diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12026 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 8564, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005838393274170948, + "grad_norm": 2.8441042445199733, + "learning_rate": 5.827505827505828e-07, + "loss": 1.1342, + "step": 5 + }, + { + "epoch": 0.0011676786548341896, + "grad_norm": 2.6915962544965275, + "learning_rate": 1.1655011655011657e-06, + "loss": 1.1191, + "step": 10 + }, + { + "epoch": 0.0017515179822512844, + "grad_norm": 2.493797679678999, + "learning_rate": 1.7482517482517485e-06, + "loss": 1.1026, + "step": 15 + }, + { + "epoch": 0.002335357309668379, + "grad_norm": 1.9460529718731712, + "learning_rate": 2.3310023310023313e-06, + "loss": 1.0839, + "step": 20 + }, + { + "epoch": 0.002919196637085474, + "grad_norm": 1.3422431894949707, + "learning_rate": 2.9137529137529138e-06, + "loss": 1.0331, + "step": 25 + }, + { + "epoch": 0.003503035964502569, + "grad_norm": 1.3292095769200771, + "learning_rate": 3.496503496503497e-06, + "loss": 1.0141, + "step": 30 + }, + { + "epoch": 0.004086875291919664, + "grad_norm": 1.0501059257033536, + "learning_rate": 4.079254079254079e-06, + "loss": 0.9596, + "step": 35 + }, + { + "epoch": 0.004670714619336758, + "grad_norm": 1.0026358935033322, + "learning_rate": 4.662004662004663e-06, + "loss": 0.9244, + "step": 40 + }, + { + "epoch": 0.005254553946753854, + "grad_norm": 0.8828397424182884, + "learning_rate": 5.244755244755245e-06, + "loss": 0.9071, + "step": 45 + }, + { + "epoch": 0.005838393274170948, + "grad_norm": 0.892428474434813, + "learning_rate": 5.8275058275058275e-06, + "loss": 0.8796, + "step": 50 + }, + { + "epoch": 0.006422232601588043, + "grad_norm": 0.8645422537859606, + "learning_rate": 6.41025641025641e-06, + "loss": 0.8939, + "step": 55 + }, + { + "epoch": 0.007006071929005138, + "grad_norm": 0.9772965887821152, + "learning_rate": 6.993006993006994e-06, + "loss": 0.8939, + "step": 60 + }, + { + "epoch": 0.007589911256422233, + "grad_norm": 0.8342203095089143, + "learning_rate": 7.5757575757575764e-06, + "loss": 0.8636, + "step": 65 + }, + { + "epoch": 0.008173750583839328, + "grad_norm": 0.838314283043167, + "learning_rate": 8.158508158508159e-06, + "loss": 0.8683, + "step": 70 + }, + { + "epoch": 0.008757589911256422, + "grad_norm": 0.8531192844964881, + "learning_rate": 8.741258741258741e-06, + "loss": 0.8741, + "step": 75 + }, + { + "epoch": 0.009341429238673517, + "grad_norm": 0.8354241808923245, + "learning_rate": 9.324009324009325e-06, + "loss": 0.8491, + "step": 80 + }, + { + "epoch": 0.009925268566090611, + "grad_norm": 0.8603802781167157, + "learning_rate": 9.906759906759908e-06, + "loss": 0.8469, + "step": 85 + }, + { + "epoch": 0.010509107893507707, + "grad_norm": 0.9094770814499542, + "learning_rate": 1.048951048951049e-05, + "loss": 0.8456, + "step": 90 + }, + { + "epoch": 0.011092947220924802, + "grad_norm": 0.8710487600121014, + "learning_rate": 1.1072261072261073e-05, + "loss": 0.8516, + "step": 95 + }, + { + "epoch": 0.011676786548341896, + "grad_norm": 0.9222268142570368, + "learning_rate": 1.1655011655011655e-05, + "loss": 0.8237, + "step": 100 + }, + { + "epoch": 0.01226062587575899, + "grad_norm": 0.9176565894712276, + "learning_rate": 1.2237762237762239e-05, + "loss": 0.8356, + "step": 105 + }, + { + "epoch": 0.012844465203176086, + "grad_norm": 0.8542848432508544, + "learning_rate": 1.282051282051282e-05, + "loss": 0.8268, + "step": 110 + }, + { + "epoch": 0.01342830453059318, + "grad_norm": 0.8547012949765272, + "learning_rate": 1.3403263403263406e-05, + "loss": 0.8179, + "step": 115 + }, + { + "epoch": 0.014012143858010275, + "grad_norm": 0.8927678257049521, + "learning_rate": 1.3986013986013988e-05, + "loss": 0.8261, + "step": 120 + }, + { + "epoch": 0.014595983185427371, + "grad_norm": 0.8771767679168678, + "learning_rate": 1.456876456876457e-05, + "loss": 0.8156, + "step": 125 + }, + { + "epoch": 0.015179822512844466, + "grad_norm": 0.9346761034876638, + "learning_rate": 1.5151515151515153e-05, + "loss": 0.8156, + "step": 130 + }, + { + "epoch": 0.01576366184026156, + "grad_norm": 0.9967648383739597, + "learning_rate": 1.5734265734265734e-05, + "loss": 0.8236, + "step": 135 + }, + { + "epoch": 0.016347501167678656, + "grad_norm": 0.950604341293268, + "learning_rate": 1.6317016317016318e-05, + "loss": 0.8011, + "step": 140 + }, + { + "epoch": 0.01693134049509575, + "grad_norm": 0.8454797159633483, + "learning_rate": 1.68997668997669e-05, + "loss": 0.8076, + "step": 145 + }, + { + "epoch": 0.017515179822512845, + "grad_norm": 0.9570558077264374, + "learning_rate": 1.7482517482517483e-05, + "loss": 0.7957, + "step": 150 + }, + { + "epoch": 0.01809901914992994, + "grad_norm": 0.9239857096216728, + "learning_rate": 1.8065268065268067e-05, + "loss": 0.8054, + "step": 155 + }, + { + "epoch": 0.018682858477347034, + "grad_norm": 0.9907328060405487, + "learning_rate": 1.864801864801865e-05, + "loss": 0.8123, + "step": 160 + }, + { + "epoch": 0.01926669780476413, + "grad_norm": 0.9282534846828877, + "learning_rate": 1.923076923076923e-05, + "loss": 0.8131, + "step": 165 + }, + { + "epoch": 0.019850537132181222, + "grad_norm": 0.8627148393669002, + "learning_rate": 1.9813519813519816e-05, + "loss": 0.7845, + "step": 170 + }, + { + "epoch": 0.02043437645959832, + "grad_norm": 0.990414565208542, + "learning_rate": 2.0396270396270396e-05, + "loss": 0.8354, + "step": 175 + }, + { + "epoch": 0.021018215787015414, + "grad_norm": 1.0141975698256604, + "learning_rate": 2.097902097902098e-05, + "loss": 0.8142, + "step": 180 + }, + { + "epoch": 0.021602055114432507, + "grad_norm": 0.9843786330547074, + "learning_rate": 2.156177156177156e-05, + "loss": 0.8031, + "step": 185 + }, + { + "epoch": 0.022185894441849603, + "grad_norm": 0.934570429819127, + "learning_rate": 2.2144522144522145e-05, + "loss": 0.8138, + "step": 190 + }, + { + "epoch": 0.0227697337692667, + "grad_norm": 1.0220562837350744, + "learning_rate": 2.272727272727273e-05, + "loss": 0.7814, + "step": 195 + }, + { + "epoch": 0.023353573096683792, + "grad_norm": 0.961440804678633, + "learning_rate": 2.331002331002331e-05, + "loss": 0.7928, + "step": 200 + }, + { + "epoch": 0.023937412424100888, + "grad_norm": 0.94391785220549, + "learning_rate": 2.3892773892773894e-05, + "loss": 0.8014, + "step": 205 + }, + { + "epoch": 0.02452125175151798, + "grad_norm": 0.9664977176711759, + "learning_rate": 2.4475524475524478e-05, + "loss": 0.8239, + "step": 210 + }, + { + "epoch": 0.025105091078935077, + "grad_norm": 1.0561418913054852, + "learning_rate": 2.505827505827506e-05, + "loss": 0.7741, + "step": 215 + }, + { + "epoch": 0.025688930406352173, + "grad_norm": 0.9358379347102888, + "learning_rate": 2.564102564102564e-05, + "loss": 0.7983, + "step": 220 + }, + { + "epoch": 0.026272769733769265, + "grad_norm": 0.8539282155631721, + "learning_rate": 2.6223776223776224e-05, + "loss": 0.793, + "step": 225 + }, + { + "epoch": 0.02685660906118636, + "grad_norm": 0.8554815403018589, + "learning_rate": 2.680652680652681e-05, + "loss": 0.8046, + "step": 230 + }, + { + "epoch": 0.027440448388603458, + "grad_norm": 1.0012026939383618, + "learning_rate": 2.738927738927739e-05, + "loss": 0.7918, + "step": 235 + }, + { + "epoch": 0.02802428771602055, + "grad_norm": 0.9302399659722874, + "learning_rate": 2.7972027972027976e-05, + "loss": 0.7666, + "step": 240 + }, + { + "epoch": 0.028608127043437646, + "grad_norm": 0.9745631817754927, + "learning_rate": 2.8554778554778557e-05, + "loss": 0.7945, + "step": 245 + }, + { + "epoch": 0.029191966370854742, + "grad_norm": 0.9663155411494311, + "learning_rate": 2.913752913752914e-05, + "loss": 0.8208, + "step": 250 + }, + { + "epoch": 0.029775805698271835, + "grad_norm": 0.9665096896834287, + "learning_rate": 2.972027972027972e-05, + "loss": 0.8155, + "step": 255 + }, + { + "epoch": 0.03035964502568893, + "grad_norm": 0.9211131832233894, + "learning_rate": 3.0303030303030306e-05, + "loss": 0.804, + "step": 260 + }, + { + "epoch": 0.030943484353106024, + "grad_norm": 0.9278894919237144, + "learning_rate": 3.088578088578088e-05, + "loss": 0.7997, + "step": 265 + }, + { + "epoch": 0.03152732368052312, + "grad_norm": 0.9453801400501536, + "learning_rate": 3.146853146853147e-05, + "loss": 0.782, + "step": 270 + }, + { + "epoch": 0.032111163007940216, + "grad_norm": 1.0236007828019342, + "learning_rate": 3.205128205128206e-05, + "loss": 0.7889, + "step": 275 + }, + { + "epoch": 0.03269500233535731, + "grad_norm": 0.959267234083067, + "learning_rate": 3.2634032634032635e-05, + "loss": 0.7898, + "step": 280 + }, + { + "epoch": 0.0332788416627744, + "grad_norm": 1.1310249700433657, + "learning_rate": 3.321678321678322e-05, + "loss": 0.8123, + "step": 285 + }, + { + "epoch": 0.0338626809901915, + "grad_norm": 1.187081207110975, + "learning_rate": 3.37995337995338e-05, + "loss": 0.8001, + "step": 290 + }, + { + "epoch": 0.034446520317608594, + "grad_norm": 0.9811581650923609, + "learning_rate": 3.438228438228439e-05, + "loss": 0.7883, + "step": 295 + }, + { + "epoch": 0.03503035964502569, + "grad_norm": 0.9889952112158437, + "learning_rate": 3.4965034965034965e-05, + "loss": 0.7819, + "step": 300 + }, + { + "epoch": 0.035614198972442786, + "grad_norm": 0.9792726994248039, + "learning_rate": 3.554778554778555e-05, + "loss": 0.7908, + "step": 305 + }, + { + "epoch": 0.03619803829985988, + "grad_norm": 1.0330116552686042, + "learning_rate": 3.613053613053613e-05, + "loss": 0.8034, + "step": 310 + }, + { + "epoch": 0.03678187762727697, + "grad_norm": 0.9791641673387794, + "learning_rate": 3.671328671328672e-05, + "loss": 0.7838, + "step": 315 + }, + { + "epoch": 0.03736571695469407, + "grad_norm": 1.087198648692349, + "learning_rate": 3.72960372960373e-05, + "loss": 0.7891, + "step": 320 + }, + { + "epoch": 0.03794955628211116, + "grad_norm": 1.1453703170207792, + "learning_rate": 3.787878787878788e-05, + "loss": 0.7828, + "step": 325 + }, + { + "epoch": 0.03853339560952826, + "grad_norm": 1.1352672915900581, + "learning_rate": 3.846153846153846e-05, + "loss": 0.7944, + "step": 330 + }, + { + "epoch": 0.039117234936945355, + "grad_norm": 1.11091425012644, + "learning_rate": 3.904428904428905e-05, + "loss": 0.8031, + "step": 335 + }, + { + "epoch": 0.039701074264362445, + "grad_norm": 0.9944507467068269, + "learning_rate": 3.962703962703963e-05, + "loss": 0.7886, + "step": 340 + }, + { + "epoch": 0.04028491359177954, + "grad_norm": 1.00813220894139, + "learning_rate": 4.020979020979021e-05, + "loss": 0.7759, + "step": 345 + }, + { + "epoch": 0.04086875291919664, + "grad_norm": 1.0770061694184807, + "learning_rate": 4.079254079254079e-05, + "loss": 0.788, + "step": 350 + }, + { + "epoch": 0.04145259224661373, + "grad_norm": 0.9596462282334578, + "learning_rate": 4.1375291375291377e-05, + "loss": 0.7962, + "step": 355 + }, + { + "epoch": 0.04203643157403083, + "grad_norm": 1.1540120979481814, + "learning_rate": 4.195804195804196e-05, + "loss": 0.797, + "step": 360 + }, + { + "epoch": 0.04262027090144792, + "grad_norm": 1.080346740497132, + "learning_rate": 4.254079254079254e-05, + "loss": 0.7954, + "step": 365 + }, + { + "epoch": 0.043204110228865014, + "grad_norm": 1.0577009933461277, + "learning_rate": 4.312354312354312e-05, + "loss": 0.7877, + "step": 370 + }, + { + "epoch": 0.04378794955628211, + "grad_norm": 0.9154501100858539, + "learning_rate": 4.370629370629371e-05, + "loss": 0.7587, + "step": 375 + }, + { + "epoch": 0.044371788883699206, + "grad_norm": 1.0086154566043717, + "learning_rate": 4.428904428904429e-05, + "loss": 0.7761, + "step": 380 + }, + { + "epoch": 0.0449556282111163, + "grad_norm": 0.9201594510651068, + "learning_rate": 4.4871794871794874e-05, + "loss": 0.7981, + "step": 385 + }, + { + "epoch": 0.0455394675385334, + "grad_norm": 0.9666106924921596, + "learning_rate": 4.545454545454546e-05, + "loss": 0.7832, + "step": 390 + }, + { + "epoch": 0.04612330686595049, + "grad_norm": 0.9815206957934799, + "learning_rate": 4.603729603729604e-05, + "loss": 0.7804, + "step": 395 + }, + { + "epoch": 0.046707146193367584, + "grad_norm": 1.0289413215392271, + "learning_rate": 4.662004662004662e-05, + "loss": 0.8053, + "step": 400 + }, + { + "epoch": 0.04729098552078468, + "grad_norm": 0.9565887758710183, + "learning_rate": 4.7202797202797204e-05, + "loss": 0.7905, + "step": 405 + }, + { + "epoch": 0.047874824848201776, + "grad_norm": 1.0540380731530687, + "learning_rate": 4.778554778554779e-05, + "loss": 0.8063, + "step": 410 + }, + { + "epoch": 0.04845866417561887, + "grad_norm": 1.0638685751627128, + "learning_rate": 4.836829836829837e-05, + "loss": 0.7806, + "step": 415 + }, + { + "epoch": 0.04904250350303596, + "grad_norm": 0.9719537263872465, + "learning_rate": 4.8951048951048956e-05, + "loss": 0.7731, + "step": 420 + }, + { + "epoch": 0.04962634283045306, + "grad_norm": 1.1429570378556555, + "learning_rate": 4.9533799533799534e-05, + "loss": 0.8072, + "step": 425 + }, + { + "epoch": 0.050210182157870153, + "grad_norm": 0.9863534850571196, + "learning_rate": 4.999999832221176e-05, + "loss": 0.7794, + "step": 430 + }, + { + "epoch": 0.05079402148528725, + "grad_norm": 0.9364682491937721, + "learning_rate": 4.999993959964937e-05, + "loss": 0.7781, + "step": 435 + }, + { + "epoch": 0.051377860812704346, + "grad_norm": 0.9768522284524611, + "learning_rate": 4.999979698792484e-05, + "loss": 0.7949, + "step": 440 + }, + { + "epoch": 0.05196170014012144, + "grad_norm": 1.2907678437139853, + "learning_rate": 4.999957048756989e-05, + "loss": 0.7946, + "step": 445 + }, + { + "epoch": 0.05254553946753853, + "grad_norm": 0.9378885425942916, + "learning_rate": 4.999926009942899e-05, + "loss": 0.7881, + "step": 450 + }, + { + "epoch": 0.05312937879495563, + "grad_norm": 0.8634203598226072, + "learning_rate": 4.999886582465941e-05, + "loss": 0.8047, + "step": 455 + }, + { + "epoch": 0.05371321812237272, + "grad_norm": 0.9278097068563849, + "learning_rate": 4.999838766473116e-05, + "loss": 0.7856, + "step": 460 + }, + { + "epoch": 0.05429705744978982, + "grad_norm": 0.8609084473770569, + "learning_rate": 4.999782562142702e-05, + "loss": 0.7634, + "step": 465 + }, + { + "epoch": 0.054880896777206915, + "grad_norm": 1.0123125316299562, + "learning_rate": 4.999717969684254e-05, + "loss": 0.7955, + "step": 470 + }, + { + "epoch": 0.055464736104624004, + "grad_norm": 1.0346559365063257, + "learning_rate": 4.999644989338598e-05, + "loss": 0.8073, + "step": 475 + }, + { + "epoch": 0.0560485754320411, + "grad_norm": 1.1426053676638168, + "learning_rate": 4.9995636213778354e-05, + "loss": 0.7719, + "step": 480 + }, + { + "epoch": 0.0566324147594582, + "grad_norm": 0.9150402070927298, + "learning_rate": 4.99947386610534e-05, + "loss": 0.7764, + "step": 485 + }, + { + "epoch": 0.05721625408687529, + "grad_norm": 0.987847326887783, + "learning_rate": 4.9993757238557564e-05, + "loss": 0.7913, + "step": 490 + }, + { + "epoch": 0.05780009341429239, + "grad_norm": 0.9689685701244068, + "learning_rate": 4.999269194995001e-05, + "loss": 0.7566, + "step": 495 + }, + { + "epoch": 0.058383932741709485, + "grad_norm": 0.9521343691976072, + "learning_rate": 4.9991542799202574e-05, + "loss": 0.7682, + "step": 500 + }, + { + "epoch": 0.058967772069126574, + "grad_norm": 0.8498214034434336, + "learning_rate": 4.999030979059977e-05, + "loss": 0.7898, + "step": 505 + }, + { + "epoch": 0.05955161139654367, + "grad_norm": 0.884610027782479, + "learning_rate": 4.998899292873876e-05, + "loss": 0.7933, + "step": 510 + }, + { + "epoch": 0.060135450723960766, + "grad_norm": 0.9009920893944248, + "learning_rate": 4.9987592218529364e-05, + "loss": 0.7807, + "step": 515 + }, + { + "epoch": 0.06071929005137786, + "grad_norm": 0.982774008971455, + "learning_rate": 4.998610766519401e-05, + "loss": 0.7755, + "step": 520 + }, + { + "epoch": 0.06130312937879496, + "grad_norm": 0.9363074199246613, + "learning_rate": 4.9984539274267726e-05, + "loss": 0.7678, + "step": 525 + }, + { + "epoch": 0.06188696870621205, + "grad_norm": 0.8818074337149633, + "learning_rate": 4.998288705159815e-05, + "loss": 0.7985, + "step": 530 + }, + { + "epoch": 0.062470808033629144, + "grad_norm": 0.9582202860157079, + "learning_rate": 4.9981151003345436e-05, + "loss": 0.7718, + "step": 535 + }, + { + "epoch": 0.06305464736104624, + "grad_norm": 0.9799283014400538, + "learning_rate": 4.99793311359823e-05, + "loss": 0.7965, + "step": 540 + }, + { + "epoch": 0.06363848668846334, + "grad_norm": 1.0277101299431106, + "learning_rate": 4.997742745629397e-05, + "loss": 0.7912, + "step": 545 + }, + { + "epoch": 0.06422232601588043, + "grad_norm": 0.9196693224733947, + "learning_rate": 4.997543997137816e-05, + "loss": 0.7742, + "step": 550 + }, + { + "epoch": 0.06480616534329753, + "grad_norm": 1.1756967119581367, + "learning_rate": 4.9973368688645034e-05, + "loss": 0.7904, + "step": 555 + }, + { + "epoch": 0.06539000467071462, + "grad_norm": 0.9727112642584299, + "learning_rate": 4.997121361581721e-05, + "loss": 0.7702, + "step": 560 + }, + { + "epoch": 0.06597384399813172, + "grad_norm": 0.9340751229066998, + "learning_rate": 4.9968974760929694e-05, + "loss": 0.7611, + "step": 565 + }, + { + "epoch": 0.0665576833255488, + "grad_norm": 0.8451161838818951, + "learning_rate": 4.996665213232987e-05, + "loss": 0.7394, + "step": 570 + }, + { + "epoch": 0.0671415226529659, + "grad_norm": 0.9437782768508981, + "learning_rate": 4.9964245738677465e-05, + "loss": 0.7721, + "step": 575 + }, + { + "epoch": 0.067725361980383, + "grad_norm": 0.8666049009060005, + "learning_rate": 4.996175558894452e-05, + "loss": 0.7649, + "step": 580 + }, + { + "epoch": 0.06830920130780009, + "grad_norm": 0.9088580093169708, + "learning_rate": 4.9959181692415345e-05, + "loss": 0.7895, + "step": 585 + }, + { + "epoch": 0.06889304063521719, + "grad_norm": 1.0295119037274578, + "learning_rate": 4.995652405868652e-05, + "loss": 0.7724, + "step": 590 + }, + { + "epoch": 0.06947687996263428, + "grad_norm": 0.9432589650717397, + "learning_rate": 4.99537826976668e-05, + "loss": 0.7707, + "step": 595 + }, + { + "epoch": 0.07006071929005138, + "grad_norm": 1.1052404077350295, + "learning_rate": 4.9950957619577115e-05, + "loss": 0.7931, + "step": 600 + }, + { + "epoch": 0.07064455861746848, + "grad_norm": 0.9419128366721217, + "learning_rate": 4.9948048834950546e-05, + "loss": 0.7881, + "step": 605 + }, + { + "epoch": 0.07122839794488557, + "grad_norm": 0.954093991240324, + "learning_rate": 4.9945056354632255e-05, + "loss": 0.7806, + "step": 610 + }, + { + "epoch": 0.07181223727230267, + "grad_norm": 1.0272959906124222, + "learning_rate": 4.994198018977945e-05, + "loss": 0.7791, + "step": 615 + }, + { + "epoch": 0.07239607659971976, + "grad_norm": 0.8921172892298924, + "learning_rate": 4.993882035186136e-05, + "loss": 0.7755, + "step": 620 + }, + { + "epoch": 0.07297991592713685, + "grad_norm": 0.9518130993302121, + "learning_rate": 4.9935576852659175e-05, + "loss": 0.7655, + "step": 625 + }, + { + "epoch": 0.07356375525455394, + "grad_norm": 0.8353315117685729, + "learning_rate": 4.993224970426603e-05, + "loss": 0.7547, + "step": 630 + }, + { + "epoch": 0.07414759458197104, + "grad_norm": 0.8586464703013177, + "learning_rate": 4.99288389190869e-05, + "loss": 0.7703, + "step": 635 + }, + { + "epoch": 0.07473143390938813, + "grad_norm": 0.889200186047205, + "learning_rate": 4.992534450983864e-05, + "loss": 0.784, + "step": 640 + }, + { + "epoch": 0.07531527323680523, + "grad_norm": 1.0890280918725501, + "learning_rate": 4.9921766489549835e-05, + "loss": 0.7829, + "step": 645 + }, + { + "epoch": 0.07589911256422233, + "grad_norm": 0.8261199193795216, + "learning_rate": 4.991810487156087e-05, + "loss": 0.7644, + "step": 650 + }, + { + "epoch": 0.07648295189163942, + "grad_norm": 0.9504620852001426, + "learning_rate": 4.991435966952376e-05, + "loss": 0.7865, + "step": 655 + }, + { + "epoch": 0.07706679121905652, + "grad_norm": 0.9843549810556219, + "learning_rate": 4.991053089740219e-05, + "loss": 0.773, + "step": 660 + }, + { + "epoch": 0.07765063054647361, + "grad_norm": 1.100665136082888, + "learning_rate": 4.990661856947142e-05, + "loss": 0.7745, + "step": 665 + }, + { + "epoch": 0.07823446987389071, + "grad_norm": 1.0689962692182553, + "learning_rate": 4.990262270031824e-05, + "loss": 0.749, + "step": 670 + }, + { + "epoch": 0.0788183092013078, + "grad_norm": 0.86450476503251, + "learning_rate": 4.989854330484092e-05, + "loss": 0.7923, + "step": 675 + }, + { + "epoch": 0.07940214852872489, + "grad_norm": 0.9313631425218568, + "learning_rate": 4.9894380398249135e-05, + "loss": 0.7851, + "step": 680 + }, + { + "epoch": 0.07998598785614199, + "grad_norm": 0.9422363860930384, + "learning_rate": 4.989013399606396e-05, + "loss": 0.7589, + "step": 685 + }, + { + "epoch": 0.08056982718355908, + "grad_norm": 0.9383975558454795, + "learning_rate": 4.988580411411774e-05, + "loss": 0.7577, + "step": 690 + }, + { + "epoch": 0.08115366651097618, + "grad_norm": 0.8678732287007085, + "learning_rate": 4.988139076855408e-05, + "loss": 0.7808, + "step": 695 + }, + { + "epoch": 0.08173750583839327, + "grad_norm": 0.9443261935414679, + "learning_rate": 4.9876893975827774e-05, + "loss": 0.7678, + "step": 700 + }, + { + "epoch": 0.08232134516581037, + "grad_norm": 0.893410559184795, + "learning_rate": 4.987231375270475e-05, + "loss": 0.7859, + "step": 705 + }, + { + "epoch": 0.08290518449322747, + "grad_norm": 0.908958440524122, + "learning_rate": 4.9867650116261994e-05, + "loss": 0.7507, + "step": 710 + }, + { + "epoch": 0.08348902382064456, + "grad_norm": 0.9226106440329441, + "learning_rate": 4.986290308388747e-05, + "loss": 0.7676, + "step": 715 + }, + { + "epoch": 0.08407286314806166, + "grad_norm": 0.8181830616959214, + "learning_rate": 4.98580726732801e-05, + "loss": 0.7931, + "step": 720 + }, + { + "epoch": 0.08465670247547875, + "grad_norm": 0.9202581836834515, + "learning_rate": 4.985315890244969e-05, + "loss": 0.748, + "step": 725 + }, + { + "epoch": 0.08524054180289584, + "grad_norm": 0.8637096331068913, + "learning_rate": 4.9848161789716804e-05, + "loss": 0.7902, + "step": 730 + }, + { + "epoch": 0.08582438113031293, + "grad_norm": 1.0251325008801968, + "learning_rate": 4.9843081353712765e-05, + "loss": 0.7838, + "step": 735 + }, + { + "epoch": 0.08640822045773003, + "grad_norm": 0.9799386743626911, + "learning_rate": 4.983791761337958e-05, + "loss": 0.7528, + "step": 740 + }, + { + "epoch": 0.08699205978514712, + "grad_norm": 1.1092058160482392, + "learning_rate": 4.9832670587969804e-05, + "loss": 0.7717, + "step": 745 + }, + { + "epoch": 0.08757589911256422, + "grad_norm": 1.0895863177487297, + "learning_rate": 4.9827340297046546e-05, + "loss": 0.747, + "step": 750 + }, + { + "epoch": 0.08815973843998132, + "grad_norm": 0.8899437101345212, + "learning_rate": 4.9821926760483354e-05, + "loss": 0.772, + "step": 755 + }, + { + "epoch": 0.08874357776739841, + "grad_norm": 1.0639534479618253, + "learning_rate": 4.9816429998464155e-05, + "loss": 0.7671, + "step": 760 + }, + { + "epoch": 0.08932741709481551, + "grad_norm": 0.9208650929263082, + "learning_rate": 4.9810850031483155e-05, + "loss": 0.7498, + "step": 765 + }, + { + "epoch": 0.0899112564222326, + "grad_norm": 1.1978688347410331, + "learning_rate": 4.9805186880344826e-05, + "loss": 0.7656, + "step": 770 + }, + { + "epoch": 0.0904950957496497, + "grad_norm": 0.8388720338093995, + "learning_rate": 4.9799440566163726e-05, + "loss": 0.7634, + "step": 775 + }, + { + "epoch": 0.0910789350770668, + "grad_norm": 1.0025093358414843, + "learning_rate": 4.979361111036454e-05, + "loss": 0.779, + "step": 780 + }, + { + "epoch": 0.09166277440448388, + "grad_norm": 0.8893526188128671, + "learning_rate": 4.9787698534681896e-05, + "loss": 0.784, + "step": 785 + }, + { + "epoch": 0.09224661373190098, + "grad_norm": 0.9052342021850083, + "learning_rate": 4.978170286116035e-05, + "loss": 0.7654, + "step": 790 + }, + { + "epoch": 0.09283045305931807, + "grad_norm": 0.7721847329666197, + "learning_rate": 4.9775624112154275e-05, + "loss": 0.7434, + "step": 795 + }, + { + "epoch": 0.09341429238673517, + "grad_norm": 0.9097917170134736, + "learning_rate": 4.976946231032777e-05, + "loss": 0.7375, + "step": 800 + }, + { + "epoch": 0.09399813171415226, + "grad_norm": 0.855353795632668, + "learning_rate": 4.976321747865462e-05, + "loss": 0.7678, + "step": 805 + }, + { + "epoch": 0.09458197104156936, + "grad_norm": 0.9001694450903074, + "learning_rate": 4.975688964041816e-05, + "loss": 0.7696, + "step": 810 + }, + { + "epoch": 0.09516581036898646, + "grad_norm": 0.9036123636475458, + "learning_rate": 4.975047881921119e-05, + "loss": 0.7382, + "step": 815 + }, + { + "epoch": 0.09574964969640355, + "grad_norm": 0.7946893689315172, + "learning_rate": 4.974398503893596e-05, + "loss": 0.7701, + "step": 820 + }, + { + "epoch": 0.09633348902382065, + "grad_norm": 0.8207619623724253, + "learning_rate": 4.973740832380397e-05, + "loss": 0.7788, + "step": 825 + }, + { + "epoch": 0.09691732835123774, + "grad_norm": 0.9272781944556916, + "learning_rate": 4.9730748698335954e-05, + "loss": 0.79, + "step": 830 + }, + { + "epoch": 0.09750116767865484, + "grad_norm": 0.9100307149491746, + "learning_rate": 4.9724006187361794e-05, + "loss": 0.7823, + "step": 835 + }, + { + "epoch": 0.09808500700607192, + "grad_norm": 0.8933455668978604, + "learning_rate": 4.971718081602037e-05, + "loss": 0.7968, + "step": 840 + }, + { + "epoch": 0.09866884633348902, + "grad_norm": 0.8676070117116492, + "learning_rate": 4.971027260975952e-05, + "loss": 0.763, + "step": 845 + }, + { + "epoch": 0.09925268566090611, + "grad_norm": 0.8909257634802544, + "learning_rate": 4.9703281594335904e-05, + "loss": 0.7597, + "step": 850 + }, + { + "epoch": 0.09983652498832321, + "grad_norm": 0.8735398370337358, + "learning_rate": 4.969620779581497e-05, + "loss": 0.7643, + "step": 855 + }, + { + "epoch": 0.10042036431574031, + "grad_norm": 1.0661381628959834, + "learning_rate": 4.968905124057077e-05, + "loss": 0.7495, + "step": 860 + }, + { + "epoch": 0.1010042036431574, + "grad_norm": 0.997806166540193, + "learning_rate": 4.968181195528594e-05, + "loss": 0.7488, + "step": 865 + }, + { + "epoch": 0.1015880429705745, + "grad_norm": 0.8212097797501837, + "learning_rate": 4.9674489966951545e-05, + "loss": 0.7656, + "step": 870 + }, + { + "epoch": 0.1021718822979916, + "grad_norm": 0.9256535225832584, + "learning_rate": 4.9667085302867015e-05, + "loss": 0.7499, + "step": 875 + }, + { + "epoch": 0.10275572162540869, + "grad_norm": 0.8960932163986034, + "learning_rate": 4.9659597990640045e-05, + "loss": 0.752, + "step": 880 + }, + { + "epoch": 0.10333956095282579, + "grad_norm": 0.9177441285582005, + "learning_rate": 4.9652028058186435e-05, + "loss": 0.7639, + "step": 885 + }, + { + "epoch": 0.10392340028024288, + "grad_norm": 0.9855352847812788, + "learning_rate": 4.9644375533730056e-05, + "loss": 0.7861, + "step": 890 + }, + { + "epoch": 0.10450723960765997, + "grad_norm": 1.1755603725509243, + "learning_rate": 4.963664044580272e-05, + "loss": 0.7644, + "step": 895 + }, + { + "epoch": 0.10509107893507706, + "grad_norm": 0.9704947639839278, + "learning_rate": 4.9628822823244056e-05, + "loss": 0.7457, + "step": 900 + }, + { + "epoch": 0.10567491826249416, + "grad_norm": 0.8135916392341626, + "learning_rate": 4.962092269520143e-05, + "loss": 0.743, + "step": 905 + }, + { + "epoch": 0.10625875758991125, + "grad_norm": 0.9718963175942106, + "learning_rate": 4.96129400911298e-05, + "loss": 0.7747, + "step": 910 + }, + { + "epoch": 0.10684259691732835, + "grad_norm": 0.8056530889300901, + "learning_rate": 4.960487504079166e-05, + "loss": 0.7468, + "step": 915 + }, + { + "epoch": 0.10742643624474545, + "grad_norm": 1.0253838704385734, + "learning_rate": 4.959672757425688e-05, + "loss": 0.7689, + "step": 920 + }, + { + "epoch": 0.10801027557216254, + "grad_norm": 0.8972042104044596, + "learning_rate": 4.958849772190261e-05, + "loss": 0.7634, + "step": 925 + }, + { + "epoch": 0.10859411489957964, + "grad_norm": 0.8252991480283688, + "learning_rate": 4.958018551441317e-05, + "loss": 0.7423, + "step": 930 + }, + { + "epoch": 0.10917795422699673, + "grad_norm": 1.0182109584106913, + "learning_rate": 4.957179098277994e-05, + "loss": 0.7679, + "step": 935 + }, + { + "epoch": 0.10976179355441383, + "grad_norm": 0.8857745022475216, + "learning_rate": 4.956331415830125e-05, + "loss": 0.7367, + "step": 940 + }, + { + "epoch": 0.11034563288183093, + "grad_norm": 0.9324065138620737, + "learning_rate": 4.955475507258222e-05, + "loss": 0.7713, + "step": 945 + }, + { + "epoch": 0.11092947220924801, + "grad_norm": 0.8898078164147581, + "learning_rate": 4.95461137575347e-05, + "loss": 0.7414, + "step": 950 + }, + { + "epoch": 0.1115133115366651, + "grad_norm": 0.9976876371835823, + "learning_rate": 4.953739024537712e-05, + "loss": 0.724, + "step": 955 + }, + { + "epoch": 0.1120971508640822, + "grad_norm": 0.7955416046502354, + "learning_rate": 4.952858456863437e-05, + "loss": 0.7772, + "step": 960 + }, + { + "epoch": 0.1126809901914993, + "grad_norm": 0.9382741081225117, + "learning_rate": 4.951969676013768e-05, + "loss": 0.7523, + "step": 965 + }, + { + "epoch": 0.1132648295189164, + "grad_norm": 0.903157971998188, + "learning_rate": 4.951072685302452e-05, + "loss": 0.7503, + "step": 970 + }, + { + "epoch": 0.11384866884633349, + "grad_norm": 0.83555937924101, + "learning_rate": 4.950167488073844e-05, + "loss": 0.742, + "step": 975 + }, + { + "epoch": 0.11443250817375059, + "grad_norm": 0.9192402633470362, + "learning_rate": 4.949254087702896e-05, + "loss": 0.7509, + "step": 980 + }, + { + "epoch": 0.11501634750116768, + "grad_norm": 0.7677750625588471, + "learning_rate": 4.948332487595148e-05, + "loss": 0.7572, + "step": 985 + }, + { + "epoch": 0.11560018682858478, + "grad_norm": 0.7809723688586555, + "learning_rate": 4.9474026911867084e-05, + "loss": 0.7603, + "step": 990 + }, + { + "epoch": 0.11618402615600187, + "grad_norm": 0.8246656807985904, + "learning_rate": 4.9464647019442465e-05, + "loss": 0.7269, + "step": 995 + }, + { + "epoch": 0.11676786548341897, + "grad_norm": 0.9446736222402072, + "learning_rate": 4.945518523364976e-05, + "loss": 0.758, + "step": 1000 + }, + { + "epoch": 0.11735170481083605, + "grad_norm": 0.8111859145900984, + "learning_rate": 4.944564158976647e-05, + "loss": 0.7619, + "step": 1005 + }, + { + "epoch": 0.11793554413825315, + "grad_norm": 0.9039015274993903, + "learning_rate": 4.943601612337528e-05, + "loss": 0.7535, + "step": 1010 + }, + { + "epoch": 0.11851938346567024, + "grad_norm": 1.0091119488608187, + "learning_rate": 4.9426308870363934e-05, + "loss": 0.7536, + "step": 1015 + }, + { + "epoch": 0.11910322279308734, + "grad_norm": 0.842082203309652, + "learning_rate": 4.941651986692514e-05, + "loss": 0.7656, + "step": 1020 + }, + { + "epoch": 0.11968706212050444, + "grad_norm": 0.8944306880913642, + "learning_rate": 4.940664914955637e-05, + "loss": 0.7566, + "step": 1025 + }, + { + "epoch": 0.12027090144792153, + "grad_norm": 0.7868127840279026, + "learning_rate": 4.939669675505978e-05, + "loss": 0.7677, + "step": 1030 + }, + { + "epoch": 0.12085474077533863, + "grad_norm": 0.8392305677420804, + "learning_rate": 4.938666272054205e-05, + "loss": 0.7372, + "step": 1035 + }, + { + "epoch": 0.12143858010275572, + "grad_norm": 0.8275154165543548, + "learning_rate": 4.937654708341425e-05, + "loss": 0.7684, + "step": 1040 + }, + { + "epoch": 0.12202241943017282, + "grad_norm": 1.0271569251545638, + "learning_rate": 4.93663498813917e-05, + "loss": 0.7559, + "step": 1045 + }, + { + "epoch": 0.12260625875758992, + "grad_norm": 0.8001356326071442, + "learning_rate": 4.9356071152493815e-05, + "loss": 0.7479, + "step": 1050 + }, + { + "epoch": 0.123190098085007, + "grad_norm": 0.7615039077476046, + "learning_rate": 4.934571093504398e-05, + "loss": 0.7436, + "step": 1055 + }, + { + "epoch": 0.1237739374124241, + "grad_norm": 0.8327757922786666, + "learning_rate": 4.933526926766943e-05, + "loss": 0.7756, + "step": 1060 + }, + { + "epoch": 0.12435777673984119, + "grad_norm": 0.782215645789506, + "learning_rate": 4.9324746189301027e-05, + "loss": 0.7631, + "step": 1065 + }, + { + "epoch": 0.12494161606725829, + "grad_norm": 0.7784532705295107, + "learning_rate": 4.9314141739173223e-05, + "loss": 0.7589, + "step": 1070 + }, + { + "epoch": 0.12552545539467538, + "grad_norm": 0.8109850694960712, + "learning_rate": 4.9303455956823816e-05, + "loss": 0.7711, + "step": 1075 + }, + { + "epoch": 0.12610929472209248, + "grad_norm": 0.8610019246237505, + "learning_rate": 4.929268888209388e-05, + "loss": 0.7381, + "step": 1080 + }, + { + "epoch": 0.12669313404950958, + "grad_norm": 0.8738321504500893, + "learning_rate": 4.928184055512754e-05, + "loss": 0.7527, + "step": 1085 + }, + { + "epoch": 0.12727697337692667, + "grad_norm": 0.7909309571669735, + "learning_rate": 4.927091101637189e-05, + "loss": 0.7435, + "step": 1090 + }, + { + "epoch": 0.12786081270434377, + "grad_norm": 0.8289072357424887, + "learning_rate": 4.9259900306576825e-05, + "loss": 0.7299, + "step": 1095 + }, + { + "epoch": 0.12844465203176086, + "grad_norm": 0.8249302215432155, + "learning_rate": 4.924880846679485e-05, + "loss": 0.7479, + "step": 1100 + }, + { + "epoch": 0.12902849135917796, + "grad_norm": 0.759611385335739, + "learning_rate": 4.923763553838098e-05, + "loss": 0.7535, + "step": 1105 + }, + { + "epoch": 0.12961233068659506, + "grad_norm": 0.7534304608019847, + "learning_rate": 4.9226381562992546e-05, + "loss": 0.7599, + "step": 1110 + }, + { + "epoch": 0.13019617001401215, + "grad_norm": 0.7560322412131688, + "learning_rate": 4.9215046582589066e-05, + "loss": 0.7515, + "step": 1115 + }, + { + "epoch": 0.13078000934142925, + "grad_norm": 0.8830214475388138, + "learning_rate": 4.9203630639432083e-05, + "loss": 0.7437, + "step": 1120 + }, + { + "epoch": 0.13136384866884634, + "grad_norm": 1.0110046096851761, + "learning_rate": 4.919213377608499e-05, + "loss": 0.7612, + "step": 1125 + }, + { + "epoch": 0.13194768799626344, + "grad_norm": 0.8362938959468142, + "learning_rate": 4.9180556035412876e-05, + "loss": 0.7444, + "step": 1130 + }, + { + "epoch": 0.13253152732368054, + "grad_norm": 1.0345951790579992, + "learning_rate": 4.916889746058242e-05, + "loss": 0.7683, + "step": 1135 + }, + { + "epoch": 0.1331153666510976, + "grad_norm": 0.9305435521245572, + "learning_rate": 4.9157158095061636e-05, + "loss": 0.7439, + "step": 1140 + }, + { + "epoch": 0.1336992059785147, + "grad_norm": 0.9211532554113626, + "learning_rate": 4.914533798261977e-05, + "loss": 0.7392, + "step": 1145 + }, + { + "epoch": 0.1342830453059318, + "grad_norm": 1.0550561983312177, + "learning_rate": 4.913343716732713e-05, + "loss": 0.7592, + "step": 1150 + }, + { + "epoch": 0.1348668846333489, + "grad_norm": 0.8617952378635508, + "learning_rate": 4.912145569355495e-05, + "loss": 0.7462, + "step": 1155 + }, + { + "epoch": 0.135450723960766, + "grad_norm": 0.8369067397902832, + "learning_rate": 4.910939360597514e-05, + "loss": 0.7325, + "step": 1160 + }, + { + "epoch": 0.13603456328818309, + "grad_norm": 0.8662955701532267, + "learning_rate": 4.909725094956019e-05, + "loss": 0.7572, + "step": 1165 + }, + { + "epoch": 0.13661840261560018, + "grad_norm": 0.8781226126848558, + "learning_rate": 4.908502776958301e-05, + "loss": 0.7414, + "step": 1170 + }, + { + "epoch": 0.13720224194301728, + "grad_norm": 1.0068828191786867, + "learning_rate": 4.907272411161668e-05, + "loss": 0.7407, + "step": 1175 + }, + { + "epoch": 0.13778608127043437, + "grad_norm": 0.8102116162804266, + "learning_rate": 4.9060340021534415e-05, + "loss": 0.7424, + "step": 1180 + }, + { + "epoch": 0.13836992059785147, + "grad_norm": 0.8182089508424056, + "learning_rate": 4.9047875545509235e-05, + "loss": 0.7369, + "step": 1185 + }, + { + "epoch": 0.13895375992526857, + "grad_norm": 0.8533530387273335, + "learning_rate": 4.9035330730013926e-05, + "loss": 0.7384, + "step": 1190 + }, + { + "epoch": 0.13953759925268566, + "grad_norm": 0.8163147298892877, + "learning_rate": 4.9022705621820786e-05, + "loss": 0.7241, + "step": 1195 + }, + { + "epoch": 0.14012143858010276, + "grad_norm": 0.885744418567555, + "learning_rate": 4.901000026800148e-05, + "loss": 0.7528, + "step": 1200 + }, + { + "epoch": 0.14070527790751985, + "grad_norm": 0.8738516104757809, + "learning_rate": 4.899721471592688e-05, + "loss": 0.732, + "step": 1205 + }, + { + "epoch": 0.14128911723493695, + "grad_norm": 0.8068156315381868, + "learning_rate": 4.898434901326685e-05, + "loss": 0.726, + "step": 1210 + }, + { + "epoch": 0.14187295656235405, + "grad_norm": 0.7856848802153881, + "learning_rate": 4.897140320799011e-05, + "loss": 0.7325, + "step": 1215 + }, + { + "epoch": 0.14245679588977114, + "grad_norm": 0.8355251687297792, + "learning_rate": 4.8958377348364e-05, + "loss": 0.7528, + "step": 1220 + }, + { + "epoch": 0.14304063521718824, + "grad_norm": 0.7684488878166239, + "learning_rate": 4.894527148295438e-05, + "loss": 0.7509, + "step": 1225 + }, + { + "epoch": 0.14362447454460534, + "grad_norm": 1.0616657426243217, + "learning_rate": 4.8932085660625374e-05, + "loss": 0.7547, + "step": 1230 + }, + { + "epoch": 0.14420831387202243, + "grad_norm": 0.9230922768754513, + "learning_rate": 4.8918819930539244e-05, + "loss": 0.7397, + "step": 1235 + }, + { + "epoch": 0.14479215319943953, + "grad_norm": 0.9018167634804453, + "learning_rate": 4.8905474342156144e-05, + "loss": 0.7488, + "step": 1240 + }, + { + "epoch": 0.1453759925268566, + "grad_norm": 0.8880821241737537, + "learning_rate": 4.889204894523401e-05, + "loss": 0.7489, + "step": 1245 + }, + { + "epoch": 0.1459598318542737, + "grad_norm": 0.8306320426031331, + "learning_rate": 4.8878543789828314e-05, + "loss": 0.7403, + "step": 1250 + }, + { + "epoch": 0.1465436711816908, + "grad_norm": 0.8903153304358166, + "learning_rate": 4.886495892629191e-05, + "loss": 0.7558, + "step": 1255 + }, + { + "epoch": 0.14712751050910788, + "grad_norm": 0.8380421321813797, + "learning_rate": 4.8851294405274855e-05, + "loss": 0.7252, + "step": 1260 + }, + { + "epoch": 0.14771134983652498, + "grad_norm": 0.9364679991310748, + "learning_rate": 4.8837550277724165e-05, + "loss": 0.734, + "step": 1265 + }, + { + "epoch": 0.14829518916394208, + "grad_norm": 0.9251425941623983, + "learning_rate": 4.8823726594883696e-05, + "loss": 0.7472, + "step": 1270 + }, + { + "epoch": 0.14887902849135917, + "grad_norm": 0.9273829246305205, + "learning_rate": 4.8809823408293887e-05, + "loss": 0.7477, + "step": 1275 + }, + { + "epoch": 0.14946286781877627, + "grad_norm": 0.7946084300359817, + "learning_rate": 4.8795840769791634e-05, + "loss": 0.7515, + "step": 1280 + }, + { + "epoch": 0.15004670714619336, + "grad_norm": 0.8503320421933336, + "learning_rate": 4.878177873151004e-05, + "loss": 0.7465, + "step": 1285 + }, + { + "epoch": 0.15063054647361046, + "grad_norm": 0.8694883327865317, + "learning_rate": 4.876763734587825e-05, + "loss": 0.738, + "step": 1290 + }, + { + "epoch": 0.15121438580102756, + "grad_norm": 0.7569044838064122, + "learning_rate": 4.8753416665621255e-05, + "loss": 0.7217, + "step": 1295 + }, + { + "epoch": 0.15179822512844465, + "grad_norm": 0.810783326961325, + "learning_rate": 4.873911674375968e-05, + "loss": 0.7536, + "step": 1300 + }, + { + "epoch": 0.15238206445586175, + "grad_norm": 0.8698111120094745, + "learning_rate": 4.87247376336096e-05, + "loss": 0.7481, + "step": 1305 + }, + { + "epoch": 0.15296590378327884, + "grad_norm": 0.9178163836621003, + "learning_rate": 4.8710279388782345e-05, + "loss": 0.7401, + "step": 1310 + }, + { + "epoch": 0.15354974311069594, + "grad_norm": 0.8785969161453483, + "learning_rate": 4.869574206318427e-05, + "loss": 0.7308, + "step": 1315 + }, + { + "epoch": 0.15413358243811304, + "grad_norm": 0.787087494439056, + "learning_rate": 4.868112571101659e-05, + "loss": 0.7142, + "step": 1320 + }, + { + "epoch": 0.15471742176553013, + "grad_norm": 0.9221866180118231, + "learning_rate": 4.866643038677519e-05, + "loss": 0.7439, + "step": 1325 + }, + { + "epoch": 0.15530126109294723, + "grad_norm": 0.8861566553427299, + "learning_rate": 4.865165614525033e-05, + "loss": 0.7452, + "step": 1330 + }, + { + "epoch": 0.15588510042036433, + "grad_norm": 0.8470239189350767, + "learning_rate": 4.863680304152657e-05, + "loss": 0.7486, + "step": 1335 + }, + { + "epoch": 0.15646893974778142, + "grad_norm": 0.8491579579103056, + "learning_rate": 4.862187113098249e-05, + "loss": 0.7489, + "step": 1340 + }, + { + "epoch": 0.15705277907519852, + "grad_norm": 0.9525828664204632, + "learning_rate": 4.8606860469290454e-05, + "loss": 0.7364, + "step": 1345 + }, + { + "epoch": 0.1576366184026156, + "grad_norm": 0.9121064096531482, + "learning_rate": 4.859177111241649e-05, + "loss": 0.72, + "step": 1350 + }, + { + "epoch": 0.15822045773003268, + "grad_norm": 0.8488919401410446, + "learning_rate": 4.8576603116620004e-05, + "loss": 0.7352, + "step": 1355 + }, + { + "epoch": 0.15880429705744978, + "grad_norm": 0.8145226432017278, + "learning_rate": 4.8561356538453625e-05, + "loss": 0.7236, + "step": 1360 + }, + { + "epoch": 0.15938813638486687, + "grad_norm": 0.7858513627050332, + "learning_rate": 4.8546031434762954e-05, + "loss": 0.7581, + "step": 1365 + }, + { + "epoch": 0.15997197571228397, + "grad_norm": 0.8632804578998556, + "learning_rate": 4.853062786268636e-05, + "loss": 0.7529, + "step": 1370 + }, + { + "epoch": 0.16055581503970107, + "grad_norm": 0.8662933609475204, + "learning_rate": 4.85151458796548e-05, + "loss": 0.7196, + "step": 1375 + }, + { + "epoch": 0.16113965436711816, + "grad_norm": 0.9008272613777886, + "learning_rate": 4.849958554339156e-05, + "loss": 0.7562, + "step": 1380 + }, + { + "epoch": 0.16172349369453526, + "grad_norm": 1.0396019649473314, + "learning_rate": 4.8483946911912064e-05, + "loss": 0.7474, + "step": 1385 + }, + { + "epoch": 0.16230733302195235, + "grad_norm": 0.9021657264167294, + "learning_rate": 4.846823004352366e-05, + "loss": 0.7286, + "step": 1390 + }, + { + "epoch": 0.16289117234936945, + "grad_norm": 0.9831547552660019, + "learning_rate": 4.845243499682539e-05, + "loss": 0.7538, + "step": 1395 + }, + { + "epoch": 0.16347501167678655, + "grad_norm": 0.8494290490564191, + "learning_rate": 4.8436561830707786e-05, + "loss": 0.7359, + "step": 1400 + }, + { + "epoch": 0.16405885100420364, + "grad_norm": 0.8777452106207012, + "learning_rate": 4.842061060435261e-05, + "loss": 0.7323, + "step": 1405 + }, + { + "epoch": 0.16464269033162074, + "grad_norm": 0.7876098244119992, + "learning_rate": 4.840458137723271e-05, + "loss": 0.7043, + "step": 1410 + }, + { + "epoch": 0.16522652965903784, + "grad_norm": 0.9636798433350332, + "learning_rate": 4.838847420911172e-05, + "loss": 0.7255, + "step": 1415 + }, + { + "epoch": 0.16581036898645493, + "grad_norm": 0.9020478525489855, + "learning_rate": 4.8372289160043895e-05, + "loss": 0.7421, + "step": 1420 + }, + { + "epoch": 0.16639420831387203, + "grad_norm": 0.9043231536695949, + "learning_rate": 4.835602629037384e-05, + "loss": 0.7333, + "step": 1425 + }, + { + "epoch": 0.16697804764128912, + "grad_norm": 0.8053495862344994, + "learning_rate": 4.8339685660736324e-05, + "loss": 0.7263, + "step": 1430 + }, + { + "epoch": 0.16756188696870622, + "grad_norm": 0.9550285318543651, + "learning_rate": 4.8323267332056026e-05, + "loss": 0.7368, + "step": 1435 + }, + { + "epoch": 0.16814572629612332, + "grad_norm": 0.9381409461089547, + "learning_rate": 4.830677136554733e-05, + "loss": 0.7341, + "step": 1440 + }, + { + "epoch": 0.1687295656235404, + "grad_norm": 0.8683417476073385, + "learning_rate": 4.829019782271408e-05, + "loss": 0.7556, + "step": 1445 + }, + { + "epoch": 0.1693134049509575, + "grad_norm": 0.8447674256758361, + "learning_rate": 4.827354676534937e-05, + "loss": 0.7492, + "step": 1450 + }, + { + "epoch": 0.1698972442783746, + "grad_norm": 0.8050509416825757, + "learning_rate": 4.825681825553527e-05, + "loss": 0.7421, + "step": 1455 + }, + { + "epoch": 0.17048108360579167, + "grad_norm": 0.7649657099473817, + "learning_rate": 4.824001235564265e-05, + "loss": 0.7487, + "step": 1460 + }, + { + "epoch": 0.17106492293320877, + "grad_norm": 0.857389296357302, + "learning_rate": 4.822312912833092e-05, + "loss": 0.733, + "step": 1465 + }, + { + "epoch": 0.17164876226062586, + "grad_norm": 0.8424832802837008, + "learning_rate": 4.82061686365478e-05, + "loss": 0.7362, + "step": 1470 + }, + { + "epoch": 0.17223260158804296, + "grad_norm": 0.9243836306800552, + "learning_rate": 4.818913094352907e-05, + "loss": 0.7342, + "step": 1475 + }, + { + "epoch": 0.17281644091546006, + "grad_norm": 0.7705246634042684, + "learning_rate": 4.8172016112798364e-05, + "loss": 0.7474, + "step": 1480 + }, + { + "epoch": 0.17340028024287715, + "grad_norm": 0.8561759097930621, + "learning_rate": 4.8154824208166906e-05, + "loss": 0.7628, + "step": 1485 + }, + { + "epoch": 0.17398411957029425, + "grad_norm": 0.8716723514678405, + "learning_rate": 4.8137555293733294e-05, + "loss": 0.7296, + "step": 1490 + }, + { + "epoch": 0.17456795889771135, + "grad_norm": 0.8590395423969877, + "learning_rate": 4.812020943388324e-05, + "loss": 0.7156, + "step": 1495 + }, + { + "epoch": 0.17515179822512844, + "grad_norm": 0.8227359438250613, + "learning_rate": 4.810278669328935e-05, + "loss": 0.7458, + "step": 1500 + }, + { + "epoch": 0.17573563755254554, + "grad_norm": 0.9291613782890741, + "learning_rate": 4.808528713691087e-05, + "loss": 0.7197, + "step": 1505 + }, + { + "epoch": 0.17631947687996263, + "grad_norm": 0.9252768812683061, + "learning_rate": 4.806771082999346e-05, + "loss": 0.752, + "step": 1510 + }, + { + "epoch": 0.17690331620737973, + "grad_norm": 0.828069657263595, + "learning_rate": 4.8050057838068904e-05, + "loss": 0.7454, + "step": 1515 + }, + { + "epoch": 0.17748715553479683, + "grad_norm": 0.8547710287364533, + "learning_rate": 4.803232822695493e-05, + "loss": 0.7365, + "step": 1520 + }, + { + "epoch": 0.17807099486221392, + "grad_norm": 0.9291014403307367, + "learning_rate": 4.801452206275493e-05, + "loss": 0.7315, + "step": 1525 + }, + { + "epoch": 0.17865483418963102, + "grad_norm": 0.8381441138725285, + "learning_rate": 4.79966394118577e-05, + "loss": 0.7246, + "step": 1530 + }, + { + "epoch": 0.1792386735170481, + "grad_norm": 1.012812492678937, + "learning_rate": 4.797868034093724e-05, + "loss": 0.7514, + "step": 1535 + }, + { + "epoch": 0.1798225128444652, + "grad_norm": 1.0163395309432404, + "learning_rate": 4.7960644916952444e-05, + "loss": 0.7448, + "step": 1540 + }, + { + "epoch": 0.1804063521718823, + "grad_norm": 0.8675054579386493, + "learning_rate": 4.7942533207146916e-05, + "loss": 0.7533, + "step": 1545 + }, + { + "epoch": 0.1809901914992994, + "grad_norm": 0.8546813403666974, + "learning_rate": 4.792434527904864e-05, + "loss": 0.7451, + "step": 1550 + }, + { + "epoch": 0.1815740308267165, + "grad_norm": 0.7424392194511835, + "learning_rate": 4.7906081200469835e-05, + "loss": 0.7326, + "step": 1555 + }, + { + "epoch": 0.1821578701541336, + "grad_norm": 0.7825121640532524, + "learning_rate": 4.788774103950657e-05, + "loss": 0.7286, + "step": 1560 + }, + { + "epoch": 0.1827417094815507, + "grad_norm": 0.980002561746577, + "learning_rate": 4.7869324864538636e-05, + "loss": 0.7346, + "step": 1565 + }, + { + "epoch": 0.18332554880896776, + "grad_norm": 0.8672472363588772, + "learning_rate": 4.7850832744229216e-05, + "loss": 0.7424, + "step": 1570 + }, + { + "epoch": 0.18390938813638485, + "grad_norm": 0.8646730016652088, + "learning_rate": 4.783226474752465e-05, + "loss": 0.7412, + "step": 1575 + }, + { + "epoch": 0.18449322746380195, + "grad_norm": 0.8337338110769495, + "learning_rate": 4.781362094365417e-05, + "loss": 0.7218, + "step": 1580 + }, + { + "epoch": 0.18507706679121905, + "grad_norm": 0.8339883285485024, + "learning_rate": 4.779490140212966e-05, + "loss": 0.7208, + "step": 1585 + }, + { + "epoch": 0.18566090611863614, + "grad_norm": 0.934085237869705, + "learning_rate": 4.777610619274539e-05, + "loss": 0.7376, + "step": 1590 + }, + { + "epoch": 0.18624474544605324, + "grad_norm": 0.755725492136055, + "learning_rate": 4.775723538557772e-05, + "loss": 0.7166, + "step": 1595 + }, + { + "epoch": 0.18682858477347034, + "grad_norm": 0.9508762513943106, + "learning_rate": 4.7738289050984905e-05, + "loss": 0.7424, + "step": 1600 + }, + { + "epoch": 0.18741242410088743, + "grad_norm": 0.9845480086954423, + "learning_rate": 4.7719267259606795e-05, + "loss": 0.7306, + "step": 1605 + }, + { + "epoch": 0.18799626342830453, + "grad_norm": 1.0187787645203552, + "learning_rate": 4.770017008236455e-05, + "loss": 0.7429, + "step": 1610 + }, + { + "epoch": 0.18858010275572162, + "grad_norm": 1.0766199613681091, + "learning_rate": 4.768099759046042e-05, + "loss": 0.7139, + "step": 1615 + }, + { + "epoch": 0.18916394208313872, + "grad_norm": 0.8018949169094243, + "learning_rate": 4.766174985537745e-05, + "loss": 0.7064, + "step": 1620 + }, + { + "epoch": 0.18974778141055582, + "grad_norm": 0.9410002469224804, + "learning_rate": 4.7642426948879234e-05, + "loss": 0.7142, + "step": 1625 + }, + { + "epoch": 0.1903316207379729, + "grad_norm": 0.9924778731480929, + "learning_rate": 4.762302894300962e-05, + "loss": 0.7469, + "step": 1630 + }, + { + "epoch": 0.19091546006539, + "grad_norm": 0.8928365170914925, + "learning_rate": 4.760355591009247e-05, + "loss": 0.7424, + "step": 1635 + }, + { + "epoch": 0.1914992993928071, + "grad_norm": 0.7719462216054435, + "learning_rate": 4.7584007922731383e-05, + "loss": 0.7381, + "step": 1640 + }, + { + "epoch": 0.1920831387202242, + "grad_norm": 0.7713047418375075, + "learning_rate": 4.75643850538094e-05, + "loss": 0.719, + "step": 1645 + }, + { + "epoch": 0.1926669780476413, + "grad_norm": 0.9075548502670161, + "learning_rate": 4.754468737648878e-05, + "loss": 0.7274, + "step": 1650 + }, + { + "epoch": 0.1932508173750584, + "grad_norm": 0.8580706227087707, + "learning_rate": 4.752491496421066e-05, + "loss": 0.7336, + "step": 1655 + }, + { + "epoch": 0.1938346567024755, + "grad_norm": 0.7720549473121491, + "learning_rate": 4.750506789069486e-05, + "loss": 0.7284, + "step": 1660 + }, + { + "epoch": 0.19441849602989258, + "grad_norm": 0.9120434353934009, + "learning_rate": 4.7485146229939545e-05, + "loss": 0.726, + "step": 1665 + }, + { + "epoch": 0.19500233535730968, + "grad_norm": 0.8772498797608074, + "learning_rate": 4.746515005622097e-05, + "loss": 0.721, + "step": 1670 + }, + { + "epoch": 0.19558617468472678, + "grad_norm": 1.0402305568608219, + "learning_rate": 4.744507944409322e-05, + "loss": 0.7474, + "step": 1675 + }, + { + "epoch": 0.19617001401214385, + "grad_norm": 0.9480289631067886, + "learning_rate": 4.742493446838791e-05, + "loss": 0.7127, + "step": 1680 + }, + { + "epoch": 0.19675385333956094, + "grad_norm": 0.8549076161944649, + "learning_rate": 4.740471520421392e-05, + "loss": 0.7283, + "step": 1685 + }, + { + "epoch": 0.19733769266697804, + "grad_norm": 0.8916814608078604, + "learning_rate": 4.73844217269571e-05, + "loss": 0.726, + "step": 1690 + }, + { + "epoch": 0.19792153199439513, + "grad_norm": 0.719882334233512, + "learning_rate": 4.736405411228e-05, + "loss": 0.717, + "step": 1695 + }, + { + "epoch": 0.19850537132181223, + "grad_norm": 0.9257642921113124, + "learning_rate": 4.7343612436121575e-05, + "loss": 0.765, + "step": 1700 + }, + { + "epoch": 0.19908921064922933, + "grad_norm": 0.7913682603162452, + "learning_rate": 4.732309677469693e-05, + "loss": 0.7444, + "step": 1705 + }, + { + "epoch": 0.19967304997664642, + "grad_norm": 0.8706125916216539, + "learning_rate": 4.7302507204497026e-05, + "loss": 0.7213, + "step": 1710 + }, + { + "epoch": 0.20025688930406352, + "grad_norm": 0.8791791140311229, + "learning_rate": 4.728184380228834e-05, + "loss": 0.7396, + "step": 1715 + }, + { + "epoch": 0.20084072863148061, + "grad_norm": 0.8625015103710599, + "learning_rate": 4.7261106645112677e-05, + "loss": 0.7301, + "step": 1720 + }, + { + "epoch": 0.2014245679588977, + "grad_norm": 0.8861750728990333, + "learning_rate": 4.72402958102868e-05, + "loss": 0.7491, + "step": 1725 + }, + { + "epoch": 0.2020084072863148, + "grad_norm": 0.9514113714928437, + "learning_rate": 4.72194113754022e-05, + "loss": 0.7327, + "step": 1730 + }, + { + "epoch": 0.2025922466137319, + "grad_norm": 0.9369626542739419, + "learning_rate": 4.719845341832475e-05, + "loss": 0.6914, + "step": 1735 + }, + { + "epoch": 0.203176085941149, + "grad_norm": 1.2929308377238355, + "learning_rate": 4.7177422017194464e-05, + "loss": 0.7227, + "step": 1740 + }, + { + "epoch": 0.2037599252685661, + "grad_norm": 1.0761261573620378, + "learning_rate": 4.715631725042517e-05, + "loss": 0.7453, + "step": 1745 + }, + { + "epoch": 0.2043437645959832, + "grad_norm": 0.9015493784655515, + "learning_rate": 4.7135139196704254e-05, + "loss": 0.7262, + "step": 1750 + }, + { + "epoch": 0.2049276039234003, + "grad_norm": 0.8448441635518296, + "learning_rate": 4.711388793499233e-05, + "loss": 0.7215, + "step": 1755 + }, + { + "epoch": 0.20551144325081738, + "grad_norm": 0.911123740513117, + "learning_rate": 4.709256354452298e-05, + "loss": 0.7317, + "step": 1760 + }, + { + "epoch": 0.20609528257823448, + "grad_norm": 0.9212102275736465, + "learning_rate": 4.7071166104802415e-05, + "loss": 0.7261, + "step": 1765 + }, + { + "epoch": 0.20667912190565157, + "grad_norm": 0.8280598637016623, + "learning_rate": 4.7049695695609224e-05, + "loss": 0.7123, + "step": 1770 + }, + { + "epoch": 0.20726296123306867, + "grad_norm": 0.7931744829771525, + "learning_rate": 4.702815239699405e-05, + "loss": 0.7169, + "step": 1775 + }, + { + "epoch": 0.20784680056048577, + "grad_norm": 0.8016643144301633, + "learning_rate": 4.7006536289279285e-05, + "loss": 0.7195, + "step": 1780 + }, + { + "epoch": 0.20843063988790284, + "grad_norm": 0.8263592261687366, + "learning_rate": 4.698484745305882e-05, + "loss": 0.7033, + "step": 1785 + }, + { + "epoch": 0.20901447921531993, + "grad_norm": 0.8507324933533925, + "learning_rate": 4.696308596919767e-05, + "loss": 0.7479, + "step": 1790 + }, + { + "epoch": 0.20959831854273703, + "grad_norm": 0.8328383640267244, + "learning_rate": 4.694125191883174e-05, + "loss": 0.7219, + "step": 1795 + }, + { + "epoch": 0.21018215787015412, + "grad_norm": 0.7346869424532056, + "learning_rate": 4.691934538336746e-05, + "loss": 0.7272, + "step": 1800 + }, + { + "epoch": 0.21076599719757122, + "grad_norm": 0.7553117082839136, + "learning_rate": 4.6897366444481545e-05, + "loss": 0.7209, + "step": 1805 + }, + { + "epoch": 0.21134983652498832, + "grad_norm": 0.8261600749869238, + "learning_rate": 4.687531518412065e-05, + "loss": 0.733, + "step": 1810 + }, + { + "epoch": 0.2119336758524054, + "grad_norm": 0.8631456199547245, + "learning_rate": 4.685319168450107e-05, + "loss": 0.6968, + "step": 1815 + }, + { + "epoch": 0.2125175151798225, + "grad_norm": 0.9026029059640308, + "learning_rate": 4.683099602810845e-05, + "loss": 0.7473, + "step": 1820 + }, + { + "epoch": 0.2131013545072396, + "grad_norm": 0.8594021123209141, + "learning_rate": 4.680872829769745e-05, + "loss": 0.7164, + "step": 1825 + }, + { + "epoch": 0.2136851938346567, + "grad_norm": 0.9662714499957493, + "learning_rate": 4.6786388576291446e-05, + "loss": 0.7405, + "step": 1830 + }, + { + "epoch": 0.2142690331620738, + "grad_norm": 0.8755429706535786, + "learning_rate": 4.6763976947182256e-05, + "loss": 0.7169, + "step": 1835 + }, + { + "epoch": 0.2148528724894909, + "grad_norm": 0.8295598426423224, + "learning_rate": 4.6741493493929794e-05, + "loss": 0.722, + "step": 1840 + }, + { + "epoch": 0.215436711816908, + "grad_norm": 0.8762905396970913, + "learning_rate": 4.671893830036174e-05, + "loss": 0.73, + "step": 1845 + }, + { + "epoch": 0.21602055114432508, + "grad_norm": 0.9509644103150598, + "learning_rate": 4.6696311450573266e-05, + "loss": 0.7474, + "step": 1850 + }, + { + "epoch": 0.21660439047174218, + "grad_norm": 0.9699081790127314, + "learning_rate": 4.667361302892671e-05, + "loss": 0.7062, + "step": 1855 + }, + { + "epoch": 0.21718822979915928, + "grad_norm": 0.9442815326623876, + "learning_rate": 4.665084312005126e-05, + "loss": 0.6956, + "step": 1860 + }, + { + "epoch": 0.21777206912657637, + "grad_norm": 0.8067541386553867, + "learning_rate": 4.662800180884263e-05, + "loss": 0.7417, + "step": 1865 + }, + { + "epoch": 0.21835590845399347, + "grad_norm": 0.8697450351973937, + "learning_rate": 4.660508918046277e-05, + "loss": 0.7252, + "step": 1870 + }, + { + "epoch": 0.21893974778141057, + "grad_norm": 0.772276879857256, + "learning_rate": 4.658210532033951e-05, + "loss": 0.7317, + "step": 1875 + }, + { + "epoch": 0.21952358710882766, + "grad_norm": 0.8192894910458268, + "learning_rate": 4.6559050314166264e-05, + "loss": 0.7196, + "step": 1880 + }, + { + "epoch": 0.22010742643624476, + "grad_norm": 0.868446881287682, + "learning_rate": 4.653592424790172e-05, + "loss": 0.734, + "step": 1885 + }, + { + "epoch": 0.22069126576366185, + "grad_norm": 0.8099440249953018, + "learning_rate": 4.6512727207769504e-05, + "loss": 0.7362, + "step": 1890 + }, + { + "epoch": 0.22127510509107892, + "grad_norm": 0.7864716011858071, + "learning_rate": 4.6489459280257856e-05, + "loss": 0.7188, + "step": 1895 + }, + { + "epoch": 0.22185894441849602, + "grad_norm": 0.8300689479187022, + "learning_rate": 4.646612055211933e-05, + "loss": 0.7418, + "step": 1900 + }, + { + "epoch": 0.22244278374591311, + "grad_norm": 0.7697294087146005, + "learning_rate": 4.6442711110370424e-05, + "loss": 0.6993, + "step": 1905 + }, + { + "epoch": 0.2230266230733302, + "grad_norm": 0.908818261954294, + "learning_rate": 4.64192310422913e-05, + "loss": 0.7115, + "step": 1910 + }, + { + "epoch": 0.2236104624007473, + "grad_norm": 0.7593820871173316, + "learning_rate": 4.639568043542548e-05, + "loss": 0.7061, + "step": 1915 + }, + { + "epoch": 0.2241943017281644, + "grad_norm": 0.8524203826838567, + "learning_rate": 4.6372059377579414e-05, + "loss": 0.7242, + "step": 1920 + }, + { + "epoch": 0.2247781410555815, + "grad_norm": 0.8993830914176336, + "learning_rate": 4.634836795682228e-05, + "loss": 0.7171, + "step": 1925 + }, + { + "epoch": 0.2253619803829986, + "grad_norm": 0.7814420865591079, + "learning_rate": 4.632460626148558e-05, + "loss": 0.7285, + "step": 1930 + }, + { + "epoch": 0.2259458197104157, + "grad_norm": 0.74700148461869, + "learning_rate": 4.6300774380162825e-05, + "loss": 0.7325, + "step": 1935 + }, + { + "epoch": 0.2265296590378328, + "grad_norm": 0.8530190257657563, + "learning_rate": 4.627687240170921e-05, + "loss": 0.7224, + "step": 1940 + }, + { + "epoch": 0.22711349836524988, + "grad_norm": 0.8588558793942196, + "learning_rate": 4.625290041524128e-05, + "loss": 0.7253, + "step": 1945 + }, + { + "epoch": 0.22769733769266698, + "grad_norm": 0.7613854196909419, + "learning_rate": 4.6228858510136616e-05, + "loss": 0.7166, + "step": 1950 + }, + { + "epoch": 0.22828117702008408, + "grad_norm": 0.8223207615558868, + "learning_rate": 4.620474677603345e-05, + "loss": 0.726, + "step": 1955 + }, + { + "epoch": 0.22886501634750117, + "grad_norm": 0.7697519948932519, + "learning_rate": 4.61805653028304e-05, + "loss": 0.7218, + "step": 1960 + }, + { + "epoch": 0.22944885567491827, + "grad_norm": 0.8675067664713971, + "learning_rate": 4.615631418068609e-05, + "loss": 0.739, + "step": 1965 + }, + { + "epoch": 0.23003269500233536, + "grad_norm": 0.8036791190421861, + "learning_rate": 4.613199350001881e-05, + "loss": 0.7338, + "step": 1970 + }, + { + "epoch": 0.23061653432975246, + "grad_norm": 1.052515044144309, + "learning_rate": 4.6107603351506205e-05, + "loss": 0.7263, + "step": 1975 + }, + { + "epoch": 0.23120037365716956, + "grad_norm": 0.8356991201425433, + "learning_rate": 4.608314382608493e-05, + "loss": 0.7207, + "step": 1980 + }, + { + "epoch": 0.23178421298458665, + "grad_norm": 0.8232997169558762, + "learning_rate": 4.6058615014950315e-05, + "loss": 0.7367, + "step": 1985 + }, + { + "epoch": 0.23236805231200375, + "grad_norm": 0.797210179690259, + "learning_rate": 4.6034017009555975e-05, + "loss": 0.73, + "step": 1990 + }, + { + "epoch": 0.23295189163942084, + "grad_norm": 0.8241062552935673, + "learning_rate": 4.600934990161355e-05, + "loss": 0.7171, + "step": 1995 + }, + { + "epoch": 0.23353573096683794, + "grad_norm": 0.7937008955270574, + "learning_rate": 4.598461378309231e-05, + "loss": 0.7291, + "step": 2000 + }, + { + "epoch": 0.234119570294255, + "grad_norm": 0.78035068225242, + "learning_rate": 4.5959808746218823e-05, + "loss": 0.7146, + "step": 2005 + }, + { + "epoch": 0.2347034096216721, + "grad_norm": 0.8134248627791105, + "learning_rate": 4.593493488347662e-05, + "loss": 0.7262, + "step": 2010 + }, + { + "epoch": 0.2352872489490892, + "grad_norm": 0.8986352822533618, + "learning_rate": 4.590999228760583e-05, + "loss": 0.7224, + "step": 2015 + }, + { + "epoch": 0.2358710882765063, + "grad_norm": 0.8799624641629938, + "learning_rate": 4.5884981051602873e-05, + "loss": 0.7039, + "step": 2020 + }, + { + "epoch": 0.2364549276039234, + "grad_norm": 0.8062832337406176, + "learning_rate": 4.585990126872006e-05, + "loss": 0.7105, + "step": 2025 + }, + { + "epoch": 0.2370387669313405, + "grad_norm": 0.8023617406960752, + "learning_rate": 4.583475303246527e-05, + "loss": 0.7082, + "step": 2030 + }, + { + "epoch": 0.23762260625875758, + "grad_norm": 0.7858415440677039, + "learning_rate": 4.580953643660165e-05, + "loss": 0.7148, + "step": 2035 + }, + { + "epoch": 0.23820644558617468, + "grad_norm": 0.8747517661212155, + "learning_rate": 4.5784251575147176e-05, + "loss": 0.7322, + "step": 2040 + }, + { + "epoch": 0.23879028491359178, + "grad_norm": 0.8011757891417564, + "learning_rate": 4.5758898542374354e-05, + "loss": 0.7281, + "step": 2045 + }, + { + "epoch": 0.23937412424100887, + "grad_norm": 0.8088704185187473, + "learning_rate": 4.5733477432809884e-05, + "loss": 0.7048, + "step": 2050 + }, + { + "epoch": 0.23995796356842597, + "grad_norm": 0.7544430888220308, + "learning_rate": 4.570798834123425e-05, + "loss": 0.7111, + "step": 2055 + }, + { + "epoch": 0.24054180289584307, + "grad_norm": 0.8717789597793989, + "learning_rate": 4.5682431362681435e-05, + "loss": 0.7111, + "step": 2060 + }, + { + "epoch": 0.24112564222326016, + "grad_norm": 0.7714012324044205, + "learning_rate": 4.565680659243851e-05, + "loss": 0.6974, + "step": 2065 + }, + { + "epoch": 0.24170948155067726, + "grad_norm": 0.7555562896364412, + "learning_rate": 4.5631114126045315e-05, + "loss": 0.718, + "step": 2070 + }, + { + "epoch": 0.24229332087809435, + "grad_norm": 0.7097810232586401, + "learning_rate": 4.560535405929408e-05, + "loss": 0.7062, + "step": 2075 + }, + { + "epoch": 0.24287716020551145, + "grad_norm": 0.769470044177514, + "learning_rate": 4.557952648822908e-05, + "loss": 0.7189, + "step": 2080 + }, + { + "epoch": 0.24346099953292855, + "grad_norm": 0.7904005906275823, + "learning_rate": 4.555363150914628e-05, + "loss": 0.7314, + "step": 2085 + }, + { + "epoch": 0.24404483886034564, + "grad_norm": 0.8463560711087779, + "learning_rate": 4.552766921859297e-05, + "loss": 0.707, + "step": 2090 + }, + { + "epoch": 0.24462867818776274, + "grad_norm": 0.845867432164641, + "learning_rate": 4.5501639713367386e-05, + "loss": 0.6973, + "step": 2095 + }, + { + "epoch": 0.24521251751517983, + "grad_norm": 0.8119973038192364, + "learning_rate": 4.547554309051839e-05, + "loss": 0.7359, + "step": 2100 + }, + { + "epoch": 0.24579635684259693, + "grad_norm": 0.8828267070913842, + "learning_rate": 4.5449379447345084e-05, + "loss": 0.7217, + "step": 2105 + }, + { + "epoch": 0.246380196170014, + "grad_norm": 0.7889889781042302, + "learning_rate": 4.5423148881396444e-05, + "loss": 0.7285, + "step": 2110 + }, + { + "epoch": 0.2469640354974311, + "grad_norm": 0.8253943240619854, + "learning_rate": 4.539685149047097e-05, + "loss": 0.7175, + "step": 2115 + }, + { + "epoch": 0.2475478748248482, + "grad_norm": 0.8463995819623495, + "learning_rate": 4.5370487372616285e-05, + "loss": 0.7088, + "step": 2120 + }, + { + "epoch": 0.2481317141522653, + "grad_norm": 0.7557279563999925, + "learning_rate": 4.5344056626128847e-05, + "loss": 0.7273, + "step": 2125 + }, + { + "epoch": 0.24871555347968238, + "grad_norm": 0.8227994525525993, + "learning_rate": 4.53175593495535e-05, + "loss": 0.7221, + "step": 2130 + }, + { + "epoch": 0.24929939280709948, + "grad_norm": 0.7571552923515421, + "learning_rate": 4.529099564168312e-05, + "loss": 0.7311, + "step": 2135 + }, + { + "epoch": 0.24988323213451658, + "grad_norm": 0.8020862994896764, + "learning_rate": 4.526436560155833e-05, + "loss": 0.7156, + "step": 2140 + }, + { + "epoch": 0.2504670714619337, + "grad_norm": 0.8257081755099529, + "learning_rate": 4.5237669328467e-05, + "loss": 0.7039, + "step": 2145 + }, + { + "epoch": 0.25105091078935077, + "grad_norm": 0.9252224979344469, + "learning_rate": 4.5210906921944e-05, + "loss": 0.7203, + "step": 2150 + }, + { + "epoch": 0.2516347501167679, + "grad_norm": 0.7820690558066279, + "learning_rate": 4.518407848177073e-05, + "loss": 0.7283, + "step": 2155 + }, + { + "epoch": 0.25221858944418496, + "grad_norm": 0.7900246259761322, + "learning_rate": 4.515718410797481e-05, + "loss": 0.7405, + "step": 2160 + }, + { + "epoch": 0.25280242877160203, + "grad_norm": 0.9165560691178495, + "learning_rate": 4.513022390082969e-05, + "loss": 0.7043, + "step": 2165 + }, + { + "epoch": 0.25338626809901915, + "grad_norm": 0.7375679989625112, + "learning_rate": 4.510319796085428e-05, + "loss": 0.7343, + "step": 2170 + }, + { + "epoch": 0.2539701074264362, + "grad_norm": 0.7827496430575249, + "learning_rate": 4.5076106388812534e-05, + "loss": 0.7266, + "step": 2175 + }, + { + "epoch": 0.25455394675385334, + "grad_norm": 0.8723807945365605, + "learning_rate": 4.504894928571315e-05, + "loss": 0.7182, + "step": 2180 + }, + { + "epoch": 0.2551377860812704, + "grad_norm": 0.8482133158531135, + "learning_rate": 4.502172675280915e-05, + "loss": 0.7104, + "step": 2185 + }, + { + "epoch": 0.25572162540868754, + "grad_norm": 0.8524857579835551, + "learning_rate": 4.4994438891597486e-05, + "loss": 0.7635, + "step": 2190 + }, + { + "epoch": 0.2563054647361046, + "grad_norm": 0.778666322067883, + "learning_rate": 4.496708580381868e-05, + "loss": 0.7338, + "step": 2195 + }, + { + "epoch": 0.25688930406352173, + "grad_norm": 0.8723980465894413, + "learning_rate": 4.4939667591456465e-05, + "loss": 0.712, + "step": 2200 + }, + { + "epoch": 0.2574731433909388, + "grad_norm": 0.8421038493972052, + "learning_rate": 4.491218435673737e-05, + "loss": 0.722, + "step": 2205 + }, + { + "epoch": 0.2580569827183559, + "grad_norm": 0.7426905139587554, + "learning_rate": 4.4884636202130365e-05, + "loss": 0.7111, + "step": 2210 + }, + { + "epoch": 0.258640822045773, + "grad_norm": 0.8604558266845712, + "learning_rate": 4.485702323034647e-05, + "loss": 0.721, + "step": 2215 + }, + { + "epoch": 0.2592246613731901, + "grad_norm": 0.8220760507302662, + "learning_rate": 4.4829345544338355e-05, + "loss": 0.7273, + "step": 2220 + }, + { + "epoch": 0.2598085007006072, + "grad_norm": 0.871686191616385, + "learning_rate": 4.480160324729998e-05, + "loss": 0.7092, + "step": 2225 + }, + { + "epoch": 0.2603923400280243, + "grad_norm": 0.844341787935979, + "learning_rate": 4.477379644266621e-05, + "loss": 0.7131, + "step": 2230 + }, + { + "epoch": 0.2609761793554414, + "grad_norm": 0.7900108496693427, + "learning_rate": 4.47459252341124e-05, + "loss": 0.7559, + "step": 2235 + }, + { + "epoch": 0.2615600186828585, + "grad_norm": 0.8273038621128911, + "learning_rate": 4.471798972555407e-05, + "loss": 0.7148, + "step": 2240 + }, + { + "epoch": 0.26214385801027557, + "grad_norm": 0.7539025667177797, + "learning_rate": 4.468999002114642e-05, + "loss": 0.7315, + "step": 2245 + }, + { + "epoch": 0.2627276973376927, + "grad_norm": 0.7613321385392502, + "learning_rate": 4.4661926225284057e-05, + "loss": 0.7127, + "step": 2250 + }, + { + "epoch": 0.26331153666510976, + "grad_norm": 0.750959574414275, + "learning_rate": 4.463379844260051e-05, + "loss": 0.7253, + "step": 2255 + }, + { + "epoch": 0.2638953759925269, + "grad_norm": 0.7479404319087383, + "learning_rate": 4.460560677796788e-05, + "loss": 0.706, + "step": 2260 + }, + { + "epoch": 0.26447921531994395, + "grad_norm": 0.7999080096714866, + "learning_rate": 4.4577351336496466e-05, + "loss": 0.7199, + "step": 2265 + }, + { + "epoch": 0.2650630546473611, + "grad_norm": 0.7979384420318076, + "learning_rate": 4.454903222353433e-05, + "loss": 0.7034, + "step": 2270 + }, + { + "epoch": 0.26564689397477814, + "grad_norm": 0.8482408600249933, + "learning_rate": 4.4520649544666955e-05, + "loss": 0.7157, + "step": 2275 + }, + { + "epoch": 0.2662307333021952, + "grad_norm": 0.7981351178418482, + "learning_rate": 4.4492203405716804e-05, + "loss": 0.7092, + "step": 2280 + }, + { + "epoch": 0.26681457262961233, + "grad_norm": 0.7962766501017232, + "learning_rate": 4.4463693912742944e-05, + "loss": 0.7097, + "step": 2285 + }, + { + "epoch": 0.2673984119570294, + "grad_norm": 0.7507841063947497, + "learning_rate": 4.4435121172040674e-05, + "loss": 0.7101, + "step": 2290 + }, + { + "epoch": 0.2679822512844465, + "grad_norm": 0.7783944622822936, + "learning_rate": 4.4406485290141075e-05, + "loss": 0.7024, + "step": 2295 + }, + { + "epoch": 0.2685660906118636, + "grad_norm": 0.7543292868458843, + "learning_rate": 4.437778637381068e-05, + "loss": 0.7033, + "step": 2300 + }, + { + "epoch": 0.2691499299392807, + "grad_norm": 0.839842240820925, + "learning_rate": 4.434902453005101e-05, + "loss": 0.7396, + "step": 2305 + }, + { + "epoch": 0.2697337692666978, + "grad_norm": 0.7745574814360575, + "learning_rate": 4.4320199866098216e-05, + "loss": 0.7096, + "step": 2310 + }, + { + "epoch": 0.2703176085941149, + "grad_norm": 0.8333262012405849, + "learning_rate": 4.4291312489422684e-05, + "loss": 0.7312, + "step": 2315 + }, + { + "epoch": 0.270901447921532, + "grad_norm": 0.7527817460814907, + "learning_rate": 4.426236250772859e-05, + "loss": 0.7105, + "step": 2320 + }, + { + "epoch": 0.2714852872489491, + "grad_norm": 0.7419140402986222, + "learning_rate": 4.423335002895358e-05, + "loss": 0.7066, + "step": 2325 + }, + { + "epoch": 0.27206912657636617, + "grad_norm": 0.822685880830946, + "learning_rate": 4.420427516126822e-05, + "loss": 0.7345, + "step": 2330 + }, + { + "epoch": 0.2726529659037833, + "grad_norm": 0.7195809448647575, + "learning_rate": 4.4175138013075804e-05, + "loss": 0.7137, + "step": 2335 + }, + { + "epoch": 0.27323680523120036, + "grad_norm": 0.8631758008930391, + "learning_rate": 4.4145938693011747e-05, + "loss": 0.7133, + "step": 2340 + }, + { + "epoch": 0.2738206445586175, + "grad_norm": 1.0308474440769033, + "learning_rate": 4.4116677309943295e-05, + "loss": 0.7185, + "step": 2345 + }, + { + "epoch": 0.27440448388603456, + "grad_norm": 0.7570257114511578, + "learning_rate": 4.40873539729691e-05, + "loss": 0.7278, + "step": 2350 + }, + { + "epoch": 0.2749883232134517, + "grad_norm": 0.7901622570063865, + "learning_rate": 4.405796879141881e-05, + "loss": 0.7064, + "step": 2355 + }, + { + "epoch": 0.27557216254086875, + "grad_norm": 0.7788901006471353, + "learning_rate": 4.402852187485262e-05, + "loss": 0.7224, + "step": 2360 + }, + { + "epoch": 0.27615600186828587, + "grad_norm": 0.8123579581408774, + "learning_rate": 4.3999013333060936e-05, + "loss": 0.7254, + "step": 2365 + }, + { + "epoch": 0.27673984119570294, + "grad_norm": 0.8237555662802201, + "learning_rate": 4.396944327606389e-05, + "loss": 0.7064, + "step": 2370 + }, + { + "epoch": 0.27732368052312006, + "grad_norm": 0.7557624979447745, + "learning_rate": 4.393981181411102e-05, + "loss": 0.7262, + "step": 2375 + }, + { + "epoch": 0.27790751985053713, + "grad_norm": 0.7866415313414409, + "learning_rate": 4.3910119057680765e-05, + "loss": 0.6906, + "step": 2380 + }, + { + "epoch": 0.2784913591779542, + "grad_norm": 0.8148376946961301, + "learning_rate": 4.3880365117480114e-05, + "loss": 0.725, + "step": 2385 + }, + { + "epoch": 0.2790751985053713, + "grad_norm": 0.8034343055523648, + "learning_rate": 4.385055010444416e-05, + "loss": 0.7053, + "step": 2390 + }, + { + "epoch": 0.2796590378327884, + "grad_norm": 0.8266078864974, + "learning_rate": 4.382067412973573e-05, + "loss": 0.7029, + "step": 2395 + }, + { + "epoch": 0.2802428771602055, + "grad_norm": 0.7053898192924525, + "learning_rate": 4.3790737304744906e-05, + "loss": 0.7212, + "step": 2400 + }, + { + "epoch": 0.2808267164876226, + "grad_norm": 0.7266265709842735, + "learning_rate": 4.376073974108866e-05, + "loss": 0.7253, + "step": 2405 + }, + { + "epoch": 0.2814105558150397, + "grad_norm": 0.6798592601288477, + "learning_rate": 4.373068155061043e-05, + "loss": 0.7097, + "step": 2410 + }, + { + "epoch": 0.2819943951424568, + "grad_norm": 0.7235954522188496, + "learning_rate": 4.37005628453797e-05, + "loss": 0.7025, + "step": 2415 + }, + { + "epoch": 0.2825782344698739, + "grad_norm": 0.775734945025186, + "learning_rate": 4.367038373769155e-05, + "loss": 0.7324, + "step": 2420 + }, + { + "epoch": 0.28316207379729097, + "grad_norm": 0.770747806314939, + "learning_rate": 4.36401443400663e-05, + "loss": 0.7285, + "step": 2425 + }, + { + "epoch": 0.2837459131247081, + "grad_norm": 0.8291732339761384, + "learning_rate": 4.3609844765249034e-05, + "loss": 0.6969, + "step": 2430 + }, + { + "epoch": 0.28432975245212516, + "grad_norm": 0.8759927257734268, + "learning_rate": 4.357948512620922e-05, + "loss": 0.7069, + "step": 2435 + }, + { + "epoch": 0.2849135917795423, + "grad_norm": 0.9555634569262756, + "learning_rate": 4.354906553614024e-05, + "loss": 0.7152, + "step": 2440 + }, + { + "epoch": 0.28549743110695935, + "grad_norm": 0.7926964374849049, + "learning_rate": 4.3518586108459034e-05, + "loss": 0.6953, + "step": 2445 + }, + { + "epoch": 0.2860812704343765, + "grad_norm": 0.8670597981303715, + "learning_rate": 4.34880469568056e-05, + "loss": 0.6939, + "step": 2450 + }, + { + "epoch": 0.28666510976179355, + "grad_norm": 0.7940656148228097, + "learning_rate": 4.345744819504266e-05, + "loss": 0.714, + "step": 2455 + }, + { + "epoch": 0.28724894908921067, + "grad_norm": 0.7161534778292359, + "learning_rate": 4.342678993725517e-05, + "loss": 0.7063, + "step": 2460 + }, + { + "epoch": 0.28783278841662774, + "grad_norm": 0.7974259411882264, + "learning_rate": 4.339607229774989e-05, + "loss": 0.7138, + "step": 2465 + }, + { + "epoch": 0.28841662774404486, + "grad_norm": 0.7732673861806778, + "learning_rate": 4.3365295391054996e-05, + "loss": 0.7073, + "step": 2470 + }, + { + "epoch": 0.28900046707146193, + "grad_norm": 0.7590336694258303, + "learning_rate": 4.333445933191964e-05, + "loss": 0.7202, + "step": 2475 + }, + { + "epoch": 0.28958430639887905, + "grad_norm": 0.7575975541713273, + "learning_rate": 4.330356423531352e-05, + "loss": 0.7143, + "step": 2480 + }, + { + "epoch": 0.2901681457262961, + "grad_norm": 0.7628200424802012, + "learning_rate": 4.327261021642644e-05, + "loss": 0.722, + "step": 2485 + }, + { + "epoch": 0.2907519850537132, + "grad_norm": 0.7622798779523068, + "learning_rate": 4.32415973906679e-05, + "loss": 0.7105, + "step": 2490 + }, + { + "epoch": 0.2913358243811303, + "grad_norm": 0.8001106062213871, + "learning_rate": 4.3210525873666656e-05, + "loss": 0.7084, + "step": 2495 + }, + { + "epoch": 0.2919196637085474, + "grad_norm": 1.204280692595554, + "learning_rate": 4.317939578127029e-05, + "loss": 0.7143, + "step": 2500 + }, + { + "epoch": 0.2925035030359645, + "grad_norm": 0.9009941077991439, + "learning_rate": 4.314820722954476e-05, + "loss": 0.7216, + "step": 2505 + }, + { + "epoch": 0.2930873423633816, + "grad_norm": 0.8938308491195663, + "learning_rate": 4.3116960334774e-05, + "loss": 0.712, + "step": 2510 + }, + { + "epoch": 0.2936711816907987, + "grad_norm": 0.8599772730180362, + "learning_rate": 4.308565521345949e-05, + "loss": 0.7203, + "step": 2515 + }, + { + "epoch": 0.29425502101821577, + "grad_norm": 2.102049767135387, + "learning_rate": 4.305429198231977e-05, + "loss": 0.7123, + "step": 2520 + }, + { + "epoch": 0.2948388603456329, + "grad_norm": 0.7496640756396827, + "learning_rate": 4.302287075829005e-05, + "loss": 0.7075, + "step": 2525 + }, + { + "epoch": 0.29542269967304996, + "grad_norm": 0.664311795120198, + "learning_rate": 4.2991391658521765e-05, + "loss": 0.7018, + "step": 2530 + }, + { + "epoch": 0.2960065390004671, + "grad_norm": 0.805681195205423, + "learning_rate": 4.2959854800382136e-05, + "loss": 0.7075, + "step": 2535 + }, + { + "epoch": 0.29659037832788415, + "grad_norm": 0.76027156646289, + "learning_rate": 4.292826030145372e-05, + "loss": 0.7134, + "step": 2540 + }, + { + "epoch": 0.2971742176553013, + "grad_norm": 0.8764032798835537, + "learning_rate": 4.289660827953399e-05, + "loss": 0.6904, + "step": 2545 + }, + { + "epoch": 0.29775805698271834, + "grad_norm": 0.721305727819197, + "learning_rate": 4.28648988526349e-05, + "loss": 0.7283, + "step": 2550 + }, + { + "epoch": 0.29834189631013547, + "grad_norm": 19.423448089121035, + "learning_rate": 4.2833132138982415e-05, + "loss": 0.7238, + "step": 2555 + }, + { + "epoch": 0.29892573563755254, + "grad_norm": 0.7950100019932609, + "learning_rate": 4.280130825701609e-05, + "loss": 0.7136, + "step": 2560 + }, + { + "epoch": 0.29950957496496966, + "grad_norm": 0.733815093607993, + "learning_rate": 4.276942732538866e-05, + "loss": 0.7032, + "step": 2565 + }, + { + "epoch": 0.30009341429238673, + "grad_norm": 0.7970021565058928, + "learning_rate": 4.273748946296552e-05, + "loss": 0.737, + "step": 2570 + }, + { + "epoch": 0.30067725361980385, + "grad_norm": 0.8628432834272957, + "learning_rate": 4.2705494788824345e-05, + "loss": 0.7149, + "step": 2575 + }, + { + "epoch": 0.3012610929472209, + "grad_norm": 0.7834419896434103, + "learning_rate": 4.267344342225463e-05, + "loss": 0.7256, + "step": 2580 + }, + { + "epoch": 0.30184493227463804, + "grad_norm": 0.7595038379537414, + "learning_rate": 4.264133548275725e-05, + "loss": 0.7034, + "step": 2585 + }, + { + "epoch": 0.3024287716020551, + "grad_norm": 0.7610589658032497, + "learning_rate": 4.2609171090044e-05, + "loss": 0.6928, + "step": 2590 + }, + { + "epoch": 0.30301261092947224, + "grad_norm": 0.7343922486197118, + "learning_rate": 4.257695036403714e-05, + "loss": 0.7057, + "step": 2595 + }, + { + "epoch": 0.3035964502568893, + "grad_norm": 0.7102829296413985, + "learning_rate": 4.2544673424868994e-05, + "loss": 0.7031, + "step": 2600 + }, + { + "epoch": 0.3041802895843064, + "grad_norm": 0.774479875646621, + "learning_rate": 4.251234039288145e-05, + "loss": 0.7086, + "step": 2605 + }, + { + "epoch": 0.3047641289117235, + "grad_norm": 0.800845583870741, + "learning_rate": 4.2479951388625546e-05, + "loss": 0.7312, + "step": 2610 + }, + { + "epoch": 0.30534796823914057, + "grad_norm": 0.7493080904949945, + "learning_rate": 4.2447506532861e-05, + "loss": 0.7276, + "step": 2615 + }, + { + "epoch": 0.3059318075665577, + "grad_norm": 0.8127313007347643, + "learning_rate": 4.241500594655577e-05, + "loss": 0.7069, + "step": 2620 + }, + { + "epoch": 0.30651564689397476, + "grad_norm": 0.8277530192656021, + "learning_rate": 4.2382449750885604e-05, + "loss": 0.6874, + "step": 2625 + }, + { + "epoch": 0.3070994862213919, + "grad_norm": 0.7963199104898718, + "learning_rate": 4.2349838067233575e-05, + "loss": 0.6903, + "step": 2630 + }, + { + "epoch": 0.30768332554880895, + "grad_norm": 0.8146094866973611, + "learning_rate": 4.231717101718967e-05, + "loss": 0.6871, + "step": 2635 + }, + { + "epoch": 0.3082671648762261, + "grad_norm": 0.7062438712561713, + "learning_rate": 4.228444872255025e-05, + "loss": 0.7, + "step": 2640 + }, + { + "epoch": 0.30885100420364314, + "grad_norm": 0.7536796043688521, + "learning_rate": 4.2251671305317696e-05, + "loss": 0.7114, + "step": 2645 + }, + { + "epoch": 0.30943484353106027, + "grad_norm": 0.7327638261755937, + "learning_rate": 4.2218838887699894e-05, + "loss": 0.7166, + "step": 2650 + }, + { + "epoch": 0.31001868285847733, + "grad_norm": 0.9041359970185278, + "learning_rate": 4.2185951592109794e-05, + "loss": 0.7, + "step": 2655 + }, + { + "epoch": 0.31060252218589446, + "grad_norm": 0.8351437587118097, + "learning_rate": 4.2153009541164965e-05, + "loss": 0.7162, + "step": 2660 + }, + { + "epoch": 0.3111863615133115, + "grad_norm": 0.8334803832595723, + "learning_rate": 4.21200128576871e-05, + "loss": 0.7272, + "step": 2665 + }, + { + "epoch": 0.31177020084072865, + "grad_norm": 0.7645748557972607, + "learning_rate": 4.208696166470161e-05, + "loss": 0.7203, + "step": 2670 + }, + { + "epoch": 0.3123540401681457, + "grad_norm": 0.8010226975358403, + "learning_rate": 4.2053856085437124e-05, + "loss": 0.7127, + "step": 2675 + }, + { + "epoch": 0.31293787949556284, + "grad_norm": 0.778105181138758, + "learning_rate": 4.202069624332507e-05, + "loss": 0.7093, + "step": 2680 + }, + { + "epoch": 0.3135217188229799, + "grad_norm": 0.8269537229295518, + "learning_rate": 4.1987482261999164e-05, + "loss": 0.7256, + "step": 2685 + }, + { + "epoch": 0.31410555815039704, + "grad_norm": 0.7273731295392446, + "learning_rate": 4.1954214265294985e-05, + "loss": 0.6877, + "step": 2690 + }, + { + "epoch": 0.3146893974778141, + "grad_norm": 0.6387268961733149, + "learning_rate": 4.192089237724951e-05, + "loss": 0.6936, + "step": 2695 + }, + { + "epoch": 0.3152732368052312, + "grad_norm": 0.7135748637677033, + "learning_rate": 4.188751672210063e-05, + "loss": 0.7118, + "step": 2700 + }, + { + "epoch": 0.3158570761326483, + "grad_norm": 0.7798832279992028, + "learning_rate": 4.1854087424286725e-05, + "loss": 0.6884, + "step": 2705 + }, + { + "epoch": 0.31644091546006536, + "grad_norm": 0.9679464940606846, + "learning_rate": 4.182060460844615e-05, + "loss": 0.6979, + "step": 2710 + }, + { + "epoch": 0.3170247547874825, + "grad_norm": 0.796914375651688, + "learning_rate": 4.1787068399416825e-05, + "loss": 0.7012, + "step": 2715 + }, + { + "epoch": 0.31760859411489956, + "grad_norm": 0.761095764820495, + "learning_rate": 4.175347892223572e-05, + "loss": 0.6793, + "step": 2720 + }, + { + "epoch": 0.3181924334423167, + "grad_norm": 0.7258359028766036, + "learning_rate": 4.1719836302138426e-05, + "loss": 0.6994, + "step": 2725 + }, + { + "epoch": 0.31877627276973375, + "grad_norm": 0.8162191545953326, + "learning_rate": 4.168614066455867e-05, + "loss": 0.7015, + "step": 2730 + }, + { + "epoch": 0.31936011209715087, + "grad_norm": 0.8218062352011269, + "learning_rate": 4.165239213512784e-05, + "loss": 0.6949, + "step": 2735 + }, + { + "epoch": 0.31994395142456794, + "grad_norm": 0.7739752042429793, + "learning_rate": 4.161859083967454e-05, + "loss": 0.7024, + "step": 2740 + }, + { + "epoch": 0.32052779075198506, + "grad_norm": 0.7856375360797179, + "learning_rate": 4.158473690422409e-05, + "loss": 0.6931, + "step": 2745 + }, + { + "epoch": 0.32111163007940213, + "grad_norm": 0.767076262950083, + "learning_rate": 4.15508304549981e-05, + "loss": 0.6763, + "step": 2750 + }, + { + "epoch": 0.32169546940681926, + "grad_norm": 0.8186303246875027, + "learning_rate": 4.1516871618413947e-05, + "loss": 0.7077, + "step": 2755 + }, + { + "epoch": 0.3222793087342363, + "grad_norm": 0.7566057113699275, + "learning_rate": 4.148286052108436e-05, + "loss": 0.7095, + "step": 2760 + }, + { + "epoch": 0.32286314806165345, + "grad_norm": 0.685642900616039, + "learning_rate": 4.144879728981688e-05, + "loss": 0.7056, + "step": 2765 + }, + { + "epoch": 0.3234469873890705, + "grad_norm": 0.7506545072890666, + "learning_rate": 4.141468205161345e-05, + "loss": 0.6991, + "step": 2770 + }, + { + "epoch": 0.32403082671648764, + "grad_norm": 0.705634756504342, + "learning_rate": 4.1380514933669916e-05, + "loss": 0.6777, + "step": 2775 + }, + { + "epoch": 0.3246146660439047, + "grad_norm": 0.7386898729822018, + "learning_rate": 4.134629606337555e-05, + "loss": 0.692, + "step": 2780 + }, + { + "epoch": 0.32519850537132183, + "grad_norm": 0.7914830335645638, + "learning_rate": 4.131202556831257e-05, + "loss": 0.7072, + "step": 2785 + }, + { + "epoch": 0.3257823446987389, + "grad_norm": 0.7822067236686193, + "learning_rate": 4.1277703576255685e-05, + "loss": 0.692, + "step": 2790 + }, + { + "epoch": 0.326366184026156, + "grad_norm": 0.7781485290409217, + "learning_rate": 4.12433302151716e-05, + "loss": 0.6948, + "step": 2795 + }, + { + "epoch": 0.3269500233535731, + "grad_norm": 0.6879972141894926, + "learning_rate": 4.1208905613218547e-05, + "loss": 0.7009, + "step": 2800 + }, + { + "epoch": 0.3275338626809902, + "grad_norm": 0.7101221257232809, + "learning_rate": 4.1174429898745795e-05, + "loss": 0.6856, + "step": 2805 + }, + { + "epoch": 0.3281177020084073, + "grad_norm": 0.731271995376911, + "learning_rate": 4.113990320029321e-05, + "loss": 0.6805, + "step": 2810 + }, + { + "epoch": 0.32870154133582435, + "grad_norm": 0.7196696016295292, + "learning_rate": 4.1105325646590714e-05, + "loss": 0.7144, + "step": 2815 + }, + { + "epoch": 0.3292853806632415, + "grad_norm": 0.7149670007135108, + "learning_rate": 4.1070697366557856e-05, + "loss": 0.6956, + "step": 2820 + }, + { + "epoch": 0.32986921999065855, + "grad_norm": 0.8184512447679282, + "learning_rate": 4.103601848930332e-05, + "loss": 0.6878, + "step": 2825 + }, + { + "epoch": 0.33045305931807567, + "grad_norm": 0.8345056924813619, + "learning_rate": 4.100128914412442e-05, + "loss": 0.7327, + "step": 2830 + }, + { + "epoch": 0.33103689864549274, + "grad_norm": 0.8215005253430337, + "learning_rate": 4.096650946050664e-05, + "loss": 0.6886, + "step": 2835 + }, + { + "epoch": 0.33162073797290986, + "grad_norm": 0.7963405506325654, + "learning_rate": 4.0931679568123174e-05, + "loss": 0.7143, + "step": 2840 + }, + { + "epoch": 0.33220457730032693, + "grad_norm": 0.753898680551099, + "learning_rate": 4.0896799596834365e-05, + "loss": 0.6857, + "step": 2845 + }, + { + "epoch": 0.33278841662774405, + "grad_norm": 0.8495093118519794, + "learning_rate": 4.086186967668731e-05, + "loss": 0.7166, + "step": 2850 + }, + { + "epoch": 0.3333722559551611, + "grad_norm": 0.8162454411979833, + "learning_rate": 4.082688993791533e-05, + "loss": 0.674, + "step": 2855 + }, + { + "epoch": 0.33395609528257825, + "grad_norm": 1.043859849113058, + "learning_rate": 4.079186051093747e-05, + "loss": 0.6954, + "step": 2860 + }, + { + "epoch": 0.3345399346099953, + "grad_norm": 0.7987678758616251, + "learning_rate": 4.075678152635807e-05, + "loss": 0.7152, + "step": 2865 + }, + { + "epoch": 0.33512377393741244, + "grad_norm": 0.7413546464797619, + "learning_rate": 4.07216531149662e-05, + "loss": 0.7196, + "step": 2870 + }, + { + "epoch": 0.3357076132648295, + "grad_norm": 0.7678397674426483, + "learning_rate": 4.068647540773524e-05, + "loss": 0.7076, + "step": 2875 + }, + { + "epoch": 0.33629145259224663, + "grad_norm": 0.7752892357655767, + "learning_rate": 4.065124853582237e-05, + "loss": 0.7129, + "step": 2880 + }, + { + "epoch": 0.3368752919196637, + "grad_norm": 0.741984984049898, + "learning_rate": 4.0615972630568055e-05, + "loss": 0.7047, + "step": 2885 + }, + { + "epoch": 0.3374591312470808, + "grad_norm": 0.796527501085706, + "learning_rate": 4.0580647823495587e-05, + "loss": 0.7061, + "step": 2890 + }, + { + "epoch": 0.3380429705744979, + "grad_norm": 0.7365729327389902, + "learning_rate": 4.054527424631059e-05, + "loss": 0.7329, + "step": 2895 + }, + { + "epoch": 0.338626809901915, + "grad_norm": 0.7723495813661937, + "learning_rate": 4.0509852030900506e-05, + "loss": 0.678, + "step": 2900 + }, + { + "epoch": 0.3392106492293321, + "grad_norm": 0.7655325772084528, + "learning_rate": 4.047438130933415e-05, + "loss": 0.7041, + "step": 2905 + }, + { + "epoch": 0.3397944885567492, + "grad_norm": 0.8181753674636343, + "learning_rate": 4.043886221386117e-05, + "loss": 0.6914, + "step": 2910 + }, + { + "epoch": 0.3403783278841663, + "grad_norm": 0.7221488554746541, + "learning_rate": 4.040329487691155e-05, + "loss": 0.6992, + "step": 2915 + }, + { + "epoch": 0.34096216721158334, + "grad_norm": 0.9106238776241381, + "learning_rate": 4.036767943109519e-05, + "loss": 0.7062, + "step": 2920 + }, + { + "epoch": 0.34154600653900047, + "grad_norm": 0.8667028137565969, + "learning_rate": 4.0332016009201315e-05, + "loss": 0.7055, + "step": 2925 + }, + { + "epoch": 0.34212984586641754, + "grad_norm": 0.7285032338549546, + "learning_rate": 4.0296304744198045e-05, + "loss": 0.6967, + "step": 2930 + }, + { + "epoch": 0.34271368519383466, + "grad_norm": 0.7111450740341209, + "learning_rate": 4.0260545769231875e-05, + "loss": 0.6921, + "step": 2935 + }, + { + "epoch": 0.34329752452125173, + "grad_norm": 0.8082397092555128, + "learning_rate": 4.022473921762719e-05, + "loss": 0.7086, + "step": 2940 + }, + { + "epoch": 0.34388136384866885, + "grad_norm": 0.7588740994953772, + "learning_rate": 4.018888522288574e-05, + "loss": 0.714, + "step": 2945 + }, + { + "epoch": 0.3444652031760859, + "grad_norm": 0.757055884032176, + "learning_rate": 4.0152983918686175e-05, + "loss": 0.6955, + "step": 2950 + }, + { + "epoch": 0.34504904250350305, + "grad_norm": 0.7543938506218743, + "learning_rate": 4.0117035438883553e-05, + "loss": 0.6826, + "step": 2955 + }, + { + "epoch": 0.3456328818309201, + "grad_norm": 0.8522266982138246, + "learning_rate": 4.008103991750879e-05, + "loss": 0.7168, + "step": 2960 + }, + { + "epoch": 0.34621672115833724, + "grad_norm": 0.6940565372247784, + "learning_rate": 4.004499748876821e-05, + "loss": 0.696, + "step": 2965 + }, + { + "epoch": 0.3468005604857543, + "grad_norm": 0.6862029386417299, + "learning_rate": 4.000890828704304e-05, + "loss": 0.6824, + "step": 2970 + }, + { + "epoch": 0.34738439981317143, + "grad_norm": 0.7315368280765282, + "learning_rate": 3.997277244688886e-05, + "loss": 0.7093, + "step": 2975 + }, + { + "epoch": 0.3479682391405885, + "grad_norm": 0.8399426230955965, + "learning_rate": 3.993659010303517e-05, + "loss": 0.7088, + "step": 2980 + }, + { + "epoch": 0.3485520784680056, + "grad_norm": 0.7447428590500363, + "learning_rate": 3.9900361390384836e-05, + "loss": 0.6729, + "step": 2985 + }, + { + "epoch": 0.3491359177954227, + "grad_norm": 0.6894785309334255, + "learning_rate": 3.986408644401362e-05, + "loss": 0.7095, + "step": 2990 + }, + { + "epoch": 0.3497197571228398, + "grad_norm": 0.7895286945877132, + "learning_rate": 3.982776539916966e-05, + "loss": 0.6963, + "step": 2995 + }, + { + "epoch": 0.3503035964502569, + "grad_norm": 0.7542557790149979, + "learning_rate": 3.979139839127296e-05, + "loss": 0.6805, + "step": 3000 + }, + { + "epoch": 0.350887435777674, + "grad_norm": 0.7985071306992715, + "learning_rate": 3.975498555591489e-05, + "loss": 0.6959, + "step": 3005 + }, + { + "epoch": 0.3514712751050911, + "grad_norm": 0.7109389878914494, + "learning_rate": 3.971852702885772e-05, + "loss": 0.6878, + "step": 3010 + }, + { + "epoch": 0.3520551144325082, + "grad_norm": 0.7031748076507193, + "learning_rate": 3.9682022946034006e-05, + "loss": 0.6931, + "step": 3015 + }, + { + "epoch": 0.35263895375992527, + "grad_norm": 0.7925651121254325, + "learning_rate": 3.964547344354624e-05, + "loss": 0.697, + "step": 3020 + }, + { + "epoch": 0.3532227930873424, + "grad_norm": 0.7244597774458819, + "learning_rate": 3.9608878657666195e-05, + "loss": 0.7022, + "step": 3025 + }, + { + "epoch": 0.35380663241475946, + "grad_norm": 0.7581238408555705, + "learning_rate": 3.9572238724834503e-05, + "loss": 0.7025, + "step": 3030 + }, + { + "epoch": 0.3543904717421765, + "grad_norm": 0.7831980832064683, + "learning_rate": 3.953555378166012e-05, + "loss": 0.7033, + "step": 3035 + }, + { + "epoch": 0.35497431106959365, + "grad_norm": 0.8547349083576906, + "learning_rate": 3.9498823964919827e-05, + "loss": 0.7058, + "step": 3040 + }, + { + "epoch": 0.3555581503970107, + "grad_norm": 0.7440878870880225, + "learning_rate": 3.94620494115577e-05, + "loss": 0.6895, + "step": 3045 + }, + { + "epoch": 0.35614198972442784, + "grad_norm": 0.6709320077587175, + "learning_rate": 3.942523025868461e-05, + "loss": 0.6934, + "step": 3050 + }, + { + "epoch": 0.3567258290518449, + "grad_norm": 0.8342619568452645, + "learning_rate": 3.9388366643577745e-05, + "loss": 0.6912, + "step": 3055 + }, + { + "epoch": 0.35730966837926204, + "grad_norm": 0.7265224003909866, + "learning_rate": 3.9351458703680017e-05, + "loss": 0.7109, + "step": 3060 + }, + { + "epoch": 0.3578935077066791, + "grad_norm": 0.7513155570253139, + "learning_rate": 3.931450657659963e-05, + "loss": 0.6973, + "step": 3065 + }, + { + "epoch": 0.3584773470340962, + "grad_norm": 0.7609685128527147, + "learning_rate": 3.927751040010954e-05, + "loss": 0.6772, + "step": 3070 + }, + { + "epoch": 0.3590611863615133, + "grad_norm": 0.898794715325684, + "learning_rate": 3.924047031214691e-05, + "loss": 0.71, + "step": 3075 + }, + { + "epoch": 0.3596450256889304, + "grad_norm": 0.7576048114562289, + "learning_rate": 3.920338645081266e-05, + "loss": 0.6958, + "step": 3080 + }, + { + "epoch": 0.3602288650163475, + "grad_norm": 0.7682741662829687, + "learning_rate": 3.916625895437089e-05, + "loss": 0.6701, + "step": 3085 + }, + { + "epoch": 0.3608127043437646, + "grad_norm": 0.8781282581181049, + "learning_rate": 3.912908796124839e-05, + "loss": 0.6979, + "step": 3090 + }, + { + "epoch": 0.3613965436711817, + "grad_norm": 0.7855289086951455, + "learning_rate": 3.909187361003414e-05, + "loss": 0.6866, + "step": 3095 + }, + { + "epoch": 0.3619803829985988, + "grad_norm": 0.7648549467063727, + "learning_rate": 3.905461603947878e-05, + "loss": 0.7006, + "step": 3100 + }, + { + "epoch": 0.3625642223260159, + "grad_norm": 0.792581012808363, + "learning_rate": 3.9017315388494044e-05, + "loss": 0.6871, + "step": 3105 + }, + { + "epoch": 0.363148061653433, + "grad_norm": 0.7754578864170861, + "learning_rate": 3.8979971796152346e-05, + "loss": 0.6933, + "step": 3110 + }, + { + "epoch": 0.36373190098085006, + "grad_norm": 0.6662434046820936, + "learning_rate": 3.894258540168618e-05, + "loss": 0.6746, + "step": 3115 + }, + { + "epoch": 0.3643157403082672, + "grad_norm": 0.7253242083521453, + "learning_rate": 3.89051563444876e-05, + "loss": 0.6802, + "step": 3120 + }, + { + "epoch": 0.36489957963568426, + "grad_norm": 0.6650194558269369, + "learning_rate": 3.886768476410777e-05, + "loss": 0.6848, + "step": 3125 + }, + { + "epoch": 0.3654834189631014, + "grad_norm": 0.8598275401566264, + "learning_rate": 3.883017080025638e-05, + "loss": 0.6858, + "step": 3130 + }, + { + "epoch": 0.36606725829051845, + "grad_norm": 0.8053667856247443, + "learning_rate": 3.879261459280111e-05, + "loss": 0.6885, + "step": 3135 + }, + { + "epoch": 0.3666510976179355, + "grad_norm": 0.7702969353671916, + "learning_rate": 3.875501628176719e-05, + "loss": 0.6956, + "step": 3140 + }, + { + "epoch": 0.36723493694535264, + "grad_norm": 0.7722892835874596, + "learning_rate": 3.8717376007336814e-05, + "loss": 0.6898, + "step": 3145 + }, + { + "epoch": 0.3678187762727697, + "grad_norm": 0.784670074007829, + "learning_rate": 3.867969390984862e-05, + "loss": 0.7156, + "step": 3150 + }, + { + "epoch": 0.36840261560018683, + "grad_norm": 0.745287207769695, + "learning_rate": 3.864197012979719e-05, + "loss": 0.7175, + "step": 3155 + }, + { + "epoch": 0.3689864549276039, + "grad_norm": 0.8145729590116079, + "learning_rate": 3.8604204807832516e-05, + "loss": 0.7291, + "step": 3160 + }, + { + "epoch": 0.369570294255021, + "grad_norm": 0.743613056899968, + "learning_rate": 3.856639808475947e-05, + "loss": 0.6812, + "step": 3165 + }, + { + "epoch": 0.3701541335824381, + "grad_norm": 0.7118971478527887, + "learning_rate": 3.85285501015373e-05, + "loss": 0.6914, + "step": 3170 + }, + { + "epoch": 0.3707379729098552, + "grad_norm": 0.7086191684890616, + "learning_rate": 3.8490660999279074e-05, + "loss": 0.6884, + "step": 3175 + }, + { + "epoch": 0.3713218122372723, + "grad_norm": 0.6696579150369846, + "learning_rate": 3.8452730919251174e-05, + "loss": 0.6986, + "step": 3180 + }, + { + "epoch": 0.3719056515646894, + "grad_norm": 0.6900937913313506, + "learning_rate": 3.841476000287275e-05, + "loss": 0.7078, + "step": 3185 + }, + { + "epoch": 0.3724894908921065, + "grad_norm": 0.7853924347903385, + "learning_rate": 3.837674839171524e-05, + "loss": 0.6879, + "step": 3190 + }, + { + "epoch": 0.3730733302195236, + "grad_norm": 0.7312106575367651, + "learning_rate": 3.833869622750177e-05, + "loss": 0.7089, + "step": 3195 + }, + { + "epoch": 0.37365716954694067, + "grad_norm": 0.9065474676918042, + "learning_rate": 3.8300603652106704e-05, + "loss": 0.6972, + "step": 3200 + }, + { + "epoch": 0.3742410088743578, + "grad_norm": 0.8566499866250796, + "learning_rate": 3.8262470807555045e-05, + "loss": 0.6824, + "step": 3205 + }, + { + "epoch": 0.37482484820177486, + "grad_norm": 0.7687481924225721, + "learning_rate": 3.822429783602195e-05, + "loss": 0.7017, + "step": 3210 + }, + { + "epoch": 0.375408687529192, + "grad_norm": 0.8140836228945466, + "learning_rate": 3.818608487983218e-05, + "loss": 0.6792, + "step": 3215 + }, + { + "epoch": 0.37599252685660906, + "grad_norm": 0.733345067405923, + "learning_rate": 3.8147832081459574e-05, + "loss": 0.6881, + "step": 3220 + }, + { + "epoch": 0.3765763661840262, + "grad_norm": 0.7744637883489761, + "learning_rate": 3.810953958352653e-05, + "loss": 0.7123, + "step": 3225 + }, + { + "epoch": 0.37716020551144325, + "grad_norm": 0.708946933223857, + "learning_rate": 3.807120752880346e-05, + "loss": 0.6907, + "step": 3230 + }, + { + "epoch": 0.37774404483886037, + "grad_norm": 0.7041806485060429, + "learning_rate": 3.8032836060208265e-05, + "loss": 0.6783, + "step": 3235 + }, + { + "epoch": 0.37832788416627744, + "grad_norm": 0.7341040187993833, + "learning_rate": 3.799442532080577e-05, + "loss": 0.6811, + "step": 3240 + }, + { + "epoch": 0.3789117234936945, + "grad_norm": 0.8543179231859879, + "learning_rate": 3.795597545380724e-05, + "loss": 0.6942, + "step": 3245 + }, + { + "epoch": 0.37949556282111163, + "grad_norm": 0.8149701441077691, + "learning_rate": 3.791748660256983e-05, + "loss": 0.6913, + "step": 3250 + }, + { + "epoch": 0.3800794021485287, + "grad_norm": 0.75207798303696, + "learning_rate": 3.787895891059603e-05, + "loss": 0.6746, + "step": 3255 + }, + { + "epoch": 0.3806632414759458, + "grad_norm": 0.7703254571890686, + "learning_rate": 3.784039252153315e-05, + "loss": 0.685, + "step": 3260 + }, + { + "epoch": 0.3812470808033629, + "grad_norm": 0.8155529508918891, + "learning_rate": 3.780178757917278e-05, + "loss": 0.7064, + "step": 3265 + }, + { + "epoch": 0.38183092013078, + "grad_norm": 0.675555849154923, + "learning_rate": 3.776314422745026e-05, + "loss": 0.6764, + "step": 3270 + }, + { + "epoch": 0.3824147594581971, + "grad_norm": 0.8278771662993144, + "learning_rate": 3.772446261044411e-05, + "loss": 0.6873, + "step": 3275 + }, + { + "epoch": 0.3829985987856142, + "grad_norm": 0.7773386206193695, + "learning_rate": 3.768574287237555e-05, + "loss": 0.6847, + "step": 3280 + }, + { + "epoch": 0.3835824381130313, + "grad_norm": 0.8031577269955347, + "learning_rate": 3.7646985157607915e-05, + "loss": 0.692, + "step": 3285 + }, + { + "epoch": 0.3841662774404484, + "grad_norm": 0.8218626150048584, + "learning_rate": 3.760818961064614e-05, + "loss": 0.6854, + "step": 3290 + }, + { + "epoch": 0.38475011676786547, + "grad_norm": 0.930027350274466, + "learning_rate": 3.75693563761362e-05, + "loss": 0.6914, + "step": 3295 + }, + { + "epoch": 0.3853339560952826, + "grad_norm": 0.7818858106760599, + "learning_rate": 3.75304855988646e-05, + "loss": 0.703, + "step": 3300 + }, + { + "epoch": 0.38591779542269966, + "grad_norm": 0.8740352987061476, + "learning_rate": 3.749157742375782e-05, + "loss": 0.6992, + "step": 3305 + }, + { + "epoch": 0.3865016347501168, + "grad_norm": 0.802111462845047, + "learning_rate": 3.745263199588176e-05, + "loss": 0.6957, + "step": 3310 + }, + { + "epoch": 0.38708547407753385, + "grad_norm": 0.8587440611545933, + "learning_rate": 3.741364946044123e-05, + "loss": 0.6907, + "step": 3315 + }, + { + "epoch": 0.387669313404951, + "grad_norm": 0.6825891536044698, + "learning_rate": 3.737462996277939e-05, + "loss": 0.688, + "step": 3320 + }, + { + "epoch": 0.38825315273236805, + "grad_norm": 0.7500018133835387, + "learning_rate": 3.73355736483772e-05, + "loss": 0.7262, + "step": 3325 + }, + { + "epoch": 0.38883699205978517, + "grad_norm": 0.7995432384520971, + "learning_rate": 3.72964806628529e-05, + "loss": 0.681, + "step": 3330 + }, + { + "epoch": 0.38942083138720224, + "grad_norm": 0.7595096108564414, + "learning_rate": 3.725735115196145e-05, + "loss": 0.7099, + "step": 3335 + }, + { + "epoch": 0.39000467071461936, + "grad_norm": 0.7440639912876408, + "learning_rate": 3.7218185261593984e-05, + "loss": 0.6958, + "step": 3340 + }, + { + "epoch": 0.39058851004203643, + "grad_norm": 0.7373854715092809, + "learning_rate": 3.717898313777729e-05, + "loss": 0.7001, + "step": 3345 + }, + { + "epoch": 0.39117234936945355, + "grad_norm": 0.7613559077274907, + "learning_rate": 3.713974492667324e-05, + "loss": 0.698, + "step": 3350 + }, + { + "epoch": 0.3917561886968706, + "grad_norm": 0.7015801575619542, + "learning_rate": 3.710047077457826e-05, + "loss": 0.6907, + "step": 3355 + }, + { + "epoch": 0.3923400280242877, + "grad_norm": 0.713928801821472, + "learning_rate": 3.706116082792276e-05, + "loss": 0.6829, + "step": 3360 + }, + { + "epoch": 0.3929238673517048, + "grad_norm": 0.7154651053309964, + "learning_rate": 3.702181523327064e-05, + "loss": 0.7117, + "step": 3365 + }, + { + "epoch": 0.3935077066791219, + "grad_norm": 0.7575843994242758, + "learning_rate": 3.698243413731867e-05, + "loss": 0.6882, + "step": 3370 + }, + { + "epoch": 0.394091546006539, + "grad_norm": 0.8048906650418256, + "learning_rate": 3.694301768689603e-05, + "loss": 0.6939, + "step": 3375 + }, + { + "epoch": 0.3946753853339561, + "grad_norm": 0.801214633355065, + "learning_rate": 3.690356602896368e-05, + "loss": 0.7002, + "step": 3380 + }, + { + "epoch": 0.3952592246613732, + "grad_norm": 0.8806970202674996, + "learning_rate": 3.686407931061386e-05, + "loss": 0.6787, + "step": 3385 + }, + { + "epoch": 0.39584306398879027, + "grad_norm": 0.9876675473391946, + "learning_rate": 3.682455767906954e-05, + "loss": 0.6949, + "step": 3390 + }, + { + "epoch": 0.3964269033162074, + "grad_norm": 0.8131617221350074, + "learning_rate": 3.678500128168384e-05, + "loss": 0.6765, + "step": 3395 + }, + { + "epoch": 0.39701074264362446, + "grad_norm": 0.807341108618566, + "learning_rate": 3.674541026593952e-05, + "loss": 0.6889, + "step": 3400 + }, + { + "epoch": 0.3975945819710416, + "grad_norm": 0.7925128757565136, + "learning_rate": 3.6705784779448405e-05, + "loss": 0.6795, + "step": 3405 + }, + { + "epoch": 0.39817842129845865, + "grad_norm": 0.7606764204522857, + "learning_rate": 3.6666124969950835e-05, + "loss": 0.6851, + "step": 3410 + }, + { + "epoch": 0.3987622606258758, + "grad_norm": 0.6939460834772246, + "learning_rate": 3.662643098531513e-05, + "loss": 0.6965, + "step": 3415 + }, + { + "epoch": 0.39934609995329284, + "grad_norm": 0.8919480095390583, + "learning_rate": 3.6586702973537025e-05, + "loss": 0.6941, + "step": 3420 + }, + { + "epoch": 0.39992993928070997, + "grad_norm": 0.7608789797993931, + "learning_rate": 3.654694108273912e-05, + "loss": 0.672, + "step": 3425 + }, + { + "epoch": 0.40051377860812704, + "grad_norm": 0.7061952122457605, + "learning_rate": 3.6507145461170345e-05, + "loss": 0.6676, + "step": 3430 + }, + { + "epoch": 0.40109761793554416, + "grad_norm": 0.8011601371400675, + "learning_rate": 3.646731625720537e-05, + "loss": 0.673, + "step": 3435 + }, + { + "epoch": 0.40168145726296123, + "grad_norm": 0.8179789066041869, + "learning_rate": 3.642745361934408e-05, + "loss": 0.6968, + "step": 3440 + }, + { + "epoch": 0.40226529659037835, + "grad_norm": 0.8681890988847925, + "learning_rate": 3.638755769621104e-05, + "loss": 0.6962, + "step": 3445 + }, + { + "epoch": 0.4028491359177954, + "grad_norm": 0.7522001832365489, + "learning_rate": 3.634762863655487e-05, + "loss": 0.6842, + "step": 3450 + }, + { + "epoch": 0.40343297524521254, + "grad_norm": 0.9070294968572723, + "learning_rate": 3.630766658924779e-05, + "loss": 0.7003, + "step": 3455 + }, + { + "epoch": 0.4040168145726296, + "grad_norm": 0.7797395542657742, + "learning_rate": 3.6267671703284963e-05, + "loss": 0.6962, + "step": 3460 + }, + { + "epoch": 0.4046006539000467, + "grad_norm": 0.9737871913004269, + "learning_rate": 3.6227644127784026e-05, + "loss": 0.701, + "step": 3465 + }, + { + "epoch": 0.4051844932274638, + "grad_norm": 0.8142196065459809, + "learning_rate": 3.618758401198447e-05, + "loss": 0.6851, + "step": 3470 + }, + { + "epoch": 0.4057683325548809, + "grad_norm": 0.9056270042476365, + "learning_rate": 3.6147491505247124e-05, + "loss": 0.6925, + "step": 3475 + }, + { + "epoch": 0.406352171882298, + "grad_norm": 0.7315884297176146, + "learning_rate": 3.6107366757053586e-05, + "loss": 0.6938, + "step": 3480 + }, + { + "epoch": 0.40693601120971507, + "grad_norm": 0.8078588240055888, + "learning_rate": 3.606720991700565e-05, + "loss": 0.7011, + "step": 3485 + }, + { + "epoch": 0.4075198505371322, + "grad_norm": 0.7254179613186337, + "learning_rate": 3.602702113482477e-05, + "loss": 0.6944, + "step": 3490 + }, + { + "epoch": 0.40810368986454926, + "grad_norm": 0.7709082948059535, + "learning_rate": 3.59868005603515e-05, + "loss": 0.7007, + "step": 3495 + }, + { + "epoch": 0.4086875291919664, + "grad_norm": 0.7115262527043823, + "learning_rate": 3.5946548343544925e-05, + "loss": 0.6782, + "step": 3500 + }, + { + "epoch": 0.40927136851938345, + "grad_norm": 0.8008414834674444, + "learning_rate": 3.5906264634482084e-05, + "loss": 0.6783, + "step": 3505 + }, + { + "epoch": 0.4098552078468006, + "grad_norm": 0.7982721355936174, + "learning_rate": 3.586594958335747e-05, + "loss": 0.6817, + "step": 3510 + }, + { + "epoch": 0.41043904717421764, + "grad_norm": 0.7268906859455486, + "learning_rate": 3.582560334048241e-05, + "loss": 0.6771, + "step": 3515 + }, + { + "epoch": 0.41102288650163477, + "grad_norm": 0.7779505205619507, + "learning_rate": 3.578522605628453e-05, + "loss": 0.667, + "step": 3520 + }, + { + "epoch": 0.41160672582905183, + "grad_norm": 0.7265217634872664, + "learning_rate": 3.5744817881307184e-05, + "loss": 0.6823, + "step": 3525 + }, + { + "epoch": 0.41219056515646896, + "grad_norm": 0.784282429365926, + "learning_rate": 3.570437896620891e-05, + "loss": 0.6847, + "step": 3530 + }, + { + "epoch": 0.412774404483886, + "grad_norm": 0.7832014455578007, + "learning_rate": 3.566390946176286e-05, + "loss": 0.6984, + "step": 3535 + }, + { + "epoch": 0.41335824381130315, + "grad_norm": 0.8273425356102178, + "learning_rate": 3.5623409518856225e-05, + "loss": 0.7043, + "step": 3540 + }, + { + "epoch": 0.4139420831387202, + "grad_norm": 0.6743765693532546, + "learning_rate": 3.55828792884897e-05, + "loss": 0.6963, + "step": 3545 + }, + { + "epoch": 0.41452592246613734, + "grad_norm": 0.7941581041438405, + "learning_rate": 3.5542318921776886e-05, + "loss": 0.684, + "step": 3550 + }, + { + "epoch": 0.4151097617935544, + "grad_norm": 0.7018226480438704, + "learning_rate": 3.5501728569943746e-05, + "loss": 0.6814, + "step": 3555 + }, + { + "epoch": 0.41569360112097153, + "grad_norm": 0.7685649461733042, + "learning_rate": 3.546110838432806e-05, + "loss": 0.6786, + "step": 3560 + }, + { + "epoch": 0.4162774404483886, + "grad_norm": 0.8047778984511744, + "learning_rate": 3.542045851637883e-05, + "loss": 0.6782, + "step": 3565 + }, + { + "epoch": 0.41686127977580567, + "grad_norm": 0.9128030650660797, + "learning_rate": 3.53797791176557e-05, + "loss": 0.7038, + "step": 3570 + }, + { + "epoch": 0.4174451191032228, + "grad_norm": 0.7706890640928363, + "learning_rate": 3.5339070339828466e-05, + "loss": 0.6882, + "step": 3575 + }, + { + "epoch": 0.41802895843063986, + "grad_norm": 0.7693351266925069, + "learning_rate": 3.529833233467642e-05, + "loss": 0.663, + "step": 3580 + }, + { + "epoch": 0.418612797758057, + "grad_norm": 0.7558947524304466, + "learning_rate": 3.525756525408785e-05, + "loss": 0.695, + "step": 3585 + }, + { + "epoch": 0.41919663708547406, + "grad_norm": 0.8022791818037412, + "learning_rate": 3.521676925005945e-05, + "loss": 0.6911, + "step": 3590 + }, + { + "epoch": 0.4197804764128912, + "grad_norm": 0.7439983041642912, + "learning_rate": 3.517594447469572e-05, + "loss": 0.674, + "step": 3595 + }, + { + "epoch": 0.42036431574030825, + "grad_norm": 0.6965671273422072, + "learning_rate": 3.513509108020846e-05, + "loss": 0.6945, + "step": 3600 + }, + { + "epoch": 0.42094815506772537, + "grad_norm": 0.7009108611745447, + "learning_rate": 3.5094209218916185e-05, + "loss": 0.6792, + "step": 3605 + }, + { + "epoch": 0.42153199439514244, + "grad_norm": 0.6904749909673673, + "learning_rate": 3.505329904324351e-05, + "loss": 0.6739, + "step": 3610 + }, + { + "epoch": 0.42211583372255956, + "grad_norm": 0.8174685162969035, + "learning_rate": 3.501236070572066e-05, + "loss": 0.6787, + "step": 3615 + }, + { + "epoch": 0.42269967304997663, + "grad_norm": 0.7808904635403306, + "learning_rate": 3.497139435898283e-05, + "loss": 0.6861, + "step": 3620 + }, + { + "epoch": 0.42328351237739376, + "grad_norm": 0.7140829015306348, + "learning_rate": 3.4930400155769644e-05, + "loss": 0.6773, + "step": 3625 + }, + { + "epoch": 0.4238673517048108, + "grad_norm": 0.6715183649248256, + "learning_rate": 3.488937824892461e-05, + "loss": 0.6986, + "step": 3630 + }, + { + "epoch": 0.42445119103222795, + "grad_norm": 0.6645867359518857, + "learning_rate": 3.48483287913945e-05, + "loss": 0.6984, + "step": 3635 + }, + { + "epoch": 0.425035030359645, + "grad_norm": 0.7245224781750078, + "learning_rate": 3.480725193622881e-05, + "loss": 0.6768, + "step": 3640 + }, + { + "epoch": 0.42561886968706214, + "grad_norm": 0.7290628767098649, + "learning_rate": 3.476614783657922e-05, + "loss": 0.7027, + "step": 3645 + }, + { + "epoch": 0.4262027090144792, + "grad_norm": 0.7065799709469044, + "learning_rate": 3.472501664569894e-05, + "loss": 0.6906, + "step": 3650 + }, + { + "epoch": 0.42678654834189633, + "grad_norm": 0.7141779437273328, + "learning_rate": 3.468385851694222e-05, + "loss": 0.6877, + "step": 3655 + }, + { + "epoch": 0.4273703876693134, + "grad_norm": 0.763602745903948, + "learning_rate": 3.464267360376373e-05, + "loss": 0.6811, + "step": 3660 + }, + { + "epoch": 0.4279542269967305, + "grad_norm": 0.7078483922513468, + "learning_rate": 3.460146205971802e-05, + "loss": 0.6967, + "step": 3665 + }, + { + "epoch": 0.4285380663241476, + "grad_norm": 0.6864310842103045, + "learning_rate": 3.456022403845891e-05, + "loss": 0.6887, + "step": 3670 + }, + { + "epoch": 0.4291219056515647, + "grad_norm": 0.8108275144233935, + "learning_rate": 3.4518959693738944e-05, + "loss": 0.6802, + "step": 3675 + }, + { + "epoch": 0.4297057449789818, + "grad_norm": 0.7599404194217726, + "learning_rate": 3.4477669179408834e-05, + "loss": 0.6843, + "step": 3680 + }, + { + "epoch": 0.43028958430639885, + "grad_norm": 0.77809828148712, + "learning_rate": 3.443635264941682e-05, + "loss": 0.685, + "step": 3685 + }, + { + "epoch": 0.430873423633816, + "grad_norm": 0.8118714505627129, + "learning_rate": 3.4395010257808185e-05, + "loss": 0.6891, + "step": 3690 + }, + { + "epoch": 0.43145726296123305, + "grad_norm": 0.6807080465607841, + "learning_rate": 3.43536421587246e-05, + "loss": 0.698, + "step": 3695 + }, + { + "epoch": 0.43204110228865017, + "grad_norm": 0.7551098220413885, + "learning_rate": 3.431224850640361e-05, + "loss": 0.6819, + "step": 3700 + }, + { + "epoch": 0.43262494161606724, + "grad_norm": 0.7538181047973818, + "learning_rate": 3.427082945517801e-05, + "loss": 0.677, + "step": 3705 + }, + { + "epoch": 0.43320878094348436, + "grad_norm": 0.7286404689155307, + "learning_rate": 3.422938515947531e-05, + "loss": 0.6716, + "step": 3710 + }, + { + "epoch": 0.43379262027090143, + "grad_norm": 0.7398508893666071, + "learning_rate": 3.418791577381713e-05, + "loss": 0.6785, + "step": 3715 + }, + { + "epoch": 0.43437645959831855, + "grad_norm": 0.7863660031060099, + "learning_rate": 3.4146421452818657e-05, + "loss": 0.7008, + "step": 3720 + }, + { + "epoch": 0.4349602989257356, + "grad_norm": 0.7352662601655674, + "learning_rate": 3.4104902351188e-05, + "loss": 0.6811, + "step": 3725 + }, + { + "epoch": 0.43554413825315275, + "grad_norm": 0.7398589781977044, + "learning_rate": 3.406335862372573e-05, + "loss": 0.6706, + "step": 3730 + }, + { + "epoch": 0.4361279775805698, + "grad_norm": 0.7845350671727488, + "learning_rate": 3.402179042532417e-05, + "loss": 0.6916, + "step": 3735 + }, + { + "epoch": 0.43671181690798694, + "grad_norm": 0.7246822043698157, + "learning_rate": 3.3980197910966915e-05, + "loss": 0.6877, + "step": 3740 + }, + { + "epoch": 0.437295656235404, + "grad_norm": 0.7641661732006494, + "learning_rate": 3.3938581235728214e-05, + "loss": 0.6747, + "step": 3745 + }, + { + "epoch": 0.43787949556282113, + "grad_norm": 0.739738159689709, + "learning_rate": 3.389694055477238e-05, + "loss": 0.6966, + "step": 3750 + }, + { + "epoch": 0.4384633348902382, + "grad_norm": 0.7414932876347806, + "learning_rate": 3.385527602335327e-05, + "loss": 0.6482, + "step": 3755 + }, + { + "epoch": 0.4390471742176553, + "grad_norm": 0.8264691570015129, + "learning_rate": 3.381358779681362e-05, + "loss": 0.6723, + "step": 3760 + }, + { + "epoch": 0.4396310135450724, + "grad_norm": 0.8435514134920983, + "learning_rate": 3.377187603058454e-05, + "loss": 0.6882, + "step": 3765 + }, + { + "epoch": 0.4402148528724895, + "grad_norm": 0.8156187493976549, + "learning_rate": 3.373014088018489e-05, + "loss": 0.7039, + "step": 3770 + }, + { + "epoch": 0.4407986921999066, + "grad_norm": 0.7068325775458664, + "learning_rate": 3.3688382501220727e-05, + "loss": 0.6706, + "step": 3775 + }, + { + "epoch": 0.4413825315273237, + "grad_norm": 0.7900012430179307, + "learning_rate": 3.364660104938472e-05, + "loss": 0.6909, + "step": 3780 + }, + { + "epoch": 0.4419663708547408, + "grad_norm": 0.7948978228461001, + "learning_rate": 3.3604796680455546e-05, + "loss": 0.7045, + "step": 3785 + }, + { + "epoch": 0.44255021018215784, + "grad_norm": 0.6664618406737972, + "learning_rate": 3.356296955029733e-05, + "loss": 0.6803, + "step": 3790 + }, + { + "epoch": 0.44313404950957497, + "grad_norm": 0.6988062398480143, + "learning_rate": 3.3521119814859063e-05, + "loss": 0.686, + "step": 3795 + }, + { + "epoch": 0.44371788883699204, + "grad_norm": 0.6579643187279307, + "learning_rate": 3.347924763017403e-05, + "loss": 0.673, + "step": 3800 + }, + { + "epoch": 0.44430172816440916, + "grad_norm": 0.671136468708349, + "learning_rate": 3.3437353152359195e-05, + "loss": 0.6785, + "step": 3805 + }, + { + "epoch": 0.44488556749182623, + "grad_norm": 0.7125716822949324, + "learning_rate": 3.339543653761466e-05, + "loss": 0.6866, + "step": 3810 + }, + { + "epoch": 0.44546940681924335, + "grad_norm": 0.7777198124067506, + "learning_rate": 3.335349794222304e-05, + "loss": 0.675, + "step": 3815 + }, + { + "epoch": 0.4460532461466604, + "grad_norm": 0.7387167777215246, + "learning_rate": 3.331153752254893e-05, + "loss": 0.6978, + "step": 3820 + }, + { + "epoch": 0.44663708547407754, + "grad_norm": 0.8016313362554687, + "learning_rate": 3.326955543503827e-05, + "loss": 0.705, + "step": 3825 + }, + { + "epoch": 0.4472209248014946, + "grad_norm": 0.749993680363176, + "learning_rate": 3.322755183621779e-05, + "loss": 0.6772, + "step": 3830 + }, + { + "epoch": 0.44780476412891174, + "grad_norm": 0.7348760458565766, + "learning_rate": 3.318552688269446e-05, + "loss": 0.6715, + "step": 3835 + }, + { + "epoch": 0.4483886034563288, + "grad_norm": 0.8064731890629008, + "learning_rate": 3.314348073115481e-05, + "loss": 0.6858, + "step": 3840 + }, + { + "epoch": 0.44897244278374593, + "grad_norm": 0.7578691186858068, + "learning_rate": 3.310141353836446e-05, + "loss": 0.6903, + "step": 3845 + }, + { + "epoch": 0.449556282111163, + "grad_norm": 0.7460901993225614, + "learning_rate": 3.305932546116743e-05, + "loss": 0.681, + "step": 3850 + }, + { + "epoch": 0.4501401214385801, + "grad_norm": 0.7506257697761904, + "learning_rate": 3.301721665648566e-05, + "loss": 0.6984, + "step": 3855 + }, + { + "epoch": 0.4507239607659972, + "grad_norm": 0.7420341018863588, + "learning_rate": 3.297508728131832e-05, + "loss": 0.6758, + "step": 3860 + }, + { + "epoch": 0.4513078000934143, + "grad_norm": 0.7548433309336711, + "learning_rate": 3.29329374927413e-05, + "loss": 0.6858, + "step": 3865 + }, + { + "epoch": 0.4518916394208314, + "grad_norm": 0.7394699898633986, + "learning_rate": 3.2890767447906615e-05, + "loss": 0.6718, + "step": 3870 + }, + { + "epoch": 0.4524754787482485, + "grad_norm": 0.7427273902406784, + "learning_rate": 3.284857730404176e-05, + "loss": 0.6758, + "step": 3875 + }, + { + "epoch": 0.4530593180756656, + "grad_norm": 0.7959414367342418, + "learning_rate": 3.2806367218449216e-05, + "loss": 0.6817, + "step": 3880 + }, + { + "epoch": 0.4536431574030827, + "grad_norm": 0.7495312967063392, + "learning_rate": 3.2764137348505785e-05, + "loss": 0.6781, + "step": 3885 + }, + { + "epoch": 0.45422699673049977, + "grad_norm": 0.8033197593264897, + "learning_rate": 3.2721887851662044e-05, + "loss": 0.6708, + "step": 3890 + }, + { + "epoch": 0.45481083605791683, + "grad_norm": 0.8691456128606405, + "learning_rate": 3.267961888544173e-05, + "loss": 0.6888, + "step": 3895 + }, + { + "epoch": 0.45539467538533396, + "grad_norm": 0.6867694796227263, + "learning_rate": 3.263733060744121e-05, + "loss": 0.6782, + "step": 3900 + }, + { + "epoch": 0.455978514712751, + "grad_norm": 0.8480385798049878, + "learning_rate": 3.25950231753288e-05, + "loss": 0.6706, + "step": 3905 + }, + { + "epoch": 0.45656235404016815, + "grad_norm": 0.7507839598810029, + "learning_rate": 3.255269674684427e-05, + "loss": 0.6969, + "step": 3910 + }, + { + "epoch": 0.4571461933675852, + "grad_norm": 0.7665881732297976, + "learning_rate": 3.2510351479798214e-05, + "loss": 0.6536, + "step": 3915 + }, + { + "epoch": 0.45773003269500234, + "grad_norm": 0.892815003005755, + "learning_rate": 3.2467987532071436e-05, + "loss": 0.6769, + "step": 3920 + }, + { + "epoch": 0.4583138720224194, + "grad_norm": 0.8147955829566529, + "learning_rate": 3.242560506161442e-05, + "loss": 0.6759, + "step": 3925 + }, + { + "epoch": 0.45889771134983653, + "grad_norm": 0.7936781689898847, + "learning_rate": 3.2383204226446706e-05, + "loss": 0.6884, + "step": 3930 + }, + { + "epoch": 0.4594815506772536, + "grad_norm": 0.7769521096875983, + "learning_rate": 3.234078518465628e-05, + "loss": 0.6919, + "step": 3935 + }, + { + "epoch": 0.4600653900046707, + "grad_norm": 0.7368280819168297, + "learning_rate": 3.229834809439904e-05, + "loss": 0.6808, + "step": 3940 + }, + { + "epoch": 0.4606492293320878, + "grad_norm": 0.8226859357051185, + "learning_rate": 3.225589311389816e-05, + "loss": 0.6772, + "step": 3945 + }, + { + "epoch": 0.4612330686595049, + "grad_norm": 0.6799317529875593, + "learning_rate": 3.221342040144352e-05, + "loss": 0.6856, + "step": 3950 + }, + { + "epoch": 0.461816907986922, + "grad_norm": 0.7261626348697685, + "learning_rate": 3.217093011539111e-05, + "loss": 0.6909, + "step": 3955 + }, + { + "epoch": 0.4624007473143391, + "grad_norm": 0.8051796951442984, + "learning_rate": 3.2128422414162454e-05, + "loss": 0.6822, + "step": 3960 + }, + { + "epoch": 0.4629845866417562, + "grad_norm": 0.7507385693708827, + "learning_rate": 3.2085897456243986e-05, + "loss": 0.6803, + "step": 3965 + }, + { + "epoch": 0.4635684259691733, + "grad_norm": 0.8035133685379359, + "learning_rate": 3.204335540018649e-05, + "loss": 0.6765, + "step": 3970 + }, + { + "epoch": 0.46415226529659037, + "grad_norm": 0.7329707765377573, + "learning_rate": 3.200079640460451e-05, + "loss": 0.6812, + "step": 3975 + }, + { + "epoch": 0.4647361046240075, + "grad_norm": 0.7136586791277427, + "learning_rate": 3.195822062817573e-05, + "loss": 0.6649, + "step": 3980 + }, + { + "epoch": 0.46531994395142456, + "grad_norm": 0.6883260567024575, + "learning_rate": 3.191562822964041e-05, + "loss": 0.7012, + "step": 3985 + }, + { + "epoch": 0.4659037832788417, + "grad_norm": 0.6991257110758196, + "learning_rate": 3.187301936780079e-05, + "loss": 0.6555, + "step": 3990 + }, + { + "epoch": 0.46648762260625876, + "grad_norm": 0.6853148064402772, + "learning_rate": 3.183039420152047e-05, + "loss": 0.6846, + "step": 3995 + }, + { + "epoch": 0.4670714619336759, + "grad_norm": 0.6637228068073139, + "learning_rate": 3.178775288972386e-05, + "loss": 0.6824, + "step": 4000 + }, + { + "epoch": 0.46765530126109295, + "grad_norm": 0.7145647310174994, + "learning_rate": 3.174509559139556e-05, + "loss": 0.6867, + "step": 4005 + }, + { + "epoch": 0.46823914058851, + "grad_norm": 0.6478018182375663, + "learning_rate": 3.170242246557978e-05, + "loss": 0.6795, + "step": 4010 + }, + { + "epoch": 0.46882297991592714, + "grad_norm": 0.7153515691503844, + "learning_rate": 3.1659733671379735e-05, + "loss": 0.6872, + "step": 4015 + }, + { + "epoch": 0.4694068192433442, + "grad_norm": 0.6827402551177538, + "learning_rate": 3.1617029367957053e-05, + "loss": 0.6591, + "step": 4020 + }, + { + "epoch": 0.46999065857076133, + "grad_norm": 0.7576014181506056, + "learning_rate": 3.1574309714531195e-05, + "loss": 0.6825, + "step": 4025 + }, + { + "epoch": 0.4705744978981784, + "grad_norm": 0.691219585542754, + "learning_rate": 3.153157487037887e-05, + "loss": 0.6593, + "step": 4030 + }, + { + "epoch": 0.4711583372255955, + "grad_norm": 0.7113323822516596, + "learning_rate": 3.1488824994833395e-05, + "loss": 0.6736, + "step": 4035 + }, + { + "epoch": 0.4717421765530126, + "grad_norm": 0.7106992433701441, + "learning_rate": 3.1446060247284134e-05, + "loss": 0.676, + "step": 4040 + }, + { + "epoch": 0.4723260158804297, + "grad_norm": 0.724318171580972, + "learning_rate": 3.140328078717591e-05, + "loss": 0.6845, + "step": 4045 + }, + { + "epoch": 0.4729098552078468, + "grad_norm": 0.7803058456745933, + "learning_rate": 3.1360486774008415e-05, + "loss": 0.6804, + "step": 4050 + }, + { + "epoch": 0.4734936945352639, + "grad_norm": 0.6717718286032255, + "learning_rate": 3.131767836733556e-05, + "loss": 0.6851, + "step": 4055 + }, + { + "epoch": 0.474077533862681, + "grad_norm": 0.702859871171385, + "learning_rate": 3.127485572676496e-05, + "loss": 0.673, + "step": 4060 + }, + { + "epoch": 0.4746613731900981, + "grad_norm": 0.7784472753189181, + "learning_rate": 3.1232019011957294e-05, + "loss": 0.6696, + "step": 4065 + }, + { + "epoch": 0.47524521251751517, + "grad_norm": 0.7599086934930439, + "learning_rate": 3.118916838262568e-05, + "loss": 0.6618, + "step": 4070 + }, + { + "epoch": 0.4758290518449323, + "grad_norm": 0.6544883749916701, + "learning_rate": 3.114630399853517e-05, + "loss": 0.6659, + "step": 4075 + }, + { + "epoch": 0.47641289117234936, + "grad_norm": 0.676166756954035, + "learning_rate": 3.1103426019502055e-05, + "loss": 0.6565, + "step": 4080 + }, + { + "epoch": 0.4769967304997665, + "grad_norm": 0.7666409275995504, + "learning_rate": 3.1060534605393345e-05, + "loss": 0.6923, + "step": 4085 + }, + { + "epoch": 0.47758056982718355, + "grad_norm": 0.7157437169336734, + "learning_rate": 3.101762991612611e-05, + "loss": 0.6757, + "step": 4090 + }, + { + "epoch": 0.4781644091546007, + "grad_norm": 0.8118839151621046, + "learning_rate": 3.0974712111666935e-05, + "loss": 0.6867, + "step": 4095 + }, + { + "epoch": 0.47874824848201775, + "grad_norm": 0.8147402404625602, + "learning_rate": 3.09317813520313e-05, + "loss": 0.695, + "step": 4100 + }, + { + "epoch": 0.47933208780943487, + "grad_norm": 0.6964765663247126, + "learning_rate": 3.0888837797283005e-05, + "loss": 0.6835, + "step": 4105 + }, + { + "epoch": 0.47991592713685194, + "grad_norm": 0.7354320889290884, + "learning_rate": 3.0845881607533524e-05, + "loss": 0.678, + "step": 4110 + }, + { + "epoch": 0.480499766464269, + "grad_norm": 0.6947896868509368, + "learning_rate": 3.0802912942941453e-05, + "loss": 0.685, + "step": 4115 + }, + { + "epoch": 0.48108360579168613, + "grad_norm": 0.6654318510038829, + "learning_rate": 3.0759931963711913e-05, + "loss": 0.6793, + "step": 4120 + }, + { + "epoch": 0.4816674451191032, + "grad_norm": 0.7815873141024726, + "learning_rate": 3.071693883009591e-05, + "loss": 0.6846, + "step": 4125 + }, + { + "epoch": 0.4822512844465203, + "grad_norm": 0.7091320415900886, + "learning_rate": 3.06739337023898e-05, + "loss": 0.6747, + "step": 4130 + }, + { + "epoch": 0.4828351237739374, + "grad_norm": 0.8021203707814092, + "learning_rate": 3.0630916740934626e-05, + "loss": 0.6771, + "step": 4135 + }, + { + "epoch": 0.4834189631013545, + "grad_norm": 0.6461471445686426, + "learning_rate": 3.058788810611558e-05, + "loss": 0.6718, + "step": 4140 + }, + { + "epoch": 0.4840028024287716, + "grad_norm": 0.814971747463668, + "learning_rate": 3.054484795836136e-05, + "loss": 0.6782, + "step": 4145 + }, + { + "epoch": 0.4845866417561887, + "grad_norm": 0.7356998398110803, + "learning_rate": 3.0501796458143593e-05, + "loss": 0.6755, + "step": 4150 + }, + { + "epoch": 0.4851704810836058, + "grad_norm": 0.7237335237898106, + "learning_rate": 3.045873376597624e-05, + "loss": 0.6688, + "step": 4155 + }, + { + "epoch": 0.4857543204110229, + "grad_norm": 0.7662558686718588, + "learning_rate": 3.041566004241498e-05, + "loss": 0.667, + "step": 4160 + }, + { + "epoch": 0.48633815973843997, + "grad_norm": 0.7105698871150168, + "learning_rate": 3.037257544805661e-05, + "loss": 0.6706, + "step": 4165 + }, + { + "epoch": 0.4869219990658571, + "grad_norm": 0.6840148973525364, + "learning_rate": 3.0329480143538498e-05, + "loss": 0.6827, + "step": 4170 + }, + { + "epoch": 0.48750583839327416, + "grad_norm": 0.758766760947445, + "learning_rate": 3.0286374289537912e-05, + "loss": 0.6451, + "step": 4175 + }, + { + "epoch": 0.4880896777206913, + "grad_norm": 0.6952109543804652, + "learning_rate": 3.0243258046771446e-05, + "loss": 0.6685, + "step": 4180 + }, + { + "epoch": 0.48867351704810835, + "grad_norm": 0.7526199018233571, + "learning_rate": 3.0200131575994456e-05, + "loss": 0.6867, + "step": 4185 + }, + { + "epoch": 0.4892573563755255, + "grad_norm": 0.7015147433957639, + "learning_rate": 3.0156995038000418e-05, + "loss": 0.6571, + "step": 4190 + }, + { + "epoch": 0.48984119570294254, + "grad_norm": 0.733436583796095, + "learning_rate": 3.011384859362034e-05, + "loss": 0.6803, + "step": 4195 + }, + { + "epoch": 0.49042503503035967, + "grad_norm": 0.857995207926992, + "learning_rate": 3.0070692403722162e-05, + "loss": 0.6891, + "step": 4200 + }, + { + "epoch": 0.49100887435777674, + "grad_norm": 0.7292363842938806, + "learning_rate": 3.002752662921018e-05, + "loss": 0.6929, + "step": 4205 + }, + { + "epoch": 0.49159271368519386, + "grad_norm": 0.7839103814367115, + "learning_rate": 2.9984351431024394e-05, + "loss": 0.6712, + "step": 4210 + }, + { + "epoch": 0.49217655301261093, + "grad_norm": 0.6775948265557494, + "learning_rate": 2.9941166970139968e-05, + "loss": 0.648, + "step": 4215 + }, + { + "epoch": 0.492760392340028, + "grad_norm": 0.7198282951734615, + "learning_rate": 2.9897973407566583e-05, + "loss": 0.666, + "step": 4220 + }, + { + "epoch": 0.4933442316674451, + "grad_norm": 0.7766993957049966, + "learning_rate": 2.985477090434786e-05, + "loss": 0.6892, + "step": 4225 + }, + { + "epoch": 0.4939280709948622, + "grad_norm": 0.7291729148417119, + "learning_rate": 2.9811559621560765e-05, + "loss": 0.6806, + "step": 4230 + }, + { + "epoch": 0.4945119103222793, + "grad_norm": 0.7756444851906211, + "learning_rate": 2.976833972031498e-05, + "loss": 0.6555, + "step": 4235 + }, + { + "epoch": 0.4950957496496964, + "grad_norm": 0.693692295860524, + "learning_rate": 2.9725111361752333e-05, + "loss": 0.6962, + "step": 4240 + }, + { + "epoch": 0.4956795889771135, + "grad_norm": 0.6950612733524342, + "learning_rate": 2.968187470704618e-05, + "loss": 0.668, + "step": 4245 + }, + { + "epoch": 0.4962634283045306, + "grad_norm": 0.7551664289622262, + "learning_rate": 2.9638629917400806e-05, + "loss": 0.6921, + "step": 4250 + }, + { + "epoch": 0.4968472676319477, + "grad_norm": 0.7486965420977963, + "learning_rate": 2.9595377154050836e-05, + "loss": 0.6917, + "step": 4255 + }, + { + "epoch": 0.49743110695936477, + "grad_norm": 0.7007684309768127, + "learning_rate": 2.955211657826061e-05, + "loss": 0.6843, + "step": 4260 + }, + { + "epoch": 0.4980149462867819, + "grad_norm": 0.6963601300824674, + "learning_rate": 2.9508848351323597e-05, + "loss": 0.6566, + "step": 4265 + }, + { + "epoch": 0.49859878561419896, + "grad_norm": 0.7667329518085629, + "learning_rate": 2.9465572634561815e-05, + "loss": 0.686, + "step": 4270 + }, + { + "epoch": 0.4991826249416161, + "grad_norm": 0.7231068396927425, + "learning_rate": 2.9422289589325187e-05, + "loss": 0.6723, + "step": 4275 + }, + { + "epoch": 0.49976646426903315, + "grad_norm": 0.8284182332439602, + "learning_rate": 2.9378999376990958e-05, + "loss": 0.6464, + "step": 4280 + }, + { + "epoch": 0.5003503035964503, + "grad_norm": 0.8459390831130426, + "learning_rate": 2.9335702158963107e-05, + "loss": 0.6628, + "step": 4285 + }, + { + "epoch": 0.5009341429238674, + "grad_norm": 0.7011442779440549, + "learning_rate": 2.929239809667172e-05, + "loss": 0.6432, + "step": 4290 + }, + { + "epoch": 0.5015179822512844, + "grad_norm": 0.8746587602987824, + "learning_rate": 2.9249087351572414e-05, + "loss": 0.6903, + "step": 4295 + }, + { + "epoch": 0.5021018215787015, + "grad_norm": 0.7257414684272304, + "learning_rate": 2.9205770085145716e-05, + "loss": 0.692, + "step": 4300 + }, + { + "epoch": 0.5026856609061187, + "grad_norm": 0.805965566100688, + "learning_rate": 2.916244645889647e-05, + "loss": 0.6764, + "step": 4305 + }, + { + "epoch": 0.5032695002335358, + "grad_norm": 0.7462663884955255, + "learning_rate": 2.911911663435322e-05, + "loss": 0.6627, + "step": 4310 + }, + { + "epoch": 0.5038533395609528, + "grad_norm": 0.7448138565708283, + "learning_rate": 2.9075780773067644e-05, + "loss": 0.6561, + "step": 4315 + }, + { + "epoch": 0.5044371788883699, + "grad_norm": 0.7391907955354905, + "learning_rate": 2.9032439036613907e-05, + "loss": 0.6938, + "step": 4320 + }, + { + "epoch": 0.505021018215787, + "grad_norm": 0.7090872369182649, + "learning_rate": 2.8989091586588085e-05, + "loss": 0.6734, + "step": 4325 + }, + { + "epoch": 0.5056048575432041, + "grad_norm": 0.6513803711091114, + "learning_rate": 2.894573858460755e-05, + "loss": 0.6759, + "step": 4330 + }, + { + "epoch": 0.5061886968706212, + "grad_norm": 0.7084768130001255, + "learning_rate": 2.8902380192310392e-05, + "loss": 0.6649, + "step": 4335 + }, + { + "epoch": 0.5067725361980383, + "grad_norm": 0.77828284609766, + "learning_rate": 2.8859016571354778e-05, + "loss": 0.6686, + "step": 4340 + }, + { + "epoch": 0.5073563755254554, + "grad_norm": 0.7501781315460823, + "learning_rate": 2.881564788341839e-05, + "loss": 0.6868, + "step": 4345 + }, + { + "epoch": 0.5079402148528724, + "grad_norm": 0.6544860225443073, + "learning_rate": 2.877227429019778e-05, + "loss": 0.6688, + "step": 4350 + }, + { + "epoch": 0.5085240541802896, + "grad_norm": 0.7406104086931747, + "learning_rate": 2.872889595340781e-05, + "loss": 0.6712, + "step": 4355 + }, + { + "epoch": 0.5091078935077067, + "grad_norm": 0.6811806607091296, + "learning_rate": 2.8685513034781003e-05, + "loss": 0.6626, + "step": 4360 + }, + { + "epoch": 0.5096917328351238, + "grad_norm": 0.767597087471059, + "learning_rate": 2.864212569606699e-05, + "loss": 0.6624, + "step": 4365 + }, + { + "epoch": 0.5102755721625408, + "grad_norm": 0.6614583652870923, + "learning_rate": 2.8598734099031878e-05, + "loss": 0.6575, + "step": 4370 + }, + { + "epoch": 0.510859411489958, + "grad_norm": 0.7740783585451079, + "learning_rate": 2.8555338405457628e-05, + "loss": 0.6863, + "step": 4375 + }, + { + "epoch": 0.5114432508173751, + "grad_norm": 0.7934060933684554, + "learning_rate": 2.85119387771415e-05, + "loss": 0.6706, + "step": 4380 + }, + { + "epoch": 0.5120270901447922, + "grad_norm": 0.7459963061955495, + "learning_rate": 2.8468535375895417e-05, + "loss": 0.6442, + "step": 4385 + }, + { + "epoch": 0.5126109294722092, + "grad_norm": 0.6723748698282871, + "learning_rate": 2.8425128363545362e-05, + "loss": 0.6886, + "step": 4390 + }, + { + "epoch": 0.5131947687996263, + "grad_norm": 0.669676392518392, + "learning_rate": 2.8381717901930792e-05, + "loss": 0.6547, + "step": 4395 + }, + { + "epoch": 0.5137786081270435, + "grad_norm": 0.659571997070763, + "learning_rate": 2.8338304152904016e-05, + "loss": 0.6843, + "step": 4400 + }, + { + "epoch": 0.5143624474544606, + "grad_norm": 0.8288991445997408, + "learning_rate": 2.8294887278329606e-05, + "loss": 0.6763, + "step": 4405 + }, + { + "epoch": 0.5149462867818776, + "grad_norm": 0.707942997453321, + "learning_rate": 2.825146744008378e-05, + "loss": 0.6564, + "step": 4410 + }, + { + "epoch": 0.5155301261092947, + "grad_norm": 0.7278932584750964, + "learning_rate": 2.8208044800053822e-05, + "loss": 0.7021, + "step": 4415 + }, + { + "epoch": 0.5161139654367118, + "grad_norm": 0.6352750094631958, + "learning_rate": 2.8164619520137437e-05, + "loss": 0.6696, + "step": 4420 + }, + { + "epoch": 0.516697804764129, + "grad_norm": 0.6784043685774885, + "learning_rate": 2.8121191762242188e-05, + "loss": 0.6676, + "step": 4425 + }, + { + "epoch": 0.517281644091546, + "grad_norm": 0.6497329718856635, + "learning_rate": 2.8077761688284886e-05, + "loss": 0.6513, + "step": 4430 + }, + { + "epoch": 0.5178654834189631, + "grad_norm": 0.6706848805024374, + "learning_rate": 2.803432946019095e-05, + "loss": 0.669, + "step": 4435 + }, + { + "epoch": 0.5184493227463802, + "grad_norm": 0.7504448419630855, + "learning_rate": 2.7990895239893866e-05, + "loss": 0.664, + "step": 4440 + }, + { + "epoch": 0.5190331620737972, + "grad_norm": 0.6803615489829645, + "learning_rate": 2.7947459189334514e-05, + "loss": 0.6466, + "step": 4445 + }, + { + "epoch": 0.5196170014012144, + "grad_norm": 0.6974815916635871, + "learning_rate": 2.790402147046062e-05, + "loss": 0.6734, + "step": 4450 + }, + { + "epoch": 0.5202008407286315, + "grad_norm": 0.6876467906226407, + "learning_rate": 2.7860582245226114e-05, + "loss": 0.6582, + "step": 4455 + }, + { + "epoch": 0.5207846800560486, + "grad_norm": 0.6834407069996354, + "learning_rate": 2.781714167559056e-05, + "loss": 0.6652, + "step": 4460 + }, + { + "epoch": 0.5213685193834656, + "grad_norm": 0.8004024893171434, + "learning_rate": 2.7773699923518527e-05, + "loss": 0.68, + "step": 4465 + }, + { + "epoch": 0.5219523587108827, + "grad_norm": 0.6831945682441466, + "learning_rate": 2.7730257150978985e-05, + "loss": 0.6804, + "step": 4470 + }, + { + "epoch": 0.5225361980382999, + "grad_norm": 0.6439654412707709, + "learning_rate": 2.7686813519944716e-05, + "loss": 0.6756, + "step": 4475 + }, + { + "epoch": 0.523120037365717, + "grad_norm": 0.6867283041103277, + "learning_rate": 2.7643369192391705e-05, + "loss": 0.6619, + "step": 4480 + }, + { + "epoch": 0.523703876693134, + "grad_norm": 0.7432268603888622, + "learning_rate": 2.759992433029852e-05, + "loss": 0.6728, + "step": 4485 + }, + { + "epoch": 0.5242877160205511, + "grad_norm": 0.7089308367934053, + "learning_rate": 2.7556479095645753e-05, + "loss": 0.6585, + "step": 4490 + }, + { + "epoch": 0.5248715553479683, + "grad_norm": 0.7013335826205295, + "learning_rate": 2.7513033650415352e-05, + "loss": 0.683, + "step": 4495 + }, + { + "epoch": 0.5254553946753854, + "grad_norm": 0.7000494253946886, + "learning_rate": 2.7469588156590065e-05, + "loss": 0.6546, + "step": 4500 + }, + { + "epoch": 0.5260392340028024, + "grad_norm": 0.683894527168804, + "learning_rate": 2.742614277615282e-05, + "loss": 0.6786, + "step": 4505 + }, + { + "epoch": 0.5266230733302195, + "grad_norm": 0.6302791455523522, + "learning_rate": 2.7382697671086115e-05, + "loss": 0.6775, + "step": 4510 + }, + { + "epoch": 0.5272069126576366, + "grad_norm": 0.6529176555018806, + "learning_rate": 2.7339253003371434e-05, + "loss": 0.6769, + "step": 4515 + }, + { + "epoch": 0.5277907519850538, + "grad_norm": 0.6772272567588813, + "learning_rate": 2.729580893498862e-05, + "loss": 0.683, + "step": 4520 + }, + { + "epoch": 0.5283745913124708, + "grad_norm": 0.6902263132262554, + "learning_rate": 2.725236562791529e-05, + "loss": 0.6591, + "step": 4525 + }, + { + "epoch": 0.5289584306398879, + "grad_norm": 0.644687802132002, + "learning_rate": 2.7208923244126218e-05, + "loss": 0.6695, + "step": 4530 + }, + { + "epoch": 0.529542269967305, + "grad_norm": 0.7687528252927972, + "learning_rate": 2.716548194559273e-05, + "loss": 0.6899, + "step": 4535 + }, + { + "epoch": 0.5301261092947221, + "grad_norm": 0.6848256355744042, + "learning_rate": 2.7122041894282113e-05, + "loss": 0.6811, + "step": 4540 + }, + { + "epoch": 0.5307099486221392, + "grad_norm": 0.7221699347481154, + "learning_rate": 2.707860325215701e-05, + "loss": 0.6885, + "step": 4545 + }, + { + "epoch": 0.5312937879495563, + "grad_norm": 0.6872951132103544, + "learning_rate": 2.7035166181174786e-05, + "loss": 0.6732, + "step": 4550 + }, + { + "epoch": 0.5318776272769734, + "grad_norm": 0.7183593593672226, + "learning_rate": 2.6991730843286985e-05, + "loss": 0.6701, + "step": 4555 + }, + { + "epoch": 0.5324614666043904, + "grad_norm": 0.8089135923074973, + "learning_rate": 2.6948297400438654e-05, + "loss": 0.6936, + "step": 4560 + }, + { + "epoch": 0.5330453059318075, + "grad_norm": 0.7222352662881156, + "learning_rate": 2.6904866014567792e-05, + "loss": 0.6828, + "step": 4565 + }, + { + "epoch": 0.5336291452592247, + "grad_norm": 0.7765733283168805, + "learning_rate": 2.686143684760473e-05, + "loss": 0.6808, + "step": 4570 + }, + { + "epoch": 0.5342129845866418, + "grad_norm": 0.6878981817242145, + "learning_rate": 2.6818010061471516e-05, + "loss": 0.655, + "step": 4575 + }, + { + "epoch": 0.5347968239140588, + "grad_norm": 0.7122473652678842, + "learning_rate": 2.6774585818081332e-05, + "loss": 0.6562, + "step": 4580 + }, + { + "epoch": 0.5353806632414759, + "grad_norm": 0.6850388051290822, + "learning_rate": 2.6731164279337867e-05, + "loss": 0.6875, + "step": 4585 + }, + { + "epoch": 0.535964502568893, + "grad_norm": 0.6854622359359065, + "learning_rate": 2.668774560713474e-05, + "loss": 0.6843, + "step": 4590 + }, + { + "epoch": 0.5365483418963102, + "grad_norm": 0.6973189987108477, + "learning_rate": 2.6644329963354882e-05, + "loss": 0.6785, + "step": 4595 + }, + { + "epoch": 0.5371321812237272, + "grad_norm": 0.7330283715603699, + "learning_rate": 2.6600917509869912e-05, + "loss": 0.6652, + "step": 4600 + }, + { + "epoch": 0.5377160205511443, + "grad_norm": 0.6815823319864532, + "learning_rate": 2.655750840853958e-05, + "loss": 0.6531, + "step": 4605 + }, + { + "epoch": 0.5382998598785614, + "grad_norm": 0.7460677526528949, + "learning_rate": 2.6514102821211117e-05, + "loss": 0.6843, + "step": 4610 + }, + { + "epoch": 0.5388836992059786, + "grad_norm": 0.6836960941272519, + "learning_rate": 2.647070090971867e-05, + "loss": 0.672, + "step": 4615 + }, + { + "epoch": 0.5394675385333956, + "grad_norm": 0.7513333647671177, + "learning_rate": 2.6427302835882672e-05, + "loss": 0.6534, + "step": 4620 + }, + { + "epoch": 0.5400513778608127, + "grad_norm": 0.7273470200485447, + "learning_rate": 2.6383908761509252e-05, + "loss": 0.6586, + "step": 4625 + }, + { + "epoch": 0.5406352171882298, + "grad_norm": 0.7056443904596775, + "learning_rate": 2.634051884838961e-05, + "loss": 0.6731, + "step": 4630 + }, + { + "epoch": 0.541219056515647, + "grad_norm": 0.6633888474887489, + "learning_rate": 2.629713325829946e-05, + "loss": 0.6865, + "step": 4635 + }, + { + "epoch": 0.541802895843064, + "grad_norm": 0.7681657660191756, + "learning_rate": 2.625375215299838e-05, + "loss": 0.674, + "step": 4640 + }, + { + "epoch": 0.5423867351704811, + "grad_norm": 0.6753257755390405, + "learning_rate": 2.6210375694229227e-05, + "loss": 0.6872, + "step": 4645 + }, + { + "epoch": 0.5429705744978982, + "grad_norm": 0.6863217893646837, + "learning_rate": 2.6167004043717535e-05, + "loss": 0.6869, + "step": 4650 + }, + { + "epoch": 0.5435544138253152, + "grad_norm": 0.7052451068265377, + "learning_rate": 2.6123637363170912e-05, + "loss": 0.6361, + "step": 4655 + }, + { + "epoch": 0.5441382531527323, + "grad_norm": 0.6881306528333085, + "learning_rate": 2.6080275814278444e-05, + "loss": 0.6682, + "step": 4660 + }, + { + "epoch": 0.5447220924801495, + "grad_norm": 0.6721828847442636, + "learning_rate": 2.6036919558710064e-05, + "loss": 0.6798, + "step": 4665 + }, + { + "epoch": 0.5453059318075666, + "grad_norm": 0.7165806437961335, + "learning_rate": 2.599356875811599e-05, + "loss": 0.674, + "step": 4670 + }, + { + "epoch": 0.5458897711349836, + "grad_norm": 0.7068701259435769, + "learning_rate": 2.595022357412609e-05, + "loss": 0.6427, + "step": 4675 + }, + { + "epoch": 0.5464736104624007, + "grad_norm": 0.79055893788562, + "learning_rate": 2.59068841683493e-05, + "loss": 0.6656, + "step": 4680 + }, + { + "epoch": 0.5470574497898179, + "grad_norm": 0.7125832389911771, + "learning_rate": 2.586355070237301e-05, + "loss": 0.6725, + "step": 4685 + }, + { + "epoch": 0.547641289117235, + "grad_norm": 0.6493827727959639, + "learning_rate": 2.5820223337762438e-05, + "loss": 0.6498, + "step": 4690 + }, + { + "epoch": 0.548225128444652, + "grad_norm": 0.8022416079723034, + "learning_rate": 2.5776902236060096e-05, + "loss": 0.665, + "step": 4695 + }, + { + "epoch": 0.5488089677720691, + "grad_norm": 0.6467026526515551, + "learning_rate": 2.5733587558785126e-05, + "loss": 0.6722, + "step": 4700 + }, + { + "epoch": 0.5493928070994862, + "grad_norm": 0.7303007973795397, + "learning_rate": 2.569027946743271e-05, + "loss": 0.674, + "step": 4705 + }, + { + "epoch": 0.5499766464269034, + "grad_norm": 0.7859509288962917, + "learning_rate": 2.5646978123473477e-05, + "loss": 0.6611, + "step": 4710 + }, + { + "epoch": 0.5505604857543204, + "grad_norm": 0.7065230527690424, + "learning_rate": 2.560368368835291e-05, + "loss": 0.6643, + "step": 4715 + }, + { + "epoch": 0.5511443250817375, + "grad_norm": 0.7599848569335382, + "learning_rate": 2.5560396323490725e-05, + "loss": 0.6709, + "step": 4720 + }, + { + "epoch": 0.5517281644091546, + "grad_norm": 0.7039887949666968, + "learning_rate": 2.5517116190280284e-05, + "loss": 0.6875, + "step": 4725 + }, + { + "epoch": 0.5523120037365717, + "grad_norm": 0.6406624443500122, + "learning_rate": 2.547384345008797e-05, + "loss": 0.6778, + "step": 4730 + }, + { + "epoch": 0.5528958430639888, + "grad_norm": 0.6784449486616754, + "learning_rate": 2.5430578264252612e-05, + "loss": 0.6721, + "step": 4735 + }, + { + "epoch": 0.5534796823914059, + "grad_norm": 0.6718997692773799, + "learning_rate": 2.538732079408489e-05, + "loss": 0.6678, + "step": 4740 + }, + { + "epoch": 0.554063521718823, + "grad_norm": 0.760270195049314, + "learning_rate": 2.534407120086668e-05, + "loss": 0.6776, + "step": 4745 + }, + { + "epoch": 0.5546473610462401, + "grad_norm": 0.6944859631748597, + "learning_rate": 2.5300829645850533e-05, + "loss": 0.6511, + "step": 4750 + }, + { + "epoch": 0.5552312003736571, + "grad_norm": 0.7573413588300381, + "learning_rate": 2.5257596290258983e-05, + "loss": 0.6439, + "step": 4755 + }, + { + "epoch": 0.5558150397010743, + "grad_norm": 0.726557474037718, + "learning_rate": 2.5214371295284028e-05, + "loss": 0.6808, + "step": 4760 + }, + { + "epoch": 0.5563988790284914, + "grad_norm": 0.748698584956097, + "learning_rate": 2.517115482208649e-05, + "loss": 0.651, + "step": 4765 + }, + { + "epoch": 0.5569827183559084, + "grad_norm": 0.693531945759846, + "learning_rate": 2.5127947031795397e-05, + "loss": 0.6613, + "step": 4770 + }, + { + "epoch": 0.5575665576833255, + "grad_norm": 0.6803795484479034, + "learning_rate": 2.5084748085507432e-05, + "loss": 0.6682, + "step": 4775 + }, + { + "epoch": 0.5581503970107426, + "grad_norm": 0.7540637765559963, + "learning_rate": 2.5041558144286282e-05, + "loss": 0.6763, + "step": 4780 + }, + { + "epoch": 0.5587342363381598, + "grad_norm": 0.7459185650491311, + "learning_rate": 2.499837736916207e-05, + "loss": 0.6585, + "step": 4785 + }, + { + "epoch": 0.5593180756655768, + "grad_norm": 0.7270311154763253, + "learning_rate": 2.495520592113074e-05, + "loss": 0.647, + "step": 4790 + }, + { + "epoch": 0.5599019149929939, + "grad_norm": 0.7572319932596162, + "learning_rate": 2.4912043961153468e-05, + "loss": 0.6673, + "step": 4795 + }, + { + "epoch": 0.560485754320411, + "grad_norm": 0.7727229372888152, + "learning_rate": 2.486889165015604e-05, + "loss": 0.6651, + "step": 4800 + }, + { + "epoch": 0.5610695936478282, + "grad_norm": 0.6497425153152417, + "learning_rate": 2.4825749149028277e-05, + "loss": 0.6601, + "step": 4805 + }, + { + "epoch": 0.5616534329752452, + "grad_norm": 0.7353593698498104, + "learning_rate": 2.4782616618623428e-05, + "loss": 0.6611, + "step": 4810 + }, + { + "epoch": 0.5622372723026623, + "grad_norm": 0.7327606724742733, + "learning_rate": 2.4739494219757554e-05, + "loss": 0.6742, + "step": 4815 + }, + { + "epoch": 0.5628211116300794, + "grad_norm": 0.639718172618709, + "learning_rate": 2.4696382113208956e-05, + "loss": 0.6642, + "step": 4820 + }, + { + "epoch": 0.5634049509574965, + "grad_norm": 0.7019189464695904, + "learning_rate": 2.465328045971755e-05, + "loss": 0.6645, + "step": 4825 + }, + { + "epoch": 0.5639887902849136, + "grad_norm": 0.67700636805716, + "learning_rate": 2.4610189419984285e-05, + "loss": 0.6732, + "step": 4830 + }, + { + "epoch": 0.5645726296123307, + "grad_norm": 0.8010967962162209, + "learning_rate": 2.4567109154670542e-05, + "loss": 0.6597, + "step": 4835 + }, + { + "epoch": 0.5651564689397478, + "grad_norm": 0.6299080051446841, + "learning_rate": 2.452403982439751e-05, + "loss": 0.6871, + "step": 4840 + }, + { + "epoch": 0.5657403082671649, + "grad_norm": 0.7136247413291765, + "learning_rate": 2.4480981589745632e-05, + "loss": 0.6459, + "step": 4845 + }, + { + "epoch": 0.5663241475945819, + "grad_norm": 0.6576425824086324, + "learning_rate": 2.4437934611253972e-05, + "loss": 0.6675, + "step": 4850 + }, + { + "epoch": 0.5669079869219991, + "grad_norm": 0.6499263354460856, + "learning_rate": 2.4394899049419612e-05, + "loss": 0.6545, + "step": 4855 + }, + { + "epoch": 0.5674918262494162, + "grad_norm": 0.6386357092056523, + "learning_rate": 2.4351875064697093e-05, + "loss": 0.6604, + "step": 4860 + }, + { + "epoch": 0.5680756655768333, + "grad_norm": 0.646440268675398, + "learning_rate": 2.4308862817497786e-05, + "loss": 0.6546, + "step": 4865 + }, + { + "epoch": 0.5686595049042503, + "grad_norm": 0.7126284723406615, + "learning_rate": 2.4265862468189283e-05, + "loss": 0.6683, + "step": 4870 + }, + { + "epoch": 0.5692433442316674, + "grad_norm": 0.6512344053164097, + "learning_rate": 2.4222874177094823e-05, + "loss": 0.6669, + "step": 4875 + }, + { + "epoch": 0.5698271835590846, + "grad_norm": 0.6407495105162438, + "learning_rate": 2.4179898104492705e-05, + "loss": 0.6497, + "step": 4880 + }, + { + "epoch": 0.5704110228865016, + "grad_norm": 0.6742933223157447, + "learning_rate": 2.4136934410615646e-05, + "loss": 0.663, + "step": 4885 + }, + { + "epoch": 0.5709948622139187, + "grad_norm": 0.6763547210626801, + "learning_rate": 2.4093983255650227e-05, + "loss": 0.656, + "step": 4890 + }, + { + "epoch": 0.5715787015413358, + "grad_norm": 0.6378251514665205, + "learning_rate": 2.405104479973628e-05, + "loss": 0.6815, + "step": 4895 + }, + { + "epoch": 0.572162540868753, + "grad_norm": 0.6276578822503801, + "learning_rate": 2.400811920296627e-05, + "loss": 0.6546, + "step": 4900 + }, + { + "epoch": 0.57274638019617, + "grad_norm": 0.6804919947104875, + "learning_rate": 2.396520662538474e-05, + "loss": 0.6362, + "step": 4905 + }, + { + "epoch": 0.5733302195235871, + "grad_norm": 0.687947921848423, + "learning_rate": 2.3922307226987678e-05, + "loss": 0.6567, + "step": 4910 + }, + { + "epoch": 0.5739140588510042, + "grad_norm": 0.7186220123458101, + "learning_rate": 2.3879421167721944e-05, + "loss": 0.6785, + "step": 4915 + }, + { + "epoch": 0.5744978981784213, + "grad_norm": 0.6464932036374117, + "learning_rate": 2.383654860748466e-05, + "loss": 0.6777, + "step": 4920 + }, + { + "epoch": 0.5750817375058384, + "grad_norm": 0.6702554906203262, + "learning_rate": 2.379368970612261e-05, + "loss": 0.6479, + "step": 4925 + }, + { + "epoch": 0.5756655768332555, + "grad_norm": 0.6521028967880418, + "learning_rate": 2.375084462343167e-05, + "loss": 0.6458, + "step": 4930 + }, + { + "epoch": 0.5762494161606726, + "grad_norm": 0.6446069797513071, + "learning_rate": 2.370801351915617e-05, + "loss": 0.6487, + "step": 4935 + }, + { + "epoch": 0.5768332554880897, + "grad_norm": 0.6676918348307936, + "learning_rate": 2.3665196552988357e-05, + "loss": 0.6684, + "step": 4940 + }, + { + "epoch": 0.5774170948155067, + "grad_norm": 0.6762410280464071, + "learning_rate": 2.362239388456773e-05, + "loss": 0.6747, + "step": 4945 + }, + { + "epoch": 0.5780009341429239, + "grad_norm": 0.6347782972936462, + "learning_rate": 2.357960567348049e-05, + "loss": 0.662, + "step": 4950 + }, + { + "epoch": 0.578584773470341, + "grad_norm": 0.6669462226307387, + "learning_rate": 2.3536832079258952e-05, + "loss": 0.6617, + "step": 4955 + }, + { + "epoch": 0.5791686127977581, + "grad_norm": 0.6570631505289427, + "learning_rate": 2.3494073261380915e-05, + "loss": 0.6762, + "step": 4960 + }, + { + "epoch": 0.5797524521251751, + "grad_norm": 0.6296806022007636, + "learning_rate": 2.34513293792691e-05, + "loss": 0.663, + "step": 4965 + }, + { + "epoch": 0.5803362914525922, + "grad_norm": 0.693156151064996, + "learning_rate": 2.340860059229052e-05, + "loss": 0.6434, + "step": 4970 + }, + { + "epoch": 0.5809201307800094, + "grad_norm": 0.6854563034520345, + "learning_rate": 2.3365887059755925e-05, + "loss": 0.677, + "step": 4975 + }, + { + "epoch": 0.5815039701074264, + "grad_norm": 0.6839853383397014, + "learning_rate": 2.3323188940919188e-05, + "loss": 0.6547, + "step": 4980 + }, + { + "epoch": 0.5820878094348435, + "grad_norm": 0.7313864803016533, + "learning_rate": 2.328050639497671e-05, + "loss": 0.6664, + "step": 4985 + }, + { + "epoch": 0.5826716487622606, + "grad_norm": 0.6543910450847746, + "learning_rate": 2.3237839581066828e-05, + "loss": 0.655, + "step": 4990 + }, + { + "epoch": 0.5832554880896778, + "grad_norm": 0.7441738407990897, + "learning_rate": 2.3195188658269224e-05, + "loss": 0.6503, + "step": 4995 + }, + { + "epoch": 0.5838393274170948, + "grad_norm": 0.7099442194938861, + "learning_rate": 2.3152553785604336e-05, + "loss": 0.6605, + "step": 5000 + }, + { + "epoch": 0.5844231667445119, + "grad_norm": 0.7147324216220688, + "learning_rate": 2.3109935122032754e-05, + "loss": 0.6482, + "step": 5005 + }, + { + "epoch": 0.585007006071929, + "grad_norm": 0.735545696956427, + "learning_rate": 2.3067332826454647e-05, + "loss": 0.665, + "step": 5010 + }, + { + "epoch": 0.5855908453993461, + "grad_norm": 0.6164011462893052, + "learning_rate": 2.3024747057709132e-05, + "loss": 0.6779, + "step": 5015 + }, + { + "epoch": 0.5861746847267632, + "grad_norm": 0.7158026426532503, + "learning_rate": 2.2982177974573733e-05, + "loss": 0.6698, + "step": 5020 + }, + { + "epoch": 0.5867585240541803, + "grad_norm": 0.6519119068548397, + "learning_rate": 2.2939625735763743e-05, + "loss": 0.6553, + "step": 5025 + }, + { + "epoch": 0.5873423633815974, + "grad_norm": 0.708133740672085, + "learning_rate": 2.2897090499931674e-05, + "loss": 0.6438, + "step": 5030 + }, + { + "epoch": 0.5879262027090145, + "grad_norm": 0.7959096079436856, + "learning_rate": 2.285457242566662e-05, + "loss": 0.639, + "step": 5035 + }, + { + "epoch": 0.5885100420364315, + "grad_norm": 0.7511747413046199, + "learning_rate": 2.2812071671493713e-05, + "loss": 0.6546, + "step": 5040 + }, + { + "epoch": 0.5890938813638487, + "grad_norm": 0.6565467442204779, + "learning_rate": 2.2769588395873482e-05, + "loss": 0.6462, + "step": 5045 + }, + { + "epoch": 0.5896777206912658, + "grad_norm": 0.6953178188769654, + "learning_rate": 2.272712275720132e-05, + "loss": 0.6675, + "step": 5050 + }, + { + "epoch": 0.5902615600186829, + "grad_norm": 0.7005519055574339, + "learning_rate": 2.268467491380683e-05, + "loss": 0.6592, + "step": 5055 + }, + { + "epoch": 0.5908453993460999, + "grad_norm": 0.6331925247962092, + "learning_rate": 2.264224502395329e-05, + "loss": 0.6615, + "step": 5060 + }, + { + "epoch": 0.591429238673517, + "grad_norm": 0.6788925130695869, + "learning_rate": 2.2599833245837032e-05, + "loss": 0.6804, + "step": 5065 + }, + { + "epoch": 0.5920130780009342, + "grad_norm": 0.7555304492511874, + "learning_rate": 2.2557439737586856e-05, + "loss": 0.6739, + "step": 5070 + }, + { + "epoch": 0.5925969173283513, + "grad_norm": 0.7080213191115895, + "learning_rate": 2.2515064657263447e-05, + "loss": 0.6511, + "step": 5075 + }, + { + "epoch": 0.5931807566557683, + "grad_norm": 0.7125684371696229, + "learning_rate": 2.2472708162858792e-05, + "loss": 0.6588, + "step": 5080 + }, + { + "epoch": 0.5937645959831854, + "grad_norm": 0.6677673198930747, + "learning_rate": 2.2430370412295566e-05, + "loss": 0.6729, + "step": 5085 + }, + { + "epoch": 0.5943484353106026, + "grad_norm": 0.6833790177033672, + "learning_rate": 2.2388051563426577e-05, + "loss": 0.6915, + "step": 5090 + }, + { + "epoch": 0.5949322746380196, + "grad_norm": 0.6961110689822065, + "learning_rate": 2.2345751774034135e-05, + "loss": 0.6573, + "step": 5095 + }, + { + "epoch": 0.5955161139654367, + "grad_norm": 0.6861149572383188, + "learning_rate": 2.230347120182951e-05, + "loss": 0.6729, + "step": 5100 + }, + { + "epoch": 0.5960999532928538, + "grad_norm": 0.6513104746202458, + "learning_rate": 2.226121000445232e-05, + "loss": 0.6547, + "step": 5105 + }, + { + "epoch": 0.5966837926202709, + "grad_norm": 0.6794112240815948, + "learning_rate": 2.2218968339469932e-05, + "loss": 0.645, + "step": 5110 + }, + { + "epoch": 0.597267631947688, + "grad_norm": 0.659653061736524, + "learning_rate": 2.2176746364376904e-05, + "loss": 0.6581, + "step": 5115 + }, + { + "epoch": 0.5978514712751051, + "grad_norm": 0.7388240821848575, + "learning_rate": 2.2134544236594374e-05, + "loss": 0.6396, + "step": 5120 + }, + { + "epoch": 0.5984353106025222, + "grad_norm": 0.7742865515393434, + "learning_rate": 2.2092362113469474e-05, + "loss": 0.664, + "step": 5125 + }, + { + "epoch": 0.5990191499299393, + "grad_norm": 0.6957977775421466, + "learning_rate": 2.2050200152274763e-05, + "loss": 0.6774, + "step": 5130 + }, + { + "epoch": 0.5996029892573563, + "grad_norm": 0.6763495473519054, + "learning_rate": 2.2008058510207635e-05, + "loss": 0.6758, + "step": 5135 + }, + { + "epoch": 0.6001868285847735, + "grad_norm": 0.6311990597582967, + "learning_rate": 2.1965937344389692e-05, + "loss": 0.6529, + "step": 5140 + }, + { + "epoch": 0.6007706679121906, + "grad_norm": 0.664405793100965, + "learning_rate": 2.1923836811866227e-05, + "loss": 0.6547, + "step": 5145 + }, + { + "epoch": 0.6013545072396077, + "grad_norm": 0.6720962869376756, + "learning_rate": 2.188175706960559e-05, + "loss": 0.6612, + "step": 5150 + }, + { + "epoch": 0.6019383465670247, + "grad_norm": 0.6224353508799658, + "learning_rate": 2.1839698274498616e-05, + "loss": 0.6825, + "step": 5155 + }, + { + "epoch": 0.6025221858944418, + "grad_norm": 0.6880116859217059, + "learning_rate": 2.1797660583358032e-05, + "loss": 0.6597, + "step": 5160 + }, + { + "epoch": 0.603106025221859, + "grad_norm": 0.6691884184144905, + "learning_rate": 2.1755644152917903e-05, + "loss": 0.6499, + "step": 5165 + }, + { + "epoch": 0.6036898645492761, + "grad_norm": 0.6708563167435044, + "learning_rate": 2.1713649139833e-05, + "loss": 0.6734, + "step": 5170 + }, + { + "epoch": 0.6042737038766931, + "grad_norm": 0.6494737500130949, + "learning_rate": 2.1671675700678257e-05, + "loss": 0.6619, + "step": 5175 + }, + { + "epoch": 0.6048575432041102, + "grad_norm": 0.6848091657139095, + "learning_rate": 2.1629723991948176e-05, + "loss": 0.6633, + "step": 5180 + }, + { + "epoch": 0.6054413825315274, + "grad_norm": 0.6456725587890948, + "learning_rate": 2.1587794170056213e-05, + "loss": 0.6661, + "step": 5185 + }, + { + "epoch": 0.6060252218589445, + "grad_norm": 0.6494044214651111, + "learning_rate": 2.154588639133425e-05, + "loss": 0.674, + "step": 5190 + }, + { + "epoch": 0.6066090611863615, + "grad_norm": 0.7196799234230328, + "learning_rate": 2.1504000812031966e-05, + "loss": 0.6673, + "step": 5195 + }, + { + "epoch": 0.6071929005137786, + "grad_norm": 0.6588531600130353, + "learning_rate": 2.1462137588316268e-05, + "loss": 0.6417, + "step": 5200 + }, + { + "epoch": 0.6077767398411957, + "grad_norm": 0.732385253858184, + "learning_rate": 2.142029687627074e-05, + "loss": 0.6505, + "step": 5205 + }, + { + "epoch": 0.6083605791686127, + "grad_norm": 0.7348856656234368, + "learning_rate": 2.1378478831895e-05, + "loss": 0.6457, + "step": 5210 + }, + { + "epoch": 0.6089444184960299, + "grad_norm": 0.7643931748530564, + "learning_rate": 2.133668361110417e-05, + "loss": 0.6684, + "step": 5215 + }, + { + "epoch": 0.609528257823447, + "grad_norm": 0.6705244603198324, + "learning_rate": 2.129491136972826e-05, + "loss": 0.6391, + "step": 5220 + }, + { + "epoch": 0.6101120971508641, + "grad_norm": 0.7088148504034617, + "learning_rate": 2.125316226351163e-05, + "loss": 0.6714, + "step": 5225 + }, + { + "epoch": 0.6106959364782811, + "grad_norm": 0.6894959554456558, + "learning_rate": 2.1211436448112356e-05, + "loss": 0.6594, + "step": 5230 + }, + { + "epoch": 0.6112797758056983, + "grad_norm": 0.7072742765848938, + "learning_rate": 2.1169734079101684e-05, + "loss": 0.6652, + "step": 5235 + }, + { + "epoch": 0.6118636151331154, + "grad_norm": 0.7247723315350132, + "learning_rate": 2.1128055311963453e-05, + "loss": 0.6598, + "step": 5240 + }, + { + "epoch": 0.6124474544605325, + "grad_norm": 0.735751394291021, + "learning_rate": 2.1086400302093483e-05, + "loss": 0.6648, + "step": 5245 + }, + { + "epoch": 0.6130312937879495, + "grad_norm": 0.6587002357845835, + "learning_rate": 2.104476920479905e-05, + "loss": 0.6522, + "step": 5250 + }, + { + "epoch": 0.6136151331153666, + "grad_norm": 0.7001904520094278, + "learning_rate": 2.1003162175298234e-05, + "loss": 0.6433, + "step": 5255 + }, + { + "epoch": 0.6141989724427838, + "grad_norm": 0.6732104941631006, + "learning_rate": 2.0961579368719407e-05, + "loss": 0.6618, + "step": 5260 + }, + { + "epoch": 0.6147828117702009, + "grad_norm": 0.6626098058920104, + "learning_rate": 2.0920020940100626e-05, + "loss": 0.6687, + "step": 5265 + }, + { + "epoch": 0.6153666510976179, + "grad_norm": 0.6655483296009259, + "learning_rate": 2.087848704438905e-05, + "loss": 0.6621, + "step": 5270 + }, + { + "epoch": 0.615950490425035, + "grad_norm": 0.686843179853701, + "learning_rate": 2.0836977836440364e-05, + "loss": 0.6427, + "step": 5275 + }, + { + "epoch": 0.6165343297524521, + "grad_norm": 0.6688371953183895, + "learning_rate": 2.0795493471018222e-05, + "loss": 0.6718, + "step": 5280 + }, + { + "epoch": 0.6171181690798693, + "grad_norm": 0.6647003616665795, + "learning_rate": 2.075403410279364e-05, + "loss": 0.6432, + "step": 5285 + }, + { + "epoch": 0.6177020084072863, + "grad_norm": 0.6859825058865614, + "learning_rate": 2.0712599886344447e-05, + "loss": 0.6481, + "step": 5290 + }, + { + "epoch": 0.6182858477347034, + "grad_norm": 0.7808965272022199, + "learning_rate": 2.067119097615468e-05, + "loss": 0.6665, + "step": 5295 + }, + { + "epoch": 0.6188696870621205, + "grad_norm": 0.6599205587793768, + "learning_rate": 2.0629807526614037e-05, + "loss": 0.6712, + "step": 5300 + }, + { + "epoch": 0.6194535263895375, + "grad_norm": 0.7272868234971551, + "learning_rate": 2.0588449692017287e-05, + "loss": 0.6522, + "step": 5305 + }, + { + "epoch": 0.6200373657169547, + "grad_norm": 0.6799580957384425, + "learning_rate": 2.054711762656369e-05, + "loss": 0.6711, + "step": 5310 + }, + { + "epoch": 0.6206212050443718, + "grad_norm": 0.6745926690111709, + "learning_rate": 2.0505811484356424e-05, + "loss": 0.6811, + "step": 5315 + }, + { + "epoch": 0.6212050443717889, + "grad_norm": 0.757885080460303, + "learning_rate": 2.0464531419402026e-05, + "loss": 0.6521, + "step": 5320 + }, + { + "epoch": 0.6217888836992059, + "grad_norm": 0.6362954775845878, + "learning_rate": 2.0423277585609806e-05, + "loss": 0.6591, + "step": 5325 + }, + { + "epoch": 0.622372723026623, + "grad_norm": 0.6604487088077978, + "learning_rate": 2.038205013679127e-05, + "loss": 0.6647, + "step": 5330 + }, + { + "epoch": 0.6229565623540402, + "grad_norm": 0.6670476169325874, + "learning_rate": 2.034084922665953e-05, + "loss": 0.6463, + "step": 5335 + }, + { + "epoch": 0.6235404016814573, + "grad_norm": 0.624587303283789, + "learning_rate": 2.0299675008828783e-05, + "loss": 0.6343, + "step": 5340 + }, + { + "epoch": 0.6241242410088743, + "grad_norm": 0.6491739846872022, + "learning_rate": 2.025852763681369e-05, + "loss": 0.6419, + "step": 5345 + }, + { + "epoch": 0.6247080803362914, + "grad_norm": 0.6944978201182703, + "learning_rate": 2.021740726402882e-05, + "loss": 0.6563, + "step": 5350 + }, + { + "epoch": 0.6252919196637086, + "grad_norm": 0.6332299668332454, + "learning_rate": 2.0176314043788077e-05, + "loss": 0.6624, + "step": 5355 + }, + { + "epoch": 0.6258757589911257, + "grad_norm": 0.7384532382114772, + "learning_rate": 2.0135248129304124e-05, + "loss": 0.6479, + "step": 5360 + }, + { + "epoch": 0.6264595983185427, + "grad_norm": 0.643946451086718, + "learning_rate": 2.009420967368784e-05, + "loss": 0.6397, + "step": 5365 + }, + { + "epoch": 0.6270434376459598, + "grad_norm": 0.6322326637102691, + "learning_rate": 2.00531988299477e-05, + "loss": 0.6664, + "step": 5370 + }, + { + "epoch": 0.627627276973377, + "grad_norm": 0.6673557927836997, + "learning_rate": 2.0012215750989242e-05, + "loss": 0.653, + "step": 5375 + }, + { + "epoch": 0.6282111163007941, + "grad_norm": 0.6706784359446668, + "learning_rate": 1.997126058961448e-05, + "loss": 0.6583, + "step": 5380 + }, + { + "epoch": 0.6287949556282111, + "grad_norm": 0.6516995188208667, + "learning_rate": 1.9930333498521354e-05, + "loss": 0.6444, + "step": 5385 + }, + { + "epoch": 0.6293787949556282, + "grad_norm": 0.6825646068596721, + "learning_rate": 1.9889434630303118e-05, + "loss": 0.6598, + "step": 5390 + }, + { + "epoch": 0.6299626342830453, + "grad_norm": 0.6402118165861358, + "learning_rate": 1.9848564137447823e-05, + "loss": 0.6568, + "step": 5395 + }, + { + "epoch": 0.6305464736104625, + "grad_norm": 0.6391671575175067, + "learning_rate": 1.9807722172337724e-05, + "loss": 0.6502, + "step": 5400 + }, + { + "epoch": 0.6311303129378795, + "grad_norm": 0.6310329791350416, + "learning_rate": 1.9766908887248697e-05, + "loss": 0.6531, + "step": 5405 + }, + { + "epoch": 0.6317141522652966, + "grad_norm": 0.6452334020484827, + "learning_rate": 1.9726124434349706e-05, + "loss": 0.6487, + "step": 5410 + }, + { + "epoch": 0.6322979915927137, + "grad_norm": 0.6445433630385896, + "learning_rate": 1.9685368965702204e-05, + "loss": 0.6711, + "step": 5415 + }, + { + "epoch": 0.6328818309201307, + "grad_norm": 0.6345788332891033, + "learning_rate": 1.9644642633259575e-05, + "loss": 0.6312, + "step": 5420 + }, + { + "epoch": 0.6334656702475479, + "grad_norm": 0.7395695651691918, + "learning_rate": 1.960394558886659e-05, + "loss": 0.6448, + "step": 5425 + }, + { + "epoch": 0.634049509574965, + "grad_norm": 0.6926534147470341, + "learning_rate": 1.95632779842588e-05, + "loss": 0.6539, + "step": 5430 + }, + { + "epoch": 0.6346333489023821, + "grad_norm": 0.673046775389808, + "learning_rate": 1.9522639971062008e-05, + "loss": 0.6704, + "step": 5435 + }, + { + "epoch": 0.6352171882297991, + "grad_norm": 0.6096765129669326, + "learning_rate": 1.948203170079168e-05, + "loss": 0.6622, + "step": 5440 + }, + { + "epoch": 0.6358010275572162, + "grad_norm": 0.6034708595881847, + "learning_rate": 1.9441453324852387e-05, + "loss": 0.6364, + "step": 5445 + }, + { + "epoch": 0.6363848668846334, + "grad_norm": 0.6812639408764771, + "learning_rate": 1.9400904994537257e-05, + "loss": 0.6528, + "step": 5450 + }, + { + "epoch": 0.6369687062120505, + "grad_norm": 0.6436990280580072, + "learning_rate": 1.936038686102736e-05, + "loss": 0.6451, + "step": 5455 + }, + { + "epoch": 0.6375525455394675, + "grad_norm": 0.6462508555062616, + "learning_rate": 1.931989907539123e-05, + "loss": 0.6618, + "step": 5460 + }, + { + "epoch": 0.6381363848668846, + "grad_norm": 0.6640329522229098, + "learning_rate": 1.92794417885842e-05, + "loss": 0.6671, + "step": 5465 + }, + { + "epoch": 0.6387202241943017, + "grad_norm": 0.6785919614768156, + "learning_rate": 1.9239015151447927e-05, + "loss": 0.6517, + "step": 5470 + }, + { + "epoch": 0.6393040635217189, + "grad_norm": 0.7048102648283558, + "learning_rate": 1.919861931470978e-05, + "loss": 0.6591, + "step": 5475 + }, + { + "epoch": 0.6398879028491359, + "grad_norm": 0.6494301311911943, + "learning_rate": 1.9158254428982293e-05, + "loss": 0.6568, + "step": 5480 + }, + { + "epoch": 0.640471742176553, + "grad_norm": 0.6424041857418867, + "learning_rate": 1.9117920644762594e-05, + "loss": 0.6726, + "step": 5485 + }, + { + "epoch": 0.6410555815039701, + "grad_norm": 0.7077906276947327, + "learning_rate": 1.907761811243186e-05, + "loss": 0.6422, + "step": 5490 + }, + { + "epoch": 0.6416394208313873, + "grad_norm": 0.6807661269517251, + "learning_rate": 1.9037346982254755e-05, + "loss": 0.6526, + "step": 5495 + }, + { + "epoch": 0.6422232601588043, + "grad_norm": 0.6685336715529324, + "learning_rate": 1.8997107404378846e-05, + "loss": 0.6568, + "step": 5500 + }, + { + "epoch": 0.6428070994862214, + "grad_norm": 0.6269460502274804, + "learning_rate": 1.8956899528834065e-05, + "loss": 0.639, + "step": 5505 + }, + { + "epoch": 0.6433909388136385, + "grad_norm": 0.6712644610457897, + "learning_rate": 1.8916723505532157e-05, + "loss": 0.6469, + "step": 5510 + }, + { + "epoch": 0.6439747781410555, + "grad_norm": 0.6817260060318665, + "learning_rate": 1.8876579484266094e-05, + "loss": 0.6413, + "step": 5515 + }, + { + "epoch": 0.6445586174684726, + "grad_norm": 0.6709064524905491, + "learning_rate": 1.8836467614709535e-05, + "loss": 0.6603, + "step": 5520 + }, + { + "epoch": 0.6451424567958898, + "grad_norm": 0.6642156209672145, + "learning_rate": 1.8796388046416253e-05, + "loss": 0.6634, + "step": 5525 + }, + { + "epoch": 0.6457262961233069, + "grad_norm": 0.6659027841445241, + "learning_rate": 1.875634092881963e-05, + "loss": 0.6562, + "step": 5530 + }, + { + "epoch": 0.6463101354507239, + "grad_norm": 0.7108394628988333, + "learning_rate": 1.8716326411232016e-05, + "loss": 0.654, + "step": 5535 + }, + { + "epoch": 0.646893974778141, + "grad_norm": 0.6802242093433051, + "learning_rate": 1.8676344642844217e-05, + "loss": 0.6432, + "step": 5540 + }, + { + "epoch": 0.6474778141055582, + "grad_norm": 0.7111830262209046, + "learning_rate": 1.8636395772724952e-05, + "loss": 0.6526, + "step": 5545 + }, + { + "epoch": 0.6480616534329753, + "grad_norm": 0.6067533863276159, + "learning_rate": 1.8596479949820273e-05, + "loss": 0.6529, + "step": 5550 + }, + { + "epoch": 0.6486454927603923, + "grad_norm": 0.6230500326750026, + "learning_rate": 1.8556597322953035e-05, + "loss": 0.6642, + "step": 5555 + }, + { + "epoch": 0.6492293320878094, + "grad_norm": 0.6433493991498862, + "learning_rate": 1.8516748040822295e-05, + "loss": 0.6473, + "step": 5560 + }, + { + "epoch": 0.6498131714152265, + "grad_norm": 0.7761064372664181, + "learning_rate": 1.847693225200281e-05, + "loss": 0.6533, + "step": 5565 + }, + { + "epoch": 0.6503970107426437, + "grad_norm": 0.6380806598893206, + "learning_rate": 1.843715010494445e-05, + "loss": 0.6565, + "step": 5570 + }, + { + "epoch": 0.6509808500700607, + "grad_norm": 0.6596355513028087, + "learning_rate": 1.839740174797166e-05, + "loss": 0.67, + "step": 5575 + }, + { + "epoch": 0.6515646893974778, + "grad_norm": 0.6596141423952829, + "learning_rate": 1.8357687329282896e-05, + "loss": 0.6594, + "step": 5580 + }, + { + "epoch": 0.6521485287248949, + "grad_norm": 0.6720639426332328, + "learning_rate": 1.831800699695008e-05, + "loss": 0.6532, + "step": 5585 + }, + { + "epoch": 0.652732368052312, + "grad_norm": 0.6719169509066691, + "learning_rate": 1.827836089891805e-05, + "loss": 0.6606, + "step": 5590 + }, + { + "epoch": 0.6533162073797291, + "grad_norm": 0.6652163538901605, + "learning_rate": 1.823874918300399e-05, + "loss": 0.6526, + "step": 5595 + }, + { + "epoch": 0.6539000467071462, + "grad_norm": 0.6273512495673599, + "learning_rate": 1.8199171996896912e-05, + "loss": 0.6528, + "step": 5600 + }, + { + "epoch": 0.6544838860345633, + "grad_norm": 0.6956985348066894, + "learning_rate": 1.8159629488157082e-05, + "loss": 0.6711, + "step": 5605 + }, + { + "epoch": 0.6550677253619804, + "grad_norm": 0.6749270880541005, + "learning_rate": 1.8120121804215466e-05, + "loss": 0.6549, + "step": 5610 + }, + { + "epoch": 0.6556515646893974, + "grad_norm": 0.6642392459123238, + "learning_rate": 1.8080649092373187e-05, + "loss": 0.6597, + "step": 5615 + }, + { + "epoch": 0.6562354040168146, + "grad_norm": 0.693501672371854, + "learning_rate": 1.8041211499800992e-05, + "loss": 0.6495, + "step": 5620 + }, + { + "epoch": 0.6568192433442317, + "grad_norm": 0.6696143427708985, + "learning_rate": 1.8001809173538676e-05, + "loss": 0.6462, + "step": 5625 + }, + { + "epoch": 0.6574030826716487, + "grad_norm": 0.6402364490146976, + "learning_rate": 1.796244226049455e-05, + "loss": 0.6558, + "step": 5630 + }, + { + "epoch": 0.6579869219990658, + "grad_norm": 0.6518313143198281, + "learning_rate": 1.792311090744489e-05, + "loss": 0.6248, + "step": 5635 + }, + { + "epoch": 0.658570761326483, + "grad_norm": 0.6779344772697246, + "learning_rate": 1.7883815261033393e-05, + "loss": 0.6534, + "step": 5640 + }, + { + "epoch": 0.6591546006539001, + "grad_norm": 0.6534547467921898, + "learning_rate": 1.7844555467770624e-05, + "loss": 0.667, + "step": 5645 + }, + { + "epoch": 0.6597384399813171, + "grad_norm": 0.6271083968726606, + "learning_rate": 1.7805331674033466e-05, + "loss": 0.6548, + "step": 5650 + }, + { + "epoch": 0.6603222793087342, + "grad_norm": 0.6905713709592075, + "learning_rate": 1.776614402606459e-05, + "loss": 0.6531, + "step": 5655 + }, + { + "epoch": 0.6609061186361513, + "grad_norm": 0.6879979135812365, + "learning_rate": 1.7726992669971904e-05, + "loss": 0.6525, + "step": 5660 + }, + { + "epoch": 0.6614899579635685, + "grad_norm": 0.6255401526115754, + "learning_rate": 1.768787775172799e-05, + "loss": 0.6556, + "step": 5665 + }, + { + "epoch": 0.6620737972909855, + "grad_norm": 0.6633550080165926, + "learning_rate": 1.7648799417169588e-05, + "loss": 0.6552, + "step": 5670 + }, + { + "epoch": 0.6626576366184026, + "grad_norm": 0.6926249702866363, + "learning_rate": 1.7609757811997023e-05, + "loss": 0.6513, + "step": 5675 + }, + { + "epoch": 0.6632414759458197, + "grad_norm": 0.6807036634968471, + "learning_rate": 1.75707530817737e-05, + "loss": 0.6505, + "step": 5680 + }, + { + "epoch": 0.6638253152732368, + "grad_norm": 0.6560608306838643, + "learning_rate": 1.753178537192551e-05, + "loss": 0.6545, + "step": 5685 + }, + { + "epoch": 0.6644091546006539, + "grad_norm": 0.6987714592200602, + "learning_rate": 1.7492854827740353e-05, + "loss": 0.6609, + "step": 5690 + }, + { + "epoch": 0.664992993928071, + "grad_norm": 0.6772091217855885, + "learning_rate": 1.7453961594367528e-05, + "loss": 0.6534, + "step": 5695 + }, + { + "epoch": 0.6655768332554881, + "grad_norm": 0.7098236239614998, + "learning_rate": 1.741510581681724e-05, + "loss": 0.649, + "step": 5700 + }, + { + "epoch": 0.6661606725829052, + "grad_norm": 0.7266652414785809, + "learning_rate": 1.737628763996005e-05, + "loss": 0.6679, + "step": 5705 + }, + { + "epoch": 0.6667445119103222, + "grad_norm": 0.7617695462993247, + "learning_rate": 1.7337507208526295e-05, + "loss": 0.6551, + "step": 5710 + }, + { + "epoch": 0.6673283512377394, + "grad_norm": 0.6540180478678356, + "learning_rate": 1.729876466710561e-05, + "loss": 0.6541, + "step": 5715 + }, + { + "epoch": 0.6679121905651565, + "grad_norm": 0.676683188269362, + "learning_rate": 1.726006016014637e-05, + "loss": 0.6713, + "step": 5720 + }, + { + "epoch": 0.6684960298925736, + "grad_norm": 0.6604985309242145, + "learning_rate": 1.7221393831955102e-05, + "loss": 0.6533, + "step": 5725 + }, + { + "epoch": 0.6690798692199906, + "grad_norm": 0.7549157711141019, + "learning_rate": 1.718276582669602e-05, + "loss": 0.6421, + "step": 5730 + }, + { + "epoch": 0.6696637085474078, + "grad_norm": 0.6359539861738778, + "learning_rate": 1.7144176288390448e-05, + "loss": 0.6425, + "step": 5735 + }, + { + "epoch": 0.6702475478748249, + "grad_norm": 0.6531250484152343, + "learning_rate": 1.7105625360916276e-05, + "loss": 0.6625, + "step": 5740 + }, + { + "epoch": 0.6708313872022419, + "grad_norm": 0.6784205320156709, + "learning_rate": 1.7067113188007457e-05, + "loss": 0.6406, + "step": 5745 + }, + { + "epoch": 0.671415226529659, + "grad_norm": 0.634047717162136, + "learning_rate": 1.7028639913253426e-05, + "loss": 0.6538, + "step": 5750 + }, + { + "epoch": 0.6719990658570761, + "grad_norm": 0.6710077949566347, + "learning_rate": 1.6990205680098612e-05, + "loss": 0.6629, + "step": 5755 + }, + { + "epoch": 0.6725829051844933, + "grad_norm": 0.6318362431313652, + "learning_rate": 1.695181063184187e-05, + "loss": 0.6607, + "step": 5760 + }, + { + "epoch": 0.6731667445119103, + "grad_norm": 0.644406552288753, + "learning_rate": 1.6913454911635954e-05, + "loss": 0.6469, + "step": 5765 + }, + { + "epoch": 0.6737505838393274, + "grad_norm": 0.6636822016723358, + "learning_rate": 1.6875138662486997e-05, + "loss": 0.6422, + "step": 5770 + }, + { + "epoch": 0.6743344231667445, + "grad_norm": 0.7029211459350307, + "learning_rate": 1.6836862027253963e-05, + "loss": 0.6354, + "step": 5775 + }, + { + "epoch": 0.6749182624941616, + "grad_norm": 0.7475098263254912, + "learning_rate": 1.6798625148648113e-05, + "loss": 0.6567, + "step": 5780 + }, + { + "epoch": 0.6755021018215787, + "grad_norm": 0.6382798450507685, + "learning_rate": 1.6760428169232483e-05, + "loss": 0.6381, + "step": 5785 + }, + { + "epoch": 0.6760859411489958, + "grad_norm": 0.6153848441637845, + "learning_rate": 1.672227123142136e-05, + "loss": 0.6588, + "step": 5790 + }, + { + "epoch": 0.6766697804764129, + "grad_norm": 0.6186569358031658, + "learning_rate": 1.668415447747971e-05, + "loss": 0.6342, + "step": 5795 + }, + { + "epoch": 0.67725361980383, + "grad_norm": 0.6656027062177072, + "learning_rate": 1.6646078049522706e-05, + "loss": 0.6396, + "step": 5800 + }, + { + "epoch": 0.677837459131247, + "grad_norm": 0.6577646859368105, + "learning_rate": 1.660804208951516e-05, + "loss": 0.6533, + "step": 5805 + }, + { + "epoch": 0.6784212984586642, + "grad_norm": 0.5927313773388592, + "learning_rate": 1.6570046739270988e-05, + "loss": 0.6531, + "step": 5810 + }, + { + "epoch": 0.6790051377860813, + "grad_norm": 0.6091518390067749, + "learning_rate": 1.6532092140452725e-05, + "loss": 0.6553, + "step": 5815 + }, + { + "epoch": 0.6795889771134984, + "grad_norm": 0.613863353598546, + "learning_rate": 1.649417843457094e-05, + "loss": 0.6383, + "step": 5820 + }, + { + "epoch": 0.6801728164409154, + "grad_norm": 0.7011048803452364, + "learning_rate": 1.6456305762983742e-05, + "loss": 0.6564, + "step": 5825 + }, + { + "epoch": 0.6807566557683326, + "grad_norm": 0.6584740117106023, + "learning_rate": 1.6418474266896257e-05, + "loss": 0.6436, + "step": 5830 + }, + { + "epoch": 0.6813404950957497, + "grad_norm": 0.7055538531657063, + "learning_rate": 1.6380684087360088e-05, + "loss": 0.6332, + "step": 5835 + }, + { + "epoch": 0.6819243344231667, + "grad_norm": 0.6552792858883486, + "learning_rate": 1.6342935365272785e-05, + "loss": 0.6605, + "step": 5840 + }, + { + "epoch": 0.6825081737505838, + "grad_norm": 0.6556248652540059, + "learning_rate": 1.6305228241377347e-05, + "loss": 0.6349, + "step": 5845 + }, + { + "epoch": 0.6830920130780009, + "grad_norm": 0.6321426350760702, + "learning_rate": 1.6267562856261638e-05, + "loss": 0.647, + "step": 5850 + }, + { + "epoch": 0.6836758524054181, + "grad_norm": 0.6699560404989997, + "learning_rate": 1.6229939350357952e-05, + "loss": 0.6434, + "step": 5855 + }, + { + "epoch": 0.6842596917328351, + "grad_norm": 0.6345164186545303, + "learning_rate": 1.6192357863942415e-05, + "loss": 0.6469, + "step": 5860 + }, + { + "epoch": 0.6848435310602522, + "grad_norm": 0.6569370988654168, + "learning_rate": 1.615481853713448e-05, + "loss": 0.634, + "step": 5865 + }, + { + "epoch": 0.6854273703876693, + "grad_norm": 0.6997172396660267, + "learning_rate": 1.6117321509896422e-05, + "loss": 0.6445, + "step": 5870 + }, + { + "epoch": 0.6860112097150864, + "grad_norm": 0.632180172250705, + "learning_rate": 1.60798669220328e-05, + "loss": 0.6419, + "step": 5875 + }, + { + "epoch": 0.6865950490425035, + "grad_norm": 0.7019041464086315, + "learning_rate": 1.6042454913189946e-05, + "loss": 0.6746, + "step": 5880 + }, + { + "epoch": 0.6871788883699206, + "grad_norm": 0.6782651171936802, + "learning_rate": 1.600508562285544e-05, + "loss": 0.672, + "step": 5885 + }, + { + "epoch": 0.6877627276973377, + "grad_norm": 0.6173469226028959, + "learning_rate": 1.5967759190357585e-05, + "loss": 0.6372, + "step": 5890 + }, + { + "epoch": 0.6883465670247548, + "grad_norm": 0.6523808993714774, + "learning_rate": 1.5930475754864898e-05, + "loss": 0.6481, + "step": 5895 + }, + { + "epoch": 0.6889304063521718, + "grad_norm": 0.6943963703663875, + "learning_rate": 1.5893235455385575e-05, + "loss": 0.6613, + "step": 5900 + }, + { + "epoch": 0.689514245679589, + "grad_norm": 0.6205909347546901, + "learning_rate": 1.5856038430766994e-05, + "loss": 0.6504, + "step": 5905 + }, + { + "epoch": 0.6900980850070061, + "grad_norm": 0.6577250115332157, + "learning_rate": 1.5818884819695184e-05, + "loss": 0.643, + "step": 5910 + }, + { + "epoch": 0.6906819243344232, + "grad_norm": 0.7131285746606613, + "learning_rate": 1.5781774760694304e-05, + "loss": 0.6319, + "step": 5915 + }, + { + "epoch": 0.6912657636618402, + "grad_norm": 0.6702222134956441, + "learning_rate": 1.5744708392126138e-05, + "loss": 0.6441, + "step": 5920 + }, + { + "epoch": 0.6918496029892574, + "grad_norm": 0.6458299715715866, + "learning_rate": 1.5707685852189573e-05, + "loss": 0.6546, + "step": 5925 + }, + { + "epoch": 0.6924334423166745, + "grad_norm": 0.6467592412179095, + "learning_rate": 1.5670707278920084e-05, + "loss": 0.6529, + "step": 5930 + }, + { + "epoch": 0.6930172816440916, + "grad_norm": 0.66541301231305, + "learning_rate": 1.563377281018922e-05, + "loss": 0.6447, + "step": 5935 + }, + { + "epoch": 0.6936011209715086, + "grad_norm": 0.6988507897107378, + "learning_rate": 1.5596882583704092e-05, + "loss": 0.6706, + "step": 5940 + }, + { + "epoch": 0.6941849602989257, + "grad_norm": 0.6253772210358882, + "learning_rate": 1.5560036737006856e-05, + "loss": 0.6469, + "step": 5945 + }, + { + "epoch": 0.6947687996263429, + "grad_norm": 0.6636080001462308, + "learning_rate": 1.5523235407474195e-05, + "loss": 0.6516, + "step": 5950 + }, + { + "epoch": 0.6953526389537599, + "grad_norm": 0.6904274005251589, + "learning_rate": 1.5486478732316827e-05, + "loss": 0.6692, + "step": 5955 + }, + { + "epoch": 0.695936478281177, + "grad_norm": 0.6225295732898889, + "learning_rate": 1.5449766848578968e-05, + "loss": 0.6476, + "step": 5960 + }, + { + "epoch": 0.6965203176085941, + "grad_norm": 0.678099724634499, + "learning_rate": 1.541309989313784e-05, + "loss": 0.6688, + "step": 5965 + }, + { + "epoch": 0.6971041569360112, + "grad_norm": 0.6507846990956992, + "learning_rate": 1.5376478002703154e-05, + "loss": 0.655, + "step": 5970 + }, + { + "epoch": 0.6976879962634283, + "grad_norm": 0.6491939428328597, + "learning_rate": 1.5339901313816584e-05, + "loss": 0.6407, + "step": 5975 + }, + { + "epoch": 0.6982718355908454, + "grad_norm": 0.6535516438873002, + "learning_rate": 1.5303369962851298e-05, + "loss": 0.6601, + "step": 5980 + }, + { + "epoch": 0.6988556749182625, + "grad_norm": 0.6230417912205047, + "learning_rate": 1.5266884086011406e-05, + "loss": 0.6554, + "step": 5985 + }, + { + "epoch": 0.6994395142456796, + "grad_norm": 0.6135419131501751, + "learning_rate": 1.5230443819331492e-05, + "loss": 0.6445, + "step": 5990 + }, + { + "epoch": 0.7000233535730966, + "grad_norm": 0.6511954072256156, + "learning_rate": 1.5194049298676061e-05, + "loss": 0.629, + "step": 5995 + }, + { + "epoch": 0.7006071929005138, + "grad_norm": 0.6748956312601866, + "learning_rate": 1.515770065973907e-05, + "loss": 0.6444, + "step": 6000 + }, + { + "epoch": 0.7011910322279309, + "grad_norm": 0.643988678018105, + "learning_rate": 1.5121398038043421e-05, + "loss": 0.6582, + "step": 6005 + }, + { + "epoch": 0.701774871555348, + "grad_norm": 0.6643325729889801, + "learning_rate": 1.5085141568940419e-05, + "loss": 0.6415, + "step": 6010 + }, + { + "epoch": 0.702358710882765, + "grad_norm": 0.6061098072733991, + "learning_rate": 1.5048931387609321e-05, + "loss": 0.6382, + "step": 6015 + }, + { + "epoch": 0.7029425502101821, + "grad_norm": 0.6170191771001262, + "learning_rate": 1.501276762905679e-05, + "loss": 0.6531, + "step": 6020 + }, + { + "epoch": 0.7035263895375993, + "grad_norm": 0.7559185866317533, + "learning_rate": 1.4976650428116401e-05, + "loss": 0.6604, + "step": 6025 + }, + { + "epoch": 0.7041102288650164, + "grad_norm": 0.6226993559992735, + "learning_rate": 1.4940579919448147e-05, + "loss": 0.6448, + "step": 6030 + }, + { + "epoch": 0.7046940681924334, + "grad_norm": 0.6305958040557355, + "learning_rate": 1.4904556237537936e-05, + "loss": 0.683, + "step": 6035 + }, + { + "epoch": 0.7052779075198505, + "grad_norm": 0.6589110960306788, + "learning_rate": 1.4868579516697079e-05, + "loss": 0.6518, + "step": 6040 + }, + { + "epoch": 0.7058617468472677, + "grad_norm": 0.6271415611314127, + "learning_rate": 1.4832649891061811e-05, + "loss": 0.6427, + "step": 6045 + }, + { + "epoch": 0.7064455861746848, + "grad_norm": 0.6637185598724493, + "learning_rate": 1.4796767494592757e-05, + "loss": 0.6467, + "step": 6050 + }, + { + "epoch": 0.7070294255021018, + "grad_norm": 0.6167876593558868, + "learning_rate": 1.4760932461074467e-05, + "loss": 0.642, + "step": 6055 + }, + { + "epoch": 0.7076132648295189, + "grad_norm": 0.659548658248431, + "learning_rate": 1.4725144924114891e-05, + "loss": 0.6369, + "step": 6060 + }, + { + "epoch": 0.708197104156936, + "grad_norm": 0.6560272710570144, + "learning_rate": 1.4689405017144908e-05, + "loss": 0.641, + "step": 6065 + }, + { + "epoch": 0.708780943484353, + "grad_norm": 0.7199923240487227, + "learning_rate": 1.4653712873417796e-05, + "loss": 0.6585, + "step": 6070 + }, + { + "epoch": 0.7093647828117702, + "grad_norm": 0.6384648928879884, + "learning_rate": 1.4618068626008755e-05, + "loss": 0.6331, + "step": 6075 + }, + { + "epoch": 0.7099486221391873, + "grad_norm": 0.6373012832164493, + "learning_rate": 1.4582472407814419e-05, + "loss": 0.6442, + "step": 6080 + }, + { + "epoch": 0.7105324614666044, + "grad_norm": 0.5981352121846706, + "learning_rate": 1.4546924351552333e-05, + "loss": 0.6559, + "step": 6085 + }, + { + "epoch": 0.7111163007940214, + "grad_norm": 0.6394867149017587, + "learning_rate": 1.4511424589760486e-05, + "loss": 0.6361, + "step": 6090 + }, + { + "epoch": 0.7117001401214386, + "grad_norm": 0.6479792085825549, + "learning_rate": 1.4475973254796799e-05, + "loss": 0.6661, + "step": 6095 + }, + { + "epoch": 0.7122839794488557, + "grad_norm": 0.6260055427994634, + "learning_rate": 1.4440570478838645e-05, + "loss": 0.6552, + "step": 6100 + }, + { + "epoch": 0.7128678187762728, + "grad_norm": 0.6540376062658972, + "learning_rate": 1.440521639388233e-05, + "loss": 0.6459, + "step": 6105 + }, + { + "epoch": 0.7134516581036898, + "grad_norm": 0.6627728228170229, + "learning_rate": 1.436991113174265e-05, + "loss": 0.6577, + "step": 6110 + }, + { + "epoch": 0.714035497431107, + "grad_norm": 0.6265546084283484, + "learning_rate": 1.4334654824052351e-05, + "loss": 0.6368, + "step": 6115 + }, + { + "epoch": 0.7146193367585241, + "grad_norm": 0.5997797137504358, + "learning_rate": 1.429944760226164e-05, + "loss": 0.6568, + "step": 6120 + }, + { + "epoch": 0.7152031760859412, + "grad_norm": 0.6434752600173411, + "learning_rate": 1.4264289597637741e-05, + "loss": 0.6604, + "step": 6125 + }, + { + "epoch": 0.7157870154133582, + "grad_norm": 0.6311149099041024, + "learning_rate": 1.4229180941264364e-05, + "loss": 0.6358, + "step": 6130 + }, + { + "epoch": 0.7163708547407753, + "grad_norm": 0.6668547608565908, + "learning_rate": 1.4194121764041224e-05, + "loss": 0.6403, + "step": 6135 + }, + { + "epoch": 0.7169546940681925, + "grad_norm": 0.6769321842488776, + "learning_rate": 1.4159112196683564e-05, + "loss": 0.639, + "step": 6140 + }, + { + "epoch": 0.7175385333956096, + "grad_norm": 0.7170192890511556, + "learning_rate": 1.4124152369721655e-05, + "loss": 0.655, + "step": 6145 + }, + { + "epoch": 0.7181223727230266, + "grad_norm": 0.6372648272171578, + "learning_rate": 1.408924241350032e-05, + "loss": 0.622, + "step": 6150 + }, + { + "epoch": 0.7187062120504437, + "grad_norm": 0.6219636894551943, + "learning_rate": 1.4054382458178439e-05, + "loss": 0.6433, + "step": 6155 + }, + { + "epoch": 0.7192900513778608, + "grad_norm": 0.6907119907534172, + "learning_rate": 1.4019572633728473e-05, + "loss": 0.6538, + "step": 6160 + }, + { + "epoch": 0.7198738907052779, + "grad_norm": 0.6671337056917069, + "learning_rate": 1.3984813069935967e-05, + "loss": 0.6483, + "step": 6165 + }, + { + "epoch": 0.720457730032695, + "grad_norm": 0.6141381534242153, + "learning_rate": 1.395010389639908e-05, + "loss": 0.6588, + "step": 6170 + }, + { + "epoch": 0.7210415693601121, + "grad_norm": 0.5936128850781495, + "learning_rate": 1.391544524252808e-05, + "loss": 0.6411, + "step": 6175 + }, + { + "epoch": 0.7216254086875292, + "grad_norm": 0.6480723618326938, + "learning_rate": 1.388083723754491e-05, + "loss": 0.6507, + "step": 6180 + }, + { + "epoch": 0.7222092480149462, + "grad_norm": 0.6484221169199865, + "learning_rate": 1.384628001048264e-05, + "loss": 0.6429, + "step": 6185 + }, + { + "epoch": 0.7227930873423634, + "grad_norm": 0.774949721677888, + "learning_rate": 1.381177369018503e-05, + "loss": 0.6418, + "step": 6190 + }, + { + "epoch": 0.7233769266697805, + "grad_norm": 0.6723770744606823, + "learning_rate": 1.377731840530604e-05, + "loss": 0.6444, + "step": 6195 + }, + { + "epoch": 0.7239607659971976, + "grad_norm": 0.6286827888599001, + "learning_rate": 1.374291428430935e-05, + "loss": 0.6528, + "step": 6200 + }, + { + "epoch": 0.7245446053246146, + "grad_norm": 0.6690865901700592, + "learning_rate": 1.3708561455467872e-05, + "loss": 0.6472, + "step": 6205 + }, + { + "epoch": 0.7251284446520317, + "grad_norm": 0.6206607920597657, + "learning_rate": 1.3674260046863285e-05, + "loss": 0.6374, + "step": 6210 + }, + { + "epoch": 0.7257122839794489, + "grad_norm": 0.6091583021662523, + "learning_rate": 1.3640010186385552e-05, + "loss": 0.6393, + "step": 6215 + }, + { + "epoch": 0.726296123306866, + "grad_norm": 0.6621947921900122, + "learning_rate": 1.3605812001732444e-05, + "loss": 0.6546, + "step": 6220 + }, + { + "epoch": 0.726879962634283, + "grad_norm": 0.6526068291619619, + "learning_rate": 1.3571665620409064e-05, + "loss": 0.6501, + "step": 6225 + }, + { + "epoch": 0.7274638019617001, + "grad_norm": 2.4602752313470493, + "learning_rate": 1.3537571169727359e-05, + "loss": 0.6393, + "step": 6230 + }, + { + "epoch": 0.7280476412891173, + "grad_norm": 0.6356268036541916, + "learning_rate": 1.3503528776805676e-05, + "loss": 0.6306, + "step": 6235 + }, + { + "epoch": 0.7286314806165344, + "grad_norm": 0.6326305074104267, + "learning_rate": 1.3469538568568255e-05, + "loss": 0.6295, + "step": 6240 + }, + { + "epoch": 0.7292153199439514, + "grad_norm": 0.6222833616408877, + "learning_rate": 1.3435600671744768e-05, + "loss": 0.6283, + "step": 6245 + }, + { + "epoch": 0.7297991592713685, + "grad_norm": 0.6133561961949014, + "learning_rate": 1.3401715212869864e-05, + "loss": 0.6224, + "step": 6250 + }, + { + "epoch": 0.7303829985987856, + "grad_norm": 0.6037967169995397, + "learning_rate": 1.3367882318282666e-05, + "loss": 0.6486, + "step": 6255 + }, + { + "epoch": 0.7309668379262028, + "grad_norm": 0.6471850372789167, + "learning_rate": 1.3334102114126314e-05, + "loss": 0.6519, + "step": 6260 + }, + { + "epoch": 0.7315506772536198, + "grad_norm": 0.691632251325258, + "learning_rate": 1.330037472634752e-05, + "loss": 0.652, + "step": 6265 + }, + { + "epoch": 0.7321345165810369, + "grad_norm": 0.646234135333679, + "learning_rate": 1.3266700280696042e-05, + "loss": 0.6329, + "step": 6270 + }, + { + "epoch": 0.732718355908454, + "grad_norm": 0.6198426403579138, + "learning_rate": 1.3233078902724266e-05, + "loss": 0.6577, + "step": 6275 + }, + { + "epoch": 0.733302195235871, + "grad_norm": 0.7320942573246623, + "learning_rate": 1.3199510717786714e-05, + "loss": 0.6533, + "step": 6280 + }, + { + "epoch": 0.7338860345632882, + "grad_norm": 0.6568792492349543, + "learning_rate": 1.3165995851039591e-05, + "loss": 0.6712, + "step": 6285 + }, + { + "epoch": 0.7344698738907053, + "grad_norm": 0.6837902650711637, + "learning_rate": 1.3132534427440301e-05, + "loss": 0.6426, + "step": 6290 + }, + { + "epoch": 0.7350537132181224, + "grad_norm": 0.6962511458886425, + "learning_rate": 1.309912657174699e-05, + "loss": 0.6495, + "step": 6295 + }, + { + "epoch": 0.7356375525455394, + "grad_norm": 0.6218195805719087, + "learning_rate": 1.3065772408518085e-05, + "loss": 0.6378, + "step": 6300 + }, + { + "epoch": 0.7362213918729565, + "grad_norm": 0.6577106692550687, + "learning_rate": 1.3032472062111823e-05, + "loss": 0.639, + "step": 6305 + }, + { + "epoch": 0.7368052312003737, + "grad_norm": 0.6695193352974672, + "learning_rate": 1.2999225656685781e-05, + "loss": 0.6452, + "step": 6310 + }, + { + "epoch": 0.7373890705277908, + "grad_norm": 0.6531494260665233, + "learning_rate": 1.2966033316196435e-05, + "loss": 0.6539, + "step": 6315 + }, + { + "epoch": 0.7379729098552078, + "grad_norm": 0.6108770743645784, + "learning_rate": 1.2932895164398684e-05, + "loss": 0.6473, + "step": 6320 + }, + { + "epoch": 0.7385567491826249, + "grad_norm": 0.6125085475187015, + "learning_rate": 1.2899811324845373e-05, + "loss": 0.6457, + "step": 6325 + }, + { + "epoch": 0.739140588510042, + "grad_norm": 0.6373905010866605, + "learning_rate": 1.2866781920886873e-05, + "loss": 0.6608, + "step": 6330 + }, + { + "epoch": 0.7397244278374592, + "grad_norm": 0.6187004994232301, + "learning_rate": 1.2833807075670564e-05, + "loss": 0.6462, + "step": 6335 + }, + { + "epoch": 0.7403082671648762, + "grad_norm": 0.6461021466819432, + "learning_rate": 1.2800886912140433e-05, + "loss": 0.6439, + "step": 6340 + }, + { + "epoch": 0.7408921064922933, + "grad_norm": 0.664306032806025, + "learning_rate": 1.2768021553036596e-05, + "loss": 0.6286, + "step": 6345 + }, + { + "epoch": 0.7414759458197104, + "grad_norm": 0.6983665960240255, + "learning_rate": 1.2735211120894813e-05, + "loss": 0.648, + "step": 6350 + }, + { + "epoch": 0.7420597851471276, + "grad_norm": 0.6199633025878769, + "learning_rate": 1.2702455738046068e-05, + "loss": 0.6481, + "step": 6355 + }, + { + "epoch": 0.7426436244745446, + "grad_norm": 0.6938764591430603, + "learning_rate": 1.2669755526616093e-05, + "loss": 0.6588, + "step": 6360 + }, + { + "epoch": 0.7432274638019617, + "grad_norm": 0.7073147544566707, + "learning_rate": 1.2637110608524916e-05, + "loss": 0.6221, + "step": 6365 + }, + { + "epoch": 0.7438113031293788, + "grad_norm": 0.6894625288865952, + "learning_rate": 1.2604521105486417e-05, + "loss": 0.641, + "step": 6370 + }, + { + "epoch": 0.7443951424567959, + "grad_norm": 0.6006031068931637, + "learning_rate": 1.2571987139007856e-05, + "loss": 0.6429, + "step": 6375 + }, + { + "epoch": 0.744978981784213, + "grad_norm": 0.6552582169334152, + "learning_rate": 1.253950883038944e-05, + "loss": 0.6517, + "step": 6380 + }, + { + "epoch": 0.7455628211116301, + "grad_norm": 0.6397535486475485, + "learning_rate": 1.2507086300723846e-05, + "loss": 0.6541, + "step": 6385 + }, + { + "epoch": 0.7461466604390472, + "grad_norm": 0.6902838347118688, + "learning_rate": 1.2474719670895796e-05, + "loss": 0.6468, + "step": 6390 + }, + { + "epoch": 0.7467304997664642, + "grad_norm": 0.6896130002515913, + "learning_rate": 1.2442409061581587e-05, + "loss": 0.6275, + "step": 6395 + }, + { + "epoch": 0.7473143390938813, + "grad_norm": 0.6540524402657386, + "learning_rate": 1.2410154593248657e-05, + "loss": 0.6257, + "step": 6400 + }, + { + "epoch": 0.7478981784212985, + "grad_norm": 0.6346234250470586, + "learning_rate": 1.2377956386155114e-05, + "loss": 0.6415, + "step": 6405 + }, + { + "epoch": 0.7484820177487156, + "grad_norm": 0.6812865894965642, + "learning_rate": 1.2345814560349316e-05, + "loss": 0.6569, + "step": 6410 + }, + { + "epoch": 0.7490658570761326, + "grad_norm": 0.7266805424106589, + "learning_rate": 1.231372923566939e-05, + "loss": 0.6433, + "step": 6415 + }, + { + "epoch": 0.7496496964035497, + "grad_norm": 0.6985012316377752, + "learning_rate": 1.2281700531742818e-05, + "loss": 0.6503, + "step": 6420 + }, + { + "epoch": 0.7502335357309668, + "grad_norm": 0.6279926697643428, + "learning_rate": 1.2249728567985966e-05, + "loss": 0.6516, + "step": 6425 + }, + { + "epoch": 0.750817375058384, + "grad_norm": 0.6823315113068185, + "learning_rate": 1.2217813463603664e-05, + "loss": 0.6479, + "step": 6430 + }, + { + "epoch": 0.751401214385801, + "grad_norm": 0.6012248291651592, + "learning_rate": 1.2185955337588727e-05, + "loss": 0.6334, + "step": 6435 + }, + { + "epoch": 0.7519850537132181, + "grad_norm": 0.6823126280567675, + "learning_rate": 1.2154154308721546e-05, + "loss": 0.6418, + "step": 6440 + }, + { + "epoch": 0.7525688930406352, + "grad_norm": 0.6157410397011505, + "learning_rate": 1.2122410495569623e-05, + "loss": 0.6399, + "step": 6445 + }, + { + "epoch": 0.7531527323680524, + "grad_norm": 0.6607304490574921, + "learning_rate": 1.2090724016487137e-05, + "loss": 0.6613, + "step": 6450 + }, + { + "epoch": 0.7537365716954694, + "grad_norm": 0.6448381380250503, + "learning_rate": 1.2059094989614503e-05, + "loss": 0.6402, + "step": 6455 + }, + { + "epoch": 0.7543204110228865, + "grad_norm": 0.72890234313802, + "learning_rate": 1.2027523532877928e-05, + "loss": 0.6339, + "step": 6460 + }, + { + "epoch": 0.7549042503503036, + "grad_norm": 0.6795166242573744, + "learning_rate": 1.1996009763988974e-05, + "loss": 0.6309, + "step": 6465 + }, + { + "epoch": 0.7554880896777207, + "grad_norm": 0.6893940287221781, + "learning_rate": 1.1964553800444123e-05, + "loss": 0.6471, + "step": 6470 + }, + { + "epoch": 0.7560719290051378, + "grad_norm": 0.634194352566182, + "learning_rate": 1.1933155759524332e-05, + "loss": 0.6387, + "step": 6475 + }, + { + "epoch": 0.7566557683325549, + "grad_norm": 0.7448512153381929, + "learning_rate": 1.1901815758294589e-05, + "loss": 0.6345, + "step": 6480 + }, + { + "epoch": 0.757239607659972, + "grad_norm": 0.763372636963995, + "learning_rate": 1.18705339136035e-05, + "loss": 0.6579, + "step": 6485 + }, + { + "epoch": 0.757823446987389, + "grad_norm": 0.6862268846936076, + "learning_rate": 1.1839310342082835e-05, + "loss": 0.6688, + "step": 6490 + }, + { + "epoch": 0.7584072863148061, + "grad_norm": 0.6320332817548495, + "learning_rate": 1.1808145160147092e-05, + "loss": 0.618, + "step": 6495 + }, + { + "epoch": 0.7589911256422233, + "grad_norm": 0.6133186430419542, + "learning_rate": 1.1777038483993066e-05, + "loss": 0.6359, + "step": 6500 + }, + { + "epoch": 0.7595749649696404, + "grad_norm": 0.6508243392201695, + "learning_rate": 1.1745990429599439e-05, + "loss": 0.658, + "step": 6505 + }, + { + "epoch": 0.7601588042970574, + "grad_norm": 0.6605087610839377, + "learning_rate": 1.1715001112726304e-05, + "loss": 0.6095, + "step": 6510 + }, + { + "epoch": 0.7607426436244745, + "grad_norm": 0.629256704641095, + "learning_rate": 1.1684070648914763e-05, + "loss": 0.651, + "step": 6515 + }, + { + "epoch": 0.7613264829518916, + "grad_norm": 0.6021258354982866, + "learning_rate": 1.1653199153486488e-05, + "loss": 0.6345, + "step": 6520 + }, + { + "epoch": 0.7619103222793088, + "grad_norm": 0.6659280501504915, + "learning_rate": 1.1622386741543295e-05, + "loss": 0.6344, + "step": 6525 + }, + { + "epoch": 0.7624941616067258, + "grad_norm": 0.6831457044400984, + "learning_rate": 1.1591633527966713e-05, + "loss": 0.6467, + "step": 6530 + }, + { + "epoch": 0.7630780009341429, + "grad_norm": 0.672942340382294, + "learning_rate": 1.1560939627417555e-05, + "loss": 0.6515, + "step": 6535 + }, + { + "epoch": 0.76366184026156, + "grad_norm": 0.6744624848853943, + "learning_rate": 1.1530305154335482e-05, + "loss": 0.633, + "step": 6540 + }, + { + "epoch": 0.7642456795889772, + "grad_norm": 0.6579110785895681, + "learning_rate": 1.1499730222938595e-05, + "loss": 0.6431, + "step": 6545 + }, + { + "epoch": 0.7648295189163942, + "grad_norm": 0.6262168723984547, + "learning_rate": 1.1469214947222993e-05, + "loss": 0.6359, + "step": 6550 + }, + { + "epoch": 0.7654133582438113, + "grad_norm": 0.6528034993009432, + "learning_rate": 1.1438759440962353e-05, + "loss": 0.6242, + "step": 6555 + }, + { + "epoch": 0.7659971975712284, + "grad_norm": 0.6200492980546738, + "learning_rate": 1.1408363817707523e-05, + "loss": 0.6453, + "step": 6560 + }, + { + "epoch": 0.7665810368986455, + "grad_norm": 0.7120307987935346, + "learning_rate": 1.1378028190786045e-05, + "loss": 0.6332, + "step": 6565 + }, + { + "epoch": 0.7671648762260626, + "grad_norm": 0.6343103468365546, + "learning_rate": 1.134775267330181e-05, + "loss": 0.6217, + "step": 6570 + }, + { + "epoch": 0.7677487155534797, + "grad_norm": 0.6443793067325617, + "learning_rate": 1.1317537378134568e-05, + "loss": 0.651, + "step": 6575 + }, + { + "epoch": 0.7683325548808968, + "grad_norm": 0.618840567288472, + "learning_rate": 1.1287382417939555e-05, + "loss": 0.6548, + "step": 6580 + }, + { + "epoch": 0.7689163942083139, + "grad_norm": 0.6460389555802754, + "learning_rate": 1.1257287905147035e-05, + "loss": 0.6326, + "step": 6585 + }, + { + "epoch": 0.7695002335357309, + "grad_norm": 0.5810628693651504, + "learning_rate": 1.1227253951961911e-05, + "loss": 0.6204, + "step": 6590 + }, + { + "epoch": 0.7700840728631481, + "grad_norm": 0.5872874463618635, + "learning_rate": 1.1197280670363297e-05, + "loss": 0.6376, + "step": 6595 + }, + { + "epoch": 0.7706679121905652, + "grad_norm": 0.6086855468161991, + "learning_rate": 1.1167368172104084e-05, + "loss": 0.64, + "step": 6600 + }, + { + "epoch": 0.7712517515179822, + "grad_norm": 0.6218777515538456, + "learning_rate": 1.1137516568710548e-05, + "loss": 0.6462, + "step": 6605 + }, + { + "epoch": 0.7718355908453993, + "grad_norm": 0.6036656429455959, + "learning_rate": 1.1107725971481923e-05, + "loss": 0.6391, + "step": 6610 + }, + { + "epoch": 0.7724194301728164, + "grad_norm": 0.5855667797586916, + "learning_rate": 1.107799649148998e-05, + "loss": 0.6282, + "step": 6615 + }, + { + "epoch": 0.7730032695002336, + "grad_norm": 0.6470380891791336, + "learning_rate": 1.1048328239578631e-05, + "loss": 0.6409, + "step": 6620 + }, + { + "epoch": 0.7735871088276506, + "grad_norm": 0.6152164633750634, + "learning_rate": 1.1018721326363493e-05, + "loss": 0.629, + "step": 6625 + }, + { + "epoch": 0.7741709481550677, + "grad_norm": 0.67733200025291, + "learning_rate": 1.0989175862231488e-05, + "loss": 0.651, + "step": 6630 + }, + { + "epoch": 0.7747547874824848, + "grad_norm": 1.146412390170155, + "learning_rate": 1.095969195734044e-05, + "loss": 0.6529, + "step": 6635 + }, + { + "epoch": 0.775338626809902, + "grad_norm": 0.59037588657831, + "learning_rate": 1.0930269721618641e-05, + "loss": 0.6113, + "step": 6640 + }, + { + "epoch": 0.775922466137319, + "grad_norm": 0.6131068191439204, + "learning_rate": 1.0900909264764463e-05, + "loss": 0.6272, + "step": 6645 + }, + { + "epoch": 0.7765063054647361, + "grad_norm": 0.6254292350299556, + "learning_rate": 1.0871610696245941e-05, + "loss": 0.6588, + "step": 6650 + }, + { + "epoch": 0.7770901447921532, + "grad_norm": 0.6513113097969633, + "learning_rate": 1.0842374125300364e-05, + "loss": 0.6484, + "step": 6655 + }, + { + "epoch": 0.7776739841195703, + "grad_norm": 0.6751648149069668, + "learning_rate": 1.081319966093386e-05, + "loss": 0.6664, + "step": 6660 + }, + { + "epoch": 0.7782578234469874, + "grad_norm": 0.6268332764660168, + "learning_rate": 1.0784087411921e-05, + "loss": 0.6215, + "step": 6665 + }, + { + "epoch": 0.7788416627744045, + "grad_norm": 0.6030514136330601, + "learning_rate": 1.0755037486804411e-05, + "loss": 0.6459, + "step": 6670 + }, + { + "epoch": 0.7794255021018216, + "grad_norm": 0.6059821182693572, + "learning_rate": 1.0726049993894324e-05, + "loss": 0.6296, + "step": 6675 + }, + { + "epoch": 0.7800093414292387, + "grad_norm": 0.6358704318338096, + "learning_rate": 1.0697125041268207e-05, + "loss": 0.6284, + "step": 6680 + }, + { + "epoch": 0.7805931807566557, + "grad_norm": 0.623055096194753, + "learning_rate": 1.0668262736770356e-05, + "loss": 0.6318, + "step": 6685 + }, + { + "epoch": 0.7811770200840729, + "grad_norm": 0.6221943017251522, + "learning_rate": 1.0639463188011476e-05, + "loss": 0.6553, + "step": 6690 + }, + { + "epoch": 0.78176085941149, + "grad_norm": 0.6274165949170496, + "learning_rate": 1.0610726502368303e-05, + "loss": 0.6499, + "step": 6695 + }, + { + "epoch": 0.7823446987389071, + "grad_norm": 0.6558366089961987, + "learning_rate": 1.0582052786983194e-05, + "loss": 0.642, + "step": 6700 + }, + { + "epoch": 0.7829285380663241, + "grad_norm": 0.6168624390569022, + "learning_rate": 1.0553442148763725e-05, + "loss": 0.6635, + "step": 6705 + }, + { + "epoch": 0.7835123773937412, + "grad_norm": 0.660505645249963, + "learning_rate": 1.0524894694382284e-05, + "loss": 0.6347, + "step": 6710 + }, + { + "epoch": 0.7840962167211584, + "grad_norm": 0.6422313039212652, + "learning_rate": 1.0496410530275694e-05, + "loss": 0.6387, + "step": 6715 + }, + { + "epoch": 0.7846800560485754, + "grad_norm": 0.6327033532056747, + "learning_rate": 1.0467989762644803e-05, + "loss": 0.6277, + "step": 6720 + }, + { + "epoch": 0.7852638953759925, + "grad_norm": 0.6472227494580378, + "learning_rate": 1.0439632497454093e-05, + "loss": 0.643, + "step": 6725 + }, + { + "epoch": 0.7858477347034096, + "grad_norm": 0.6216051002622686, + "learning_rate": 1.0411338840431278e-05, + "loss": 0.6412, + "step": 6730 + }, + { + "epoch": 0.7864315740308268, + "grad_norm": 0.6313937791651838, + "learning_rate": 1.0383108897066915e-05, + "loss": 0.641, + "step": 6735 + }, + { + "epoch": 0.7870154133582438, + "grad_norm": 0.633226750358156, + "learning_rate": 1.035494277261401e-05, + "loss": 0.6113, + "step": 6740 + }, + { + "epoch": 0.7875992526856609, + "grad_norm": 0.6101629183833445, + "learning_rate": 1.0326840572087633e-05, + "loss": 0.627, + "step": 6745 + }, + { + "epoch": 0.788183092013078, + "grad_norm": 0.603736417034934, + "learning_rate": 1.0298802400264502e-05, + "loss": 0.6303, + "step": 6750 + }, + { + "epoch": 0.7887669313404951, + "grad_norm": 0.6397082011815393, + "learning_rate": 1.0270828361682628e-05, + "loss": 0.6386, + "step": 6755 + }, + { + "epoch": 0.7893507706679121, + "grad_norm": 0.6666697928521588, + "learning_rate": 1.0242918560640893e-05, + "loss": 0.6172, + "step": 6760 + }, + { + "epoch": 0.7899346099953293, + "grad_norm": 0.6589671968178897, + "learning_rate": 1.0215073101198683e-05, + "loss": 0.6366, + "step": 6765 + }, + { + "epoch": 0.7905184493227464, + "grad_norm": 0.6257936502758339, + "learning_rate": 1.0187292087175485e-05, + "loss": 0.6245, + "step": 6770 + }, + { + "epoch": 0.7911022886501635, + "grad_norm": 0.6265837040701255, + "learning_rate": 1.0159575622150513e-05, + "loss": 0.6409, + "step": 6775 + }, + { + "epoch": 0.7916861279775805, + "grad_norm": 0.6374339844464507, + "learning_rate": 1.0131923809462313e-05, + "loss": 0.6481, + "step": 6780 + }, + { + "epoch": 0.7922699673049977, + "grad_norm": 0.6621139588212137, + "learning_rate": 1.0104336752208374e-05, + "loss": 0.6197, + "step": 6785 + }, + { + "epoch": 0.7928538066324148, + "grad_norm": 1.1720758186676021, + "learning_rate": 1.0076814553244762e-05, + "loss": 0.6477, + "step": 6790 + }, + { + "epoch": 0.7934376459598319, + "grad_norm": 0.6112390664794839, + "learning_rate": 1.0049357315185711e-05, + "loss": 0.6368, + "step": 6795 + }, + { + "epoch": 0.7940214852872489, + "grad_norm": 0.6025289483169135, + "learning_rate": 1.0021965140403267e-05, + "loss": 0.6117, + "step": 6800 + }, + { + "epoch": 0.794605324614666, + "grad_norm": 0.6585643058560275, + "learning_rate": 9.99463813102688e-06, + "loss": 0.6222, + "step": 6805 + }, + { + "epoch": 0.7951891639420832, + "grad_norm": 0.6715809889446958, + "learning_rate": 9.967376388943042e-06, + "loss": 0.6418, + "step": 6810 + }, + { + "epoch": 0.7957730032695002, + "grad_norm": 0.6217978701228573, + "learning_rate": 9.940180015794908e-06, + "loss": 0.6577, + "step": 6815 + }, + { + "epoch": 0.7963568425969173, + "grad_norm": 0.6183066216767451, + "learning_rate": 9.913049112981897e-06, + "loss": 0.6496, + "step": 6820 + }, + { + "epoch": 0.7969406819243344, + "grad_norm": 0.6099618127843874, + "learning_rate": 9.885983781659332e-06, + "loss": 0.6558, + "step": 6825 + }, + { + "epoch": 0.7975245212517516, + "grad_norm": 0.658463790742885, + "learning_rate": 9.858984122738072e-06, + "loss": 0.627, + "step": 6830 + }, + { + "epoch": 0.7981083605791686, + "grad_norm": 0.6615276020621076, + "learning_rate": 9.832050236884102e-06, + "loss": 0.6497, + "step": 6835 + }, + { + "epoch": 0.7986921999065857, + "grad_norm": 0.6259325557193028, + "learning_rate": 9.805182224518186e-06, + "loss": 0.6414, + "step": 6840 + }, + { + "epoch": 0.7992760392340028, + "grad_norm": 0.5925039243497241, + "learning_rate": 9.778380185815486e-06, + "loss": 0.6381, + "step": 6845 + }, + { + "epoch": 0.7998598785614199, + "grad_norm": 0.6550300289531977, + "learning_rate": 9.751644220705187e-06, + "loss": 0.5937, + "step": 6850 + }, + { + "epoch": 0.800443717888837, + "grad_norm": 0.6498679334529014, + "learning_rate": 9.72497442887012e-06, + "loss": 0.6589, + "step": 6855 + }, + { + "epoch": 0.8010275572162541, + "grad_norm": 0.6258303217846111, + "learning_rate": 9.698370909746387e-06, + "loss": 0.6314, + "step": 6860 + }, + { + "epoch": 0.8016113965436712, + "grad_norm": 0.6110263661620758, + "learning_rate": 9.671833762523016e-06, + "loss": 0.6422, + "step": 6865 + }, + { + "epoch": 0.8021952358710883, + "grad_norm": 0.6604564267266404, + "learning_rate": 9.645363086141561e-06, + "loss": 0.6687, + "step": 6870 + }, + { + "epoch": 0.8027790751985053, + "grad_norm": 0.6133261669233432, + "learning_rate": 9.618958979295747e-06, + "loss": 0.6495, + "step": 6875 + }, + { + "epoch": 0.8033629145259225, + "grad_norm": 0.5805003568606534, + "learning_rate": 9.592621540431101e-06, + "loss": 0.6325, + "step": 6880 + }, + { + "epoch": 0.8039467538533396, + "grad_norm": 0.6016490003111251, + "learning_rate": 9.566350867744584e-06, + "loss": 0.6496, + "step": 6885 + }, + { + "epoch": 0.8045305931807567, + "grad_norm": 0.5860139017883728, + "learning_rate": 9.540147059184226e-06, + "loss": 0.6305, + "step": 6890 + }, + { + "epoch": 0.8051144325081737, + "grad_norm": 0.7230562270791053, + "learning_rate": 9.514010212448751e-06, + "loss": 0.6328, + "step": 6895 + }, + { + "epoch": 0.8056982718355908, + "grad_norm": 0.5980225557666627, + "learning_rate": 9.487940424987235e-06, + "loss": 0.6408, + "step": 6900 + }, + { + "epoch": 0.806282111163008, + "grad_norm": 0.6002231741684171, + "learning_rate": 9.461937793998723e-06, + "loss": 0.656, + "step": 6905 + }, + { + "epoch": 0.8068659504904251, + "grad_norm": 0.6367968204118127, + "learning_rate": 9.436002416431868e-06, + "loss": 0.6306, + "step": 6910 + }, + { + "epoch": 0.8074497898178421, + "grad_norm": 0.6065896754400874, + "learning_rate": 9.41013438898458e-06, + "loss": 0.632, + "step": 6915 + }, + { + "epoch": 0.8080336291452592, + "grad_norm": 0.6605091785427533, + "learning_rate": 9.384333808103656e-06, + "loss": 0.6321, + "step": 6920 + }, + { + "epoch": 0.8086174684726763, + "grad_norm": 0.649728718647983, + "learning_rate": 9.358600769984432e-06, + "loss": 0.6358, + "step": 6925 + }, + { + "epoch": 0.8092013078000934, + "grad_norm": 0.6052521632195271, + "learning_rate": 9.332935370570402e-06, + "loss": 0.6501, + "step": 6930 + }, + { + "epoch": 0.8097851471275105, + "grad_norm": 0.6327178716698119, + "learning_rate": 9.30733770555289e-06, + "loss": 0.6401, + "step": 6935 + }, + { + "epoch": 0.8103689864549276, + "grad_norm": 0.6531262103718444, + "learning_rate": 9.281807870370666e-06, + "loss": 0.6448, + "step": 6940 + }, + { + "epoch": 0.8109528257823447, + "grad_norm": 0.6301419895570477, + "learning_rate": 9.256345960209608e-06, + "loss": 0.6264, + "step": 6945 + }, + { + "epoch": 0.8115366651097617, + "grad_norm": 0.6963741903466704, + "learning_rate": 9.23095207000234e-06, + "loss": 0.6741, + "step": 6950 + }, + { + "epoch": 0.8121205044371789, + "grad_norm": 0.6464706174782056, + "learning_rate": 9.205626294427885e-06, + "loss": 0.63, + "step": 6955 + }, + { + "epoch": 0.812704343764596, + "grad_norm": 0.6334091306063494, + "learning_rate": 9.18036872791129e-06, + "loss": 0.6497, + "step": 6960 + }, + { + "epoch": 0.8132881830920131, + "grad_norm": 0.6022796874371679, + "learning_rate": 9.155179464623312e-06, + "loss": 0.6326, + "step": 6965 + }, + { + "epoch": 0.8138720224194301, + "grad_norm": 0.6427806776747196, + "learning_rate": 9.130058598480027e-06, + "loss": 0.6307, + "step": 6970 + }, + { + "epoch": 0.8144558617468473, + "grad_norm": 0.6393493440257442, + "learning_rate": 9.105006223142507e-06, + "loss": 0.6296, + "step": 6975 + }, + { + "epoch": 0.8150397010742644, + "grad_norm": 0.6183999459604124, + "learning_rate": 9.080022432016457e-06, + "loss": 0.6277, + "step": 6980 + }, + { + "epoch": 0.8156235404016815, + "grad_norm": 0.599802408180061, + "learning_rate": 9.05510731825188e-06, + "loss": 0.6452, + "step": 6985 + }, + { + "epoch": 0.8162073797290985, + "grad_norm": 0.6467508872801025, + "learning_rate": 9.030260974742701e-06, + "loss": 0.6469, + "step": 6990 + }, + { + "epoch": 0.8167912190565156, + "grad_norm": 0.6400772659223442, + "learning_rate": 9.005483494126474e-06, + "loss": 0.6382, + "step": 6995 + }, + { + "epoch": 0.8173750583839328, + "grad_norm": 0.6070945909106871, + "learning_rate": 8.980774968783978e-06, + "loss": 0.6492, + "step": 7000 + }, + { + "epoch": 0.8179588977113499, + "grad_norm": 0.6348123346621048, + "learning_rate": 8.9561354908389e-06, + "loss": 0.6478, + "step": 7005 + }, + { + "epoch": 0.8185427370387669, + "grad_norm": 0.69411335593364, + "learning_rate": 8.931565152157492e-06, + "loss": 0.6511, + "step": 7010 + }, + { + "epoch": 0.819126576366184, + "grad_norm": 0.6676483425443996, + "learning_rate": 8.907064044348232e-06, + "loss": 0.6458, + "step": 7015 + }, + { + "epoch": 0.8197104156936011, + "grad_norm": 0.6295085789424556, + "learning_rate": 8.88263225876147e-06, + "loss": 0.6046, + "step": 7020 + }, + { + "epoch": 0.8202942550210183, + "grad_norm": 0.589645571190248, + "learning_rate": 8.858269886489099e-06, + "loss": 0.6126, + "step": 7025 + }, + { + "epoch": 0.8208780943484353, + "grad_norm": 0.6212383481031892, + "learning_rate": 8.8339770183642e-06, + "loss": 0.6433, + "step": 7030 + }, + { + "epoch": 0.8214619336758524, + "grad_norm": 0.6121504319306436, + "learning_rate": 8.809753744960733e-06, + "loss": 0.647, + "step": 7035 + }, + { + "epoch": 0.8220457730032695, + "grad_norm": 0.6020644990184818, + "learning_rate": 8.785600156593157e-06, + "loss": 0.632, + "step": 7040 + }, + { + "epoch": 0.8226296123306865, + "grad_norm": 0.612215270277333, + "learning_rate": 8.761516343316131e-06, + "loss": 0.6439, + "step": 7045 + }, + { + "epoch": 0.8232134516581037, + "grad_norm": 0.6111428805532173, + "learning_rate": 8.737502394924158e-06, + "loss": 0.6374, + "step": 7050 + }, + { + "epoch": 0.8237972909855208, + "grad_norm": 0.6324321344274677, + "learning_rate": 8.713558400951254e-06, + "loss": 0.6315, + "step": 7055 + }, + { + "epoch": 0.8243811303129379, + "grad_norm": 0.650140068168635, + "learning_rate": 8.689684450670627e-06, + "loss": 0.6447, + "step": 7060 + }, + { + "epoch": 0.8249649696403549, + "grad_norm": 0.6441498808124666, + "learning_rate": 8.665880633094314e-06, + "loss": 0.6141, + "step": 7065 + }, + { + "epoch": 0.825548808967772, + "grad_norm": 0.5710054884737357, + "learning_rate": 8.642147036972887e-06, + "loss": 0.6333, + "step": 7070 + }, + { + "epoch": 0.8261326482951892, + "grad_norm": 0.6220206555668683, + "learning_rate": 8.618483750795087e-06, + "loss": 0.6437, + "step": 7075 + }, + { + "epoch": 0.8267164876226063, + "grad_norm": 0.6003035270757918, + "learning_rate": 8.594890862787518e-06, + "loss": 0.6234, + "step": 7080 + }, + { + "epoch": 0.8273003269500233, + "grad_norm": 0.6292916954987227, + "learning_rate": 8.571368460914316e-06, + "loss": 0.6315, + "step": 7085 + }, + { + "epoch": 0.8278841662774404, + "grad_norm": 0.6174540855499432, + "learning_rate": 8.547916632876806e-06, + "loss": 0.6425, + "step": 7090 + }, + { + "epoch": 0.8284680056048576, + "grad_norm": 0.6132426662760693, + "learning_rate": 8.524535466113185e-06, + "loss": 0.6289, + "step": 7095 + }, + { + "epoch": 0.8290518449322747, + "grad_norm": 0.6175813113333095, + "learning_rate": 8.5012250477982e-06, + "loss": 0.6333, + "step": 7100 + }, + { + "epoch": 0.8296356842596917, + "grad_norm": 0.6012843126303246, + "learning_rate": 8.477985464842816e-06, + "loss": 0.6356, + "step": 7105 + }, + { + "epoch": 0.8302195235871088, + "grad_norm": 0.6689824872334442, + "learning_rate": 8.454816803893893e-06, + "loss": 0.6373, + "step": 7110 + }, + { + "epoch": 0.830803362914526, + "grad_norm": 0.6391248959131548, + "learning_rate": 8.431719151333864e-06, + "loss": 0.6247, + "step": 7115 + }, + { + "epoch": 0.8313872022419431, + "grad_norm": 0.6181793298039209, + "learning_rate": 8.40869259328042e-06, + "loss": 0.6443, + "step": 7120 + }, + { + "epoch": 0.8319710415693601, + "grad_norm": 0.636510927044935, + "learning_rate": 8.385737215586171e-06, + "loss": 0.6455, + "step": 7125 + }, + { + "epoch": 0.8325548808967772, + "grad_norm": 0.5973412707457408, + "learning_rate": 8.362853103838344e-06, + "loss": 0.6257, + "step": 7130 + }, + { + "epoch": 0.8331387202241943, + "grad_norm": 0.6492295469372185, + "learning_rate": 8.340040343358455e-06, + "loss": 0.6115, + "step": 7135 + }, + { + "epoch": 0.8337225595516113, + "grad_norm": 0.662621688734901, + "learning_rate": 8.317299019201996e-06, + "loss": 0.6387, + "step": 7140 + }, + { + "epoch": 0.8343063988790285, + "grad_norm": 0.6043337811826702, + "learning_rate": 8.294629216158107e-06, + "loss": 0.6382, + "step": 7145 + }, + { + "epoch": 0.8348902382064456, + "grad_norm": 0.6031307093474805, + "learning_rate": 8.272031018749272e-06, + "loss": 0.6442, + "step": 7150 + }, + { + "epoch": 0.8354740775338627, + "grad_norm": 0.6010727899754135, + "learning_rate": 8.249504511231005e-06, + "loss": 0.6496, + "step": 7155 + }, + { + "epoch": 0.8360579168612797, + "grad_norm": 0.6131715223251057, + "learning_rate": 8.227049777591516e-06, + "loss": 0.6277, + "step": 7160 + }, + { + "epoch": 0.8366417561886969, + "grad_norm": 0.6447679727511777, + "learning_rate": 8.204666901551428e-06, + "loss": 0.6469, + "step": 7165 + }, + { + "epoch": 0.837225595516114, + "grad_norm": 0.6122691237665745, + "learning_rate": 8.182355966563438e-06, + "loss": 0.6307, + "step": 7170 + }, + { + "epoch": 0.8378094348435311, + "grad_norm": 0.648928872074787, + "learning_rate": 8.160117055812019e-06, + "loss": 0.6491, + "step": 7175 + }, + { + "epoch": 0.8383932741709481, + "grad_norm": 0.607879109395071, + "learning_rate": 8.13795025221311e-06, + "loss": 0.6267, + "step": 7180 + }, + { + "epoch": 0.8389771134983652, + "grad_norm": 0.6059669756904347, + "learning_rate": 8.115855638413806e-06, + "loss": 0.6218, + "step": 7185 + }, + { + "epoch": 0.8395609528257824, + "grad_norm": 0.6439391595439827, + "learning_rate": 8.09383329679204e-06, + "loss": 0.627, + "step": 7190 + }, + { + "epoch": 0.8401447921531995, + "grad_norm": 0.631501317582565, + "learning_rate": 8.071883309456292e-06, + "loss": 0.6269, + "step": 7195 + }, + { + "epoch": 0.8407286314806165, + "grad_norm": 0.6122310277654351, + "learning_rate": 8.050005758245274e-06, + "loss": 0.6029, + "step": 7200 + }, + { + "epoch": 0.8413124708080336, + "grad_norm": 0.6397694501906913, + "learning_rate": 8.028200724727623e-06, + "loss": 0.6271, + "step": 7205 + }, + { + "epoch": 0.8418963101354507, + "grad_norm": 0.6195119302731571, + "learning_rate": 8.006468290201603e-06, + "loss": 0.6253, + "step": 7210 + }, + { + "epoch": 0.8424801494628679, + "grad_norm": 0.5710130842633037, + "learning_rate": 7.984808535694794e-06, + "loss": 0.6338, + "step": 7215 + }, + { + "epoch": 0.8430639887902849, + "grad_norm": 0.6170905160754272, + "learning_rate": 7.963221541963799e-06, + "loss": 0.643, + "step": 7220 + }, + { + "epoch": 0.843647828117702, + "grad_norm": 0.5929177989523172, + "learning_rate": 7.94170738949394e-06, + "loss": 0.622, + "step": 7225 + }, + { + "epoch": 0.8442316674451191, + "grad_norm": 0.6066005138729927, + "learning_rate": 7.920266158498948e-06, + "loss": 0.639, + "step": 7230 + }, + { + "epoch": 0.8448155067725363, + "grad_norm": 0.6368084808396752, + "learning_rate": 7.898897928920684e-06, + "loss": 0.6206, + "step": 7235 + }, + { + "epoch": 0.8453993460999533, + "grad_norm": 0.6065549696314979, + "learning_rate": 7.877602780428816e-06, + "loss": 0.6193, + "step": 7240 + }, + { + "epoch": 0.8459831854273704, + "grad_norm": 0.6184574053651811, + "learning_rate": 7.856380792420549e-06, + "loss": 0.631, + "step": 7245 + }, + { + "epoch": 0.8465670247547875, + "grad_norm": 0.6329384365136123, + "learning_rate": 7.835232044020304e-06, + "loss": 0.6373, + "step": 7250 + }, + { + "epoch": 0.8471508640822045, + "grad_norm": 0.6105416018238302, + "learning_rate": 7.81415661407944e-06, + "loss": 0.6242, + "step": 7255 + }, + { + "epoch": 0.8477347034096216, + "grad_norm": 0.6261425215195263, + "learning_rate": 7.793154581175954e-06, + "loss": 0.6315, + "step": 7260 + }, + { + "epoch": 0.8483185427370388, + "grad_norm": 0.5756730477657247, + "learning_rate": 7.772226023614185e-06, + "loss": 0.6348, + "step": 7265 + }, + { + "epoch": 0.8489023820644559, + "grad_norm": 0.564793355503493, + "learning_rate": 7.751371019424528e-06, + "loss": 0.6295, + "step": 7270 + }, + { + "epoch": 0.8494862213918729, + "grad_norm": 0.6778890237819897, + "learning_rate": 7.730589646363141e-06, + "loss": 0.6357, + "step": 7275 + }, + { + "epoch": 0.85007006071929, + "grad_norm": 0.5872963788822205, + "learning_rate": 7.709881981911648e-06, + "loss": 0.6263, + "step": 7280 + }, + { + "epoch": 0.8506539000467072, + "grad_norm": 0.6047046430959456, + "learning_rate": 7.689248103276873e-06, + "loss": 0.6428, + "step": 7285 + }, + { + "epoch": 0.8512377393741243, + "grad_norm": 0.6596981335365075, + "learning_rate": 7.668688087390509e-06, + "loss": 0.6219, + "step": 7290 + }, + { + "epoch": 0.8518215787015413, + "grad_norm": 0.586612617500815, + "learning_rate": 7.648202010908884e-06, + "loss": 0.6494, + "step": 7295 + }, + { + "epoch": 0.8524054180289584, + "grad_norm": 0.595787806735689, + "learning_rate": 7.627789950212635e-06, + "loss": 0.6442, + "step": 7300 + }, + { + "epoch": 0.8529892573563755, + "grad_norm": 0.5935995589497031, + "learning_rate": 7.607451981406441e-06, + "loss": 0.6407, + "step": 7305 + }, + { + "epoch": 0.8535730966837927, + "grad_norm": 0.6351732011308916, + "learning_rate": 7.587188180318736e-06, + "loss": 0.6338, + "step": 7310 + }, + { + "epoch": 0.8541569360112097, + "grad_norm": 0.6423062007453759, + "learning_rate": 7.5669986225014215e-06, + "loss": 0.6145, + "step": 7315 + }, + { + "epoch": 0.8547407753386268, + "grad_norm": 0.6104054060862208, + "learning_rate": 7.546883383229594e-06, + "loss": 0.6302, + "step": 7320 + }, + { + "epoch": 0.8553246146660439, + "grad_norm": 0.5732066016302394, + "learning_rate": 7.526842537501259e-06, + "loss": 0.6194, + "step": 7325 + }, + { + "epoch": 0.855908453993461, + "grad_norm": 0.6305709160866012, + "learning_rate": 7.50687616003705e-06, + "loss": 0.6515, + "step": 7330 + }, + { + "epoch": 0.8564922933208781, + "grad_norm": 0.5996923205852378, + "learning_rate": 7.486984325279956e-06, + "loss": 0.6349, + "step": 7335 + }, + { + "epoch": 0.8570761326482952, + "grad_norm": 0.6278616591211431, + "learning_rate": 7.467167107395028e-06, + "loss": 0.6516, + "step": 7340 + }, + { + "epoch": 0.8576599719757123, + "grad_norm": 0.6269815485337418, + "learning_rate": 7.44742458026913e-06, + "loss": 0.6456, + "step": 7345 + }, + { + "epoch": 0.8582438113031294, + "grad_norm": 0.6156463362595231, + "learning_rate": 7.427756817510634e-06, + "loss": 0.6323, + "step": 7350 + }, + { + "epoch": 0.8588276506305464, + "grad_norm": 0.5956719624976204, + "learning_rate": 7.408163892449172e-06, + "loss": 0.6574, + "step": 7355 + }, + { + "epoch": 0.8594114899579636, + "grad_norm": 0.6454289614556328, + "learning_rate": 7.388645878135338e-06, + "loss": 0.6256, + "step": 7360 + }, + { + "epoch": 0.8599953292853807, + "grad_norm": 0.6210045159647282, + "learning_rate": 7.369202847340432e-06, + "loss": 0.6354, + "step": 7365 + }, + { + "epoch": 0.8605791686127977, + "grad_norm": 0.6214434983612692, + "learning_rate": 7.349834872556187e-06, + "loss": 0.6189, + "step": 7370 + }, + { + "epoch": 0.8611630079402148, + "grad_norm": 0.6600710665798625, + "learning_rate": 7.330542025994495e-06, + "loss": 0.6404, + "step": 7375 + }, + { + "epoch": 0.861746847267632, + "grad_norm": 0.5860627523438156, + "learning_rate": 7.311324379587136e-06, + "loss": 0.6246, + "step": 7380 + }, + { + "epoch": 0.8623306865950491, + "grad_norm": 0.6212770319942422, + "learning_rate": 7.292182004985511e-06, + "loss": 0.6246, + "step": 7385 + }, + { + "epoch": 0.8629145259224661, + "grad_norm": 0.6035809127554331, + "learning_rate": 7.2731149735603825e-06, + "loss": 0.6387, + "step": 7390 + }, + { + "epoch": 0.8634983652498832, + "grad_norm": 0.5579844843636018, + "learning_rate": 7.254123356401597e-06, + "loss": 0.6445, + "step": 7395 + }, + { + "epoch": 0.8640822045773003, + "grad_norm": 0.5936151599474442, + "learning_rate": 7.23520722431783e-06, + "loss": 0.6334, + "step": 7400 + }, + { + "epoch": 0.8646660439047175, + "grad_norm": 0.650546590602176, + "learning_rate": 7.216366647836306e-06, + "loss": 0.6116, + "step": 7405 + }, + { + "epoch": 0.8652498832321345, + "grad_norm": 0.6487524123219067, + "learning_rate": 7.197601697202565e-06, + "loss": 0.6526, + "step": 7410 + }, + { + "epoch": 0.8658337225595516, + "grad_norm": 0.6535618503329066, + "learning_rate": 7.1789124423801645e-06, + "loss": 0.638, + "step": 7415 + }, + { + "epoch": 0.8664175618869687, + "grad_norm": 0.6027481431879828, + "learning_rate": 7.160298953050448e-06, + "loss": 0.6198, + "step": 7420 + }, + { + "epoch": 0.8670014012143858, + "grad_norm": 0.6369758086341853, + "learning_rate": 7.141761298612267e-06, + "loss": 0.6502, + "step": 7425 + }, + { + "epoch": 0.8675852405418029, + "grad_norm": 0.6345290792167012, + "learning_rate": 7.123299548181732e-06, + "loss": 0.6479, + "step": 7430 + }, + { + "epoch": 0.86816907986922, + "grad_norm": 0.6498287886811438, + "learning_rate": 7.104913770591953e-06, + "loss": 0.6323, + "step": 7435 + }, + { + "epoch": 0.8687529191966371, + "grad_norm": 0.610453182797401, + "learning_rate": 7.086604034392777e-06, + "loss": 0.6526, + "step": 7440 + }, + { + "epoch": 0.8693367585240542, + "grad_norm": 0.6132209789616724, + "learning_rate": 7.068370407850541e-06, + "loss": 0.6514, + "step": 7445 + }, + { + "epoch": 0.8699205978514712, + "grad_norm": 0.6042376490660476, + "learning_rate": 7.050212958947813e-06, + "loss": 0.619, + "step": 7450 + }, + { + "epoch": 0.8705044371788884, + "grad_norm": 0.6254622221090369, + "learning_rate": 7.032131755383134e-06, + "loss": 0.6131, + "step": 7455 + }, + { + "epoch": 0.8710882765063055, + "grad_norm": 0.6095513928040593, + "learning_rate": 7.014126864570782e-06, + "loss": 0.61, + "step": 7460 + }, + { + "epoch": 0.8716721158337225, + "grad_norm": 0.6000952610701883, + "learning_rate": 6.996198353640495e-06, + "loss": 0.6328, + "step": 7465 + }, + { + "epoch": 0.8722559551611396, + "grad_norm": 0.6267837879440427, + "learning_rate": 6.978346289437245e-06, + "loss": 0.6386, + "step": 7470 + }, + { + "epoch": 0.8728397944885568, + "grad_norm": 0.6099846619504717, + "learning_rate": 6.9605707385209755e-06, + "loss": 0.6322, + "step": 7475 + }, + { + "epoch": 0.8734236338159739, + "grad_norm": 0.6337055827813153, + "learning_rate": 6.942871767166354e-06, + "loss": 0.6176, + "step": 7480 + }, + { + "epoch": 0.8740074731433909, + "grad_norm": 0.6027124220327612, + "learning_rate": 6.925249441362533e-06, + "loss": 0.6454, + "step": 7485 + }, + { + "epoch": 0.874591312470808, + "grad_norm": 0.5825546360394736, + "learning_rate": 6.907703826812895e-06, + "loss": 0.6256, + "step": 7490 + }, + { + "epoch": 0.8751751517982251, + "grad_norm": 0.6364269400726253, + "learning_rate": 6.89023498893481e-06, + "loss": 0.6203, + "step": 7495 + }, + { + "epoch": 0.8757589911256423, + "grad_norm": 0.5866690674431253, + "learning_rate": 6.872842992859395e-06, + "loss": 0.6426, + "step": 7500 + }, + { + "epoch": 0.8763428304530593, + "grad_norm": 0.606473729787098, + "learning_rate": 6.855527903431267e-06, + "loss": 0.6555, + "step": 7505 + }, + { + "epoch": 0.8769266697804764, + "grad_norm": 0.6175430857724672, + "learning_rate": 6.838289785208303e-06, + "loss": 0.6228, + "step": 7510 + }, + { + "epoch": 0.8775105091078935, + "grad_norm": 0.6441178869926913, + "learning_rate": 6.821128702461401e-06, + "loss": 0.6232, + "step": 7515 + }, + { + "epoch": 0.8780943484353106, + "grad_norm": 0.6455378058084096, + "learning_rate": 6.804044719174235e-06, + "loss": 0.6397, + "step": 7520 + }, + { + "epoch": 0.8786781877627277, + "grad_norm": 0.6474729332887994, + "learning_rate": 6.787037899043027e-06, + "loss": 0.6493, + "step": 7525 + }, + { + "epoch": 0.8792620270901448, + "grad_norm": 0.6399294710875848, + "learning_rate": 6.770108305476293e-06, + "loss": 0.6339, + "step": 7530 + }, + { + "epoch": 0.8798458664175619, + "grad_norm": 0.5902882884714229, + "learning_rate": 6.753256001594622e-06, + "loss": 0.6269, + "step": 7535 + }, + { + "epoch": 0.880429705744979, + "grad_norm": 0.6258926337290766, + "learning_rate": 6.736481050230438e-06, + "loss": 0.65, + "step": 7540 + }, + { + "epoch": 0.881013545072396, + "grad_norm": 0.6781590777121034, + "learning_rate": 6.719783513927755e-06, + "loss": 0.6483, + "step": 7545 + }, + { + "epoch": 0.8815973843998132, + "grad_norm": 0.5475048535917718, + "learning_rate": 6.703163454941953e-06, + "loss": 0.6237, + "step": 7550 + }, + { + "epoch": 0.8821812237272303, + "grad_norm": 0.6058898403074587, + "learning_rate": 6.686620935239552e-06, + "loss": 0.6305, + "step": 7555 + }, + { + "epoch": 0.8827650630546474, + "grad_norm": 0.6062332874729838, + "learning_rate": 6.670156016497958e-06, + "loss": 0.6165, + "step": 7560 + }, + { + "epoch": 0.8833489023820644, + "grad_norm": 0.5993012988315325, + "learning_rate": 6.653768760105268e-06, + "loss": 0.6329, + "step": 7565 + }, + { + "epoch": 0.8839327417094816, + "grad_norm": 0.6004488967091748, + "learning_rate": 6.637459227160004e-06, + "loss": 0.6275, + "step": 7570 + }, + { + "epoch": 0.8845165810368987, + "grad_norm": 0.5954153667320861, + "learning_rate": 6.621227478470911e-06, + "loss": 0.6391, + "step": 7575 + }, + { + "epoch": 0.8851004203643157, + "grad_norm": 0.6211794389009575, + "learning_rate": 6.605073574556721e-06, + "loss": 0.6481, + "step": 7580 + }, + { + "epoch": 0.8856842596917328, + "grad_norm": 0.6117471748709564, + "learning_rate": 6.588997575645929e-06, + "loss": 0.6144, + "step": 7585 + }, + { + "epoch": 0.8862680990191499, + "grad_norm": 0.633929275434095, + "learning_rate": 6.572999541676563e-06, + "loss": 0.6475, + "step": 7590 + }, + { + "epoch": 0.8868519383465671, + "grad_norm": 0.6085055988668767, + "learning_rate": 6.557079532295968e-06, + "loss": 0.6327, + "step": 7595 + }, + { + "epoch": 0.8874357776739841, + "grad_norm": 0.5942186040054447, + "learning_rate": 6.541237606860582e-06, + "loss": 0.6254, + "step": 7600 + }, + { + "epoch": 0.8880196170014012, + "grad_norm": 0.5846661536893463, + "learning_rate": 6.525473824435714e-06, + "loss": 0.6437, + "step": 7605 + }, + { + "epoch": 0.8886034563288183, + "grad_norm": 0.5951544079113886, + "learning_rate": 6.5097882437953205e-06, + "loss": 0.6225, + "step": 7610 + }, + { + "epoch": 0.8891872956562354, + "grad_norm": 0.6258276348637012, + "learning_rate": 6.49418092342179e-06, + "loss": 0.6452, + "step": 7615 + }, + { + "epoch": 0.8897711349836525, + "grad_norm": 0.5737411429686861, + "learning_rate": 6.478651921505727e-06, + "loss": 0.634, + "step": 7620 + }, + { + "epoch": 0.8903549743110696, + "grad_norm": 0.6039253224606503, + "learning_rate": 6.463201295945727e-06, + "loss": 0.6411, + "step": 7625 + }, + { + "epoch": 0.8909388136384867, + "grad_norm": 0.5847198844312848, + "learning_rate": 6.447829104348171e-06, + "loss": 0.6348, + "step": 7630 + }, + { + "epoch": 0.8915226529659038, + "grad_norm": 0.6030492636470951, + "learning_rate": 6.432535404026997e-06, + "loss": 0.6321, + "step": 7635 + }, + { + "epoch": 0.8921064922933208, + "grad_norm": 0.5846736812852988, + "learning_rate": 6.417320252003505e-06, + "loss": 0.6243, + "step": 7640 + }, + { + "epoch": 0.892690331620738, + "grad_norm": 0.5869240165168146, + "learning_rate": 6.402183705006127e-06, + "loss": 0.6257, + "step": 7645 + }, + { + "epoch": 0.8932741709481551, + "grad_norm": 0.6051339628375705, + "learning_rate": 6.387125819470231e-06, + "loss": 0.6286, + "step": 7650 + }, + { + "epoch": 0.8938580102755722, + "grad_norm": 0.6029543179587162, + "learning_rate": 6.372146651537892e-06, + "loss": 0.6343, + "step": 7655 + }, + { + "epoch": 0.8944418496029892, + "grad_norm": 0.5978429103009526, + "learning_rate": 6.3572462570576985e-06, + "loss": 0.6379, + "step": 7660 + }, + { + "epoch": 0.8950256889304063, + "grad_norm": 0.601501640872941, + "learning_rate": 6.3424246915845395e-06, + "loss": 0.6513, + "step": 7665 + }, + { + "epoch": 0.8956095282578235, + "grad_norm": 0.6176265112274103, + "learning_rate": 6.327682010379392e-06, + "loss": 0.6327, + "step": 7670 + }, + { + "epoch": 0.8961933675852406, + "grad_norm": 0.6347108525818637, + "learning_rate": 6.313018268409122e-06, + "loss": 0.6128, + "step": 7675 + }, + { + "epoch": 0.8967772069126576, + "grad_norm": 0.6073779195946353, + "learning_rate": 6.2984335203462825e-06, + "loss": 0.6373, + "step": 7680 + }, + { + "epoch": 0.8973610462400747, + "grad_norm": 0.575256225610877, + "learning_rate": 6.283927820568894e-06, + "loss": 0.6161, + "step": 7685 + }, + { + "epoch": 0.8979448855674919, + "grad_norm": 0.6155556881105437, + "learning_rate": 6.269501223160259e-06, + "loss": 0.6312, + "step": 7690 + }, + { + "epoch": 0.8985287248949089, + "grad_norm": 0.577643059355464, + "learning_rate": 6.255153781908754e-06, + "loss": 0.6359, + "step": 7695 + }, + { + "epoch": 0.899112564222326, + "grad_norm": 0.5977737921135581, + "learning_rate": 6.240885550307624e-06, + "loss": 0.6129, + "step": 7700 + }, + { + "epoch": 0.8996964035497431, + "grad_norm": 0.6140981499537501, + "learning_rate": 6.2266965815547865e-06, + "loss": 0.6253, + "step": 7705 + }, + { + "epoch": 0.9002802428771602, + "grad_norm": 0.5975649843560508, + "learning_rate": 6.212586928552641e-06, + "loss": 0.6163, + "step": 7710 + }, + { + "epoch": 0.9008640822045773, + "grad_norm": 0.5950454352388025, + "learning_rate": 6.19855664390786e-06, + "loss": 0.6274, + "step": 7715 + }, + { + "epoch": 0.9014479215319944, + "grad_norm": 0.6094660507160992, + "learning_rate": 6.184605779931197e-06, + "loss": 0.6435, + "step": 7720 + }, + { + "epoch": 0.9020317608594115, + "grad_norm": 0.6119309598856335, + "learning_rate": 6.170734388637294e-06, + "loss": 0.6352, + "step": 7725 + }, + { + "epoch": 0.9026156001868286, + "grad_norm": 0.595911716393697, + "learning_rate": 6.156942521744484e-06, + "loss": 0.6194, + "step": 7730 + }, + { + "epoch": 0.9031994395142456, + "grad_norm": 0.5993346947122322, + "learning_rate": 6.143230230674602e-06, + "loss": 0.629, + "step": 7735 + }, + { + "epoch": 0.9037832788416628, + "grad_norm": 0.6044325398571229, + "learning_rate": 6.12959756655279e-06, + "loss": 0.6271, + "step": 7740 + }, + { + "epoch": 0.9043671181690799, + "grad_norm": 0.6343098779398217, + "learning_rate": 6.11604458020731e-06, + "loss": 0.6373, + "step": 7745 + }, + { + "epoch": 0.904950957496497, + "grad_norm": 0.577313456911306, + "learning_rate": 6.102571322169347e-06, + "loss": 0.6361, + "step": 7750 + }, + { + "epoch": 0.905534796823914, + "grad_norm": 0.6239547800150814, + "learning_rate": 6.089177842672826e-06, + "loss": 0.6477, + "step": 7755 + }, + { + "epoch": 0.9061186361513311, + "grad_norm": 0.6188422351793207, + "learning_rate": 6.075864191654231e-06, + "loss": 0.628, + "step": 7760 + }, + { + "epoch": 0.9067024754787483, + "grad_norm": 0.5911029453007852, + "learning_rate": 6.062630418752404e-06, + "loss": 0.6219, + "step": 7765 + }, + { + "epoch": 0.9072863148061654, + "grad_norm": 0.6284611964804068, + "learning_rate": 6.049476573308375e-06, + "loss": 0.6482, + "step": 7770 + }, + { + "epoch": 0.9078701541335824, + "grad_norm": 0.5686403476250719, + "learning_rate": 6.036402704365168e-06, + "loss": 0.6252, + "step": 7775 + }, + { + "epoch": 0.9084539934609995, + "grad_norm": 0.6248813037354658, + "learning_rate": 6.023408860667617e-06, + "loss": 0.6457, + "step": 7780 + }, + { + "epoch": 0.9090378327884167, + "grad_norm": 0.6116229301425355, + "learning_rate": 6.010495090662197e-06, + "loss": 0.6218, + "step": 7785 + }, + { + "epoch": 0.9096216721158337, + "grad_norm": 0.5982997364765981, + "learning_rate": 5.9976614424968245e-06, + "loss": 0.6238, + "step": 7790 + }, + { + "epoch": 0.9102055114432508, + "grad_norm": 0.6092146507527899, + "learning_rate": 5.9849079640207e-06, + "loss": 0.6292, + "step": 7795 + }, + { + "epoch": 0.9107893507706679, + "grad_norm": 0.6625607886398457, + "learning_rate": 5.972234702784106e-06, + "loss": 0.6401, + "step": 7800 + }, + { + "epoch": 0.911373190098085, + "grad_norm": 0.6123956879925448, + "learning_rate": 5.9596417060382545e-06, + "loss": 0.6317, + "step": 7805 + }, + { + "epoch": 0.911957029425502, + "grad_norm": 0.6042691351169877, + "learning_rate": 5.9471290207350925e-06, + "loss": 0.6133, + "step": 7810 + }, + { + "epoch": 0.9125408687529192, + "grad_norm": 0.62392183374982, + "learning_rate": 5.934696693527123e-06, + "loss": 0.6335, + "step": 7815 + }, + { + "epoch": 0.9131247080803363, + "grad_norm": 0.5796174304624362, + "learning_rate": 5.9223447707672564e-06, + "loss": 0.6163, + "step": 7820 + }, + { + "epoch": 0.9137085474077534, + "grad_norm": 0.6278698472258146, + "learning_rate": 5.910073298508609e-06, + "loss": 0.6433, + "step": 7825 + }, + { + "epoch": 0.9142923867351704, + "grad_norm": 0.5895985868489902, + "learning_rate": 5.8978823225043555e-06, + "loss": 0.6251, + "step": 7830 + }, + { + "epoch": 0.9148762260625876, + "grad_norm": 0.6008029537552508, + "learning_rate": 5.8857718882075325e-06, + "loss": 0.6469, + "step": 7835 + }, + { + "epoch": 0.9154600653900047, + "grad_norm": 0.5817527833570267, + "learning_rate": 5.8737420407708985e-06, + "loss": 0.6292, + "step": 7840 + }, + { + "epoch": 0.9160439047174218, + "grad_norm": 0.599543094939994, + "learning_rate": 5.861792825046739e-06, + "loss": 0.6255, + "step": 7845 + }, + { + "epoch": 0.9166277440448388, + "grad_norm": 0.5776097905597614, + "learning_rate": 5.849924285586719e-06, + "loss": 0.6391, + "step": 7850 + }, + { + "epoch": 0.917211583372256, + "grad_norm": 0.6107022416917586, + "learning_rate": 5.838136466641704e-06, + "loss": 0.6291, + "step": 7855 + }, + { + "epoch": 0.9177954226996731, + "grad_norm": 0.6021931223535016, + "learning_rate": 5.8264294121616e-06, + "loss": 0.6275, + "step": 7860 + }, + { + "epoch": 0.9183792620270902, + "grad_norm": 0.6246355143005593, + "learning_rate": 5.814803165795194e-06, + "loss": 0.6338, + "step": 7865 + }, + { + "epoch": 0.9189631013545072, + "grad_norm": 0.6034832752476175, + "learning_rate": 5.803257770889978e-06, + "loss": 0.6411, + "step": 7870 + }, + { + "epoch": 0.9195469406819243, + "grad_norm": 0.6092008532994692, + "learning_rate": 5.791793270492006e-06, + "loss": 0.6328, + "step": 7875 + }, + { + "epoch": 0.9201307800093415, + "grad_norm": 0.5948761862413096, + "learning_rate": 5.780409707345714e-06, + "loss": 0.6473, + "step": 7880 + }, + { + "epoch": 0.9207146193367586, + "grad_norm": 0.5708828641760395, + "learning_rate": 5.769107123893781e-06, + "loss": 0.618, + "step": 7885 + }, + { + "epoch": 0.9212984586641756, + "grad_norm": 0.5787863997950811, + "learning_rate": 5.757885562276948e-06, + "loss": 0.6221, + "step": 7890 + }, + { + "epoch": 0.9218822979915927, + "grad_norm": 0.6141574953858627, + "learning_rate": 5.7467450643338804e-06, + "loss": 0.6296, + "step": 7895 + }, + { + "epoch": 0.9224661373190098, + "grad_norm": 0.5958732143906431, + "learning_rate": 5.7356856716010014e-06, + "loss": 0.6257, + "step": 7900 + }, + { + "epoch": 0.9230499766464269, + "grad_norm": 0.6431949588838122, + "learning_rate": 5.724707425312344e-06, + "loss": 0.6255, + "step": 7905 + }, + { + "epoch": 0.923633815973844, + "grad_norm": 0.6051972855303055, + "learning_rate": 5.7138103663993895e-06, + "loss": 0.6263, + "step": 7910 + }, + { + "epoch": 0.9242176553012611, + "grad_norm": 0.6063472814698573, + "learning_rate": 5.70299453549092e-06, + "loss": 0.6331, + "step": 7915 + }, + { + "epoch": 0.9248014946286782, + "grad_norm": 0.6305201290600937, + "learning_rate": 5.692259972912865e-06, + "loss": 0.626, + "step": 7920 + }, + { + "epoch": 0.9253853339560952, + "grad_norm": 0.6033166123290059, + "learning_rate": 5.681606718688152e-06, + "loss": 0.6286, + "step": 7925 + }, + { + "epoch": 0.9259691732835124, + "grad_norm": 0.5744059957483572, + "learning_rate": 5.671034812536561e-06, + "loss": 0.6008, + "step": 7930 + }, + { + "epoch": 0.9265530126109295, + "grad_norm": 0.6197956332271047, + "learning_rate": 5.660544293874561e-06, + "loss": 0.6141, + "step": 7935 + }, + { + "epoch": 0.9271368519383466, + "grad_norm": 0.6021158964286558, + "learning_rate": 5.65013520181519e-06, + "loss": 0.6221, + "step": 7940 + }, + { + "epoch": 0.9277206912657636, + "grad_norm": 0.5656423666481871, + "learning_rate": 5.639807575167886e-06, + "loss": 0.6226, + "step": 7945 + }, + { + "epoch": 0.9283045305931807, + "grad_norm": 0.600675095444175, + "learning_rate": 5.6295614524383436e-06, + "loss": 0.6249, + "step": 7950 + }, + { + "epoch": 0.9288883699205979, + "grad_norm": 0.6133243190992899, + "learning_rate": 5.619396871828387e-06, + "loss": 0.6291, + "step": 7955 + }, + { + "epoch": 0.929472209248015, + "grad_norm": 0.6511471891410207, + "learning_rate": 5.6093138712358155e-06, + "loss": 0.6333, + "step": 7960 + }, + { + "epoch": 0.930056048575432, + "grad_norm": 0.6223717747889685, + "learning_rate": 5.5993124882542584e-06, + "loss": 0.6338, + "step": 7965 + }, + { + "epoch": 0.9306398879028491, + "grad_norm": 0.6128977080868971, + "learning_rate": 5.589392760173047e-06, + "loss": 0.6348, + "step": 7970 + }, + { + "epoch": 0.9312237272302663, + "grad_norm": 0.5952529850359788, + "learning_rate": 5.579554723977065e-06, + "loss": 0.6218, + "step": 7975 + }, + { + "epoch": 0.9318075665576834, + "grad_norm": 0.5926314655193932, + "learning_rate": 5.569798416346615e-06, + "loss": 0.6278, + "step": 7980 + }, + { + "epoch": 0.9323914058851004, + "grad_norm": 0.6364867297259194, + "learning_rate": 5.560123873657284e-06, + "loss": 0.6227, + "step": 7985 + }, + { + "epoch": 0.9329752452125175, + "grad_norm": 0.588741898129217, + "learning_rate": 5.550531131979804e-06, + "loss": 0.6322, + "step": 7990 + }, + { + "epoch": 0.9335590845399346, + "grad_norm": 0.5868391154734851, + "learning_rate": 5.5410202270799165e-06, + "loss": 0.6335, + "step": 7995 + }, + { + "epoch": 0.9341429238673518, + "grad_norm": 0.6177367560309492, + "learning_rate": 5.531591194418244e-06, + "loss": 0.6243, + "step": 8000 + }, + { + "epoch": 0.9347267631947688, + "grad_norm": 0.5846412401374685, + "learning_rate": 5.5222440691501534e-06, + "loss": 0.6239, + "step": 8005 + }, + { + "epoch": 0.9353106025221859, + "grad_norm": 0.5952668242128202, + "learning_rate": 5.512978886125628e-06, + "loss": 0.6086, + "step": 8010 + }, + { + "epoch": 0.935894441849603, + "grad_norm": 0.5975481591744062, + "learning_rate": 5.5037956798891345e-06, + "loss": 0.627, + "step": 8015 + }, + { + "epoch": 0.93647828117702, + "grad_norm": 0.6493163522370307, + "learning_rate": 5.494694484679501e-06, + "loss": 0.6105, + "step": 8020 + }, + { + "epoch": 0.9370621205044372, + "grad_norm": 0.6067098748167189, + "learning_rate": 5.485675334429776e-06, + "loss": 0.6048, + "step": 8025 + }, + { + "epoch": 0.9376459598318543, + "grad_norm": 0.6112238279178478, + "learning_rate": 5.476738262767116e-06, + "loss": 0.6409, + "step": 8030 + }, + { + "epoch": 0.9382297991592714, + "grad_norm": 0.6018698466530003, + "learning_rate": 5.467883303012653e-06, + "loss": 0.6414, + "step": 8035 + }, + { + "epoch": 0.9388136384866884, + "grad_norm": 0.5596954894197848, + "learning_rate": 5.459110488181373e-06, + "loss": 0.6177, + "step": 8040 + }, + { + "epoch": 0.9393974778141055, + "grad_norm": 0.6181282912666148, + "learning_rate": 5.450419850981987e-06, + "loss": 0.6499, + "step": 8045 + }, + { + "epoch": 0.9399813171415227, + "grad_norm": 0.600484249263648, + "learning_rate": 5.441811423816817e-06, + "loss": 0.6407, + "step": 8050 + }, + { + "epoch": 0.9405651564689398, + "grad_norm": 0.6038979619569053, + "learning_rate": 5.433285238781674e-06, + "loss": 0.6426, + "step": 8055 + }, + { + "epoch": 0.9411489957963568, + "grad_norm": 0.5930425789751672, + "learning_rate": 5.424841327665728e-06, + "loss": 0.6363, + "step": 8060 + }, + { + "epoch": 0.9417328351237739, + "grad_norm": 0.6188823703473455, + "learning_rate": 5.416479721951409e-06, + "loss": 0.6239, + "step": 8065 + }, + { + "epoch": 0.942316674451191, + "grad_norm": 0.5894093077115455, + "learning_rate": 5.408200452814265e-06, + "loss": 0.6174, + "step": 8070 + }, + { + "epoch": 0.9429005137786082, + "grad_norm": 0.5803109373806545, + "learning_rate": 5.400003551122871e-06, + "loss": 0.6368, + "step": 8075 + }, + { + "epoch": 0.9434843531060252, + "grad_norm": 0.6397959955822484, + "learning_rate": 5.391889047438692e-06, + "loss": 0.6244, + "step": 8080 + }, + { + "epoch": 0.9440681924334423, + "grad_norm": 0.6258497625424511, + "learning_rate": 5.383856972015984e-06, + "loss": 0.6226, + "step": 8085 + }, + { + "epoch": 0.9446520317608594, + "grad_norm": 0.6069468999871956, + "learning_rate": 5.3759073548016776e-06, + "loss": 0.6404, + "step": 8090 + }, + { + "epoch": 0.9452358710882766, + "grad_norm": 0.5861493386306807, + "learning_rate": 5.368040225435264e-06, + "loss": 0.6187, + "step": 8095 + }, + { + "epoch": 0.9458197104156936, + "grad_norm": 0.5634537016801693, + "learning_rate": 5.360255613248679e-06, + "loss": 0.6155, + "step": 8100 + }, + { + "epoch": 0.9464035497431107, + "grad_norm": 0.5807799609418582, + "learning_rate": 5.352553547266205e-06, + "loss": 0.6346, + "step": 8105 + }, + { + "epoch": 0.9469873890705278, + "grad_norm": 0.5985805165038645, + "learning_rate": 5.34493405620436e-06, + "loss": 0.6456, + "step": 8110 + }, + { + "epoch": 0.9475712283979448, + "grad_norm": 0.6470813090081771, + "learning_rate": 5.337397168471786e-06, + "loss": 0.6403, + "step": 8115 + }, + { + "epoch": 0.948155067725362, + "grad_norm": 0.6212004861636161, + "learning_rate": 5.329942912169144e-06, + "loss": 0.6335, + "step": 8120 + }, + { + "epoch": 0.9487389070527791, + "grad_norm": 0.5867741211903176, + "learning_rate": 5.322571315089009e-06, + "loss": 0.6264, + "step": 8125 + }, + { + "epoch": 0.9493227463801962, + "grad_norm": 0.5827684325148621, + "learning_rate": 5.315282404715776e-06, + "loss": 0.6268, + "step": 8130 + }, + { + "epoch": 0.9499065857076132, + "grad_norm": 0.5762150106079065, + "learning_rate": 5.308076208225538e-06, + "loss": 0.6261, + "step": 8135 + }, + { + "epoch": 0.9504904250350303, + "grad_norm": 0.6484971222881835, + "learning_rate": 5.300952752486006e-06, + "loss": 0.6416, + "step": 8140 + }, + { + "epoch": 0.9510742643624475, + "grad_norm": 0.5891698336069202, + "learning_rate": 5.293912064056394e-06, + "loss": 0.6229, + "step": 8145 + }, + { + "epoch": 0.9516581036898646, + "grad_norm": 0.5916315255808878, + "learning_rate": 5.286954169187325e-06, + "loss": 0.6166, + "step": 8150 + }, + { + "epoch": 0.9522419430172816, + "grad_norm": 0.6674962228781914, + "learning_rate": 5.280079093820737e-06, + "loss": 0.6451, + "step": 8155 + }, + { + "epoch": 0.9528257823446987, + "grad_norm": 0.6069069688549191, + "learning_rate": 5.273286863589776e-06, + "loss": 0.6415, + "step": 8160 + }, + { + "epoch": 0.9534096216721158, + "grad_norm": 0.5958144997860393, + "learning_rate": 5.266577503818708e-06, + "loss": 0.6527, + "step": 8165 + }, + { + "epoch": 0.953993460999533, + "grad_norm": 0.6155935484948483, + "learning_rate": 5.259951039522832e-06, + "loss": 0.6461, + "step": 8170 + }, + { + "epoch": 0.95457730032695, + "grad_norm": 0.6252376719795079, + "learning_rate": 5.253407495408368e-06, + "loss": 0.6239, + "step": 8175 + }, + { + "epoch": 0.9551611396543671, + "grad_norm": 0.5776249542863285, + "learning_rate": 5.24694689587238e-06, + "loss": 0.6237, + "step": 8180 + }, + { + "epoch": 0.9557449789817842, + "grad_norm": 0.5863217640086233, + "learning_rate": 5.240569265002673e-06, + "loss": 0.6237, + "step": 8185 + }, + { + "epoch": 0.9563288183092014, + "grad_norm": 0.6073735304248427, + "learning_rate": 5.234274626577723e-06, + "loss": 0.6423, + "step": 8190 + }, + { + "epoch": 0.9569126576366184, + "grad_norm": 0.5999967869487696, + "learning_rate": 5.228063004066567e-06, + "loss": 0.6189, + "step": 8195 + }, + { + "epoch": 0.9574964969640355, + "grad_norm": 0.6441911130662245, + "learning_rate": 5.22193442062872e-06, + "loss": 0.6232, + "step": 8200 + }, + { + "epoch": 0.9580803362914526, + "grad_norm": 0.6439126904936041, + "learning_rate": 5.2158888991141055e-06, + "loss": 0.6339, + "step": 8205 + }, + { + "epoch": 0.9586641756188697, + "grad_norm": 0.6032589048854394, + "learning_rate": 5.2099264620629425e-06, + "loss": 0.6349, + "step": 8210 + }, + { + "epoch": 0.9592480149462868, + "grad_norm": 0.5960007625552923, + "learning_rate": 5.204047131705689e-06, + "loss": 0.6204, + "step": 8215 + }, + { + "epoch": 0.9598318542737039, + "grad_norm": 0.6233901017209507, + "learning_rate": 5.198250929962939e-06, + "loss": 0.6415, + "step": 8220 + }, + { + "epoch": 0.960415693601121, + "grad_norm": 0.6066027994598999, + "learning_rate": 5.192537878445356e-06, + "loss": 0.6384, + "step": 8225 + }, + { + "epoch": 0.960999532928538, + "grad_norm": 0.5806070059656948, + "learning_rate": 5.186907998453573e-06, + "loss": 0.6247, + "step": 8230 + }, + { + "epoch": 0.9615833722559551, + "grad_norm": 0.6233770326846283, + "learning_rate": 5.181361310978133e-06, + "loss": 0.6167, + "step": 8235 + }, + { + "epoch": 0.9621672115833723, + "grad_norm": 0.5843093926235634, + "learning_rate": 5.175897836699403e-06, + "loss": 0.6435, + "step": 8240 + }, + { + "epoch": 0.9627510509107894, + "grad_norm": 0.6235011595985152, + "learning_rate": 5.170517595987493e-06, + "loss": 0.634, + "step": 8245 + }, + { + "epoch": 0.9633348902382064, + "grad_norm": 0.6556016479037008, + "learning_rate": 5.165220608902186e-06, + "loss": 0.6291, + "step": 8250 + }, + { + "epoch": 0.9639187295656235, + "grad_norm": 0.6375416786937221, + "learning_rate": 5.160006895192858e-06, + "loss": 0.6282, + "step": 8255 + }, + { + "epoch": 0.9645025688930406, + "grad_norm": 0.6137703050348912, + "learning_rate": 5.154876474298412e-06, + "loss": 0.6088, + "step": 8260 + }, + { + "epoch": 0.9650864082204578, + "grad_norm": 0.6390723634134673, + "learning_rate": 5.149829365347197e-06, + "loss": 0.6298, + "step": 8265 + }, + { + "epoch": 0.9656702475478748, + "grad_norm": 0.6067102292176847, + "learning_rate": 5.14486558715694e-06, + "loss": 0.6339, + "step": 8270 + }, + { + "epoch": 0.9662540868752919, + "grad_norm": 0.627573621858438, + "learning_rate": 5.139985158234677e-06, + "loss": 0.666, + "step": 8275 + }, + { + "epoch": 0.966837926202709, + "grad_norm": 0.6233363637808976, + "learning_rate": 5.135188096776682e-06, + "loss": 0.627, + "step": 8280 + }, + { + "epoch": 0.9674217655301262, + "grad_norm": 0.6007626857020777, + "learning_rate": 5.130474420668403e-06, + "loss": 0.6404, + "step": 8285 + }, + { + "epoch": 0.9680056048575432, + "grad_norm": 0.6298620628413639, + "learning_rate": 5.125844147484391e-06, + "loss": 0.6202, + "step": 8290 + }, + { + "epoch": 0.9685894441849603, + "grad_norm": 0.675465228558122, + "learning_rate": 5.121297294488237e-06, + "loss": 0.6518, + "step": 8295 + }, + { + "epoch": 0.9691732835123774, + "grad_norm": 0.5865659966177293, + "learning_rate": 5.1168338786325025e-06, + "loss": 0.6247, + "step": 8300 + }, + { + "epoch": 0.9697571228397945, + "grad_norm": 0.6678929540821399, + "learning_rate": 5.112453916558671e-06, + "loss": 0.6494, + "step": 8305 + }, + { + "epoch": 0.9703409621672116, + "grad_norm": 0.6009288621686822, + "learning_rate": 5.108157424597062e-06, + "loss": 0.6234, + "step": 8310 + }, + { + "epoch": 0.9709248014946287, + "grad_norm": 0.6193467021477096, + "learning_rate": 5.103944418766791e-06, + "loss": 0.635, + "step": 8315 + }, + { + "epoch": 0.9715086408220458, + "grad_norm": 0.622608684023799, + "learning_rate": 5.099814914775706e-06, + "loss": 0.6237, + "step": 8320 + }, + { + "epoch": 0.9720924801494629, + "grad_norm": 0.6052720531668783, + "learning_rate": 5.095768928020314e-06, + "loss": 0.6396, + "step": 8325 + }, + { + "epoch": 0.9726763194768799, + "grad_norm": 0.6736182968411951, + "learning_rate": 5.09180647358575e-06, + "loss": 0.6247, + "step": 8330 + }, + { + "epoch": 0.9732601588042971, + "grad_norm": 0.6389566384205301, + "learning_rate": 5.087927566245688e-06, + "loss": 0.6234, + "step": 8335 + }, + { + "epoch": 0.9738439981317142, + "grad_norm": 0.5607228041623084, + "learning_rate": 5.0841322204623205e-06, + "loss": 0.6178, + "step": 8340 + }, + { + "epoch": 0.9744278374591312, + "grad_norm": 0.5892368416358901, + "learning_rate": 5.080420450386274e-06, + "loss": 0.6296, + "step": 8345 + }, + { + "epoch": 0.9750116767865483, + "grad_norm": 0.6172458659875123, + "learning_rate": 5.076792269856582e-06, + "loss": 0.6265, + "step": 8350 + }, + { + "epoch": 0.9755955161139654, + "grad_norm": 0.6232719417629363, + "learning_rate": 5.073247692400609e-06, + "loss": 0.6198, + "step": 8355 + }, + { + "epoch": 0.9761793554413826, + "grad_norm": 0.6131296828724799, + "learning_rate": 5.069786731234025e-06, + "loss": 0.636, + "step": 8360 + }, + { + "epoch": 0.9767631947687996, + "grad_norm": 0.5852765716886096, + "learning_rate": 5.066409399260733e-06, + "loss": 0.648, + "step": 8365 + }, + { + "epoch": 0.9773470340962167, + "grad_norm": 0.5745904770680967, + "learning_rate": 5.063115709072837e-06, + "loss": 0.6363, + "step": 8370 + }, + { + "epoch": 0.9779308734236338, + "grad_norm": 0.605526140146533, + "learning_rate": 5.059905672950588e-06, + "loss": 0.6341, + "step": 8375 + }, + { + "epoch": 0.978514712751051, + "grad_norm": 0.6286995218711023, + "learning_rate": 5.056779302862337e-06, + "loss": 0.6349, + "step": 8380 + }, + { + "epoch": 0.979098552078468, + "grad_norm": 0.5967884859689654, + "learning_rate": 5.0537366104645e-06, + "loss": 0.6194, + "step": 8385 + }, + { + "epoch": 0.9796823914058851, + "grad_norm": 0.6180004875088996, + "learning_rate": 5.050777607101506e-06, + "loss": 0.6273, + "step": 8390 + }, + { + "epoch": 0.9802662307333022, + "grad_norm": 0.6310278459157885, + "learning_rate": 5.047902303805746e-06, + "loss": 0.6483, + "step": 8395 + }, + { + "epoch": 0.9808500700607193, + "grad_norm": 0.6032724694985684, + "learning_rate": 5.045110711297557e-06, + "loss": 0.6257, + "step": 8400 + }, + { + "epoch": 0.9814339093881363, + "grad_norm": 0.5781867381020175, + "learning_rate": 5.042402839985161e-06, + "loss": 0.6134, + "step": 8405 + }, + { + "epoch": 0.9820177487155535, + "grad_norm": 0.6260381778592984, + "learning_rate": 5.039778699964626e-06, + "loss": 0.6379, + "step": 8410 + }, + { + "epoch": 0.9826015880429706, + "grad_norm": 0.6005383882247752, + "learning_rate": 5.037238301019845e-06, + "loss": 0.6145, + "step": 8415 + }, + { + "epoch": 0.9831854273703877, + "grad_norm": 0.594314717849409, + "learning_rate": 5.034781652622484e-06, + "loss": 0.6172, + "step": 8420 + }, + { + "epoch": 0.9837692666978047, + "grad_norm": 0.600166199851812, + "learning_rate": 5.032408763931956e-06, + "loss": 0.6019, + "step": 8425 + }, + { + "epoch": 0.9843531060252219, + "grad_norm": 0.6307929931128271, + "learning_rate": 5.0301196437953755e-06, + "loss": 0.6229, + "step": 8430 + }, + { + "epoch": 0.984936945352639, + "grad_norm": 0.6082576984810791, + "learning_rate": 5.0279143007475425e-06, + "loss": 0.6346, + "step": 8435 + }, + { + "epoch": 0.985520784680056, + "grad_norm": 0.6326185892061758, + "learning_rate": 5.02579274301089e-06, + "loss": 0.633, + "step": 8440 + }, + { + "epoch": 0.9861046240074731, + "grad_norm": 0.6516301943360185, + "learning_rate": 5.0237549784954745e-06, + "loss": 0.6085, + "step": 8445 + }, + { + "epoch": 0.9866884633348902, + "grad_norm": 0.6213879865286841, + "learning_rate": 5.021801014798933e-06, + "loss": 0.6147, + "step": 8450 + }, + { + "epoch": 0.9872723026623074, + "grad_norm": 0.5780399611890119, + "learning_rate": 5.0199308592064535e-06, + "loss": 0.6184, + "step": 8455 + }, + { + "epoch": 0.9878561419897244, + "grad_norm": 0.601910927657346, + "learning_rate": 5.018144518690761e-06, + "loss": 0.6251, + "step": 8460 + }, + { + "epoch": 0.9884399813171415, + "grad_norm": 0.5659705673632358, + "learning_rate": 5.016441999912074e-06, + "loss": 0.6261, + "step": 8465 + }, + { + "epoch": 0.9890238206445586, + "grad_norm": 0.6329264636005011, + "learning_rate": 5.014823309218096e-06, + "loss": 0.6228, + "step": 8470 + }, + { + "epoch": 0.9896076599719758, + "grad_norm": 0.6362609489056387, + "learning_rate": 5.013288452643979e-06, + "loss": 0.6366, + "step": 8475 + }, + { + "epoch": 0.9901914992993928, + "grad_norm": 0.5997302170613081, + "learning_rate": 5.011837435912308e-06, + "loss": 0.6249, + "step": 8480 + }, + { + "epoch": 0.9907753386268099, + "grad_norm": 0.5704092338174375, + "learning_rate": 5.010470264433083e-06, + "loss": 0.633, + "step": 8485 + }, + { + "epoch": 0.991359177954227, + "grad_norm": 0.5666559915764748, + "learning_rate": 5.009186943303684e-06, + "loss": 0.6375, + "step": 8490 + }, + { + "epoch": 0.9919430172816441, + "grad_norm": 0.5759983030786257, + "learning_rate": 5.0079874773088735e-06, + "loss": 0.6313, + "step": 8495 + }, + { + "epoch": 0.9925268566090611, + "grad_norm": 0.608225948606098, + "learning_rate": 5.006871870920757e-06, + "loss": 0.6208, + "step": 8500 + }, + { + "epoch": 0.9931106959364783, + "grad_norm": 0.5628644591986458, + "learning_rate": 5.005840128298783e-06, + "loss": 0.6231, + "step": 8505 + }, + { + "epoch": 0.9936945352638954, + "grad_norm": 0.6025508957094532, + "learning_rate": 5.004892253289714e-06, + "loss": 0.6318, + "step": 8510 + }, + { + "epoch": 0.9942783745913125, + "grad_norm": 0.6076801376241487, + "learning_rate": 5.004028249427629e-06, + "loss": 0.623, + "step": 8515 + }, + { + "epoch": 0.9948622139187295, + "grad_norm": 0.569738097090106, + "learning_rate": 5.003248119933894e-06, + "loss": 0.6146, + "step": 8520 + }, + { + "epoch": 0.9954460532461467, + "grad_norm": 0.5841965142439793, + "learning_rate": 5.002551867717153e-06, + "loss": 0.6164, + "step": 8525 + }, + { + "epoch": 0.9960298925735638, + "grad_norm": 0.6266992389455439, + "learning_rate": 5.00193949537333e-06, + "loss": 0.646, + "step": 8530 + }, + { + "epoch": 0.9966137319009809, + "grad_norm": 0.6190833079470848, + "learning_rate": 5.0014110051856e-06, + "loss": 0.6542, + "step": 8535 + }, + { + "epoch": 0.9971975712283979, + "grad_norm": 0.5923866630501219, + "learning_rate": 5.000966399124398e-06, + "loss": 0.63, + "step": 8540 + }, + { + "epoch": 0.997781410555815, + "grad_norm": 0.6286317642862608, + "learning_rate": 5.000605678847399e-06, + "loss": 0.6213, + "step": 8545 + }, + { + "epoch": 0.9983652498832322, + "grad_norm": 0.5777027991615938, + "learning_rate": 5.000328845699522e-06, + "loss": 0.6262, + "step": 8550 + }, + { + "epoch": 0.9989490892106492, + "grad_norm": 0.6183899021112303, + "learning_rate": 5.000135900712914e-06, + "loss": 0.6248, + "step": 8555 + }, + { + "epoch": 0.9995329285380663, + "grad_norm": 0.5929731972464702, + "learning_rate": 5.000026844606953e-06, + "loss": 0.6319, + "step": 8560 + }, + { + "epoch": 1.0, + "step": 8564, + "total_flos": 487937544290304.0, + "train_loss": 0.687042359323604, + "train_runtime": 31744.1838, + "train_samples_per_second": 17.266, + "train_steps_per_second": 0.27 + } + ], + "logging_steps": 5, + "max_steps": 8564, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 487937544290304.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}