{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.303632988597189, "eval_steps": 500, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0026518164942985947, "grad_norm": 0.1258944869041443, "learning_rate": 5e-05, "loss": 0.537, "step": 10 }, { "epoch": 0.005303632988597189, "grad_norm": 0.08864187449216843, "learning_rate": 5e-05, "loss": 0.2604, "step": 20 }, { "epoch": 0.007955449482895784, "grad_norm": 0.04630456119775772, "learning_rate": 5e-05, "loss": 0.1491, "step": 30 }, { "epoch": 0.010607265977194379, "grad_norm": 0.034610189497470856, "learning_rate": 5e-05, "loss": 0.0877, "step": 40 }, { "epoch": 0.013259082471492973, "grad_norm": 0.02519925870001316, "learning_rate": 5e-05, "loss": 0.0404, "step": 50 }, { "epoch": 0.015910898965791568, "grad_norm": 0.021403878927230835, "learning_rate": 5e-05, "loss": 0.0237, "step": 60 }, { "epoch": 0.01856271546009016, "grad_norm": 0.013028358109295368, "learning_rate": 5e-05, "loss": 0.0175, "step": 70 }, { "epoch": 0.021214531954388757, "grad_norm": 0.009956196881830692, "learning_rate": 5e-05, "loss": 0.0147, "step": 80 }, { "epoch": 0.02386634844868735, "grad_norm": 0.016326865181326866, "learning_rate": 5e-05, "loss": 0.014, "step": 90 }, { "epoch": 0.026518164942985947, "grad_norm": 0.018968934193253517, "learning_rate": 5e-05, "loss": 0.0138, "step": 100 }, { "epoch": 0.02916998143728454, "grad_norm": 0.019122008234262466, "learning_rate": 5e-05, "loss": 0.013, "step": 110 }, { "epoch": 0.031821797931583136, "grad_norm": 0.01659848913550377, "learning_rate": 5e-05, "loss": 0.0134, "step": 120 }, { "epoch": 0.03447361442588173, "grad_norm": 0.010206062346696854, "learning_rate": 5e-05, "loss": 0.0122, "step": 130 }, { "epoch": 0.03712543092018032, "grad_norm": 0.01594812050461769, "learning_rate": 5e-05, "loss": 0.0143, "step": 140 }, { "epoch": 0.03977724741447892, "grad_norm": 0.013471633195877075, "learning_rate": 5e-05, "loss": 0.0123, "step": 150 }, { "epoch": 0.042429063908777515, "grad_norm": 0.016392529010772705, "learning_rate": 5e-05, "loss": 0.0119, "step": 160 }, { "epoch": 0.045080880403076104, "grad_norm": 0.012609810568392277, "learning_rate": 5e-05, "loss": 0.0123, "step": 170 }, { "epoch": 0.0477326968973747, "grad_norm": 0.011995252221822739, "learning_rate": 5e-05, "loss": 0.013, "step": 180 }, { "epoch": 0.0503845133916733, "grad_norm": 0.01840459555387497, "learning_rate": 5e-05, "loss": 0.0121, "step": 190 }, { "epoch": 0.05303632988597189, "grad_norm": 0.019262684509158134, "learning_rate": 5e-05, "loss": 0.0118, "step": 200 }, { "epoch": 0.05568814638027048, "grad_norm": 0.012004096060991287, "learning_rate": 5e-05, "loss": 0.0123, "step": 210 }, { "epoch": 0.05833996287456908, "grad_norm": 0.011293444782495499, "learning_rate": 5e-05, "loss": 0.0114, "step": 220 }, { "epoch": 0.060991779368867675, "grad_norm": 0.01743623986840248, "learning_rate": 5e-05, "loss": 0.0122, "step": 230 }, { "epoch": 0.06364359586316627, "grad_norm": 0.017951998859643936, "learning_rate": 5e-05, "loss": 0.0117, "step": 240 }, { "epoch": 0.06629541235746486, "grad_norm": 0.01696779578924179, "learning_rate": 5e-05, "loss": 0.0121, "step": 250 }, { "epoch": 0.06894722885176346, "grad_norm": 0.013187007047235966, "learning_rate": 5e-05, "loss": 0.0114, "step": 260 }, { "epoch": 0.07159904534606205, "grad_norm": 0.014351295307278633, "learning_rate": 5e-05, "loss": 0.0105, "step": 270 }, { "epoch": 0.07425086184036064, "grad_norm": 0.021265747025609016, "learning_rate": 5e-05, "loss": 0.0119, "step": 280 }, { "epoch": 0.07690267833465925, "grad_norm": 0.017219804227352142, "learning_rate": 5e-05, "loss": 0.0118, "step": 290 }, { "epoch": 0.07955449482895784, "grad_norm": 0.018998468294739723, "learning_rate": 5e-05, "loss": 0.0112, "step": 300 }, { "epoch": 0.08220631132325643, "grad_norm": 0.012130018323659897, "learning_rate": 5e-05, "loss": 0.0112, "step": 310 }, { "epoch": 0.08485812781755503, "grad_norm": 0.009609296917915344, "learning_rate": 5e-05, "loss": 0.01, "step": 320 }, { "epoch": 0.08750994431185362, "grad_norm": 0.012953165918588638, "learning_rate": 5e-05, "loss": 0.0117, "step": 330 }, { "epoch": 0.09016176080615221, "grad_norm": 0.017377154901623726, "learning_rate": 5e-05, "loss": 0.011, "step": 340 }, { "epoch": 0.09281357730045081, "grad_norm": 0.015518328174948692, "learning_rate": 5e-05, "loss": 0.0104, "step": 350 }, { "epoch": 0.0954653937947494, "grad_norm": 0.020566442981362343, "learning_rate": 5e-05, "loss": 0.0114, "step": 360 }, { "epoch": 0.098117210289048, "grad_norm": 0.013780680485069752, "learning_rate": 5e-05, "loss": 0.011, "step": 370 }, { "epoch": 0.1007690267833466, "grad_norm": 0.010043540969491005, "learning_rate": 5e-05, "loss": 0.0107, "step": 380 }, { "epoch": 0.10342084327764518, "grad_norm": 0.009996985085308552, "learning_rate": 5e-05, "loss": 0.0106, "step": 390 }, { "epoch": 0.10607265977194379, "grad_norm": 0.01948331855237484, "learning_rate": 5e-05, "loss": 0.0104, "step": 400 }, { "epoch": 0.10872447626624238, "grad_norm": 0.018410008400678635, "learning_rate": 5e-05, "loss": 0.0111, "step": 410 }, { "epoch": 0.11137629276054097, "grad_norm": 0.012905648909509182, "learning_rate": 5e-05, "loss": 0.011, "step": 420 }, { "epoch": 0.11402810925483957, "grad_norm": 0.015588047914206982, "learning_rate": 5e-05, "loss": 0.0106, "step": 430 }, { "epoch": 0.11667992574913816, "grad_norm": 0.010237782262265682, "learning_rate": 5e-05, "loss": 0.0107, "step": 440 }, { "epoch": 0.11933174224343675, "grad_norm": 0.01513813529163599, "learning_rate": 5e-05, "loss": 0.0102, "step": 450 }, { "epoch": 0.12198355873773535, "grad_norm": 0.016143858432769775, "learning_rate": 5e-05, "loss": 0.0107, "step": 460 }, { "epoch": 0.12463537523203394, "grad_norm": 0.00828468892723322, "learning_rate": 5e-05, "loss": 0.0103, "step": 470 }, { "epoch": 0.12728719172633254, "grad_norm": 0.015572180040180683, "learning_rate": 5e-05, "loss": 0.0095, "step": 480 }, { "epoch": 0.12993900822063112, "grad_norm": 0.017650986090302467, "learning_rate": 5e-05, "loss": 0.0108, "step": 490 }, { "epoch": 0.13259082471492972, "grad_norm": 0.022239752113819122, "learning_rate": 5e-05, "loss": 0.0113, "step": 500 }, { "epoch": 0.13524264120922833, "grad_norm": 0.010186909697949886, "learning_rate": 5e-05, "loss": 0.0106, "step": 510 }, { "epoch": 0.13789445770352693, "grad_norm": 0.013059939257800579, "learning_rate": 5e-05, "loss": 0.0107, "step": 520 }, { "epoch": 0.1405462741978255, "grad_norm": 0.010432315990328789, "learning_rate": 5e-05, "loss": 0.0094, "step": 530 }, { "epoch": 0.1431980906921241, "grad_norm": 0.019246794283390045, "learning_rate": 5e-05, "loss": 0.0108, "step": 540 }, { "epoch": 0.1458499071864227, "grad_norm": 0.01299999002367258, "learning_rate": 5e-05, "loss": 0.0098, "step": 550 }, { "epoch": 0.1485017236807213, "grad_norm": 0.010865003801882267, "learning_rate": 5e-05, "loss": 0.0101, "step": 560 }, { "epoch": 0.1511535401750199, "grad_norm": 0.015246598049998283, "learning_rate": 5e-05, "loss": 0.0105, "step": 570 }, { "epoch": 0.1538053566693185, "grad_norm": 0.026161620393395424, "learning_rate": 5e-05, "loss": 0.0102, "step": 580 }, { "epoch": 0.15645717316361707, "grad_norm": 0.015734069049358368, "learning_rate": 5e-05, "loss": 0.0107, "step": 590 }, { "epoch": 0.15910898965791567, "grad_norm": 0.016834575682878494, "learning_rate": 5e-05, "loss": 0.0098, "step": 600 }, { "epoch": 0.16176080615221428, "grad_norm": 0.00990761537104845, "learning_rate": 5e-05, "loss": 0.0099, "step": 610 }, { "epoch": 0.16441262264651285, "grad_norm": 0.013068127445876598, "learning_rate": 5e-05, "loss": 0.0096, "step": 620 }, { "epoch": 0.16706443914081145, "grad_norm": 0.013438420370221138, "learning_rate": 5e-05, "loss": 0.0094, "step": 630 }, { "epoch": 0.16971625563511006, "grad_norm": 0.01631905697286129, "learning_rate": 5e-05, "loss": 0.0101, "step": 640 }, { "epoch": 0.17236807212940863, "grad_norm": 0.008468487299978733, "learning_rate": 5e-05, "loss": 0.0093, "step": 650 }, { "epoch": 0.17501988862370724, "grad_norm": 0.013419297523796558, "learning_rate": 5e-05, "loss": 0.0101, "step": 660 }, { "epoch": 0.17767170511800584, "grad_norm": 0.013997181318700314, "learning_rate": 5e-05, "loss": 0.0105, "step": 670 }, { "epoch": 0.18032352161230442, "grad_norm": 0.011292782612144947, "learning_rate": 5e-05, "loss": 0.0101, "step": 680 }, { "epoch": 0.18297533810660302, "grad_norm": 0.012854757718741894, "learning_rate": 5e-05, "loss": 0.0093, "step": 690 }, { "epoch": 0.18562715460090162, "grad_norm": 0.009599885903298855, "learning_rate": 5e-05, "loss": 0.0092, "step": 700 }, { "epoch": 0.18827897109520023, "grad_norm": 0.02002909407019615, "learning_rate": 5e-05, "loss": 0.009, "step": 710 }, { "epoch": 0.1909307875894988, "grad_norm": 0.013872887939214706, "learning_rate": 5e-05, "loss": 0.0102, "step": 720 }, { "epoch": 0.1935826040837974, "grad_norm": 0.01644757017493248, "learning_rate": 5e-05, "loss": 0.0104, "step": 730 }, { "epoch": 0.196234420578096, "grad_norm": 0.012152507901191711, "learning_rate": 5e-05, "loss": 0.0098, "step": 740 }, { "epoch": 0.19888623707239458, "grad_norm": 0.012348605319857597, "learning_rate": 5e-05, "loss": 0.0091, "step": 750 }, { "epoch": 0.2015380535666932, "grad_norm": 0.02040923573076725, "learning_rate": 5e-05, "loss": 0.0096, "step": 760 }, { "epoch": 0.2041898700609918, "grad_norm": 0.013828651048243046, "learning_rate": 5e-05, "loss": 0.0092, "step": 770 }, { "epoch": 0.20684168655529037, "grad_norm": 0.009244026616215706, "learning_rate": 5e-05, "loss": 0.0094, "step": 780 }, { "epoch": 0.20949350304958897, "grad_norm": 0.014571444131433964, "learning_rate": 5e-05, "loss": 0.0091, "step": 790 }, { "epoch": 0.21214531954388757, "grad_norm": 0.018429681658744812, "learning_rate": 5e-05, "loss": 0.0096, "step": 800 }, { "epoch": 0.21479713603818615, "grad_norm": 0.008341381326317787, "learning_rate": 5e-05, "loss": 0.0098, "step": 810 }, { "epoch": 0.21744895253248475, "grad_norm": 0.012733936309814453, "learning_rate": 5e-05, "loss": 0.01, "step": 820 }, { "epoch": 0.22010076902678335, "grad_norm": 0.01471877470612526, "learning_rate": 5e-05, "loss": 0.0094, "step": 830 }, { "epoch": 0.22275258552108193, "grad_norm": 0.017787477001547813, "learning_rate": 5e-05, "loss": 0.0094, "step": 840 }, { "epoch": 0.22540440201538053, "grad_norm": 0.009511825628578663, "learning_rate": 5e-05, "loss": 0.0094, "step": 850 }, { "epoch": 0.22805621850967914, "grad_norm": 0.011159355752170086, "learning_rate": 5e-05, "loss": 0.0085, "step": 860 }, { "epoch": 0.2307080350039777, "grad_norm": 0.014395744539797306, "learning_rate": 5e-05, "loss": 0.0086, "step": 870 }, { "epoch": 0.23335985149827632, "grad_norm": 0.010089886374771595, "learning_rate": 5e-05, "loss": 0.0092, "step": 880 }, { "epoch": 0.23601166799257492, "grad_norm": 0.010723458603024483, "learning_rate": 5e-05, "loss": 0.0086, "step": 890 }, { "epoch": 0.2386634844868735, "grad_norm": 0.014126202091574669, "learning_rate": 5e-05, "loss": 0.0088, "step": 900 }, { "epoch": 0.2413153009811721, "grad_norm": 0.011570971459150314, "learning_rate": 5e-05, "loss": 0.0084, "step": 910 }, { "epoch": 0.2439671174754707, "grad_norm": 0.012601399794220924, "learning_rate": 5e-05, "loss": 0.0092, "step": 920 }, { "epoch": 0.2466189339697693, "grad_norm": 0.010908788070082664, "learning_rate": 5e-05, "loss": 0.008, "step": 930 }, { "epoch": 0.24927075046406788, "grad_norm": 0.012425004504621029, "learning_rate": 5e-05, "loss": 0.0089, "step": 940 }, { "epoch": 0.25192256695836646, "grad_norm": 0.013276235200464725, "learning_rate": 5e-05, "loss": 0.0086, "step": 950 }, { "epoch": 0.2545743834526651, "grad_norm": 0.015362740494310856, "learning_rate": 5e-05, "loss": 0.0088, "step": 960 }, { "epoch": 0.25722619994696366, "grad_norm": 0.012183173559606075, "learning_rate": 5e-05, "loss": 0.0089, "step": 970 }, { "epoch": 0.25987801644126224, "grad_norm": 0.01438547670841217, "learning_rate": 5e-05, "loss": 0.0088, "step": 980 }, { "epoch": 0.26252983293556087, "grad_norm": 0.013665788806974888, "learning_rate": 5e-05, "loss": 0.0092, "step": 990 }, { "epoch": 0.26518164942985945, "grad_norm": 0.01046668365597725, "learning_rate": 5e-05, "loss": 0.0085, "step": 1000 }, { "epoch": 0.267833465924158, "grad_norm": 0.013813899829983711, "learning_rate": 5e-05, "loss": 0.0088, "step": 1010 }, { "epoch": 0.27048528241845665, "grad_norm": 0.01947537064552307, "learning_rate": 5e-05, "loss": 0.0084, "step": 1020 }, { "epoch": 0.2731370989127552, "grad_norm": 0.011912846006453037, "learning_rate": 5e-05, "loss": 0.0079, "step": 1030 }, { "epoch": 0.27578891540705386, "grad_norm": 0.02369276061654091, "learning_rate": 5e-05, "loss": 0.0085, "step": 1040 }, { "epoch": 0.27844073190135243, "grad_norm": 0.008832289837300777, "learning_rate": 5e-05, "loss": 0.0086, "step": 1050 }, { "epoch": 0.281092548395651, "grad_norm": 0.025068864226341248, "learning_rate": 5e-05, "loss": 0.0085, "step": 1060 }, { "epoch": 0.28374436488994964, "grad_norm": 0.013990101404488087, "learning_rate": 5e-05, "loss": 0.0082, "step": 1070 }, { "epoch": 0.2863961813842482, "grad_norm": 0.014582226984202862, "learning_rate": 5e-05, "loss": 0.0085, "step": 1080 }, { "epoch": 0.2890479978785468, "grad_norm": 0.013114459812641144, "learning_rate": 5e-05, "loss": 0.0083, "step": 1090 }, { "epoch": 0.2916998143728454, "grad_norm": 0.010913411155343056, "learning_rate": 5e-05, "loss": 0.0073, "step": 1100 }, { "epoch": 0.294351630867144, "grad_norm": 0.018149925395846367, "learning_rate": 5e-05, "loss": 0.0079, "step": 1110 }, { "epoch": 0.2970034473614426, "grad_norm": 0.012796511873602867, "learning_rate": 5e-05, "loss": 0.0076, "step": 1120 }, { "epoch": 0.2996552638557412, "grad_norm": 0.021942973136901855, "learning_rate": 5e-05, "loss": 0.008, "step": 1130 }, { "epoch": 0.3023070803500398, "grad_norm": 0.018051836639642715, "learning_rate": 5e-05, "loss": 0.0082, "step": 1140 }, { "epoch": 0.30495889684433836, "grad_norm": 0.013402791693806648, "learning_rate": 5e-05, "loss": 0.0076, "step": 1150 }, { "epoch": 0.307610713338637, "grad_norm": 0.011157209984958172, "learning_rate": 5e-05, "loss": 0.0082, "step": 1160 }, { "epoch": 0.31026252983293556, "grad_norm": 0.013290155678987503, "learning_rate": 5e-05, "loss": 0.0074, "step": 1170 }, { "epoch": 0.31291434632723414, "grad_norm": 0.01983226276934147, "learning_rate": 5e-05, "loss": 0.0077, "step": 1180 }, { "epoch": 0.31556616282153277, "grad_norm": 0.010223600082099438, "learning_rate": 5e-05, "loss": 0.0084, "step": 1190 }, { "epoch": 0.31821797931583135, "grad_norm": 0.00978254247456789, "learning_rate": 5e-05, "loss": 0.008, "step": 1200 }, { "epoch": 0.3208697958101299, "grad_norm": 0.014222302474081516, "learning_rate": 5e-05, "loss": 0.0089, "step": 1210 }, { "epoch": 0.32352161230442855, "grad_norm": 0.01081007532775402, "learning_rate": 5e-05, "loss": 0.0084, "step": 1220 }, { "epoch": 0.3261734287987271, "grad_norm": 0.013335522264242172, "learning_rate": 5e-05, "loss": 0.0077, "step": 1230 }, { "epoch": 0.3288252452930257, "grad_norm": 0.018239237368106842, "learning_rate": 5e-05, "loss": 0.0082, "step": 1240 }, { "epoch": 0.33147706178732433, "grad_norm": 0.012316283769905567, "learning_rate": 5e-05, "loss": 0.0075, "step": 1250 }, { "epoch": 0.3341288782816229, "grad_norm": 0.0161879975348711, "learning_rate": 5e-05, "loss": 0.0084, "step": 1260 }, { "epoch": 0.3367806947759215, "grad_norm": 0.011896328069269657, "learning_rate": 5e-05, "loss": 0.0083, "step": 1270 }, { "epoch": 0.3394325112702201, "grad_norm": 0.01583375409245491, "learning_rate": 5e-05, "loss": 0.0085, "step": 1280 }, { "epoch": 0.3420843277645187, "grad_norm": 0.015839409083127975, "learning_rate": 5e-05, "loss": 0.0076, "step": 1290 }, { "epoch": 0.34473614425881727, "grad_norm": 0.015172465704381466, "learning_rate": 5e-05, "loss": 0.0076, "step": 1300 }, { "epoch": 0.3473879607531159, "grad_norm": 0.014984076842665672, "learning_rate": 5e-05, "loss": 0.008, "step": 1310 }, { "epoch": 0.3500397772474145, "grad_norm": 0.011987367644906044, "learning_rate": 5e-05, "loss": 0.0078, "step": 1320 }, { "epoch": 0.35269159374171305, "grad_norm": 0.011574046686291695, "learning_rate": 5e-05, "loss": 0.0082, "step": 1330 }, { "epoch": 0.3553434102360117, "grad_norm": 0.01762913167476654, "learning_rate": 5e-05, "loss": 0.0077, "step": 1340 }, { "epoch": 0.35799522673031026, "grad_norm": 0.012356298975646496, "learning_rate": 5e-05, "loss": 0.0074, "step": 1350 }, { "epoch": 0.36064704322460883, "grad_norm": 0.010079051367938519, "learning_rate": 5e-05, "loss": 0.0074, "step": 1360 }, { "epoch": 0.36329885971890746, "grad_norm": 0.014865169301629066, "learning_rate": 5e-05, "loss": 0.0077, "step": 1370 }, { "epoch": 0.36595067621320604, "grad_norm": 0.011747049167752266, "learning_rate": 5e-05, "loss": 0.0076, "step": 1380 }, { "epoch": 0.3686024927075046, "grad_norm": 0.010203711688518524, "learning_rate": 5e-05, "loss": 0.0078, "step": 1390 }, { "epoch": 0.37125430920180325, "grad_norm": 0.015804342925548553, "learning_rate": 5e-05, "loss": 0.0084, "step": 1400 }, { "epoch": 0.3739061256961018, "grad_norm": 0.012406069785356522, "learning_rate": 5e-05, "loss": 0.0075, "step": 1410 }, { "epoch": 0.37655794219040045, "grad_norm": 0.017000149935483932, "learning_rate": 5e-05, "loss": 0.0064, "step": 1420 }, { "epoch": 0.379209758684699, "grad_norm": 0.015780480578541756, "learning_rate": 5e-05, "loss": 0.0068, "step": 1430 }, { "epoch": 0.3818615751789976, "grad_norm": 0.010282334871590137, "learning_rate": 5e-05, "loss": 0.0072, "step": 1440 }, { "epoch": 0.38451339167329623, "grad_norm": 0.009841980412602425, "learning_rate": 5e-05, "loss": 0.0069, "step": 1450 }, { "epoch": 0.3871652081675948, "grad_norm": 0.02233537845313549, "learning_rate": 5e-05, "loss": 0.008, "step": 1460 }, { "epoch": 0.3898170246618934, "grad_norm": 0.012376459315419197, "learning_rate": 5e-05, "loss": 0.0076, "step": 1470 }, { "epoch": 0.392468841156192, "grad_norm": 0.01513287890702486, "learning_rate": 5e-05, "loss": 0.007, "step": 1480 }, { "epoch": 0.3951206576504906, "grad_norm": 0.013680401258170605, "learning_rate": 5e-05, "loss": 0.0066, "step": 1490 }, { "epoch": 0.39777247414478917, "grad_norm": 0.012688511982560158, "learning_rate": 5e-05, "loss": 0.0078, "step": 1500 }, { "epoch": 0.4004242906390878, "grad_norm": 0.007461309898644686, "learning_rate": 5e-05, "loss": 0.0073, "step": 1510 }, { "epoch": 0.4030761071333864, "grad_norm": 0.012309460900723934, "learning_rate": 5e-05, "loss": 0.0064, "step": 1520 }, { "epoch": 0.40572792362768495, "grad_norm": 0.00997267384082079, "learning_rate": 5e-05, "loss": 0.0071, "step": 1530 }, { "epoch": 0.4083797401219836, "grad_norm": 0.01195466611534357, "learning_rate": 5e-05, "loss": 0.007, "step": 1540 }, { "epoch": 0.41103155661628216, "grad_norm": 0.015032176859676838, "learning_rate": 5e-05, "loss": 0.0071, "step": 1550 }, { "epoch": 0.41368337311058073, "grad_norm": 0.008342492394149303, "learning_rate": 5e-05, "loss": 0.0069, "step": 1560 }, { "epoch": 0.41633518960487936, "grad_norm": 0.017913976684212685, "learning_rate": 5e-05, "loss": 0.0074, "step": 1570 }, { "epoch": 0.41898700609917794, "grad_norm": 0.011689859442412853, "learning_rate": 5e-05, "loss": 0.0069, "step": 1580 }, { "epoch": 0.4216388225934765, "grad_norm": 0.019604161381721497, "learning_rate": 5e-05, "loss": 0.0066, "step": 1590 }, { "epoch": 0.42429063908777515, "grad_norm": 0.01538645476102829, "learning_rate": 5e-05, "loss": 0.0074, "step": 1600 }, { "epoch": 0.4269424555820737, "grad_norm": 0.007185685448348522, "learning_rate": 5e-05, "loss": 0.0071, "step": 1610 }, { "epoch": 0.4295942720763723, "grad_norm": 0.009462813846766949, "learning_rate": 5e-05, "loss": 0.007, "step": 1620 }, { "epoch": 0.4322460885706709, "grad_norm": 0.016012225300073624, "learning_rate": 5e-05, "loss": 0.0079, "step": 1630 }, { "epoch": 0.4348979050649695, "grad_norm": 0.011162541806697845, "learning_rate": 5e-05, "loss": 0.0076, "step": 1640 }, { "epoch": 0.4375497215592681, "grad_norm": 0.014166356064379215, "learning_rate": 5e-05, "loss": 0.0071, "step": 1650 }, { "epoch": 0.4402015380535667, "grad_norm": 0.012407203204929829, "learning_rate": 5e-05, "loss": 0.0073, "step": 1660 }, { "epoch": 0.4428533545478653, "grad_norm": 0.015585038810968399, "learning_rate": 5e-05, "loss": 0.0069, "step": 1670 }, { "epoch": 0.44550517104216386, "grad_norm": 0.010211491025984287, "learning_rate": 5e-05, "loss": 0.0062, "step": 1680 }, { "epoch": 0.4481569875364625, "grad_norm": 0.01717487908899784, "learning_rate": 5e-05, "loss": 0.0071, "step": 1690 }, { "epoch": 0.45080880403076107, "grad_norm": 0.01590793766081333, "learning_rate": 5e-05, "loss": 0.0066, "step": 1700 }, { "epoch": 0.45346062052505964, "grad_norm": 0.013059757649898529, "learning_rate": 5e-05, "loss": 0.0067, "step": 1710 }, { "epoch": 0.4561124370193583, "grad_norm": 0.017237195745110512, "learning_rate": 5e-05, "loss": 0.0071, "step": 1720 }, { "epoch": 0.45876425351365685, "grad_norm": 0.016273258253932, "learning_rate": 5e-05, "loss": 0.007, "step": 1730 }, { "epoch": 0.4614160700079554, "grad_norm": 0.00963173620402813, "learning_rate": 5e-05, "loss": 0.0069, "step": 1740 }, { "epoch": 0.46406788650225406, "grad_norm": 0.011312242597341537, "learning_rate": 5e-05, "loss": 0.0068, "step": 1750 }, { "epoch": 0.46671970299655263, "grad_norm": 0.010186634957790375, "learning_rate": 5e-05, "loss": 0.0069, "step": 1760 }, { "epoch": 0.4693715194908512, "grad_norm": 0.01625797525048256, "learning_rate": 5e-05, "loss": 0.0063, "step": 1770 }, { "epoch": 0.47202333598514984, "grad_norm": 0.01162825245410204, "learning_rate": 5e-05, "loss": 0.0064, "step": 1780 }, { "epoch": 0.4746751524794484, "grad_norm": 0.009352140128612518, "learning_rate": 5e-05, "loss": 0.007, "step": 1790 }, { "epoch": 0.477326968973747, "grad_norm": 0.017744529992341995, "learning_rate": 5e-05, "loss": 0.0064, "step": 1800 }, { "epoch": 0.4799787854680456, "grad_norm": 0.01081256102770567, "learning_rate": 5e-05, "loss": 0.0069, "step": 1810 }, { "epoch": 0.4826306019623442, "grad_norm": 0.012644189409911633, "learning_rate": 5e-05, "loss": 0.007, "step": 1820 }, { "epoch": 0.48528241845664283, "grad_norm": 0.010520144365727901, "learning_rate": 5e-05, "loss": 0.0063, "step": 1830 }, { "epoch": 0.4879342349509414, "grad_norm": 0.01324478443711996, "learning_rate": 5e-05, "loss": 0.0069, "step": 1840 }, { "epoch": 0.49058605144524, "grad_norm": 0.010949905030429363, "learning_rate": 5e-05, "loss": 0.0065, "step": 1850 }, { "epoch": 0.4932378679395386, "grad_norm": 0.009622184559702873, "learning_rate": 5e-05, "loss": 0.0068, "step": 1860 }, { "epoch": 0.4958896844338372, "grad_norm": 0.009923068806529045, "learning_rate": 5e-05, "loss": 0.0063, "step": 1870 }, { "epoch": 0.49854150092813576, "grad_norm": 0.012701519764959812, "learning_rate": 5e-05, "loss": 0.0071, "step": 1880 }, { "epoch": 0.5011933174224343, "grad_norm": 0.013168574310839176, "learning_rate": 5e-05, "loss": 0.0067, "step": 1890 }, { "epoch": 0.5038451339167329, "grad_norm": 0.012124096974730492, "learning_rate": 5e-05, "loss": 0.0072, "step": 1900 }, { "epoch": 0.5064969504110316, "grad_norm": 0.013571794144809246, "learning_rate": 5e-05, "loss": 0.0066, "step": 1910 }, { "epoch": 0.5091487669053302, "grad_norm": 0.01265135407447815, "learning_rate": 5e-05, "loss": 0.0073, "step": 1920 }, { "epoch": 0.5118005833996288, "grad_norm": 0.007570344489067793, "learning_rate": 5e-05, "loss": 0.0061, "step": 1930 }, { "epoch": 0.5144523998939273, "grad_norm": 0.011190901510417461, "learning_rate": 5e-05, "loss": 0.007, "step": 1940 }, { "epoch": 0.5171042163882259, "grad_norm": 0.014921837486326694, "learning_rate": 5e-05, "loss": 0.0077, "step": 1950 }, { "epoch": 0.5197560328825245, "grad_norm": 0.010060888715088367, "learning_rate": 5e-05, "loss": 0.0066, "step": 1960 }, { "epoch": 0.5224078493768232, "grad_norm": 0.014729388058185577, "learning_rate": 5e-05, "loss": 0.0058, "step": 1970 }, { "epoch": 0.5250596658711217, "grad_norm": 0.02001059241592884, "learning_rate": 5e-05, "loss": 0.0067, "step": 1980 }, { "epoch": 0.5277114823654203, "grad_norm": 0.00968366488814354, "learning_rate": 5e-05, "loss": 0.0062, "step": 1990 }, { "epoch": 0.5303632988597189, "grad_norm": 0.012237307615578175, "learning_rate": 5e-05, "loss": 0.0056, "step": 2000 }, { "epoch": 0.5330151153540175, "grad_norm": 0.010821251198649406, "learning_rate": 5e-05, "loss": 0.0067, "step": 2010 }, { "epoch": 0.535666931848316, "grad_norm": 0.01793757826089859, "learning_rate": 5e-05, "loss": 0.0064, "step": 2020 }, { "epoch": 0.5383187483426147, "grad_norm": 0.014390253461897373, "learning_rate": 5e-05, "loss": 0.0054, "step": 2030 }, { "epoch": 0.5409705648369133, "grad_norm": 0.013489446602761745, "learning_rate": 5e-05, "loss": 0.0063, "step": 2040 }, { "epoch": 0.5436223813312119, "grad_norm": 0.016089778393507004, "learning_rate": 5e-05, "loss": 0.0069, "step": 2050 }, { "epoch": 0.5462741978255105, "grad_norm": 0.010340374894440174, "learning_rate": 5e-05, "loss": 0.0068, "step": 2060 }, { "epoch": 0.548926014319809, "grad_norm": 0.016863029450178146, "learning_rate": 5e-05, "loss": 0.0065, "step": 2070 }, { "epoch": 0.5515778308141077, "grad_norm": 0.010584230534732342, "learning_rate": 5e-05, "loss": 0.0062, "step": 2080 }, { "epoch": 0.5542296473084063, "grad_norm": 0.013457668013870716, "learning_rate": 5e-05, "loss": 0.0071, "step": 2090 }, { "epoch": 0.5568814638027049, "grad_norm": 0.016352346166968346, "learning_rate": 5e-05, "loss": 0.006, "step": 2100 }, { "epoch": 0.5595332802970034, "grad_norm": 0.012239636853337288, "learning_rate": 5e-05, "loss": 0.0065, "step": 2110 }, { "epoch": 0.562185096791302, "grad_norm": 0.012317162938416004, "learning_rate": 5e-05, "loss": 0.0069, "step": 2120 }, { "epoch": 0.5648369132856006, "grad_norm": 0.007824297063052654, "learning_rate": 5e-05, "loss": 0.0068, "step": 2130 }, { "epoch": 0.5674887297798993, "grad_norm": 0.01059173047542572, "learning_rate": 5e-05, "loss": 0.0064, "step": 2140 }, { "epoch": 0.5701405462741979, "grad_norm": 0.018736060708761215, "learning_rate": 5e-05, "loss": 0.0067, "step": 2150 }, { "epoch": 0.5727923627684964, "grad_norm": 0.01113548967987299, "learning_rate": 5e-05, "loss": 0.0064, "step": 2160 }, { "epoch": 0.575444179262795, "grad_norm": 0.0075713880360126495, "learning_rate": 5e-05, "loss": 0.0067, "step": 2170 }, { "epoch": 0.5780959957570936, "grad_norm": 0.01220528595149517, "learning_rate": 5e-05, "loss": 0.0065, "step": 2180 }, { "epoch": 0.5807478122513922, "grad_norm": 0.012628840282559395, "learning_rate": 5e-05, "loss": 0.0064, "step": 2190 }, { "epoch": 0.5833996287456908, "grad_norm": 0.009552238509058952, "learning_rate": 5e-05, "loss": 0.0064, "step": 2200 }, { "epoch": 0.5860514452399894, "grad_norm": 0.008477460592985153, "learning_rate": 5e-05, "loss": 0.0061, "step": 2210 }, { "epoch": 0.588703261734288, "grad_norm": 0.012596933171153069, "learning_rate": 5e-05, "loss": 0.0065, "step": 2220 }, { "epoch": 0.5913550782285866, "grad_norm": 0.01009803544729948, "learning_rate": 5e-05, "loss": 0.0062, "step": 2230 }, { "epoch": 0.5940068947228851, "grad_norm": 0.011355806142091751, "learning_rate": 5e-05, "loss": 0.0063, "step": 2240 }, { "epoch": 0.5966587112171837, "grad_norm": 0.00824103131890297, "learning_rate": 5e-05, "loss": 0.0057, "step": 2250 }, { "epoch": 0.5993105277114824, "grad_norm": 0.00816698744893074, "learning_rate": 5e-05, "loss": 0.0063, "step": 2260 }, { "epoch": 0.601962344205781, "grad_norm": 0.01775503344833851, "learning_rate": 5e-05, "loss": 0.0055, "step": 2270 }, { "epoch": 0.6046141607000796, "grad_norm": 0.011369360610842705, "learning_rate": 5e-05, "loss": 0.0067, "step": 2280 }, { "epoch": 0.6072659771943781, "grad_norm": 0.014426524750888348, "learning_rate": 5e-05, "loss": 0.0058, "step": 2290 }, { "epoch": 0.6099177936886767, "grad_norm": 0.012124336324632168, "learning_rate": 5e-05, "loss": 0.0068, "step": 2300 }, { "epoch": 0.6125696101829753, "grad_norm": 0.006931654643267393, "learning_rate": 5e-05, "loss": 0.0066, "step": 2310 }, { "epoch": 0.615221426677274, "grad_norm": 0.013428520411252975, "learning_rate": 5e-05, "loss": 0.006, "step": 2320 }, { "epoch": 0.6178732431715726, "grad_norm": 0.009586717002093792, "learning_rate": 5e-05, "loss": 0.0067, "step": 2330 }, { "epoch": 0.6205250596658711, "grad_norm": 0.015284504741430283, "learning_rate": 5e-05, "loss": 0.0056, "step": 2340 }, { "epoch": 0.6231768761601697, "grad_norm": 0.010159710422158241, "learning_rate": 5e-05, "loss": 0.0052, "step": 2350 }, { "epoch": 0.6258286926544683, "grad_norm": 0.014477217569947243, "learning_rate": 5e-05, "loss": 0.0051, "step": 2360 }, { "epoch": 0.6284805091487669, "grad_norm": 0.011998296715319157, "learning_rate": 5e-05, "loss": 0.0061, "step": 2370 }, { "epoch": 0.6311323256430655, "grad_norm": 0.012489769607782364, "learning_rate": 5e-05, "loss": 0.0063, "step": 2380 }, { "epoch": 0.6337841421373641, "grad_norm": 0.012630645185709, "learning_rate": 5e-05, "loss": 0.006, "step": 2390 }, { "epoch": 0.6364359586316627, "grad_norm": 0.013784768991172314, "learning_rate": 5e-05, "loss": 0.0062, "step": 2400 }, { "epoch": 0.6390877751259613, "grad_norm": 0.011258543469011784, "learning_rate": 5e-05, "loss": 0.006, "step": 2410 }, { "epoch": 0.6417395916202598, "grad_norm": 0.015645228326320648, "learning_rate": 5e-05, "loss": 0.0061, "step": 2420 }, { "epoch": 0.6443914081145584, "grad_norm": 0.009739172644913197, "learning_rate": 5e-05, "loss": 0.0055, "step": 2430 }, { "epoch": 0.6470432246088571, "grad_norm": 0.009579052217304707, "learning_rate": 5e-05, "loss": 0.0061, "step": 2440 }, { "epoch": 0.6496950411031557, "grad_norm": 0.01509283296763897, "learning_rate": 5e-05, "loss": 0.0064, "step": 2450 }, { "epoch": 0.6523468575974543, "grad_norm": 0.011856339871883392, "learning_rate": 5e-05, "loss": 0.0068, "step": 2460 }, { "epoch": 0.6549986740917528, "grad_norm": 0.011422081850469112, "learning_rate": 5e-05, "loss": 0.0061, "step": 2470 }, { "epoch": 0.6576504905860514, "grad_norm": 0.009676282294094563, "learning_rate": 5e-05, "loss": 0.0066, "step": 2480 }, { "epoch": 0.6603023070803501, "grad_norm": 0.017975283786654472, "learning_rate": 5e-05, "loss": 0.0065, "step": 2490 }, { "epoch": 0.6629541235746487, "grad_norm": 0.013262840919196606, "learning_rate": 5e-05, "loss": 0.0058, "step": 2500 }, { "epoch": 0.6656059400689472, "grad_norm": 0.011979634873569012, "learning_rate": 5e-05, "loss": 0.0065, "step": 2510 }, { "epoch": 0.6682577565632458, "grad_norm": 0.009137840941548347, "learning_rate": 5e-05, "loss": 0.0068, "step": 2520 }, { "epoch": 0.6709095730575444, "grad_norm": 0.015563786961138248, "learning_rate": 5e-05, "loss": 0.0064, "step": 2530 }, { "epoch": 0.673561389551843, "grad_norm": 0.009518592618405819, "learning_rate": 5e-05, "loss": 0.0057, "step": 2540 }, { "epoch": 0.6762132060461417, "grad_norm": 0.010752581991255283, "learning_rate": 5e-05, "loss": 0.0062, "step": 2550 }, { "epoch": 0.6788650225404402, "grad_norm": 0.012611442245543003, "learning_rate": 5e-05, "loss": 0.0058, "step": 2560 }, { "epoch": 0.6815168390347388, "grad_norm": 0.011413481086492538, "learning_rate": 5e-05, "loss": 0.0066, "step": 2570 }, { "epoch": 0.6841686555290374, "grad_norm": 0.010332215577363968, "learning_rate": 5e-05, "loss": 0.0058, "step": 2580 }, { "epoch": 0.686820472023336, "grad_norm": 0.008225159719586372, "learning_rate": 5e-05, "loss": 0.0051, "step": 2590 }, { "epoch": 0.6894722885176345, "grad_norm": 0.008972653187811375, "learning_rate": 5e-05, "loss": 0.0054, "step": 2600 }, { "epoch": 0.6921241050119332, "grad_norm": 0.017176972702145576, "learning_rate": 5e-05, "loss": 0.0054, "step": 2610 }, { "epoch": 0.6947759215062318, "grad_norm": 0.014187118038535118, "learning_rate": 5e-05, "loss": 0.006, "step": 2620 }, { "epoch": 0.6974277380005304, "grad_norm": 0.010718374513089657, "learning_rate": 5e-05, "loss": 0.0063, "step": 2630 }, { "epoch": 0.700079554494829, "grad_norm": 0.01777811534702778, "learning_rate": 5e-05, "loss": 0.0061, "step": 2640 }, { "epoch": 0.7027313709891275, "grad_norm": 0.007840079255402088, "learning_rate": 5e-05, "loss": 0.006, "step": 2650 }, { "epoch": 0.7053831874834261, "grad_norm": 0.012339567765593529, "learning_rate": 5e-05, "loss": 0.0055, "step": 2660 }, { "epoch": 0.7080350039777248, "grad_norm": 0.01373356580734253, "learning_rate": 5e-05, "loss": 0.0061, "step": 2670 }, { "epoch": 0.7106868204720234, "grad_norm": 0.012406235560774803, "learning_rate": 5e-05, "loss": 0.0056, "step": 2680 }, { "epoch": 0.7133386369663219, "grad_norm": 0.013570855371654034, "learning_rate": 5e-05, "loss": 0.006, "step": 2690 }, { "epoch": 0.7159904534606205, "grad_norm": 0.013746225275099277, "learning_rate": 5e-05, "loss": 0.0061, "step": 2700 }, { "epoch": 0.7186422699549191, "grad_norm": 0.014308486133813858, "learning_rate": 5e-05, "loss": 0.0064, "step": 2710 }, { "epoch": 0.7212940864492177, "grad_norm": 0.014248129911720753, "learning_rate": 5e-05, "loss": 0.0058, "step": 2720 }, { "epoch": 0.7239459029435164, "grad_norm": 0.010117997415363789, "learning_rate": 5e-05, "loss": 0.0067, "step": 2730 }, { "epoch": 0.7265977194378149, "grad_norm": 0.011920841410756111, "learning_rate": 5e-05, "loss": 0.0056, "step": 2740 }, { "epoch": 0.7292495359321135, "grad_norm": 0.017600057646632195, "learning_rate": 5e-05, "loss": 0.0055, "step": 2750 }, { "epoch": 0.7319013524264121, "grad_norm": 0.012431267648935318, "learning_rate": 5e-05, "loss": 0.0058, "step": 2760 }, { "epoch": 0.7345531689207107, "grad_norm": 0.010622302070260048, "learning_rate": 5e-05, "loss": 0.006, "step": 2770 }, { "epoch": 0.7372049854150092, "grad_norm": 0.011106299236416817, "learning_rate": 5e-05, "loss": 0.006, "step": 2780 }, { "epoch": 0.7398568019093079, "grad_norm": 0.01017810683697462, "learning_rate": 5e-05, "loss": 0.0056, "step": 2790 }, { "epoch": 0.7425086184036065, "grad_norm": 0.009219973348081112, "learning_rate": 5e-05, "loss": 0.0057, "step": 2800 }, { "epoch": 0.7451604348979051, "grad_norm": 0.010090688243508339, "learning_rate": 5e-05, "loss": 0.0064, "step": 2810 }, { "epoch": 0.7478122513922036, "grad_norm": 0.012486668303608894, "learning_rate": 5e-05, "loss": 0.006, "step": 2820 }, { "epoch": 0.7504640678865022, "grad_norm": 0.010908860713243484, "learning_rate": 5e-05, "loss": 0.0054, "step": 2830 }, { "epoch": 0.7531158843808009, "grad_norm": 0.010980230756103992, "learning_rate": 5e-05, "loss": 0.0067, "step": 2840 }, { "epoch": 0.7557677008750995, "grad_norm": 0.010295581072568893, "learning_rate": 5e-05, "loss": 0.0058, "step": 2850 }, { "epoch": 0.758419517369398, "grad_norm": 0.006702885031700134, "learning_rate": 5e-05, "loss": 0.0057, "step": 2860 }, { "epoch": 0.7610713338636966, "grad_norm": 0.010913851670920849, "learning_rate": 5e-05, "loss": 0.0063, "step": 2870 }, { "epoch": 0.7637231503579952, "grad_norm": 0.014394965954124928, "learning_rate": 5e-05, "loss": 0.0057, "step": 2880 }, { "epoch": 0.7663749668522938, "grad_norm": 0.007123112678527832, "learning_rate": 5e-05, "loss": 0.0055, "step": 2890 }, { "epoch": 0.7690267833465925, "grad_norm": 0.015240196138620377, "learning_rate": 5e-05, "loss": 0.0063, "step": 2900 }, { "epoch": 0.771678599840891, "grad_norm": 0.007446739822626114, "learning_rate": 5e-05, "loss": 0.0064, "step": 2910 }, { "epoch": 0.7743304163351896, "grad_norm": 0.009338028728961945, "learning_rate": 5e-05, "loss": 0.0056, "step": 2920 }, { "epoch": 0.7769822328294882, "grad_norm": 0.012741082347929478, "learning_rate": 5e-05, "loss": 0.0056, "step": 2930 }, { "epoch": 0.7796340493237868, "grad_norm": 0.010148140601813793, "learning_rate": 5e-05, "loss": 0.0057, "step": 2940 }, { "epoch": 0.7822858658180853, "grad_norm": 0.00822310708463192, "learning_rate": 5e-05, "loss": 0.0054, "step": 2950 }, { "epoch": 0.784937682312384, "grad_norm": 0.014910020865499973, "learning_rate": 5e-05, "loss": 0.0051, "step": 2960 }, { "epoch": 0.7875894988066826, "grad_norm": 0.01279065664857626, "learning_rate": 5e-05, "loss": 0.0061, "step": 2970 }, { "epoch": 0.7902413153009812, "grad_norm": 0.00997047871351242, "learning_rate": 5e-05, "loss": 0.0055, "step": 2980 }, { "epoch": 0.7928931317952798, "grad_norm": 0.010397320613265038, "learning_rate": 5e-05, "loss": 0.0065, "step": 2990 }, { "epoch": 0.7955449482895783, "grad_norm": 0.009744018316268921, "learning_rate": 5e-05, "loss": 0.006, "step": 3000 }, { "epoch": 0.7981967647838769, "grad_norm": 0.012422683648765087, "learning_rate": 5e-05, "loss": 0.0056, "step": 3010 }, { "epoch": 0.8008485812781756, "grad_norm": 0.008513445034623146, "learning_rate": 5e-05, "loss": 0.005, "step": 3020 }, { "epoch": 0.8035003977724742, "grad_norm": 0.01307771448045969, "learning_rate": 5e-05, "loss": 0.0056, "step": 3030 }, { "epoch": 0.8061522142667727, "grad_norm": 0.01529394555836916, "learning_rate": 5e-05, "loss": 0.0057, "step": 3040 }, { "epoch": 0.8088040307610713, "grad_norm": 0.01072612963616848, "learning_rate": 5e-05, "loss": 0.0053, "step": 3050 }, { "epoch": 0.8114558472553699, "grad_norm": 0.012130209244787693, "learning_rate": 5e-05, "loss": 0.0052, "step": 3060 }, { "epoch": 0.8141076637496685, "grad_norm": 0.010140764527022839, "learning_rate": 5e-05, "loss": 0.0063, "step": 3070 }, { "epoch": 0.8167594802439672, "grad_norm": 0.008750601671636105, "learning_rate": 5e-05, "loss": 0.0052, "step": 3080 }, { "epoch": 0.8194112967382657, "grad_norm": 0.00959628727287054, "learning_rate": 5e-05, "loss": 0.0051, "step": 3090 }, { "epoch": 0.8220631132325643, "grad_norm": 0.00768613163381815, "learning_rate": 5e-05, "loss": 0.0049, "step": 3100 }, { "epoch": 0.8247149297268629, "grad_norm": 0.012720347382128239, "learning_rate": 5e-05, "loss": 0.0056, "step": 3110 }, { "epoch": 0.8273667462211615, "grad_norm": 0.011753101833164692, "learning_rate": 5e-05, "loss": 0.0051, "step": 3120 }, { "epoch": 0.83001856271546, "grad_norm": 0.010131241753697395, "learning_rate": 5e-05, "loss": 0.0054, "step": 3130 }, { "epoch": 0.8326703792097587, "grad_norm": 0.015864761546254158, "learning_rate": 5e-05, "loss": 0.0059, "step": 3140 }, { "epoch": 0.8353221957040573, "grad_norm": 0.00866832584142685, "learning_rate": 5e-05, "loss": 0.0058, "step": 3150 }, { "epoch": 0.8379740121983559, "grad_norm": 0.013319578021764755, "learning_rate": 5e-05, "loss": 0.0061, "step": 3160 }, { "epoch": 0.8406258286926545, "grad_norm": 0.014210108667612076, "learning_rate": 5e-05, "loss": 0.0062, "step": 3170 }, { "epoch": 0.843277645186953, "grad_norm": 0.007536872755736113, "learning_rate": 5e-05, "loss": 0.0056, "step": 3180 }, { "epoch": 0.8459294616812516, "grad_norm": 0.010182585567235947, "learning_rate": 5e-05, "loss": 0.0061, "step": 3190 }, { "epoch": 0.8485812781755503, "grad_norm": 0.010220678523182869, "learning_rate": 5e-05, "loss": 0.0054, "step": 3200 }, { "epoch": 0.8512330946698489, "grad_norm": 0.015781976282596588, "learning_rate": 5e-05, "loss": 0.0053, "step": 3210 }, { "epoch": 0.8538849111641474, "grad_norm": 0.020468879491090775, "learning_rate": 5e-05, "loss": 0.0048, "step": 3220 }, { "epoch": 0.856536727658446, "grad_norm": 0.011455283500254154, "learning_rate": 5e-05, "loss": 0.0053, "step": 3230 }, { "epoch": 0.8591885441527446, "grad_norm": 0.01053438987582922, "learning_rate": 5e-05, "loss": 0.0048, "step": 3240 }, { "epoch": 0.8618403606470433, "grad_norm": 0.019055433571338654, "learning_rate": 5e-05, "loss": 0.0051, "step": 3250 }, { "epoch": 0.8644921771413419, "grad_norm": 0.009914879687130451, "learning_rate": 5e-05, "loss": 0.0055, "step": 3260 }, { "epoch": 0.8671439936356404, "grad_norm": 0.014600240625441074, "learning_rate": 5e-05, "loss": 0.006, "step": 3270 }, { "epoch": 0.869795810129939, "grad_norm": 0.009259520098567009, "learning_rate": 5e-05, "loss": 0.0056, "step": 3280 }, { "epoch": 0.8724476266242376, "grad_norm": 0.0117458151653409, "learning_rate": 5e-05, "loss": 0.0056, "step": 3290 }, { "epoch": 0.8750994431185362, "grad_norm": 0.008083544671535492, "learning_rate": 5e-05, "loss": 0.0054, "step": 3300 }, { "epoch": 0.8777512596128348, "grad_norm": 0.0181112103164196, "learning_rate": 5e-05, "loss": 0.0051, "step": 3310 }, { "epoch": 0.8804030761071334, "grad_norm": 0.012190030887722969, "learning_rate": 5e-05, "loss": 0.0057, "step": 3320 }, { "epoch": 0.883054892601432, "grad_norm": 0.010855783708393574, "learning_rate": 5e-05, "loss": 0.0056, "step": 3330 }, { "epoch": 0.8857067090957306, "grad_norm": 0.008543924428522587, "learning_rate": 5e-05, "loss": 0.0049, "step": 3340 }, { "epoch": 0.8883585255900291, "grad_norm": 0.011487052775919437, "learning_rate": 5e-05, "loss": 0.0052, "step": 3350 }, { "epoch": 0.8910103420843277, "grad_norm": 0.014448055997490883, "learning_rate": 5e-05, "loss": 0.0062, "step": 3360 }, { "epoch": 0.8936621585786264, "grad_norm": 0.010259400121867657, "learning_rate": 5e-05, "loss": 0.0059, "step": 3370 }, { "epoch": 0.896313975072925, "grad_norm": 0.008330618031322956, "learning_rate": 5e-05, "loss": 0.005, "step": 3380 }, { "epoch": 0.8989657915672236, "grad_norm": 0.010353799909353256, "learning_rate": 5e-05, "loss": 0.0054, "step": 3390 }, { "epoch": 0.9016176080615221, "grad_norm": 0.010005519725382328, "learning_rate": 5e-05, "loss": 0.0049, "step": 3400 }, { "epoch": 0.9042694245558207, "grad_norm": 0.008485163561999798, "learning_rate": 5e-05, "loss": 0.0054, "step": 3410 }, { "epoch": 0.9069212410501193, "grad_norm": 0.01682075671851635, "learning_rate": 5e-05, "loss": 0.0056, "step": 3420 }, { "epoch": 0.909573057544418, "grad_norm": 0.013550261035561562, "learning_rate": 5e-05, "loss": 0.0052, "step": 3430 }, { "epoch": 0.9122248740387165, "grad_norm": 0.01084277592599392, "learning_rate": 5e-05, "loss": 0.0055, "step": 3440 }, { "epoch": 0.9148766905330151, "grad_norm": 0.015856586396694183, "learning_rate": 5e-05, "loss": 0.0061, "step": 3450 }, { "epoch": 0.9175285070273137, "grad_norm": 0.01264201384037733, "learning_rate": 5e-05, "loss": 0.005, "step": 3460 }, { "epoch": 0.9201803235216123, "grad_norm": 0.013790092431008816, "learning_rate": 5e-05, "loss": 0.0054, "step": 3470 }, { "epoch": 0.9228321400159109, "grad_norm": 0.010162296704947948, "learning_rate": 5e-05, "loss": 0.0051, "step": 3480 }, { "epoch": 0.9254839565102095, "grad_norm": 0.009699148125946522, "learning_rate": 5e-05, "loss": 0.0054, "step": 3490 }, { "epoch": 0.9281357730045081, "grad_norm": 0.011140543967485428, "learning_rate": 5e-05, "loss": 0.0054, "step": 3500 }, { "epoch": 0.9307875894988067, "grad_norm": 0.013018625788390636, "learning_rate": 5e-05, "loss": 0.0053, "step": 3510 }, { "epoch": 0.9334394059931053, "grad_norm": 0.01986619457602501, "learning_rate": 5e-05, "loss": 0.0054, "step": 3520 }, { "epoch": 0.9360912224874038, "grad_norm": 0.013184898532927036, "learning_rate": 5e-05, "loss": 0.0056, "step": 3530 }, { "epoch": 0.9387430389817024, "grad_norm": 0.01006197277456522, "learning_rate": 5e-05, "loss": 0.0055, "step": 3540 }, { "epoch": 0.9413948554760011, "grad_norm": 0.01527655404061079, "learning_rate": 5e-05, "loss": 0.006, "step": 3550 }, { "epoch": 0.9440466719702997, "grad_norm": 0.01600155420601368, "learning_rate": 5e-05, "loss": 0.0049, "step": 3560 }, { "epoch": 0.9466984884645983, "grad_norm": 0.0091208191588521, "learning_rate": 5e-05, "loss": 0.0053, "step": 3570 }, { "epoch": 0.9493503049588968, "grad_norm": 0.009641831740736961, "learning_rate": 5e-05, "loss": 0.0045, "step": 3580 }, { "epoch": 0.9520021214531954, "grad_norm": 0.012004656717181206, "learning_rate": 5e-05, "loss": 0.0047, "step": 3590 }, { "epoch": 0.954653937947494, "grad_norm": 0.017074277624487877, "learning_rate": 5e-05, "loss": 0.0057, "step": 3600 }, { "epoch": 0.9573057544417927, "grad_norm": 0.011384370736777782, "learning_rate": 5e-05, "loss": 0.0055, "step": 3610 }, { "epoch": 0.9599575709360912, "grad_norm": 0.01091021578758955, "learning_rate": 5e-05, "loss": 0.0051, "step": 3620 }, { "epoch": 0.9626093874303898, "grad_norm": 0.006411446724087, "learning_rate": 5e-05, "loss": 0.0046, "step": 3630 }, { "epoch": 0.9652612039246884, "grad_norm": 0.011761453002691269, "learning_rate": 5e-05, "loss": 0.0057, "step": 3640 }, { "epoch": 0.967913020418987, "grad_norm": 0.008403790183365345, "learning_rate": 5e-05, "loss": 0.0054, "step": 3650 }, { "epoch": 0.9705648369132857, "grad_norm": 0.011681593023240566, "learning_rate": 5e-05, "loss": 0.0056, "step": 3660 }, { "epoch": 0.9732166534075842, "grad_norm": 0.013260429725050926, "learning_rate": 5e-05, "loss": 0.0056, "step": 3670 }, { "epoch": 0.9758684699018828, "grad_norm": 0.012290672399103642, "learning_rate": 5e-05, "loss": 0.0058, "step": 3680 }, { "epoch": 0.9785202863961814, "grad_norm": 0.010819220915436745, "learning_rate": 5e-05, "loss": 0.0053, "step": 3690 }, { "epoch": 0.98117210289048, "grad_norm": 0.015099594369530678, "learning_rate": 5e-05, "loss": 0.0055, "step": 3700 }, { "epoch": 0.9838239193847785, "grad_norm": 0.012548113241791725, "learning_rate": 5e-05, "loss": 0.0046, "step": 3710 }, { "epoch": 0.9864757358790772, "grad_norm": 0.017624972388148308, "learning_rate": 5e-05, "loss": 0.0062, "step": 3720 }, { "epoch": 0.9891275523733758, "grad_norm": 0.01035949308425188, "learning_rate": 5e-05, "loss": 0.0064, "step": 3730 }, { "epoch": 0.9917793688676744, "grad_norm": 0.009944017976522446, "learning_rate": 5e-05, "loss": 0.0051, "step": 3740 }, { "epoch": 0.994431185361973, "grad_norm": 0.010977090336382389, "learning_rate": 5e-05, "loss": 0.0051, "step": 3750 }, { "epoch": 0.9970830018562715, "grad_norm": 0.010577734559774399, "learning_rate": 5e-05, "loss": 0.0053, "step": 3760 }, { "epoch": 0.9997348183505701, "grad_norm": 0.011321503669023514, "learning_rate": 5e-05, "loss": 0.0055, "step": 3770 }, { "epoch": 1.0023866348448687, "grad_norm": 0.012126474641263485, "learning_rate": 5e-05, "loss": 0.0051, "step": 3780 }, { "epoch": 1.0050384513391672, "grad_norm": 0.009338350035250187, "learning_rate": 5e-05, "loss": 0.0052, "step": 3790 }, { "epoch": 1.0076902678334658, "grad_norm": 0.01445938739925623, "learning_rate": 5e-05, "loss": 0.0049, "step": 3800 }, { "epoch": 1.0103420843277646, "grad_norm": 0.010727426037192345, "learning_rate": 5e-05, "loss": 0.0055, "step": 3810 }, { "epoch": 1.0129939008220632, "grad_norm": 0.007477167993783951, "learning_rate": 5e-05, "loss": 0.0054, "step": 3820 }, { "epoch": 1.0156457173163618, "grad_norm": 0.009393922053277493, "learning_rate": 5e-05, "loss": 0.005, "step": 3830 }, { "epoch": 1.0182975338106603, "grad_norm": 0.013007636182010174, "learning_rate": 5e-05, "loss": 0.005, "step": 3840 }, { "epoch": 1.020949350304959, "grad_norm": 0.009218838065862656, "learning_rate": 5e-05, "loss": 0.0051, "step": 3850 }, { "epoch": 1.0236011667992575, "grad_norm": 0.008823463693261147, "learning_rate": 5e-05, "loss": 0.0054, "step": 3860 }, { "epoch": 1.026252983293556, "grad_norm": 0.009299694560468197, "learning_rate": 5e-05, "loss": 0.0058, "step": 3870 }, { "epoch": 1.0289047997878547, "grad_norm": 0.009518041275441647, "learning_rate": 5e-05, "loss": 0.0046, "step": 3880 }, { "epoch": 1.0315566162821532, "grad_norm": 0.013241413980722427, "learning_rate": 5e-05, "loss": 0.0049, "step": 3890 }, { "epoch": 1.0342084327764518, "grad_norm": 0.0133131742477417, "learning_rate": 5e-05, "loss": 0.0049, "step": 3900 }, { "epoch": 1.0368602492707504, "grad_norm": 0.009627291932702065, "learning_rate": 5e-05, "loss": 0.0052, "step": 3910 }, { "epoch": 1.039512065765049, "grad_norm": 0.010804231278598309, "learning_rate": 5e-05, "loss": 0.0048, "step": 3920 }, { "epoch": 1.0421638822593478, "grad_norm": 0.009999709203839302, "learning_rate": 5e-05, "loss": 0.0043, "step": 3930 }, { "epoch": 1.0448156987536463, "grad_norm": 0.011529280804097652, "learning_rate": 5e-05, "loss": 0.0046, "step": 3940 }, { "epoch": 1.047467515247945, "grad_norm": 0.009822256863117218, "learning_rate": 5e-05, "loss": 0.0057, "step": 3950 }, { "epoch": 1.0501193317422435, "grad_norm": 0.009788265451788902, "learning_rate": 5e-05, "loss": 0.0051, "step": 3960 }, { "epoch": 1.052771148236542, "grad_norm": 0.010074404999613762, "learning_rate": 5e-05, "loss": 0.005, "step": 3970 }, { "epoch": 1.0554229647308406, "grad_norm": 0.010969365946948528, "learning_rate": 5e-05, "loss": 0.0048, "step": 3980 }, { "epoch": 1.0580747812251392, "grad_norm": 0.013874899595975876, "learning_rate": 5e-05, "loss": 0.0048, "step": 3990 }, { "epoch": 1.0607265977194378, "grad_norm": 0.007955843582749367, "learning_rate": 5e-05, "loss": 0.0051, "step": 4000 }, { "epoch": 1.0633784142137364, "grad_norm": 0.00910469051450491, "learning_rate": 5e-05, "loss": 0.0049, "step": 4010 }, { "epoch": 1.066030230708035, "grad_norm": 0.011983786709606647, "learning_rate": 5e-05, "loss": 0.005, "step": 4020 }, { "epoch": 1.0686820472023335, "grad_norm": 0.011548668146133423, "learning_rate": 5e-05, "loss": 0.005, "step": 4030 }, { "epoch": 1.071333863696632, "grad_norm": 0.010220183059573174, "learning_rate": 5e-05, "loss": 0.0044, "step": 4040 }, { "epoch": 1.0739856801909309, "grad_norm": 0.008708702400326729, "learning_rate": 5e-05, "loss": 0.0053, "step": 4050 }, { "epoch": 1.0766374966852295, "grad_norm": 0.010379524901509285, "learning_rate": 5e-05, "loss": 0.0048, "step": 4060 }, { "epoch": 1.079289313179528, "grad_norm": 0.013857336714863777, "learning_rate": 5e-05, "loss": 0.0049, "step": 4070 }, { "epoch": 1.0819411296738266, "grad_norm": 0.010432504117488861, "learning_rate": 5e-05, "loss": 0.0052, "step": 4080 }, { "epoch": 1.0845929461681252, "grad_norm": 0.010059886611998081, "learning_rate": 5e-05, "loss": 0.0046, "step": 4090 }, { "epoch": 1.0872447626624238, "grad_norm": 0.016598954796791077, "learning_rate": 5e-05, "loss": 0.0052, "step": 4100 }, { "epoch": 1.0898965791567223, "grad_norm": 0.009067121893167496, "learning_rate": 5e-05, "loss": 0.0047, "step": 4110 }, { "epoch": 1.092548395651021, "grad_norm": 0.011461110785603523, "learning_rate": 5e-05, "loss": 0.0045, "step": 4120 }, { "epoch": 1.0952002121453195, "grad_norm": 0.010566857643425465, "learning_rate": 5e-05, "loss": 0.0049, "step": 4130 }, { "epoch": 1.097852028639618, "grad_norm": 0.0187949538230896, "learning_rate": 5e-05, "loss": 0.0051, "step": 4140 }, { "epoch": 1.1005038451339166, "grad_norm": 0.008556387387216091, "learning_rate": 5e-05, "loss": 0.0046, "step": 4150 }, { "epoch": 1.1031556616282154, "grad_norm": 0.010631304234266281, "learning_rate": 5e-05, "loss": 0.0044, "step": 4160 }, { "epoch": 1.105807478122514, "grad_norm": 0.012137765996158123, "learning_rate": 5e-05, "loss": 0.0044, "step": 4170 }, { "epoch": 1.1084592946168126, "grad_norm": 0.028013162314891815, "learning_rate": 5e-05, "loss": 0.0059, "step": 4180 }, { "epoch": 1.1111111111111112, "grad_norm": 0.010126393288373947, "learning_rate": 5e-05, "loss": 0.0045, "step": 4190 }, { "epoch": 1.1137629276054097, "grad_norm": 0.00909582432359457, "learning_rate": 5e-05, "loss": 0.0055, "step": 4200 }, { "epoch": 1.1164147440997083, "grad_norm": 0.00762931490316987, "learning_rate": 5e-05, "loss": 0.0052, "step": 4210 }, { "epoch": 1.1190665605940069, "grad_norm": 0.012371683493256569, "learning_rate": 5e-05, "loss": 0.0048, "step": 4220 }, { "epoch": 1.1217183770883055, "grad_norm": 0.013934116810560226, "learning_rate": 5e-05, "loss": 0.0054, "step": 4230 }, { "epoch": 1.124370193582604, "grad_norm": 0.014141558669507504, "learning_rate": 5e-05, "loss": 0.0049, "step": 4240 }, { "epoch": 1.1270220100769026, "grad_norm": 0.011837013997137547, "learning_rate": 5e-05, "loss": 0.0044, "step": 4250 }, { "epoch": 1.1296738265712012, "grad_norm": 0.012324878945946693, "learning_rate": 5e-05, "loss": 0.0053, "step": 4260 }, { "epoch": 1.1323256430654998, "grad_norm": 0.015769479796290398, "learning_rate": 5e-05, "loss": 0.0052, "step": 4270 }, { "epoch": 1.1349774595597983, "grad_norm": 0.018003089353442192, "learning_rate": 5e-05, "loss": 0.0043, "step": 4280 }, { "epoch": 1.1376292760540971, "grad_norm": 0.009816902689635754, "learning_rate": 5e-05, "loss": 0.0056, "step": 4290 }, { "epoch": 1.1402810925483957, "grad_norm": 0.011095925234258175, "learning_rate": 5e-05, "loss": 0.0043, "step": 4300 }, { "epoch": 1.1429329090426943, "grad_norm": 0.010123059153556824, "learning_rate": 5e-05, "loss": 0.0057, "step": 4310 }, { "epoch": 1.1455847255369929, "grad_norm": 0.009551813825964928, "learning_rate": 5e-05, "loss": 0.0047, "step": 4320 }, { "epoch": 1.1482365420312914, "grad_norm": 0.009234924800693989, "learning_rate": 5e-05, "loss": 0.005, "step": 4330 }, { "epoch": 1.15088835852559, "grad_norm": 0.010417346842586994, "learning_rate": 5e-05, "loss": 0.0045, "step": 4340 }, { "epoch": 1.1535401750198886, "grad_norm": 0.013145706616342068, "learning_rate": 5e-05, "loss": 0.0043, "step": 4350 }, { "epoch": 1.1561919915141872, "grad_norm": 0.015307405032217503, "learning_rate": 5e-05, "loss": 0.0051, "step": 4360 }, { "epoch": 1.1588438080084857, "grad_norm": 0.007395805791020393, "learning_rate": 5e-05, "loss": 0.0043, "step": 4370 }, { "epoch": 1.1614956245027843, "grad_norm": 0.011004475876688957, "learning_rate": 5e-05, "loss": 0.0041, "step": 4380 }, { "epoch": 1.1641474409970831, "grad_norm": 0.019597826525568962, "learning_rate": 5e-05, "loss": 0.0049, "step": 4390 }, { "epoch": 1.1667992574913817, "grad_norm": 0.009440355002880096, "learning_rate": 5e-05, "loss": 0.0041, "step": 4400 }, { "epoch": 1.1694510739856803, "grad_norm": 0.007653518579900265, "learning_rate": 5e-05, "loss": 0.0056, "step": 4410 }, { "epoch": 1.1721028904799788, "grad_norm": 0.008695983327925205, "learning_rate": 5e-05, "loss": 0.0045, "step": 4420 }, { "epoch": 1.1747547069742774, "grad_norm": 0.011163333430886269, "learning_rate": 5e-05, "loss": 0.0054, "step": 4430 }, { "epoch": 1.177406523468576, "grad_norm": 0.010592016391456127, "learning_rate": 5e-05, "loss": 0.0056, "step": 4440 }, { "epoch": 1.1800583399628746, "grad_norm": 0.01493637915700674, "learning_rate": 5e-05, "loss": 0.0055, "step": 4450 }, { "epoch": 1.1827101564571731, "grad_norm": 0.008601345121860504, "learning_rate": 5e-05, "loss": 0.0042, "step": 4460 }, { "epoch": 1.1853619729514717, "grad_norm": 0.006805775221437216, "learning_rate": 5e-05, "loss": 0.0045, "step": 4470 }, { "epoch": 1.1880137894457703, "grad_norm": 0.008687603287398815, "learning_rate": 5e-05, "loss": 0.0044, "step": 4480 }, { "epoch": 1.1906656059400689, "grad_norm": 0.011491890996694565, "learning_rate": 5e-05, "loss": 0.0048, "step": 4490 }, { "epoch": 1.1933174224343674, "grad_norm": 0.014226673170924187, "learning_rate": 5e-05, "loss": 0.0045, "step": 4500 }, { "epoch": 1.195969238928666, "grad_norm": 0.014927077107131481, "learning_rate": 5e-05, "loss": 0.0058, "step": 4510 }, { "epoch": 1.1986210554229648, "grad_norm": 0.005617219023406506, "learning_rate": 5e-05, "loss": 0.0048, "step": 4520 }, { "epoch": 1.2012728719172634, "grad_norm": 0.010061756707727909, "learning_rate": 5e-05, "loss": 0.0046, "step": 4530 }, { "epoch": 1.203924688411562, "grad_norm": 0.014795041643083096, "learning_rate": 5e-05, "loss": 0.0048, "step": 4540 }, { "epoch": 1.2065765049058605, "grad_norm": 0.010011537931859493, "learning_rate": 5e-05, "loss": 0.0042, "step": 4550 }, { "epoch": 1.2092283214001591, "grad_norm": 0.013642709702253342, "learning_rate": 5e-05, "loss": 0.0046, "step": 4560 }, { "epoch": 1.2118801378944577, "grad_norm": 0.014879580587148666, "learning_rate": 5e-05, "loss": 0.0049, "step": 4570 }, { "epoch": 1.2145319543887563, "grad_norm": 0.011336344294250011, "learning_rate": 5e-05, "loss": 0.0046, "step": 4580 }, { "epoch": 1.2171837708830548, "grad_norm": 0.017873499542474747, "learning_rate": 5e-05, "loss": 0.0047, "step": 4590 }, { "epoch": 1.2198355873773534, "grad_norm": 0.009109721519052982, "learning_rate": 5e-05, "loss": 0.005, "step": 4600 }, { "epoch": 1.222487403871652, "grad_norm": 0.009435832500457764, "learning_rate": 5e-05, "loss": 0.0044, "step": 4610 }, { "epoch": 1.2251392203659508, "grad_norm": 0.013777959160506725, "learning_rate": 5e-05, "loss": 0.0048, "step": 4620 }, { "epoch": 1.2277910368602494, "grad_norm": 0.011369801126420498, "learning_rate": 5e-05, "loss": 0.004, "step": 4630 }, { "epoch": 1.230442853354548, "grad_norm": 0.013237404637038708, "learning_rate": 5e-05, "loss": 0.0038, "step": 4640 }, { "epoch": 1.2330946698488465, "grad_norm": 0.011461567133665085, "learning_rate": 5e-05, "loss": 0.0049, "step": 4650 }, { "epoch": 1.235746486343145, "grad_norm": 0.009827972389757633, "learning_rate": 5e-05, "loss": 0.0039, "step": 4660 }, { "epoch": 1.2383983028374437, "grad_norm": 0.008755287155508995, "learning_rate": 5e-05, "loss": 0.0055, "step": 4670 }, { "epoch": 1.2410501193317423, "grad_norm": 0.008098709397017956, "learning_rate": 5e-05, "loss": 0.004, "step": 4680 }, { "epoch": 1.2437019358260408, "grad_norm": 0.010351942852139473, "learning_rate": 5e-05, "loss": 0.005, "step": 4690 }, { "epoch": 1.2463537523203394, "grad_norm": 0.013087980449199677, "learning_rate": 5e-05, "loss": 0.004, "step": 4700 }, { "epoch": 1.249005568814638, "grad_norm": 0.018932638689875603, "learning_rate": 5e-05, "loss": 0.0043, "step": 4710 }, { "epoch": 1.2516573853089366, "grad_norm": 0.013286925852298737, "learning_rate": 5e-05, "loss": 0.004, "step": 4720 }, { "epoch": 1.2543092018032351, "grad_norm": 0.01578126661479473, "learning_rate": 5e-05, "loss": 0.0043, "step": 4730 }, { "epoch": 1.2569610182975337, "grad_norm": 0.010138173587620258, "learning_rate": 5e-05, "loss": 0.0052, "step": 4740 }, { "epoch": 1.2596128347918323, "grad_norm": 0.01364054623991251, "learning_rate": 5e-05, "loss": 0.0051, "step": 4750 }, { "epoch": 1.262264651286131, "grad_norm": 0.011587605811655521, "learning_rate": 5e-05, "loss": 0.0046, "step": 4760 }, { "epoch": 1.2649164677804297, "grad_norm": 0.013903125189244747, "learning_rate": 5e-05, "loss": 0.0044, "step": 4770 }, { "epoch": 1.2675682842747282, "grad_norm": 0.013850158080458641, "learning_rate": 5e-05, "loss": 0.0054, "step": 4780 }, { "epoch": 1.2702201007690268, "grad_norm": 0.013003162108361721, "learning_rate": 5e-05, "loss": 0.0041, "step": 4790 }, { "epoch": 1.2728719172633254, "grad_norm": 0.011354098096489906, "learning_rate": 5e-05, "loss": 0.0047, "step": 4800 }, { "epoch": 1.275523733757624, "grad_norm": 0.006839950103312731, "learning_rate": 5e-05, "loss": 0.0041, "step": 4810 }, { "epoch": 1.2781755502519225, "grad_norm": 0.01674976572394371, "learning_rate": 5e-05, "loss": 0.0042, "step": 4820 }, { "epoch": 1.280827366746221, "grad_norm": 0.01731754280626774, "learning_rate": 5e-05, "loss": 0.0042, "step": 4830 }, { "epoch": 1.2834791832405197, "grad_norm": 0.00932003278285265, "learning_rate": 5e-05, "loss": 0.0046, "step": 4840 }, { "epoch": 1.2861309997348185, "grad_norm": 0.008423437364399433, "learning_rate": 5e-05, "loss": 0.0043, "step": 4850 }, { "epoch": 1.288782816229117, "grad_norm": 0.009784470312297344, "learning_rate": 5e-05, "loss": 0.0045, "step": 4860 }, { "epoch": 1.2914346327234156, "grad_norm": 0.015741558745503426, "learning_rate": 5e-05, "loss": 0.0044, "step": 4870 }, { "epoch": 1.2940864492177142, "grad_norm": 0.013815692625939846, "learning_rate": 5e-05, "loss": 0.0045, "step": 4880 }, { "epoch": 1.2967382657120128, "grad_norm": 0.015917763113975525, "learning_rate": 5e-05, "loss": 0.0042, "step": 4890 }, { "epoch": 1.2993900822063114, "grad_norm": 0.00916313473135233, "learning_rate": 5e-05, "loss": 0.0038, "step": 4900 }, { "epoch": 1.30204189870061, "grad_norm": 0.009397645480930805, "learning_rate": 5e-05, "loss": 0.0044, "step": 4910 }, { "epoch": 1.3046937151949085, "grad_norm": 0.009689433500170708, "learning_rate": 5e-05, "loss": 0.0045, "step": 4920 }, { "epoch": 1.307345531689207, "grad_norm": 0.015068970620632172, "learning_rate": 5e-05, "loss": 0.0051, "step": 4930 }, { "epoch": 1.3099973481835057, "grad_norm": 0.00921731349080801, "learning_rate": 5e-05, "loss": 0.0039, "step": 4940 }, { "epoch": 1.3126491646778042, "grad_norm": 0.010140509344637394, "learning_rate": 5e-05, "loss": 0.0042, "step": 4950 }, { "epoch": 1.3153009811721028, "grad_norm": 0.011633462272584438, "learning_rate": 5e-05, "loss": 0.0044, "step": 4960 }, { "epoch": 1.3179527976664014, "grad_norm": 0.011148120276629925, "learning_rate": 5e-05, "loss": 0.0039, "step": 4970 }, { "epoch": 1.3206046141607, "grad_norm": 0.011639381758868694, "learning_rate": 5e-05, "loss": 0.0056, "step": 4980 }, { "epoch": 1.3232564306549985, "grad_norm": 0.012218561954796314, "learning_rate": 5e-05, "loss": 0.0048, "step": 4990 }, { "epoch": 1.3259082471492973, "grad_norm": 0.01060402113944292, "learning_rate": 5e-05, "loss": 0.0047, "step": 5000 }, { "epoch": 1.328560063643596, "grad_norm": 0.010202805511653423, "learning_rate": 5e-05, "loss": 0.0052, "step": 5010 }, { "epoch": 1.3312118801378945, "grad_norm": 0.013157653622329235, "learning_rate": 5e-05, "loss": 0.0041, "step": 5020 }, { "epoch": 1.333863696632193, "grad_norm": 0.00878073088824749, "learning_rate": 5e-05, "loss": 0.0045, "step": 5030 }, { "epoch": 1.3365155131264916, "grad_norm": 0.00804234854876995, "learning_rate": 5e-05, "loss": 0.0053, "step": 5040 }, { "epoch": 1.3391673296207902, "grad_norm": 0.018773255869746208, "learning_rate": 5e-05, "loss": 0.0048, "step": 5050 }, { "epoch": 1.3418191461150888, "grad_norm": 0.009693160653114319, "learning_rate": 5e-05, "loss": 0.0039, "step": 5060 }, { "epoch": 1.3444709626093874, "grad_norm": 0.01198120042681694, "learning_rate": 5e-05, "loss": 0.004, "step": 5070 }, { "epoch": 1.3471227791036862, "grad_norm": 0.0097303232178092, "learning_rate": 5e-05, "loss": 0.0042, "step": 5080 }, { "epoch": 1.3497745955979847, "grad_norm": 0.009387052617967129, "learning_rate": 5e-05, "loss": 0.0047, "step": 5090 }, { "epoch": 1.3524264120922833, "grad_norm": 0.008239242248237133, "learning_rate": 5e-05, "loss": 0.0046, "step": 5100 }, { "epoch": 1.355078228586582, "grad_norm": 0.010566020384430885, "learning_rate": 5e-05, "loss": 0.0044, "step": 5110 }, { "epoch": 1.3577300450808805, "grad_norm": 0.007877822034060955, "learning_rate": 5e-05, "loss": 0.0044, "step": 5120 }, { "epoch": 1.360381861575179, "grad_norm": 0.01268061250448227, "learning_rate": 5e-05, "loss": 0.0041, "step": 5130 }, { "epoch": 1.3630336780694776, "grad_norm": 0.010969136841595173, "learning_rate": 5e-05, "loss": 0.0041, "step": 5140 }, { "epoch": 1.3656854945637762, "grad_norm": 0.008032144978642464, "learning_rate": 5e-05, "loss": 0.0043, "step": 5150 }, { "epoch": 1.3683373110580748, "grad_norm": 0.014766513369977474, "learning_rate": 5e-05, "loss": 0.0051, "step": 5160 }, { "epoch": 1.3709891275523733, "grad_norm": 0.008528274483978748, "learning_rate": 5e-05, "loss": 0.0046, "step": 5170 }, { "epoch": 1.373640944046672, "grad_norm": 0.009188860654830933, "learning_rate": 5e-05, "loss": 0.0045, "step": 5180 }, { "epoch": 1.3762927605409705, "grad_norm": 0.007338584400713444, "learning_rate": 5e-05, "loss": 0.0037, "step": 5190 }, { "epoch": 1.378944577035269, "grad_norm": 0.016266709193587303, "learning_rate": 5e-05, "loss": 0.0038, "step": 5200 }, { "epoch": 1.3815963935295676, "grad_norm": 0.007714974693953991, "learning_rate": 5e-05, "loss": 0.004, "step": 5210 }, { "epoch": 1.3842482100238662, "grad_norm": 0.010423757135868073, "learning_rate": 5e-05, "loss": 0.0039, "step": 5220 }, { "epoch": 1.386900026518165, "grad_norm": 0.00908859446644783, "learning_rate": 5e-05, "loss": 0.0045, "step": 5230 }, { "epoch": 1.3895518430124636, "grad_norm": 0.007910105399787426, "learning_rate": 5e-05, "loss": 0.0047, "step": 5240 }, { "epoch": 1.3922036595067622, "grad_norm": 0.012212705798447132, "learning_rate": 5e-05, "loss": 0.0049, "step": 5250 }, { "epoch": 1.3948554760010607, "grad_norm": 0.014985023997724056, "learning_rate": 5e-05, "loss": 0.0033, "step": 5260 }, { "epoch": 1.3975072924953593, "grad_norm": 0.0201583132147789, "learning_rate": 5e-05, "loss": 0.0044, "step": 5270 }, { "epoch": 1.400159108989658, "grad_norm": 0.01086405385285616, "learning_rate": 5e-05, "loss": 0.004, "step": 5280 }, { "epoch": 1.4028109254839565, "grad_norm": 0.009956137277185917, "learning_rate": 5e-05, "loss": 0.0038, "step": 5290 }, { "epoch": 1.405462741978255, "grad_norm": 0.009943477809429169, "learning_rate": 5e-05, "loss": 0.0048, "step": 5300 }, { "epoch": 1.4081145584725536, "grad_norm": 0.011242822743952274, "learning_rate": 5e-05, "loss": 0.0042, "step": 5310 }, { "epoch": 1.4107663749668524, "grad_norm": 0.006323426030576229, "learning_rate": 5e-05, "loss": 0.0041, "step": 5320 }, { "epoch": 1.413418191461151, "grad_norm": 0.007764706388115883, "learning_rate": 5e-05, "loss": 0.0046, "step": 5330 }, { "epoch": 1.4160700079554496, "grad_norm": 0.007364222779870033, "learning_rate": 5e-05, "loss": 0.0037, "step": 5340 }, { "epoch": 1.4187218244497481, "grad_norm": 0.007008685730397701, "learning_rate": 5e-05, "loss": 0.004, "step": 5350 }, { "epoch": 1.4213736409440467, "grad_norm": 0.006734980270266533, "learning_rate": 5e-05, "loss": 0.0037, "step": 5360 }, { "epoch": 1.4240254574383453, "grad_norm": 0.016300003975629807, "learning_rate": 5e-05, "loss": 0.0043, "step": 5370 }, { "epoch": 1.4266772739326439, "grad_norm": 0.006626574322581291, "learning_rate": 5e-05, "loss": 0.0045, "step": 5380 }, { "epoch": 1.4293290904269425, "grad_norm": 0.010181017220020294, "learning_rate": 5e-05, "loss": 0.0044, "step": 5390 }, { "epoch": 1.431980906921241, "grad_norm": 0.00703588780015707, "learning_rate": 5e-05, "loss": 0.0047, "step": 5400 }, { "epoch": 1.4346327234155396, "grad_norm": 0.009322666563093662, "learning_rate": 5e-05, "loss": 0.0046, "step": 5410 }, { "epoch": 1.4372845399098382, "grad_norm": 0.010878956876695156, "learning_rate": 5e-05, "loss": 0.0046, "step": 5420 }, { "epoch": 1.4399363564041368, "grad_norm": 0.019013166427612305, "learning_rate": 5e-05, "loss": 0.0039, "step": 5430 }, { "epoch": 1.4425881728984353, "grad_norm": 0.00811670534312725, "learning_rate": 5e-05, "loss": 0.0047, "step": 5440 }, { "epoch": 1.445239989392734, "grad_norm": 0.010026267729699612, "learning_rate": 5e-05, "loss": 0.0036, "step": 5450 }, { "epoch": 1.4478918058870325, "grad_norm": 0.010472642257809639, "learning_rate": 5e-05, "loss": 0.0039, "step": 5460 }, { "epoch": 1.4505436223813313, "grad_norm": 0.008591456338763237, "learning_rate": 5e-05, "loss": 0.0039, "step": 5470 }, { "epoch": 1.4531954388756299, "grad_norm": 0.014273119159042835, "learning_rate": 5e-05, "loss": 0.0036, "step": 5480 }, { "epoch": 1.4558472553699284, "grad_norm": 0.012761227786540985, "learning_rate": 5e-05, "loss": 0.0043, "step": 5490 }, { "epoch": 1.458499071864227, "grad_norm": 0.013408991508185863, "learning_rate": 5e-05, "loss": 0.0042, "step": 5500 }, { "epoch": 1.4611508883585256, "grad_norm": 0.013530619442462921, "learning_rate": 5e-05, "loss": 0.0039, "step": 5510 }, { "epoch": 1.4638027048528242, "grad_norm": 0.010833495296537876, "learning_rate": 5e-05, "loss": 0.0037, "step": 5520 }, { "epoch": 1.4664545213471227, "grad_norm": 0.02011292055249214, "learning_rate": 5e-05, "loss": 0.0041, "step": 5530 }, { "epoch": 1.4691063378414213, "grad_norm": 0.012721111997961998, "learning_rate": 5e-05, "loss": 0.0038, "step": 5540 }, { "epoch": 1.47175815433572, "grad_norm": 0.0129472017288208, "learning_rate": 5e-05, "loss": 0.004, "step": 5550 }, { "epoch": 1.4744099708300187, "grad_norm": 0.012788426131010056, "learning_rate": 5e-05, "loss": 0.0042, "step": 5560 }, { "epoch": 1.4770617873243173, "grad_norm": 0.008336531929671764, "learning_rate": 5e-05, "loss": 0.0039, "step": 5570 }, { "epoch": 1.4797136038186158, "grad_norm": 0.00840145256370306, "learning_rate": 5e-05, "loss": 0.0042, "step": 5580 }, { "epoch": 1.4823654203129144, "grad_norm": 0.01572095789015293, "learning_rate": 5e-05, "loss": 0.004, "step": 5590 }, { "epoch": 1.485017236807213, "grad_norm": 0.019700800999999046, "learning_rate": 5e-05, "loss": 0.0041, "step": 5600 }, { "epoch": 1.4876690533015116, "grad_norm": 0.008834819309413433, "learning_rate": 5e-05, "loss": 0.0044, "step": 5610 }, { "epoch": 1.4903208697958101, "grad_norm": 0.01310776174068451, "learning_rate": 5e-05, "loss": 0.0042, "step": 5620 }, { "epoch": 1.4929726862901087, "grad_norm": 0.010965723544359207, "learning_rate": 5e-05, "loss": 0.0037, "step": 5630 }, { "epoch": 1.4956245027844073, "grad_norm": 0.012463233433663845, "learning_rate": 5e-05, "loss": 0.0037, "step": 5640 }, { "epoch": 1.4982763192787059, "grad_norm": 0.011711183935403824, "learning_rate": 5e-05, "loss": 0.0038, "step": 5650 }, { "epoch": 1.5009281357730044, "grad_norm": 0.012159621343016624, "learning_rate": 5e-05, "loss": 0.0042, "step": 5660 }, { "epoch": 1.503579952267303, "grad_norm": 0.008716187439858913, "learning_rate": 5e-05, "loss": 0.0042, "step": 5670 }, { "epoch": 1.5062317687616016, "grad_norm": 0.011942260898649693, "learning_rate": 5e-05, "loss": 0.0041, "step": 5680 }, { "epoch": 1.5088835852559002, "grad_norm": 0.012850731611251831, "learning_rate": 5e-05, "loss": 0.004, "step": 5690 }, { "epoch": 1.5115354017501987, "grad_norm": 0.013583059422671795, "learning_rate": 5e-05, "loss": 0.004, "step": 5700 }, { "epoch": 1.5141872182444975, "grad_norm": 0.008461887016892433, "learning_rate": 5e-05, "loss": 0.0046, "step": 5710 }, { "epoch": 1.516839034738796, "grad_norm": 0.009111542254686356, "learning_rate": 5e-05, "loss": 0.005, "step": 5720 }, { "epoch": 1.5194908512330947, "grad_norm": 0.007978223264217377, "learning_rate": 5e-05, "loss": 0.0039, "step": 5730 }, { "epoch": 1.5221426677273933, "grad_norm": 0.010834870859980583, "learning_rate": 5e-05, "loss": 0.0037, "step": 5740 }, { "epoch": 1.5247944842216918, "grad_norm": 0.008480457589030266, "learning_rate": 5e-05, "loss": 0.004, "step": 5750 }, { "epoch": 1.5274463007159904, "grad_norm": 0.009284783154726028, "learning_rate": 5e-05, "loss": 0.004, "step": 5760 }, { "epoch": 1.5300981172102892, "grad_norm": 0.008985163643956184, "learning_rate": 5e-05, "loss": 0.0033, "step": 5770 }, { "epoch": 1.5327499337045878, "grad_norm": 0.015456616878509521, "learning_rate": 5e-05, "loss": 0.0047, "step": 5780 }, { "epoch": 1.5354017501988864, "grad_norm": 0.009345540776848793, "learning_rate": 5e-05, "loss": 0.0043, "step": 5790 }, { "epoch": 1.538053566693185, "grad_norm": 0.007293611764907837, "learning_rate": 5e-05, "loss": 0.0035, "step": 5800 }, { "epoch": 1.5407053831874835, "grad_norm": 0.012455802410840988, "learning_rate": 5e-05, "loss": 0.0037, "step": 5810 }, { "epoch": 1.543357199681782, "grad_norm": 0.014543714933097363, "learning_rate": 5e-05, "loss": 0.004, "step": 5820 }, { "epoch": 1.5460090161760807, "grad_norm": 0.013890712521970272, "learning_rate": 5e-05, "loss": 0.0044, "step": 5830 }, { "epoch": 1.5486608326703792, "grad_norm": 0.010773948393762112, "learning_rate": 5e-05, "loss": 0.0041, "step": 5840 }, { "epoch": 1.5513126491646778, "grad_norm": 0.005687897093594074, "learning_rate": 5e-05, "loss": 0.0043, "step": 5850 }, { "epoch": 1.5539644656589764, "grad_norm": 0.01086400356143713, "learning_rate": 5e-05, "loss": 0.0038, "step": 5860 }, { "epoch": 1.556616282153275, "grad_norm": 0.008292458951473236, "learning_rate": 5e-05, "loss": 0.0042, "step": 5870 }, { "epoch": 1.5592680986475735, "grad_norm": 0.007676058914512396, "learning_rate": 5e-05, "loss": 0.0042, "step": 5880 }, { "epoch": 1.5619199151418721, "grad_norm": 0.0108788525685668, "learning_rate": 5e-05, "loss": 0.0042, "step": 5890 }, { "epoch": 1.5645717316361707, "grad_norm": 0.010611265897750854, "learning_rate": 5e-05, "loss": 0.0044, "step": 5900 }, { "epoch": 1.5672235481304693, "grad_norm": 0.013078942894935608, "learning_rate": 5e-05, "loss": 0.0042, "step": 5910 }, { "epoch": 1.5698753646247678, "grad_norm": 0.007019662298262119, "learning_rate": 5e-05, "loss": 0.0043, "step": 5920 }, { "epoch": 1.5725271811190664, "grad_norm": 0.0123084532096982, "learning_rate": 5e-05, "loss": 0.0046, "step": 5930 }, { "epoch": 1.575178997613365, "grad_norm": 0.009862853214144707, "learning_rate": 5e-05, "loss": 0.0041, "step": 5940 }, { "epoch": 1.5778308141076638, "grad_norm": 0.00972562376409769, "learning_rate": 5e-05, "loss": 0.0036, "step": 5950 }, { "epoch": 1.5804826306019624, "grad_norm": 0.012724102474749088, "learning_rate": 5e-05, "loss": 0.0038, "step": 5960 }, { "epoch": 1.583134447096261, "grad_norm": 0.009794866666197777, "learning_rate": 5e-05, "loss": 0.0038, "step": 5970 }, { "epoch": 1.5857862635905595, "grad_norm": 0.01264490932226181, "learning_rate": 5e-05, "loss": 0.0037, "step": 5980 }, { "epoch": 1.588438080084858, "grad_norm": 0.00610125157982111, "learning_rate": 5e-05, "loss": 0.0043, "step": 5990 }, { "epoch": 1.5910898965791567, "grad_norm": 0.007982159964740276, "learning_rate": 5e-05, "loss": 0.0037, "step": 6000 }, { "epoch": 1.5937417130734555, "grad_norm": 0.008273666724562645, "learning_rate": 5e-05, "loss": 0.0039, "step": 6010 }, { "epoch": 1.596393529567754, "grad_norm": 0.015302873216569424, "learning_rate": 5e-05, "loss": 0.0035, "step": 6020 }, { "epoch": 1.5990453460620526, "grad_norm": 0.011706732213497162, "learning_rate": 5e-05, "loss": 0.0041, "step": 6030 }, { "epoch": 1.6016971625563512, "grad_norm": 0.008869534358382225, "learning_rate": 5e-05, "loss": 0.0034, "step": 6040 }, { "epoch": 1.6043489790506498, "grad_norm": 0.009611327201128006, "learning_rate": 5e-05, "loss": 0.0044, "step": 6050 }, { "epoch": 1.6070007955449483, "grad_norm": 0.01559068076312542, "learning_rate": 5e-05, "loss": 0.0039, "step": 6060 }, { "epoch": 1.609652612039247, "grad_norm": 0.012230784632265568, "learning_rate": 5e-05, "loss": 0.004, "step": 6070 }, { "epoch": 1.6123044285335455, "grad_norm": 0.008492650464177132, "learning_rate": 5e-05, "loss": 0.0047, "step": 6080 }, { "epoch": 1.614956245027844, "grad_norm": 0.009813955053687096, "learning_rate": 5e-05, "loss": 0.0034, "step": 6090 }, { "epoch": 1.6176080615221426, "grad_norm": 0.013513347133994102, "learning_rate": 5e-05, "loss": 0.0049, "step": 6100 }, { "epoch": 1.6202598780164412, "grad_norm": 0.01192953810095787, "learning_rate": 5e-05, "loss": 0.0028, "step": 6110 }, { "epoch": 1.6229116945107398, "grad_norm": 0.012612147256731987, "learning_rate": 5e-05, "loss": 0.0041, "step": 6120 }, { "epoch": 1.6255635110050384, "grad_norm": 0.00930293183773756, "learning_rate": 5e-05, "loss": 0.0034, "step": 6130 }, { "epoch": 1.628215327499337, "grad_norm": 0.01491401344537735, "learning_rate": 5e-05, "loss": 0.0038, "step": 6140 }, { "epoch": 1.6308671439936355, "grad_norm": 0.012734273448586464, "learning_rate": 5e-05, "loss": 0.004, "step": 6150 }, { "epoch": 1.633518960487934, "grad_norm": 0.01410200260579586, "learning_rate": 5e-05, "loss": 0.0039, "step": 6160 }, { "epoch": 1.6361707769822327, "grad_norm": 0.007131983991712332, "learning_rate": 5e-05, "loss": 0.0035, "step": 6170 }, { "epoch": 1.6388225934765315, "grad_norm": 0.012141134589910507, "learning_rate": 5e-05, "loss": 0.0041, "step": 6180 }, { "epoch": 1.64147440997083, "grad_norm": 0.01466628722846508, "learning_rate": 5e-05, "loss": 0.0035, "step": 6190 }, { "epoch": 1.6441262264651286, "grad_norm": 0.008028069511055946, "learning_rate": 5e-05, "loss": 0.0036, "step": 6200 }, { "epoch": 1.6467780429594272, "grad_norm": 0.018056824803352356, "learning_rate": 5e-05, "loss": 0.0034, "step": 6210 }, { "epoch": 1.6494298594537258, "grad_norm": 0.013711776584386826, "learning_rate": 5e-05, "loss": 0.0039, "step": 6220 }, { "epoch": 1.6520816759480244, "grad_norm": 0.011408611200749874, "learning_rate": 5e-05, "loss": 0.004, "step": 6230 }, { "epoch": 1.6547334924423232, "grad_norm": 0.009424285963177681, "learning_rate": 5e-05, "loss": 0.0042, "step": 6240 }, { "epoch": 1.6573853089366217, "grad_norm": 0.012184342369437218, "learning_rate": 5e-05, "loss": 0.0043, "step": 6250 }, { "epoch": 1.6600371254309203, "grad_norm": 0.010949616320431232, "learning_rate": 5e-05, "loss": 0.0041, "step": 6260 }, { "epoch": 1.6626889419252189, "grad_norm": 0.01289514359086752, "learning_rate": 5e-05, "loss": 0.0038, "step": 6270 }, { "epoch": 1.6653407584195175, "grad_norm": 0.01058620773255825, "learning_rate": 5e-05, "loss": 0.004, "step": 6280 }, { "epoch": 1.667992574913816, "grad_norm": 0.008481941185891628, "learning_rate": 5e-05, "loss": 0.0045, "step": 6290 }, { "epoch": 1.6706443914081146, "grad_norm": 0.01348476018756628, "learning_rate": 5e-05, "loss": 0.0044, "step": 6300 }, { "epoch": 1.6732962079024132, "grad_norm": 0.008671422488987446, "learning_rate": 5e-05, "loss": 0.0039, "step": 6310 }, { "epoch": 1.6759480243967118, "grad_norm": 0.007556726690381765, "learning_rate": 5e-05, "loss": 0.0045, "step": 6320 }, { "epoch": 1.6785998408910103, "grad_norm": 0.009814237244427204, "learning_rate": 5e-05, "loss": 0.0035, "step": 6330 }, { "epoch": 1.681251657385309, "grad_norm": 0.01987018994987011, "learning_rate": 5e-05, "loss": 0.0044, "step": 6340 }, { "epoch": 1.6839034738796075, "grad_norm": 0.012590549886226654, "learning_rate": 5e-05, "loss": 0.0036, "step": 6350 }, { "epoch": 1.686555290373906, "grad_norm": 0.015379281714558601, "learning_rate": 5e-05, "loss": 0.0036, "step": 6360 }, { "epoch": 1.6892071068682046, "grad_norm": 0.015043104067444801, "learning_rate": 5e-05, "loss": 0.0034, "step": 6370 }, { "epoch": 1.6918589233625032, "grad_norm": 0.00519393989816308, "learning_rate": 5e-05, "loss": 0.0034, "step": 6380 }, { "epoch": 1.6945107398568018, "grad_norm": 0.015927042812108994, "learning_rate": 5e-05, "loss": 0.0042, "step": 6390 }, { "epoch": 1.6971625563511004, "grad_norm": 0.011063069105148315, "learning_rate": 5e-05, "loss": 0.0043, "step": 6400 }, { "epoch": 1.699814372845399, "grad_norm": 0.010242955759167671, "learning_rate": 5e-05, "loss": 0.0039, "step": 6410 }, { "epoch": 1.7024661893396977, "grad_norm": 0.009621228091418743, "learning_rate": 5e-05, "loss": 0.0034, "step": 6420 }, { "epoch": 1.7051180058339963, "grad_norm": 0.011599643155932426, "learning_rate": 5e-05, "loss": 0.0032, "step": 6430 }, { "epoch": 1.7077698223282949, "grad_norm": 0.016961591318249702, "learning_rate": 5e-05, "loss": 0.0042, "step": 6440 }, { "epoch": 1.7104216388225935, "grad_norm": 0.008318674750626087, "learning_rate": 5e-05, "loss": 0.0042, "step": 6450 }, { "epoch": 1.713073455316892, "grad_norm": 0.011123607866466045, "learning_rate": 5e-05, "loss": 0.0038, "step": 6460 }, { "epoch": 1.7157252718111906, "grad_norm": 0.019876273348927498, "learning_rate": 5e-05, "loss": 0.0038, "step": 6470 }, { "epoch": 1.7183770883054894, "grad_norm": 0.013712113723158836, "learning_rate": 5e-05, "loss": 0.0041, "step": 6480 }, { "epoch": 1.721028904799788, "grad_norm": 0.011023106053471565, "learning_rate": 5e-05, "loss": 0.0039, "step": 6490 }, { "epoch": 1.7236807212940866, "grad_norm": 0.01213430892676115, "learning_rate": 5e-05, "loss": 0.0045, "step": 6500 }, { "epoch": 1.7263325377883851, "grad_norm": 0.008855405263602734, "learning_rate": 5e-05, "loss": 0.0035, "step": 6510 }, { "epoch": 1.7289843542826837, "grad_norm": 0.009468968026340008, "learning_rate": 5e-05, "loss": 0.0037, "step": 6520 }, { "epoch": 1.7316361707769823, "grad_norm": 0.012391138821840286, "learning_rate": 5e-05, "loss": 0.0037, "step": 6530 }, { "epoch": 1.7342879872712809, "grad_norm": 0.007793671451508999, "learning_rate": 5e-05, "loss": 0.0039, "step": 6540 }, { "epoch": 1.7369398037655794, "grad_norm": 0.014003198593854904, "learning_rate": 5e-05, "loss": 0.0043, "step": 6550 }, { "epoch": 1.739591620259878, "grad_norm": 0.010676076635718346, "learning_rate": 5e-05, "loss": 0.0038, "step": 6560 }, { "epoch": 1.7422434367541766, "grad_norm": 0.00809957180172205, "learning_rate": 5e-05, "loss": 0.0037, "step": 6570 }, { "epoch": 1.7448952532484752, "grad_norm": 0.008542346768081188, "learning_rate": 5e-05, "loss": 0.0041, "step": 6580 }, { "epoch": 1.7475470697427737, "grad_norm": 0.013506497256457806, "learning_rate": 5e-05, "loss": 0.0035, "step": 6590 }, { "epoch": 1.7501988862370723, "grad_norm": 0.013208353891968727, "learning_rate": 5e-05, "loss": 0.0036, "step": 6600 }, { "epoch": 1.752850702731371, "grad_norm": 0.010113679803907871, "learning_rate": 5e-05, "loss": 0.0042, "step": 6610 }, { "epoch": 1.7555025192256695, "grad_norm": 0.012135012075304985, "learning_rate": 5e-05, "loss": 0.0041, "step": 6620 }, { "epoch": 1.758154335719968, "grad_norm": 0.00918748788535595, "learning_rate": 5e-05, "loss": 0.0036, "step": 6630 }, { "epoch": 1.7608061522142666, "grad_norm": 0.016533933579921722, "learning_rate": 5e-05, "loss": 0.0043, "step": 6640 }, { "epoch": 1.7634579687085654, "grad_norm": 0.0072473566979169846, "learning_rate": 5e-05, "loss": 0.0034, "step": 6650 }, { "epoch": 1.766109785202864, "grad_norm": 0.012321876361966133, "learning_rate": 5e-05, "loss": 0.0039, "step": 6660 }, { "epoch": 1.7687616016971626, "grad_norm": 0.017444230616092682, "learning_rate": 5e-05, "loss": 0.0038, "step": 6670 }, { "epoch": 1.7714134181914611, "grad_norm": 0.015863321721553802, "learning_rate": 5e-05, "loss": 0.0041, "step": 6680 }, { "epoch": 1.7740652346857597, "grad_norm": 0.00782292615622282, "learning_rate": 5e-05, "loss": 0.0033, "step": 6690 }, { "epoch": 1.7767170511800583, "grad_norm": 0.005016414448618889, "learning_rate": 5e-05, "loss": 0.0035, "step": 6700 }, { "epoch": 1.779368867674357, "grad_norm": 0.012129734270274639, "learning_rate": 5e-05, "loss": 0.0039, "step": 6710 }, { "epoch": 1.7820206841686557, "grad_norm": 0.010917719453573227, "learning_rate": 5e-05, "loss": 0.0036, "step": 6720 }, { "epoch": 1.7846725006629542, "grad_norm": 0.007929924875497818, "learning_rate": 5e-05, "loss": 0.0033, "step": 6730 }, { "epoch": 1.7873243171572528, "grad_norm": 0.013700383715331554, "learning_rate": 5e-05, "loss": 0.0037, "step": 6740 }, { "epoch": 1.7899761336515514, "grad_norm": 0.010211814194917679, "learning_rate": 5e-05, "loss": 0.0038, "step": 6750 }, { "epoch": 1.79262795014585, "grad_norm": 0.011598619632422924, "learning_rate": 5e-05, "loss": 0.0045, "step": 6760 }, { "epoch": 1.7952797666401485, "grad_norm": 0.008464635349810123, "learning_rate": 5e-05, "loss": 0.0039, "step": 6770 }, { "epoch": 1.7979315831344471, "grad_norm": 0.0068101221695542336, "learning_rate": 5e-05, "loss": 0.0033, "step": 6780 }, { "epoch": 1.8005833996287457, "grad_norm": 0.00989547185599804, "learning_rate": 5e-05, "loss": 0.0032, "step": 6790 }, { "epoch": 1.8032352161230443, "grad_norm": 0.015221504494547844, "learning_rate": 5e-05, "loss": 0.0038, "step": 6800 }, { "epoch": 1.8058870326173428, "grad_norm": 0.013327119871973991, "learning_rate": 5e-05, "loss": 0.0043, "step": 6810 }, { "epoch": 1.8085388491116414, "grad_norm": 0.010980576276779175, "learning_rate": 5e-05, "loss": 0.0035, "step": 6820 }, { "epoch": 1.81119066560594, "grad_norm": 0.00987189169973135, "learning_rate": 5e-05, "loss": 0.0034, "step": 6830 }, { "epoch": 1.8138424821002386, "grad_norm": 0.014566825702786446, "learning_rate": 5e-05, "loss": 0.004, "step": 6840 }, { "epoch": 1.8164942985945371, "grad_norm": 0.008532696403563023, "learning_rate": 5e-05, "loss": 0.0037, "step": 6850 }, { "epoch": 1.8191461150888357, "grad_norm": 0.01372421532869339, "learning_rate": 5e-05, "loss": 0.0036, "step": 6860 }, { "epoch": 1.8217979315831343, "grad_norm": 0.013438086956739426, "learning_rate": 5e-05, "loss": 0.0032, "step": 6870 }, { "epoch": 1.8244497480774329, "grad_norm": 0.009550942108035088, "learning_rate": 5e-05, "loss": 0.0032, "step": 6880 }, { "epoch": 1.8271015645717317, "grad_norm": 0.012924582697451115, "learning_rate": 5e-05, "loss": 0.0036, "step": 6890 }, { "epoch": 1.8297533810660302, "grad_norm": 0.007753556128591299, "learning_rate": 5e-05, "loss": 0.0037, "step": 6900 }, { "epoch": 1.8324051975603288, "grad_norm": 0.004740705247968435, "learning_rate": 5e-05, "loss": 0.004, "step": 6910 }, { "epoch": 1.8350570140546274, "grad_norm": 0.007150164805352688, "learning_rate": 5e-05, "loss": 0.0045, "step": 6920 }, { "epoch": 1.837708830548926, "grad_norm": 0.00761279184371233, "learning_rate": 5e-05, "loss": 0.0041, "step": 6930 }, { "epoch": 1.8403606470432248, "grad_norm": 0.013920258730649948, "learning_rate": 5e-05, "loss": 0.0039, "step": 6940 }, { "epoch": 1.8430124635375233, "grad_norm": 0.009099151007831097, "learning_rate": 5e-05, "loss": 0.0039, "step": 6950 }, { "epoch": 1.845664280031822, "grad_norm": 0.01297698263078928, "learning_rate": 5e-05, "loss": 0.004, "step": 6960 }, { "epoch": 1.8483160965261205, "grad_norm": 0.01032144296914339, "learning_rate": 5e-05, "loss": 0.0041, "step": 6970 }, { "epoch": 1.850967913020419, "grad_norm": 0.006401524879038334, "learning_rate": 5e-05, "loss": 0.0038, "step": 6980 }, { "epoch": 1.8536197295147177, "grad_norm": 0.008720487356185913, "learning_rate": 5e-05, "loss": 0.0032, "step": 6990 }, { "epoch": 1.8562715460090162, "grad_norm": 0.00502184871584177, "learning_rate": 5e-05, "loss": 0.0029, "step": 7000 }, { "epoch": 1.8589233625033148, "grad_norm": 0.010808387771248817, "learning_rate": 5e-05, "loss": 0.0033, "step": 7010 }, { "epoch": 1.8615751789976134, "grad_norm": 0.011890877969563007, "learning_rate": 5e-05, "loss": 0.0032, "step": 7020 }, { "epoch": 1.864226995491912, "grad_norm": 0.012545928359031677, "learning_rate": 5e-05, "loss": 0.0038, "step": 7030 }, { "epoch": 1.8668788119862105, "grad_norm": 0.01935751736164093, "learning_rate": 5e-05, "loss": 0.004, "step": 7040 }, { "epoch": 1.869530628480509, "grad_norm": 0.0047897957265377045, "learning_rate": 5e-05, "loss": 0.0037, "step": 7050 }, { "epoch": 1.8721824449748077, "grad_norm": 0.012331346049904823, "learning_rate": 5e-05, "loss": 0.0037, "step": 7060 }, { "epoch": 1.8748342614691063, "grad_norm": 0.009893546812236309, "learning_rate": 5e-05, "loss": 0.0033, "step": 7070 }, { "epoch": 1.8774860779634048, "grad_norm": 0.007137580309063196, "learning_rate": 5e-05, "loss": 0.0033, "step": 7080 }, { "epoch": 1.8801378944577034, "grad_norm": 0.012783384881913662, "learning_rate": 5e-05, "loss": 0.0035, "step": 7090 }, { "epoch": 1.882789710952002, "grad_norm": 0.006944759748876095, "learning_rate": 5e-05, "loss": 0.0042, "step": 7100 }, { "epoch": 1.8854415274463006, "grad_norm": 0.006940097082406282, "learning_rate": 5e-05, "loss": 0.0035, "step": 7110 }, { "epoch": 1.8880933439405994, "grad_norm": 0.00708185276016593, "learning_rate": 5e-05, "loss": 0.004, "step": 7120 }, { "epoch": 1.890745160434898, "grad_norm": 0.00880520511418581, "learning_rate": 5e-05, "loss": 0.0044, "step": 7130 }, { "epoch": 1.8933969769291965, "grad_norm": 0.009258565492928028, "learning_rate": 5e-05, "loss": 0.0047, "step": 7140 }, { "epoch": 1.896048793423495, "grad_norm": 0.006418722216039896, "learning_rate": 5e-05, "loss": 0.0035, "step": 7150 }, { "epoch": 1.8987006099177937, "grad_norm": 0.011125003919005394, "learning_rate": 5e-05, "loss": 0.0037, "step": 7160 }, { "epoch": 1.9013524264120922, "grad_norm": 0.00997136440128088, "learning_rate": 5e-05, "loss": 0.0034, "step": 7170 }, { "epoch": 1.904004242906391, "grad_norm": 0.012521300464868546, "learning_rate": 5e-05, "loss": 0.0035, "step": 7180 }, { "epoch": 1.9066560594006896, "grad_norm": 0.01090375054627657, "learning_rate": 5e-05, "loss": 0.0036, "step": 7190 }, { "epoch": 1.9093078758949882, "grad_norm": 0.010019796900451183, "learning_rate": 5e-05, "loss": 0.0032, "step": 7200 }, { "epoch": 1.9119596923892868, "grad_norm": 0.010484153404831886, "learning_rate": 5e-05, "loss": 0.0034, "step": 7210 }, { "epoch": 1.9146115088835853, "grad_norm": 0.008348500356078148, "learning_rate": 5e-05, "loss": 0.0042, "step": 7220 }, { "epoch": 1.917263325377884, "grad_norm": 0.007793165743350983, "learning_rate": 5e-05, "loss": 0.0034, "step": 7230 }, { "epoch": 1.9199151418721825, "grad_norm": 0.012008682824671268, "learning_rate": 5e-05, "loss": 0.0034, "step": 7240 }, { "epoch": 1.922566958366481, "grad_norm": 0.041275858879089355, "learning_rate": 5e-05, "loss": 0.0038, "step": 7250 }, { "epoch": 1.9252187748607796, "grad_norm": 0.009256097488105297, "learning_rate": 5e-05, "loss": 0.0037, "step": 7260 }, { "epoch": 1.9278705913550782, "grad_norm": 0.008928642608225346, "learning_rate": 5e-05, "loss": 0.0033, "step": 7270 }, { "epoch": 1.9305224078493768, "grad_norm": 0.010500765405595303, "learning_rate": 5e-05, "loss": 0.0036, "step": 7280 }, { "epoch": 1.9331742243436754, "grad_norm": 0.012242295779287815, "learning_rate": 5e-05, "loss": 0.0039, "step": 7290 }, { "epoch": 1.935826040837974, "grad_norm": 0.014756971038877964, "learning_rate": 5e-05, "loss": 0.004, "step": 7300 }, { "epoch": 1.9384778573322725, "grad_norm": 0.012217966839671135, "learning_rate": 5e-05, "loss": 0.0043, "step": 7310 }, { "epoch": 1.941129673826571, "grad_norm": 0.010805630125105381, "learning_rate": 5e-05, "loss": 0.0042, "step": 7320 }, { "epoch": 1.9437814903208697, "grad_norm": 0.00960564985871315, "learning_rate": 5e-05, "loss": 0.0039, "step": 7330 }, { "epoch": 1.9464333068151682, "grad_norm": 0.009439690969884396, "learning_rate": 5e-05, "loss": 0.0034, "step": 7340 }, { "epoch": 1.949085123309467, "grad_norm": 0.010525006800889969, "learning_rate": 5e-05, "loss": 0.0032, "step": 7350 }, { "epoch": 1.9517369398037656, "grad_norm": 0.008997218683362007, "learning_rate": 5e-05, "loss": 0.0031, "step": 7360 }, { "epoch": 1.9543887562980642, "grad_norm": 0.0146209467202425, "learning_rate": 5e-05, "loss": 0.0037, "step": 7370 }, { "epoch": 1.9570405727923628, "grad_norm": 0.009325198829174042, "learning_rate": 5e-05, "loss": 0.0037, "step": 7380 }, { "epoch": 1.9596923892866613, "grad_norm": 0.007849776186048985, "learning_rate": 5e-05, "loss": 0.0035, "step": 7390 }, { "epoch": 1.96234420578096, "grad_norm": 0.014128265902400017, "learning_rate": 5e-05, "loss": 0.003, "step": 7400 }, { "epoch": 1.9649960222752587, "grad_norm": 0.0083721112459898, "learning_rate": 5e-05, "loss": 0.0035, "step": 7410 }, { "epoch": 1.9676478387695573, "grad_norm": 0.010255949571728706, "learning_rate": 5e-05, "loss": 0.004, "step": 7420 }, { "epoch": 1.9702996552638559, "grad_norm": 0.01154076587408781, "learning_rate": 5e-05, "loss": 0.0036, "step": 7430 }, { "epoch": 1.9729514717581544, "grad_norm": 0.01305315364152193, "learning_rate": 5e-05, "loss": 0.0042, "step": 7440 }, { "epoch": 1.975603288252453, "grad_norm": 0.011516467668116093, "learning_rate": 5e-05, "loss": 0.0042, "step": 7450 }, { "epoch": 1.9782551047467516, "grad_norm": 0.008437003940343857, "learning_rate": 5e-05, "loss": 0.0036, "step": 7460 }, { "epoch": 1.9809069212410502, "grad_norm": 0.01239365991204977, "learning_rate": 5e-05, "loss": 0.0034, "step": 7470 }, { "epoch": 1.9835587377353487, "grad_norm": 0.009785588830709457, "learning_rate": 5e-05, "loss": 0.0034, "step": 7480 }, { "epoch": 1.9862105542296473, "grad_norm": 0.017179014161229134, "learning_rate": 5e-05, "loss": 0.0034, "step": 7490 }, { "epoch": 1.988862370723946, "grad_norm": 0.011737877503037453, "learning_rate": 5e-05, "loss": 0.0051, "step": 7500 }, { "epoch": 1.9915141872182445, "grad_norm": 0.012457642704248428, "learning_rate": 5e-05, "loss": 0.0039, "step": 7510 }, { "epoch": 1.994166003712543, "grad_norm": 0.008222060278058052, "learning_rate": 5e-05, "loss": 0.0035, "step": 7520 }, { "epoch": 1.9968178202068416, "grad_norm": 0.006167420651763678, "learning_rate": 5e-05, "loss": 0.0035, "step": 7530 }, { "epoch": 1.9994696367011402, "grad_norm": 0.015268982388079166, "learning_rate": 5e-05, "loss": 0.0036, "step": 7540 }, { "epoch": 2.0021214531954388, "grad_norm": 0.011415797285735607, "learning_rate": 5e-05, "loss": 0.0032, "step": 7550 }, { "epoch": 2.0047732696897373, "grad_norm": 0.008936595171689987, "learning_rate": 5e-05, "loss": 0.0035, "step": 7560 }, { "epoch": 2.007425086184036, "grad_norm": 0.014987941831350327, "learning_rate": 5e-05, "loss": 0.0034, "step": 7570 }, { "epoch": 2.0100769026783345, "grad_norm": 0.014553535729646683, "learning_rate": 5e-05, "loss": 0.0041, "step": 7580 }, { "epoch": 2.012728719172633, "grad_norm": 0.009288553148508072, "learning_rate": 5e-05, "loss": 0.004, "step": 7590 }, { "epoch": 2.0153805356669316, "grad_norm": 0.012278278358280659, "learning_rate": 5e-05, "loss": 0.0038, "step": 7600 }, { "epoch": 2.0180323521612302, "grad_norm": 0.010612528771162033, "learning_rate": 5e-05, "loss": 0.0037, "step": 7610 }, { "epoch": 2.0206841686555292, "grad_norm": 0.009088178165256977, "learning_rate": 5e-05, "loss": 0.0038, "step": 7620 }, { "epoch": 2.023335985149828, "grad_norm": 0.01498050894588232, "learning_rate": 5e-05, "loss": 0.0038, "step": 7630 }, { "epoch": 2.0259878016441264, "grad_norm": 0.009779920801520348, "learning_rate": 5e-05, "loss": 0.004, "step": 7640 }, { "epoch": 2.028639618138425, "grad_norm": 0.009464571252465248, "learning_rate": 5e-05, "loss": 0.0033, "step": 7650 }, { "epoch": 2.0312914346327235, "grad_norm": 0.006360730621963739, "learning_rate": 5e-05, "loss": 0.0032, "step": 7660 }, { "epoch": 2.033943251127022, "grad_norm": 0.0038492109160870314, "learning_rate": 5e-05, "loss": 0.0035, "step": 7670 }, { "epoch": 2.0365950676213207, "grad_norm": 0.00952677521854639, "learning_rate": 5e-05, "loss": 0.0039, "step": 7680 }, { "epoch": 2.0392468841156193, "grad_norm": 0.014355471357703209, "learning_rate": 5e-05, "loss": 0.0036, "step": 7690 }, { "epoch": 2.041898700609918, "grad_norm": 0.008904846385121346, "learning_rate": 5e-05, "loss": 0.0029, "step": 7700 }, { "epoch": 2.0445505171042164, "grad_norm": 0.007917567156255245, "learning_rate": 5e-05, "loss": 0.0033, "step": 7710 }, { "epoch": 2.047202333598515, "grad_norm": 0.009953659027814865, "learning_rate": 5e-05, "loss": 0.0039, "step": 7720 }, { "epoch": 2.0498541500928136, "grad_norm": 0.013684802688658237, "learning_rate": 5e-05, "loss": 0.0038, "step": 7730 }, { "epoch": 2.052505966587112, "grad_norm": 0.006868042051792145, "learning_rate": 5e-05, "loss": 0.0036, "step": 7740 }, { "epoch": 2.0551577830814107, "grad_norm": 0.00794382207095623, "learning_rate": 5e-05, "loss": 0.0037, "step": 7750 }, { "epoch": 2.0578095995757093, "grad_norm": 0.010748165659606457, "learning_rate": 5e-05, "loss": 0.0037, "step": 7760 }, { "epoch": 2.060461416070008, "grad_norm": 0.013346575200557709, "learning_rate": 5e-05, "loss": 0.0038, "step": 7770 }, { "epoch": 2.0631132325643065, "grad_norm": 0.017504857853055, "learning_rate": 5e-05, "loss": 0.0035, "step": 7780 }, { "epoch": 2.065765049058605, "grad_norm": 0.02126402221620083, "learning_rate": 5e-05, "loss": 0.0029, "step": 7790 }, { "epoch": 2.0684168655529036, "grad_norm": 0.014936577528715134, "learning_rate": 5e-05, "loss": 0.0036, "step": 7800 }, { "epoch": 2.071068682047202, "grad_norm": 0.010300401598215103, "learning_rate": 5e-05, "loss": 0.0032, "step": 7810 }, { "epoch": 2.0737204985415008, "grad_norm": 0.010759874247014523, "learning_rate": 5e-05, "loss": 0.0037, "step": 7820 }, { "epoch": 2.0763723150357993, "grad_norm": 0.011617331765592098, "learning_rate": 5e-05, "loss": 0.0031, "step": 7830 }, { "epoch": 2.079024131530098, "grad_norm": 0.006507335230708122, "learning_rate": 5e-05, "loss": 0.0036, "step": 7840 }, { "epoch": 2.0816759480243965, "grad_norm": 0.012249298393726349, "learning_rate": 5e-05, "loss": 0.004, "step": 7850 }, { "epoch": 2.0843277645186955, "grad_norm": 0.009742509573698044, "learning_rate": 5e-05, "loss": 0.0033, "step": 7860 }, { "epoch": 2.086979581012994, "grad_norm": 0.008740321733057499, "learning_rate": 5e-05, "loss": 0.0039, "step": 7870 }, { "epoch": 2.0896313975072927, "grad_norm": 0.011481222696602345, "learning_rate": 5e-05, "loss": 0.0035, "step": 7880 }, { "epoch": 2.0922832140015912, "grad_norm": 0.0070774490013718605, "learning_rate": 5e-05, "loss": 0.003, "step": 7890 }, { "epoch": 2.09493503049589, "grad_norm": 0.009654768742620945, "learning_rate": 5e-05, "loss": 0.0038, "step": 7900 }, { "epoch": 2.0975868469901884, "grad_norm": 0.014048945158720016, "learning_rate": 5e-05, "loss": 0.0041, "step": 7910 }, { "epoch": 2.100238663484487, "grad_norm": 0.012830098159611225, "learning_rate": 5e-05, "loss": 0.0031, "step": 7920 }, { "epoch": 2.1028904799787855, "grad_norm": 0.00925034936517477, "learning_rate": 5e-05, "loss": 0.0031, "step": 7930 }, { "epoch": 2.105542296473084, "grad_norm": 0.011425439268350601, "learning_rate": 5e-05, "loss": 0.0034, "step": 7940 }, { "epoch": 2.1081941129673827, "grad_norm": 0.012159310281276703, "learning_rate": 5e-05, "loss": 0.0042, "step": 7950 }, { "epoch": 2.1108459294616813, "grad_norm": 0.005153105594217777, "learning_rate": 5e-05, "loss": 0.0032, "step": 7960 }, { "epoch": 2.11349774595598, "grad_norm": 0.009789319708943367, "learning_rate": 5e-05, "loss": 0.0039, "step": 7970 }, { "epoch": 2.1161495624502784, "grad_norm": 0.012226143851876259, "learning_rate": 5e-05, "loss": 0.004, "step": 7980 }, { "epoch": 2.118801378944577, "grad_norm": 0.009878990240395069, "learning_rate": 5e-05, "loss": 0.003, "step": 7990 }, { "epoch": 2.1214531954388756, "grad_norm": 0.010955754667520523, "learning_rate": 5e-05, "loss": 0.0041, "step": 8000 }, { "epoch": 2.124105011933174, "grad_norm": 0.007398007437586784, "learning_rate": 5e-05, "loss": 0.0036, "step": 8010 }, { "epoch": 2.1267568284274727, "grad_norm": 0.006772639695554972, "learning_rate": 5e-05, "loss": 0.0035, "step": 8020 }, { "epoch": 2.1294086449217713, "grad_norm": 0.007351063657552004, "learning_rate": 5e-05, "loss": 0.0041, "step": 8030 }, { "epoch": 2.13206046141607, "grad_norm": 0.012457132339477539, "learning_rate": 5e-05, "loss": 0.0037, "step": 8040 }, { "epoch": 2.1347122779103684, "grad_norm": 0.011119748465716839, "learning_rate": 5e-05, "loss": 0.0039, "step": 8050 }, { "epoch": 2.137364094404667, "grad_norm": 0.015500233508646488, "learning_rate": 5e-05, "loss": 0.0038, "step": 8060 }, { "epoch": 2.1400159108989656, "grad_norm": 0.01022540032863617, "learning_rate": 5e-05, "loss": 0.0032, "step": 8070 }, { "epoch": 2.142667727393264, "grad_norm": 0.006994948256760836, "learning_rate": 5e-05, "loss": 0.004, "step": 8080 }, { "epoch": 2.145319543887563, "grad_norm": 0.006835215259343386, "learning_rate": 5e-05, "loss": 0.0037, "step": 8090 }, { "epoch": 2.1479713603818618, "grad_norm": 0.008271201513707638, "learning_rate": 5e-05, "loss": 0.0035, "step": 8100 }, { "epoch": 2.1506231768761603, "grad_norm": 0.007871189154684544, "learning_rate": 5e-05, "loss": 0.0032, "step": 8110 }, { "epoch": 2.153274993370459, "grad_norm": 0.011235035955905914, "learning_rate": 5e-05, "loss": 0.0028, "step": 8120 }, { "epoch": 2.1559268098647575, "grad_norm": 0.014173023402690887, "learning_rate": 5e-05, "loss": 0.0041, "step": 8130 }, { "epoch": 2.158578626359056, "grad_norm": 0.005477243103086948, "learning_rate": 5e-05, "loss": 0.0035, "step": 8140 }, { "epoch": 2.1612304428533546, "grad_norm": 0.018945366144180298, "learning_rate": 5e-05, "loss": 0.0032, "step": 8150 }, { "epoch": 2.163882259347653, "grad_norm": 0.006187544669955969, "learning_rate": 5e-05, "loss": 0.0033, "step": 8160 }, { "epoch": 2.166534075841952, "grad_norm": 0.0091762850061059, "learning_rate": 5e-05, "loss": 0.0028, "step": 8170 }, { "epoch": 2.1691858923362504, "grad_norm": 0.007951530627906322, "learning_rate": 5e-05, "loss": 0.004, "step": 8180 }, { "epoch": 2.171837708830549, "grad_norm": 0.009417155757546425, "learning_rate": 5e-05, "loss": 0.0033, "step": 8190 }, { "epoch": 2.1744895253248475, "grad_norm": 0.011150380596518517, "learning_rate": 5e-05, "loss": 0.0035, "step": 8200 }, { "epoch": 2.177141341819146, "grad_norm": 0.012988967821002007, "learning_rate": 5e-05, "loss": 0.0042, "step": 8210 }, { "epoch": 2.1797931583134447, "grad_norm": 0.010976849123835564, "learning_rate": 5e-05, "loss": 0.0038, "step": 8220 }, { "epoch": 2.1824449748077432, "grad_norm": 0.015874285250902176, "learning_rate": 5e-05, "loss": 0.0035, "step": 8230 }, { "epoch": 2.185096791302042, "grad_norm": 0.007811186369508505, "learning_rate": 5e-05, "loss": 0.0035, "step": 8240 }, { "epoch": 2.1877486077963404, "grad_norm": 0.012513087131083012, "learning_rate": 5e-05, "loss": 0.0033, "step": 8250 }, { "epoch": 2.190400424290639, "grad_norm": 0.009040352888405323, "learning_rate": 5e-05, "loss": 0.0036, "step": 8260 }, { "epoch": 2.1930522407849375, "grad_norm": 0.007143011782318354, "learning_rate": 5e-05, "loss": 0.0037, "step": 8270 }, { "epoch": 2.195704057279236, "grad_norm": 0.02263515442609787, "learning_rate": 5e-05, "loss": 0.0047, "step": 8280 }, { "epoch": 2.1983558737735347, "grad_norm": 0.01607399247586727, "learning_rate": 5e-05, "loss": 0.0037, "step": 8290 }, { "epoch": 2.2010076902678333, "grad_norm": 0.0135713592171669, "learning_rate": 5e-05, "loss": 0.0036, "step": 8300 }, { "epoch": 2.203659506762132, "grad_norm": 0.008566686883568764, "learning_rate": 5e-05, "loss": 0.0037, "step": 8310 }, { "epoch": 2.206311323256431, "grad_norm": 0.013139299117028713, "learning_rate": 5e-05, "loss": 0.0034, "step": 8320 }, { "epoch": 2.2089631397507294, "grad_norm": 0.00784307811409235, "learning_rate": 5e-05, "loss": 0.0031, "step": 8330 }, { "epoch": 2.211614956245028, "grad_norm": 0.006961996667087078, "learning_rate": 5e-05, "loss": 0.0039, "step": 8340 }, { "epoch": 2.2142667727393266, "grad_norm": 0.010873141698539257, "learning_rate": 5e-05, "loss": 0.0036, "step": 8350 }, { "epoch": 2.216918589233625, "grad_norm": 0.009528831578791142, "learning_rate": 5e-05, "loss": 0.0032, "step": 8360 }, { "epoch": 2.2195704057279237, "grad_norm": 0.011752023361623287, "learning_rate": 5e-05, "loss": 0.0035, "step": 8370 }, { "epoch": 2.2222222222222223, "grad_norm": 0.010808750055730343, "learning_rate": 5e-05, "loss": 0.0035, "step": 8380 }, { "epoch": 2.224874038716521, "grad_norm": 0.010668564587831497, "learning_rate": 5e-05, "loss": 0.0035, "step": 8390 }, { "epoch": 2.2275258552108195, "grad_norm": 0.00966513529419899, "learning_rate": 5e-05, "loss": 0.003, "step": 8400 }, { "epoch": 2.230177671705118, "grad_norm": 0.006987038534134626, "learning_rate": 5e-05, "loss": 0.0028, "step": 8410 }, { "epoch": 2.2328294881994166, "grad_norm": 0.010094232857227325, "learning_rate": 5e-05, "loss": 0.0038, "step": 8420 }, { "epoch": 2.235481304693715, "grad_norm": 0.006660869810730219, "learning_rate": 5e-05, "loss": 0.0033, "step": 8430 }, { "epoch": 2.2381331211880138, "grad_norm": 0.011899590492248535, "learning_rate": 5e-05, "loss": 0.0041, "step": 8440 }, { "epoch": 2.2407849376823123, "grad_norm": 0.008373958989977837, "learning_rate": 5e-05, "loss": 0.0033, "step": 8450 }, { "epoch": 2.243436754176611, "grad_norm": 0.012924647890031338, "learning_rate": 5e-05, "loss": 0.004, "step": 8460 }, { "epoch": 2.2460885706709095, "grad_norm": 0.00964029598981142, "learning_rate": 5e-05, "loss": 0.003, "step": 8470 }, { "epoch": 2.248740387165208, "grad_norm": 0.01887417584657669, "learning_rate": 5e-05, "loss": 0.0032, "step": 8480 }, { "epoch": 2.2513922036595067, "grad_norm": 0.012762613594532013, "learning_rate": 5e-05, "loss": 0.0036, "step": 8490 }, { "epoch": 2.2540440201538052, "grad_norm": 0.009652834385633469, "learning_rate": 5e-05, "loss": 0.003, "step": 8500 }, { "epoch": 2.256695836648104, "grad_norm": 0.009354501031339169, "learning_rate": 5e-05, "loss": 0.0037, "step": 8510 }, { "epoch": 2.2593476531424024, "grad_norm": 0.008950895629823208, "learning_rate": 5e-05, "loss": 0.0038, "step": 8520 }, { "epoch": 2.261999469636701, "grad_norm": 0.011087559163570404, "learning_rate": 5e-05, "loss": 0.0038, "step": 8530 }, { "epoch": 2.2646512861309995, "grad_norm": 0.01626458764076233, "learning_rate": 5e-05, "loss": 0.0036, "step": 8540 }, { "epoch": 2.2673031026252985, "grad_norm": 0.01411160547286272, "learning_rate": 5e-05, "loss": 0.004, "step": 8550 }, { "epoch": 2.2699549191195967, "grad_norm": 0.007286636158823967, "learning_rate": 5e-05, "loss": 0.0029, "step": 8560 }, { "epoch": 2.2726067356138957, "grad_norm": 0.008799084462225437, "learning_rate": 5e-05, "loss": 0.0036, "step": 8570 }, { "epoch": 2.2752585521081943, "grad_norm": 0.006380013190209866, "learning_rate": 5e-05, "loss": 0.0031, "step": 8580 }, { "epoch": 2.277910368602493, "grad_norm": 0.008965665474534035, "learning_rate": 5e-05, "loss": 0.0031, "step": 8590 }, { "epoch": 2.2805621850967914, "grad_norm": 0.01934429258108139, "learning_rate": 5e-05, "loss": 0.0033, "step": 8600 }, { "epoch": 2.28321400159109, "grad_norm": 0.0086721982806921, "learning_rate": 5e-05, "loss": 0.0033, "step": 8610 }, { "epoch": 2.2858658180853886, "grad_norm": 0.011028343811631203, "learning_rate": 5e-05, "loss": 0.0034, "step": 8620 }, { "epoch": 2.288517634579687, "grad_norm": 0.008308466523885727, "learning_rate": 5e-05, "loss": 0.003, "step": 8630 }, { "epoch": 2.2911694510739857, "grad_norm": 0.006808287464082241, "learning_rate": 5e-05, "loss": 0.0031, "step": 8640 }, { "epoch": 2.2938212675682843, "grad_norm": 0.010718496516346931, "learning_rate": 5e-05, "loss": 0.0032, "step": 8650 }, { "epoch": 2.296473084062583, "grad_norm": 0.011856146156787872, "learning_rate": 5e-05, "loss": 0.0035, "step": 8660 }, { "epoch": 2.2991249005568815, "grad_norm": 0.007587608881294727, "learning_rate": 5e-05, "loss": 0.0036, "step": 8670 }, { "epoch": 2.30177671705118, "grad_norm": 0.009049585089087486, "learning_rate": 5e-05, "loss": 0.0035, "step": 8680 }, { "epoch": 2.3044285335454786, "grad_norm": 0.008744058199226856, "learning_rate": 5e-05, "loss": 0.0031, "step": 8690 }, { "epoch": 2.307080350039777, "grad_norm": 0.00967977475374937, "learning_rate": 5e-05, "loss": 0.004, "step": 8700 }, { "epoch": 2.3097321665340758, "grad_norm": 0.009122500196099281, "learning_rate": 5e-05, "loss": 0.0031, "step": 8710 }, { "epoch": 2.3123839830283743, "grad_norm": 0.007296338211745024, "learning_rate": 5e-05, "loss": 0.0029, "step": 8720 }, { "epoch": 2.315035799522673, "grad_norm": 0.01111921202391386, "learning_rate": 5e-05, "loss": 0.0034, "step": 8730 }, { "epoch": 2.3176876160169715, "grad_norm": 0.010613585822284222, "learning_rate": 5e-05, "loss": 0.0027, "step": 8740 }, { "epoch": 2.32033943251127, "grad_norm": 0.022785112261772156, "learning_rate": 5e-05, "loss": 0.0038, "step": 8750 }, { "epoch": 2.3229912490055686, "grad_norm": 0.008978509344160557, "learning_rate": 5e-05, "loss": 0.0034, "step": 8760 }, { "epoch": 2.325643065499867, "grad_norm": 0.01058660913258791, "learning_rate": 5e-05, "loss": 0.0034, "step": 8770 }, { "epoch": 2.3282948819941662, "grad_norm": 0.0124271921813488, "learning_rate": 5e-05, "loss": 0.0033, "step": 8780 }, { "epoch": 2.3309466984884644, "grad_norm": 0.007701314520090818, "learning_rate": 5e-05, "loss": 0.0032, "step": 8790 }, { "epoch": 2.3335985149827634, "grad_norm": 0.017071649432182312, "learning_rate": 5e-05, "loss": 0.0031, "step": 8800 }, { "epoch": 2.336250331477062, "grad_norm": 0.011354390531778336, "learning_rate": 5e-05, "loss": 0.0044, "step": 8810 }, { "epoch": 2.3389021479713605, "grad_norm": 0.010670194402337074, "learning_rate": 5e-05, "loss": 0.0034, "step": 8820 }, { "epoch": 2.341553964465659, "grad_norm": 0.010478717274963856, "learning_rate": 5e-05, "loss": 0.003, "step": 8830 }, { "epoch": 2.3442057809599577, "grad_norm": 0.012502174824476242, "learning_rate": 5e-05, "loss": 0.0029, "step": 8840 }, { "epoch": 2.3468575974542563, "grad_norm": 0.01142768282443285, "learning_rate": 5e-05, "loss": 0.003, "step": 8850 }, { "epoch": 2.349509413948555, "grad_norm": 0.010149066336452961, "learning_rate": 5e-05, "loss": 0.004, "step": 8860 }, { "epoch": 2.3521612304428534, "grad_norm": 0.01926625892519951, "learning_rate": 5e-05, "loss": 0.0033, "step": 8870 }, { "epoch": 2.354813046937152, "grad_norm": 0.011835222132503986, "learning_rate": 5e-05, "loss": 0.0029, "step": 8880 }, { "epoch": 2.3574648634314506, "grad_norm": 0.007093705236911774, "learning_rate": 5e-05, "loss": 0.0031, "step": 8890 }, { "epoch": 2.360116679925749, "grad_norm": 0.011290466412901878, "learning_rate": 5e-05, "loss": 0.003, "step": 8900 }, { "epoch": 2.3627684964200477, "grad_norm": 0.00821759644895792, "learning_rate": 5e-05, "loss": 0.0029, "step": 8910 }, { "epoch": 2.3654203129143463, "grad_norm": 0.014491350390017033, "learning_rate": 5e-05, "loss": 0.0034, "step": 8920 }, { "epoch": 2.368072129408645, "grad_norm": 0.00922013632953167, "learning_rate": 5e-05, "loss": 0.0034, "step": 8930 }, { "epoch": 2.3707239459029434, "grad_norm": 0.00916734803467989, "learning_rate": 5e-05, "loss": 0.0041, "step": 8940 }, { "epoch": 2.373375762397242, "grad_norm": 0.011871268041431904, "learning_rate": 5e-05, "loss": 0.0035, "step": 8950 }, { "epoch": 2.3760275788915406, "grad_norm": 0.007687619421631098, "learning_rate": 5e-05, "loss": 0.0031, "step": 8960 }, { "epoch": 2.378679395385839, "grad_norm": 0.010417713783681393, "learning_rate": 5e-05, "loss": 0.003, "step": 8970 }, { "epoch": 2.3813312118801377, "grad_norm": 0.007864427752792835, "learning_rate": 5e-05, "loss": 0.0026, "step": 8980 }, { "epoch": 2.3839830283744363, "grad_norm": 0.012203100137412548, "learning_rate": 5e-05, "loss": 0.0031, "step": 8990 }, { "epoch": 2.386634844868735, "grad_norm": 0.009726407937705517, "learning_rate": 5e-05, "loss": 0.0042, "step": 9000 }, { "epoch": 2.389286661363034, "grad_norm": 0.011470604687929153, "learning_rate": 5e-05, "loss": 0.0035, "step": 9010 }, { "epoch": 2.391938477857332, "grad_norm": 0.012855783104896545, "learning_rate": 5e-05, "loss": 0.0036, "step": 9020 }, { "epoch": 2.394590294351631, "grad_norm": 0.0058795614168047905, "learning_rate": 5e-05, "loss": 0.0028, "step": 9030 }, { "epoch": 2.3972421108459296, "grad_norm": 0.0108746737241745, "learning_rate": 5e-05, "loss": 0.0033, "step": 9040 }, { "epoch": 2.399893927340228, "grad_norm": 0.009743508882820606, "learning_rate": 5e-05, "loss": 0.003, "step": 9050 }, { "epoch": 2.402545743834527, "grad_norm": 0.00515114376321435, "learning_rate": 5e-05, "loss": 0.0033, "step": 9060 }, { "epoch": 2.4051975603288254, "grad_norm": 0.007264383137226105, "learning_rate": 5e-05, "loss": 0.0037, "step": 9070 }, { "epoch": 2.407849376823124, "grad_norm": 0.007139577995985746, "learning_rate": 5e-05, "loss": 0.003, "step": 9080 }, { "epoch": 2.4105011933174225, "grad_norm": 0.012730707414448261, "learning_rate": 5e-05, "loss": 0.0032, "step": 9090 }, { "epoch": 2.413153009811721, "grad_norm": 0.006217214744538069, "learning_rate": 5e-05, "loss": 0.0035, "step": 9100 }, { "epoch": 2.4158048263060197, "grad_norm": 0.0033778015058487654, "learning_rate": 5e-05, "loss": 0.0029, "step": 9110 }, { "epoch": 2.4184566428003182, "grad_norm": 0.006488529033958912, "learning_rate": 5e-05, "loss": 0.0035, "step": 9120 }, { "epoch": 2.421108459294617, "grad_norm": 0.011433531530201435, "learning_rate": 5e-05, "loss": 0.0029, "step": 9130 }, { "epoch": 2.4237602757889154, "grad_norm": 0.009380683302879333, "learning_rate": 5e-05, "loss": 0.0032, "step": 9140 }, { "epoch": 2.426412092283214, "grad_norm": 0.006668803747743368, "learning_rate": 5e-05, "loss": 0.0038, "step": 9150 }, { "epoch": 2.4290639087775125, "grad_norm": 0.007920114323496819, "learning_rate": 5e-05, "loss": 0.0033, "step": 9160 }, { "epoch": 2.431715725271811, "grad_norm": 0.015390190295875072, "learning_rate": 5e-05, "loss": 0.0038, "step": 9170 }, { "epoch": 2.4343675417661097, "grad_norm": 0.012680286541581154, "learning_rate": 5e-05, "loss": 0.0035, "step": 9180 }, { "epoch": 2.4370193582604083, "grad_norm": 0.011429259553551674, "learning_rate": 5e-05, "loss": 0.0034, "step": 9190 }, { "epoch": 2.439671174754707, "grad_norm": 0.007637449540197849, "learning_rate": 5e-05, "loss": 0.0033, "step": 9200 }, { "epoch": 2.4423229912490054, "grad_norm": 0.01120874471962452, "learning_rate": 5e-05, "loss": 0.0036, "step": 9210 }, { "epoch": 2.444974807743304, "grad_norm": 0.008990606293082237, "learning_rate": 5e-05, "loss": 0.0027, "step": 9220 }, { "epoch": 2.4476266242376026, "grad_norm": 0.010240115225315094, "learning_rate": 5e-05, "loss": 0.0031, "step": 9230 }, { "epoch": 2.4502784407319016, "grad_norm": 0.008155914023518562, "learning_rate": 5e-05, "loss": 0.003, "step": 9240 }, { "epoch": 2.4529302572261997, "grad_norm": 0.008066438138484955, "learning_rate": 5e-05, "loss": 0.0026, "step": 9250 }, { "epoch": 2.4555820737204987, "grad_norm": 0.009229475632309914, "learning_rate": 5e-05, "loss": 0.0038, "step": 9260 }, { "epoch": 2.4582338902147973, "grad_norm": 0.007709812372922897, "learning_rate": 5e-05, "loss": 0.0029, "step": 9270 }, { "epoch": 2.460885706709096, "grad_norm": 0.010268161073327065, "learning_rate": 5e-05, "loss": 0.0034, "step": 9280 }, { "epoch": 2.4635375232033945, "grad_norm": 0.016149045899510384, "learning_rate": 5e-05, "loss": 0.0034, "step": 9290 }, { "epoch": 2.466189339697693, "grad_norm": 0.007346258033066988, "learning_rate": 5e-05, "loss": 0.0031, "step": 9300 }, { "epoch": 2.4688411561919916, "grad_norm": 0.005993125028908253, "learning_rate": 5e-05, "loss": 0.0034, "step": 9310 }, { "epoch": 2.47149297268629, "grad_norm": 0.006262932904064655, "learning_rate": 5e-05, "loss": 0.0032, "step": 9320 }, { "epoch": 2.4741447891805888, "grad_norm": 0.010343831032514572, "learning_rate": 5e-05, "loss": 0.0029, "step": 9330 }, { "epoch": 2.4767966056748874, "grad_norm": 0.00988733023405075, "learning_rate": 5e-05, "loss": 0.0035, "step": 9340 }, { "epoch": 2.479448422169186, "grad_norm": 0.013346554711461067, "learning_rate": 5e-05, "loss": 0.003, "step": 9350 }, { "epoch": 2.4821002386634845, "grad_norm": 0.010080711916089058, "learning_rate": 5e-05, "loss": 0.0028, "step": 9360 }, { "epoch": 2.484752055157783, "grad_norm": 0.010317140258848667, "learning_rate": 5e-05, "loss": 0.0031, "step": 9370 }, { "epoch": 2.4874038716520817, "grad_norm": 0.007549201603978872, "learning_rate": 5e-05, "loss": 0.0034, "step": 9380 }, { "epoch": 2.4900556881463802, "grad_norm": 0.013462088070809841, "learning_rate": 5e-05, "loss": 0.0028, "step": 9390 }, { "epoch": 2.492707504640679, "grad_norm": 0.009537049569189548, "learning_rate": 5e-05, "loss": 0.0036, "step": 9400 }, { "epoch": 2.4953593211349774, "grad_norm": 0.008907473646104336, "learning_rate": 5e-05, "loss": 0.0026, "step": 9410 }, { "epoch": 2.498011137629276, "grad_norm": 0.008721532300114632, "learning_rate": 5e-05, "loss": 0.0029, "step": 9420 }, { "epoch": 2.5006629541235745, "grad_norm": 0.008660989813506603, "learning_rate": 5e-05, "loss": 0.0035, "step": 9430 }, { "epoch": 2.503314770617873, "grad_norm": 0.012278860434889793, "learning_rate": 5e-05, "loss": 0.0032, "step": 9440 }, { "epoch": 2.5059665871121717, "grad_norm": 0.01262583862990141, "learning_rate": 5e-05, "loss": 0.0033, "step": 9450 }, { "epoch": 2.5086184036064703, "grad_norm": 0.021372873336076736, "learning_rate": 5e-05, "loss": 0.0033, "step": 9460 }, { "epoch": 2.5112702201007693, "grad_norm": 0.013200702145695686, "learning_rate": 5e-05, "loss": 0.0035, "step": 9470 }, { "epoch": 2.5139220365950674, "grad_norm": 0.019265584647655487, "learning_rate": 5e-05, "loss": 0.0034, "step": 9480 }, { "epoch": 2.5165738530893664, "grad_norm": 0.007920782081782818, "learning_rate": 5e-05, "loss": 0.0042, "step": 9490 }, { "epoch": 2.5192256695836646, "grad_norm": 0.009029662236571312, "learning_rate": 5e-05, "loss": 0.0032, "step": 9500 }, { "epoch": 2.5218774860779636, "grad_norm": 0.004908683709800243, "learning_rate": 5e-05, "loss": 0.003, "step": 9510 }, { "epoch": 2.524529302572262, "grad_norm": 0.010281630791723728, "learning_rate": 5e-05, "loss": 0.003, "step": 9520 }, { "epoch": 2.5271811190665607, "grad_norm": 0.009765415452420712, "learning_rate": 5e-05, "loss": 0.0031, "step": 9530 }, { "epoch": 2.5298329355608593, "grad_norm": 0.009661361575126648, "learning_rate": 5e-05, "loss": 0.0027, "step": 9540 }, { "epoch": 2.532484752055158, "grad_norm": 0.010293838568031788, "learning_rate": 5e-05, "loss": 0.0032, "step": 9550 }, { "epoch": 2.5351365685494565, "grad_norm": 0.0117010697722435, "learning_rate": 5e-05, "loss": 0.0035, "step": 9560 }, { "epoch": 2.537788385043755, "grad_norm": 0.007517325691878796, "learning_rate": 5e-05, "loss": 0.0029, "step": 9570 }, { "epoch": 2.5404402015380536, "grad_norm": 0.008885391056537628, "learning_rate": 5e-05, "loss": 0.0029, "step": 9580 }, { "epoch": 2.543092018032352, "grad_norm": 0.014154150150716305, "learning_rate": 5e-05, "loss": 0.003, "step": 9590 }, { "epoch": 2.5457438345266508, "grad_norm": 0.012642758898437023, "learning_rate": 5e-05, "loss": 0.004, "step": 9600 }, { "epoch": 2.5483956510209493, "grad_norm": 0.0073194680735468864, "learning_rate": 5e-05, "loss": 0.0031, "step": 9610 }, { "epoch": 2.551047467515248, "grad_norm": 0.017527198418974876, "learning_rate": 5e-05, "loss": 0.0032, "step": 9620 }, { "epoch": 2.5536992840095465, "grad_norm": 0.007269429974257946, "learning_rate": 5e-05, "loss": 0.003, "step": 9630 }, { "epoch": 2.556351100503845, "grad_norm": 0.008839873597025871, "learning_rate": 5e-05, "loss": 0.0039, "step": 9640 }, { "epoch": 2.5590029169981436, "grad_norm": 0.008140191435813904, "learning_rate": 5e-05, "loss": 0.0034, "step": 9650 }, { "epoch": 2.561654733492442, "grad_norm": 0.006823945790529251, "learning_rate": 5e-05, "loss": 0.0031, "step": 9660 }, { "epoch": 2.564306549986741, "grad_norm": 0.007753221783787012, "learning_rate": 5e-05, "loss": 0.0034, "step": 9670 }, { "epoch": 2.5669583664810394, "grad_norm": 0.014629178680479527, "learning_rate": 5e-05, "loss": 0.0031, "step": 9680 }, { "epoch": 2.569610182975338, "grad_norm": 0.012456674128770828, "learning_rate": 5e-05, "loss": 0.0034, "step": 9690 }, { "epoch": 2.572261999469637, "grad_norm": 0.01003741379827261, "learning_rate": 5e-05, "loss": 0.003, "step": 9700 }, { "epoch": 2.574913815963935, "grad_norm": 0.008565345779061317, "learning_rate": 5e-05, "loss": 0.0038, "step": 9710 }, { "epoch": 2.577565632458234, "grad_norm": 0.009958758018910885, "learning_rate": 5e-05, "loss": 0.0031, "step": 9720 }, { "epoch": 2.5802174489525322, "grad_norm": 0.008738914504647255, "learning_rate": 5e-05, "loss": 0.0026, "step": 9730 }, { "epoch": 2.5828692654468313, "grad_norm": 0.011310593225061893, "learning_rate": 5e-05, "loss": 0.003, "step": 9740 }, { "epoch": 2.58552108194113, "grad_norm": 0.011224153451621532, "learning_rate": 5e-05, "loss": 0.0031, "step": 9750 }, { "epoch": 2.5881728984354284, "grad_norm": 0.010082468390464783, "learning_rate": 5e-05, "loss": 0.0033, "step": 9760 }, { "epoch": 2.590824714929727, "grad_norm": 0.009013529866933823, "learning_rate": 5e-05, "loss": 0.0024, "step": 9770 }, { "epoch": 2.5934765314240256, "grad_norm": 0.010932885110378265, "learning_rate": 5e-05, "loss": 0.0028, "step": 9780 }, { "epoch": 2.596128347918324, "grad_norm": 0.009222094900906086, "learning_rate": 5e-05, "loss": 0.0025, "step": 9790 }, { "epoch": 2.5987801644126227, "grad_norm": 0.0162618700414896, "learning_rate": 5e-05, "loss": 0.0036, "step": 9800 }, { "epoch": 2.6014319809069213, "grad_norm": 0.008581521920859814, "learning_rate": 5e-05, "loss": 0.0028, "step": 9810 }, { "epoch": 2.60408379740122, "grad_norm": 0.010060912929475307, "learning_rate": 5e-05, "loss": 0.0034, "step": 9820 }, { "epoch": 2.6067356138955184, "grad_norm": 0.010502337478101254, "learning_rate": 5e-05, "loss": 0.0031, "step": 9830 }, { "epoch": 2.609387430389817, "grad_norm": 0.008352628909051418, "learning_rate": 5e-05, "loss": 0.0038, "step": 9840 }, { "epoch": 2.6120392468841156, "grad_norm": 0.011281336657702923, "learning_rate": 5e-05, "loss": 0.0043, "step": 9850 }, { "epoch": 2.614691063378414, "grad_norm": 0.014391069300472736, "learning_rate": 5e-05, "loss": 0.0027, "step": 9860 }, { "epoch": 2.6173428798727127, "grad_norm": 0.006904127541929483, "learning_rate": 5e-05, "loss": 0.0038, "step": 9870 }, { "epoch": 2.6199946963670113, "grad_norm": 0.009598644450306892, "learning_rate": 5e-05, "loss": 0.0026, "step": 9880 }, { "epoch": 2.62264651286131, "grad_norm": 0.008691622875630856, "learning_rate": 5e-05, "loss": 0.0031, "step": 9890 }, { "epoch": 2.6252983293556085, "grad_norm": 0.012788784690201283, "learning_rate": 5e-05, "loss": 0.0026, "step": 9900 }, { "epoch": 2.627950145849907, "grad_norm": 0.007950362749397755, "learning_rate": 5e-05, "loss": 0.0024, "step": 9910 }, { "epoch": 2.6306019623442056, "grad_norm": 0.011486959643661976, "learning_rate": 5e-05, "loss": 0.0033, "step": 9920 }, { "epoch": 2.6332537788385046, "grad_norm": 0.007678466383367777, "learning_rate": 5e-05, "loss": 0.0031, "step": 9930 }, { "epoch": 2.6359055953328028, "grad_norm": 0.012686515226960182, "learning_rate": 5e-05, "loss": 0.0028, "step": 9940 }, { "epoch": 2.638557411827102, "grad_norm": 0.012709353119134903, "learning_rate": 5e-05, "loss": 0.0033, "step": 9950 }, { "epoch": 2.6412092283214, "grad_norm": 0.010673453100025654, "learning_rate": 5e-05, "loss": 0.0026, "step": 9960 }, { "epoch": 2.643861044815699, "grad_norm": 0.007346470840275288, "learning_rate": 5e-05, "loss": 0.0029, "step": 9970 }, { "epoch": 2.646512861309997, "grad_norm": 0.01014084555208683, "learning_rate": 5e-05, "loss": 0.0024, "step": 9980 }, { "epoch": 2.649164677804296, "grad_norm": 0.012180601246654987, "learning_rate": 5e-05, "loss": 0.0029, "step": 9990 }, { "epoch": 2.6518164942985947, "grad_norm": 0.01186351478099823, "learning_rate": 5e-05, "loss": 0.0035, "step": 10000 }, { "epoch": 2.6544683107928932, "grad_norm": 0.01144286710768938, "learning_rate": 5e-05, "loss": 0.0038, "step": 10010 }, { "epoch": 2.657120127287192, "grad_norm": 0.009177569299936295, "learning_rate": 5e-05, "loss": 0.0037, "step": 10020 }, { "epoch": 2.6597719437814904, "grad_norm": 0.011506020091474056, "learning_rate": 5e-05, "loss": 0.0035, "step": 10030 }, { "epoch": 2.662423760275789, "grad_norm": 0.010188901796936989, "learning_rate": 5e-05, "loss": 0.0034, "step": 10040 }, { "epoch": 2.6650755767700876, "grad_norm": 0.008199119940400124, "learning_rate": 5e-05, "loss": 0.0033, "step": 10050 }, { "epoch": 2.667727393264386, "grad_norm": 0.009244361892342567, "learning_rate": 5e-05, "loss": 0.0034, "step": 10060 }, { "epoch": 2.6703792097586847, "grad_norm": 0.009847949258983135, "learning_rate": 5e-05, "loss": 0.0032, "step": 10070 }, { "epoch": 2.6730310262529833, "grad_norm": 0.009883273392915726, "learning_rate": 5e-05, "loss": 0.0032, "step": 10080 }, { "epoch": 2.675682842747282, "grad_norm": 0.00783332996070385, "learning_rate": 5e-05, "loss": 0.0035, "step": 10090 }, { "epoch": 2.6783346592415804, "grad_norm": 0.008840855211019516, "learning_rate": 5e-05, "loss": 0.0028, "step": 10100 }, { "epoch": 2.680986475735879, "grad_norm": 0.008041713386774063, "learning_rate": 5e-05, "loss": 0.0036, "step": 10110 }, { "epoch": 2.6836382922301776, "grad_norm": 0.005123327486217022, "learning_rate": 5e-05, "loss": 0.0029, "step": 10120 }, { "epoch": 2.686290108724476, "grad_norm": 0.007316894363611937, "learning_rate": 5e-05, "loss": 0.003, "step": 10130 }, { "epoch": 2.6889419252187747, "grad_norm": 0.010139225050807, "learning_rate": 5e-05, "loss": 0.0026, "step": 10140 }, { "epoch": 2.6915937417130733, "grad_norm": 0.008890466764569283, "learning_rate": 5e-05, "loss": 0.0029, "step": 10150 }, { "epoch": 2.6942455582073723, "grad_norm": 0.0074600032530725, "learning_rate": 5e-05, "loss": 0.0033, "step": 10160 }, { "epoch": 2.6968973747016705, "grad_norm": 0.006790932733565569, "learning_rate": 5e-05, "loss": 0.0033, "step": 10170 }, { "epoch": 2.6995491911959695, "grad_norm": 0.008338709361851215, "learning_rate": 5e-05, "loss": 0.0033, "step": 10180 }, { "epoch": 2.7022010076902676, "grad_norm": 0.012687193229794502, "learning_rate": 5e-05, "loss": 0.0027, "step": 10190 }, { "epoch": 2.7048528241845666, "grad_norm": 0.007650404702872038, "learning_rate": 5e-05, "loss": 0.0026, "step": 10200 }, { "epoch": 2.7075046406788648, "grad_norm": 0.006182400975376368, "learning_rate": 5e-05, "loss": 0.0031, "step": 10210 }, { "epoch": 2.710156457173164, "grad_norm": 0.019282739609479904, "learning_rate": 5e-05, "loss": 0.0034, "step": 10220 }, { "epoch": 2.7128082736674624, "grad_norm": 0.007319161668419838, "learning_rate": 5e-05, "loss": 0.0032, "step": 10230 }, { "epoch": 2.715460090161761, "grad_norm": 0.0033780676312744617, "learning_rate": 5e-05, "loss": 0.0029, "step": 10240 }, { "epoch": 2.7181119066560595, "grad_norm": 0.008869151584804058, "learning_rate": 5e-05, "loss": 0.0036, "step": 10250 }, { "epoch": 2.720763723150358, "grad_norm": 0.00951290875673294, "learning_rate": 5e-05, "loss": 0.0034, "step": 10260 }, { "epoch": 2.7234155396446567, "grad_norm": 0.0077605885453522205, "learning_rate": 5e-05, "loss": 0.0036, "step": 10270 }, { "epoch": 2.7260673561389552, "grad_norm": 0.00809828657656908, "learning_rate": 5e-05, "loss": 0.0026, "step": 10280 }, { "epoch": 2.728719172633254, "grad_norm": 0.008521676063537598, "learning_rate": 5e-05, "loss": 0.003, "step": 10290 }, { "epoch": 2.7313709891275524, "grad_norm": 0.010032078251242638, "learning_rate": 5e-05, "loss": 0.003, "step": 10300 }, { "epoch": 2.734022805621851, "grad_norm": 0.014851457439363003, "learning_rate": 5e-05, "loss": 0.0037, "step": 10310 }, { "epoch": 2.7366746221161495, "grad_norm": 0.008225297555327415, "learning_rate": 5e-05, "loss": 0.0035, "step": 10320 }, { "epoch": 2.739326438610448, "grad_norm": 0.008410945534706116, "learning_rate": 5e-05, "loss": 0.0033, "step": 10330 }, { "epoch": 2.7419782551047467, "grad_norm": 0.008654527366161346, "learning_rate": 5e-05, "loss": 0.0027, "step": 10340 }, { "epoch": 2.7446300715990453, "grad_norm": 0.013079024851322174, "learning_rate": 5e-05, "loss": 0.0034, "step": 10350 }, { "epoch": 2.747281888093344, "grad_norm": 0.007196380756795406, "learning_rate": 5e-05, "loss": 0.0029, "step": 10360 }, { "epoch": 2.7499337045876424, "grad_norm": 0.01181880198419094, "learning_rate": 5e-05, "loss": 0.0034, "step": 10370 }, { "epoch": 2.752585521081941, "grad_norm": 0.0075484588742256165, "learning_rate": 5e-05, "loss": 0.0036, "step": 10380 }, { "epoch": 2.7552373375762396, "grad_norm": 0.018504725769162178, "learning_rate": 5e-05, "loss": 0.0033, "step": 10390 }, { "epoch": 2.757889154070538, "grad_norm": 0.006315159611403942, "learning_rate": 5e-05, "loss": 0.003, "step": 10400 }, { "epoch": 2.760540970564837, "grad_norm": 0.006599197629839182, "learning_rate": 5e-05, "loss": 0.003, "step": 10410 }, { "epoch": 2.7631927870591353, "grad_norm": 0.008563859388232231, "learning_rate": 5e-05, "loss": 0.0029, "step": 10420 }, { "epoch": 2.7658446035534343, "grad_norm": 0.008424398489296436, "learning_rate": 5e-05, "loss": 0.003, "step": 10430 }, { "epoch": 2.7684964200477324, "grad_norm": 0.011539159342646599, "learning_rate": 5e-05, "loss": 0.0029, "step": 10440 }, { "epoch": 2.7711482365420315, "grad_norm": 0.011557120829820633, "learning_rate": 5e-05, "loss": 0.0036, "step": 10450 }, { "epoch": 2.77380005303633, "grad_norm": 0.011152615770697594, "learning_rate": 5e-05, "loss": 0.0028, "step": 10460 }, { "epoch": 2.7764518695306286, "grad_norm": 0.011444439180195332, "learning_rate": 5e-05, "loss": 0.0032, "step": 10470 }, { "epoch": 2.779103686024927, "grad_norm": 0.014446464367210865, "learning_rate": 5e-05, "loss": 0.0033, "step": 10480 }, { "epoch": 2.7817555025192258, "grad_norm": 0.008268372155725956, "learning_rate": 5e-05, "loss": 0.0026, "step": 10490 }, { "epoch": 2.7844073190135243, "grad_norm": 0.013074197806417942, "learning_rate": 5e-05, "loss": 0.0024, "step": 10500 }, { "epoch": 2.787059135507823, "grad_norm": 0.011859544552862644, "learning_rate": 5e-05, "loss": 0.0029, "step": 10510 }, { "epoch": 2.7897109520021215, "grad_norm": 0.007267489563673735, "learning_rate": 5e-05, "loss": 0.003, "step": 10520 }, { "epoch": 2.79236276849642, "grad_norm": 0.010367272421717644, "learning_rate": 5e-05, "loss": 0.0036, "step": 10530 }, { "epoch": 2.7950145849907186, "grad_norm": 0.012599032372236252, "learning_rate": 5e-05, "loss": 0.0037, "step": 10540 }, { "epoch": 2.797666401485017, "grad_norm": 0.011411398649215698, "learning_rate": 5e-05, "loss": 0.0027, "step": 10550 }, { "epoch": 2.800318217979316, "grad_norm": 0.009426480159163475, "learning_rate": 5e-05, "loss": 0.0025, "step": 10560 }, { "epoch": 2.8029700344736144, "grad_norm": 0.00705475639551878, "learning_rate": 5e-05, "loss": 0.0028, "step": 10570 }, { "epoch": 2.805621850967913, "grad_norm": 0.010897872969508171, "learning_rate": 5e-05, "loss": 0.0032, "step": 10580 }, { "epoch": 2.8082736674622115, "grad_norm": 0.012352865189313889, "learning_rate": 5e-05, "loss": 0.0031, "step": 10590 }, { "epoch": 2.81092548395651, "grad_norm": 0.00810383539646864, "learning_rate": 5e-05, "loss": 0.0022, "step": 10600 }, { "epoch": 2.8135773004508087, "grad_norm": 0.006266561336815357, "learning_rate": 5e-05, "loss": 0.003, "step": 10610 }, { "epoch": 2.8162291169451072, "grad_norm": 0.006467216648161411, "learning_rate": 5e-05, "loss": 0.0028, "step": 10620 }, { "epoch": 2.818880933439406, "grad_norm": 0.007913012057542801, "learning_rate": 5e-05, "loss": 0.0026, "step": 10630 }, { "epoch": 2.821532749933705, "grad_norm": 0.0138782300055027, "learning_rate": 5e-05, "loss": 0.0027, "step": 10640 }, { "epoch": 2.824184566428003, "grad_norm": 0.009950296953320503, "learning_rate": 5e-05, "loss": 0.0023, "step": 10650 }, { "epoch": 2.826836382922302, "grad_norm": 0.012419586069881916, "learning_rate": 5e-05, "loss": 0.003, "step": 10660 }, { "epoch": 2.8294881994166, "grad_norm": 0.009047228842973709, "learning_rate": 5e-05, "loss": 0.0029, "step": 10670 }, { "epoch": 2.832140015910899, "grad_norm": 0.008618469350039959, "learning_rate": 5e-05, "loss": 0.0032, "step": 10680 }, { "epoch": 2.8347918324051977, "grad_norm": 0.0074747176840901375, "learning_rate": 5e-05, "loss": 0.0037, "step": 10690 }, { "epoch": 2.8374436488994963, "grad_norm": 0.012326627038419247, "learning_rate": 5e-05, "loss": 0.0035, "step": 10700 }, { "epoch": 2.840095465393795, "grad_norm": 0.009822036139667034, "learning_rate": 5e-05, "loss": 0.0034, "step": 10710 }, { "epoch": 2.8427472818880934, "grad_norm": 0.011336802504956722, "learning_rate": 5e-05, "loss": 0.0032, "step": 10720 }, { "epoch": 2.845399098382392, "grad_norm": 0.006518637761473656, "learning_rate": 5e-05, "loss": 0.003, "step": 10730 }, { "epoch": 2.8480509148766906, "grad_norm": 0.013575873337686062, "learning_rate": 5e-05, "loss": 0.0034, "step": 10740 }, { "epoch": 2.850702731370989, "grad_norm": 0.01033126749098301, "learning_rate": 5e-05, "loss": 0.0036, "step": 10750 }, { "epoch": 2.8533545478652877, "grad_norm": 0.013805652037262917, "learning_rate": 5e-05, "loss": 0.0027, "step": 10760 }, { "epoch": 2.8560063643595863, "grad_norm": 0.007651107385754585, "learning_rate": 5e-05, "loss": 0.003, "step": 10770 }, { "epoch": 2.858658180853885, "grad_norm": 0.005657474044710398, "learning_rate": 5e-05, "loss": 0.0025, "step": 10780 }, { "epoch": 2.8613099973481835, "grad_norm": 0.007111294195055962, "learning_rate": 5e-05, "loss": 0.0027, "step": 10790 }, { "epoch": 2.863961813842482, "grad_norm": 0.010856878012418747, "learning_rate": 5e-05, "loss": 0.0027, "step": 10800 }, { "epoch": 2.8666136303367806, "grad_norm": 0.01172639150172472, "learning_rate": 5e-05, "loss": 0.0033, "step": 10810 }, { "epoch": 2.869265446831079, "grad_norm": 0.00953640602529049, "learning_rate": 5e-05, "loss": 0.003, "step": 10820 }, { "epoch": 2.871917263325378, "grad_norm": 0.006831949111074209, "learning_rate": 5e-05, "loss": 0.003, "step": 10830 }, { "epoch": 2.8745690798196764, "grad_norm": 0.009326920844614506, "learning_rate": 5e-05, "loss": 0.0028, "step": 10840 }, { "epoch": 2.877220896313975, "grad_norm": 0.00944512989372015, "learning_rate": 5e-05, "loss": 0.0029, "step": 10850 }, { "epoch": 2.8798727128082735, "grad_norm": 0.014380382373929024, "learning_rate": 5e-05, "loss": 0.0026, "step": 10860 }, { "epoch": 2.8825245293025725, "grad_norm": 0.011668446473777294, "learning_rate": 5e-05, "loss": 0.0034, "step": 10870 }, { "epoch": 2.8851763457968707, "grad_norm": 0.007226914633065462, "learning_rate": 5e-05, "loss": 0.0029, "step": 10880 }, { "epoch": 2.8878281622911697, "grad_norm": 0.014663147740066051, "learning_rate": 5e-05, "loss": 0.003, "step": 10890 }, { "epoch": 2.890479978785468, "grad_norm": 0.010098678059875965, "learning_rate": 5e-05, "loss": 0.0029, "step": 10900 }, { "epoch": 2.893131795279767, "grad_norm": 0.01621408201754093, "learning_rate": 5e-05, "loss": 0.0041, "step": 10910 }, { "epoch": 2.895783611774065, "grad_norm": 0.007404005620628595, "learning_rate": 5e-05, "loss": 0.0026, "step": 10920 }, { "epoch": 2.898435428268364, "grad_norm": 0.010805833153426647, "learning_rate": 5e-05, "loss": 0.0031, "step": 10930 }, { "epoch": 2.9010872447626626, "grad_norm": 0.007478088606148958, "learning_rate": 5e-05, "loss": 0.0024, "step": 10940 }, { "epoch": 2.903739061256961, "grad_norm": 0.00870394054800272, "learning_rate": 5e-05, "loss": 0.0034, "step": 10950 }, { "epoch": 2.9063908777512597, "grad_norm": 0.009030320681631565, "learning_rate": 5e-05, "loss": 0.0029, "step": 10960 }, { "epoch": 2.9090426942455583, "grad_norm": 0.006845667026937008, "learning_rate": 5e-05, "loss": 0.0025, "step": 10970 }, { "epoch": 2.911694510739857, "grad_norm": 0.007066364865750074, "learning_rate": 5e-05, "loss": 0.0028, "step": 10980 }, { "epoch": 2.9143463272341554, "grad_norm": 0.0071622091345489025, "learning_rate": 5e-05, "loss": 0.0036, "step": 10990 }, { "epoch": 2.916998143728454, "grad_norm": 0.02155531942844391, "learning_rate": 5e-05, "loss": 0.0026, "step": 11000 }, { "epoch": 2.9196499602227526, "grad_norm": 0.013255936093628407, "learning_rate": 5e-05, "loss": 0.0027, "step": 11010 }, { "epoch": 2.922301776717051, "grad_norm": 0.007347180508077145, "learning_rate": 5e-05, "loss": 0.0026, "step": 11020 }, { "epoch": 2.9249535932113497, "grad_norm": 0.008440878242254257, "learning_rate": 5e-05, "loss": 0.003, "step": 11030 }, { "epoch": 2.9276054097056483, "grad_norm": 0.009175759740173817, "learning_rate": 5e-05, "loss": 0.0026, "step": 11040 }, { "epoch": 2.930257226199947, "grad_norm": 0.007999304682016373, "learning_rate": 5e-05, "loss": 0.0026, "step": 11050 }, { "epoch": 2.9329090426942455, "grad_norm": 0.010778600350022316, "learning_rate": 5e-05, "loss": 0.0033, "step": 11060 }, { "epoch": 2.935560859188544, "grad_norm": 0.0067409975454211235, "learning_rate": 5e-05, "loss": 0.0032, "step": 11070 }, { "epoch": 2.9382126756828426, "grad_norm": 0.014447305351495743, "learning_rate": 5e-05, "loss": 0.0033, "step": 11080 }, { "epoch": 2.940864492177141, "grad_norm": 0.008558184839785099, "learning_rate": 5e-05, "loss": 0.0038, "step": 11090 }, { "epoch": 2.94351630867144, "grad_norm": 0.004552672617137432, "learning_rate": 5e-05, "loss": 0.0032, "step": 11100 }, { "epoch": 2.9461681251657383, "grad_norm": 0.008364318870007992, "learning_rate": 5e-05, "loss": 0.0031, "step": 11110 }, { "epoch": 2.9488199416600374, "grad_norm": 0.006022443529218435, "learning_rate": 5e-05, "loss": 0.0032, "step": 11120 }, { "epoch": 2.9514717581543355, "grad_norm": 0.006488977465778589, "learning_rate": 5e-05, "loss": 0.0027, "step": 11130 }, { "epoch": 2.9541235746486345, "grad_norm": 0.007243592757731676, "learning_rate": 5e-05, "loss": 0.0028, "step": 11140 }, { "epoch": 2.9567753911429326, "grad_norm": 0.008078561164438725, "learning_rate": 5e-05, "loss": 0.0035, "step": 11150 }, { "epoch": 2.9594272076372317, "grad_norm": 0.00878161285072565, "learning_rate": 5e-05, "loss": 0.003, "step": 11160 }, { "epoch": 2.9620790241315302, "grad_norm": 0.007278898265212774, "learning_rate": 5e-05, "loss": 0.0023, "step": 11170 }, { "epoch": 2.964730840625829, "grad_norm": 0.009256538935005665, "learning_rate": 5e-05, "loss": 0.0028, "step": 11180 }, { "epoch": 2.9673826571201274, "grad_norm": 0.006457492709159851, "learning_rate": 5e-05, "loss": 0.0026, "step": 11190 }, { "epoch": 2.970034473614426, "grad_norm": 0.008365978486835957, "learning_rate": 5e-05, "loss": 0.0032, "step": 11200 }, { "epoch": 2.9726862901087245, "grad_norm": 0.012818435207009315, "learning_rate": 5e-05, "loss": 0.0035, "step": 11210 }, { "epoch": 2.975338106603023, "grad_norm": 0.010332811623811722, "learning_rate": 5e-05, "loss": 0.0034, "step": 11220 }, { "epoch": 2.9779899230973217, "grad_norm": 0.005961082875728607, "learning_rate": 5e-05, "loss": 0.0031, "step": 11230 }, { "epoch": 2.9806417395916203, "grad_norm": 0.010245407931506634, "learning_rate": 5e-05, "loss": 0.0029, "step": 11240 }, { "epoch": 2.983293556085919, "grad_norm": 0.007024890277534723, "learning_rate": 5e-05, "loss": 0.0033, "step": 11250 }, { "epoch": 2.9859453725802174, "grad_norm": 0.010778623633086681, "learning_rate": 5e-05, "loss": 0.0029, "step": 11260 }, { "epoch": 2.988597189074516, "grad_norm": 0.009913366287946701, "learning_rate": 5e-05, "loss": 0.0036, "step": 11270 }, { "epoch": 2.9912490055688146, "grad_norm": 0.0058290185406804085, "learning_rate": 5e-05, "loss": 0.0033, "step": 11280 }, { "epoch": 2.993900822063113, "grad_norm": 0.009193207137286663, "learning_rate": 5e-05, "loss": 0.0026, "step": 11290 }, { "epoch": 2.9965526385574117, "grad_norm": 0.010959203355014324, "learning_rate": 5e-05, "loss": 0.003, "step": 11300 }, { "epoch": 2.9992044550517103, "grad_norm": 0.009938686154782772, "learning_rate": 5e-05, "loss": 0.0028, "step": 11310 }, { "epoch": 3.001856271546009, "grad_norm": 0.008492257446050644, "learning_rate": 5e-05, "loss": 0.0028, "step": 11320 }, { "epoch": 3.0045080880403074, "grad_norm": 0.012267207726836205, "learning_rate": 5e-05, "loss": 0.0037, "step": 11330 }, { "epoch": 3.007159904534606, "grad_norm": 0.010339092463254929, "learning_rate": 5e-05, "loss": 0.0032, "step": 11340 }, { "epoch": 3.0098117210289046, "grad_norm": 0.009771511889994144, "learning_rate": 5e-05, "loss": 0.003, "step": 11350 }, { "epoch": 3.012463537523203, "grad_norm": 0.009849435649812222, "learning_rate": 5e-05, "loss": 0.0035, "step": 11360 }, { "epoch": 3.015115354017502, "grad_norm": 0.00960448756814003, "learning_rate": 5e-05, "loss": 0.003, "step": 11370 }, { "epoch": 3.0177671705118008, "grad_norm": 0.007635532878339291, "learning_rate": 5e-05, "loss": 0.0026, "step": 11380 }, { "epoch": 3.0204189870060993, "grad_norm": 0.005928667262196541, "learning_rate": 5e-05, "loss": 0.0034, "step": 11390 }, { "epoch": 3.023070803500398, "grad_norm": 0.014163787476718426, "learning_rate": 5e-05, "loss": 0.0028, "step": 11400 }, { "epoch": 3.0257226199946965, "grad_norm": 0.007289479020982981, "learning_rate": 5e-05, "loss": 0.0039, "step": 11410 }, { "epoch": 3.028374436488995, "grad_norm": 0.006720331497490406, "learning_rate": 5e-05, "loss": 0.003, "step": 11420 }, { "epoch": 3.0310262529832936, "grad_norm": 0.006773029454052448, "learning_rate": 5e-05, "loss": 0.0025, "step": 11430 }, { "epoch": 3.033678069477592, "grad_norm": 0.006890075746923685, "learning_rate": 5e-05, "loss": 0.0032, "step": 11440 }, { "epoch": 3.036329885971891, "grad_norm": 0.009246631525456905, "learning_rate": 5e-05, "loss": 0.003, "step": 11450 }, { "epoch": 3.0389817024661894, "grad_norm": 0.009580313228070736, "learning_rate": 5e-05, "loss": 0.0029, "step": 11460 }, { "epoch": 3.041633518960488, "grad_norm": 0.00896134041249752, "learning_rate": 5e-05, "loss": 0.0024, "step": 11470 }, { "epoch": 3.0442853354547865, "grad_norm": 0.006839429959654808, "learning_rate": 5e-05, "loss": 0.0026, "step": 11480 }, { "epoch": 3.046937151949085, "grad_norm": 0.011411383748054504, "learning_rate": 5e-05, "loss": 0.0033, "step": 11490 }, { "epoch": 3.0495889684433837, "grad_norm": 0.00750200217589736, "learning_rate": 5e-05, "loss": 0.0029, "step": 11500 }, { "epoch": 3.0522407849376822, "grad_norm": 0.007657794281840324, "learning_rate": 5e-05, "loss": 0.0032, "step": 11510 }, { "epoch": 3.054892601431981, "grad_norm": 0.01698107272386551, "learning_rate": 5e-05, "loss": 0.0029, "step": 11520 }, { "epoch": 3.0575444179262794, "grad_norm": 0.012625481933355331, "learning_rate": 5e-05, "loss": 0.0029, "step": 11530 }, { "epoch": 3.060196234420578, "grad_norm": 0.007826467044651508, "learning_rate": 5e-05, "loss": 0.0028, "step": 11540 }, { "epoch": 3.0628480509148766, "grad_norm": 0.008610033430159092, "learning_rate": 5e-05, "loss": 0.0025, "step": 11550 }, { "epoch": 3.065499867409175, "grad_norm": 0.005291426554322243, "learning_rate": 5e-05, "loss": 0.0025, "step": 11560 }, { "epoch": 3.0681516839034737, "grad_norm": 0.014364995062351227, "learning_rate": 5e-05, "loss": 0.003, "step": 11570 }, { "epoch": 3.0708035003977723, "grad_norm": 0.01696774736046791, "learning_rate": 5e-05, "loss": 0.0026, "step": 11580 }, { "epoch": 3.073455316892071, "grad_norm": 0.009543702006340027, "learning_rate": 5e-05, "loss": 0.0032, "step": 11590 }, { "epoch": 3.07610713338637, "grad_norm": 0.01223832182586193, "learning_rate": 5e-05, "loss": 0.0027, "step": 11600 }, { "epoch": 3.0787589498806684, "grad_norm": 0.006617382168769836, "learning_rate": 5e-05, "loss": 0.0033, "step": 11610 }, { "epoch": 3.081410766374967, "grad_norm": 0.01061962079256773, "learning_rate": 5e-05, "loss": 0.0034, "step": 11620 }, { "epoch": 3.0840625828692656, "grad_norm": 0.008108573034405708, "learning_rate": 5e-05, "loss": 0.003, "step": 11630 }, { "epoch": 3.086714399363564, "grad_norm": 0.014010858722031116, "learning_rate": 5e-05, "loss": 0.0032, "step": 11640 }, { "epoch": 3.0893662158578628, "grad_norm": 0.009363891556859016, "learning_rate": 5e-05, "loss": 0.0028, "step": 11650 }, { "epoch": 3.0920180323521613, "grad_norm": 0.006651648785918951, "learning_rate": 5e-05, "loss": 0.0025, "step": 11660 }, { "epoch": 3.09466984884646, "grad_norm": 0.012731166556477547, "learning_rate": 5e-05, "loss": 0.0029, "step": 11670 }, { "epoch": 3.0973216653407585, "grad_norm": 0.010614740662276745, "learning_rate": 5e-05, "loss": 0.0031, "step": 11680 }, { "epoch": 3.099973481835057, "grad_norm": 0.012382113374769688, "learning_rate": 5e-05, "loss": 0.0025, "step": 11690 }, { "epoch": 3.1026252983293556, "grad_norm": 0.007739261724054813, "learning_rate": 5e-05, "loss": 0.0027, "step": 11700 }, { "epoch": 3.105277114823654, "grad_norm": 0.010352324694395065, "learning_rate": 5e-05, "loss": 0.0022, "step": 11710 }, { "epoch": 3.107928931317953, "grad_norm": 0.012624272145330906, "learning_rate": 5e-05, "loss": 0.0035, "step": 11720 }, { "epoch": 3.1105807478122514, "grad_norm": 0.006768427789211273, "learning_rate": 5e-05, "loss": 0.003, "step": 11730 }, { "epoch": 3.11323256430655, "grad_norm": 0.008421593345701694, "learning_rate": 5e-05, "loss": 0.0029, "step": 11740 }, { "epoch": 3.1158843808008485, "grad_norm": 0.009353759698569775, "learning_rate": 5e-05, "loss": 0.0034, "step": 11750 }, { "epoch": 3.118536197295147, "grad_norm": 0.010402653366327286, "learning_rate": 5e-05, "loss": 0.0029, "step": 11760 }, { "epoch": 3.1211880137894457, "grad_norm": 0.010243527591228485, "learning_rate": 5e-05, "loss": 0.0035, "step": 11770 }, { "epoch": 3.1238398302837442, "grad_norm": 0.013422836549580097, "learning_rate": 5e-05, "loss": 0.0031, "step": 11780 }, { "epoch": 3.126491646778043, "grad_norm": 0.004974239971488714, "learning_rate": 5e-05, "loss": 0.0029, "step": 11790 }, { "epoch": 3.1291434632723414, "grad_norm": 0.00817057490348816, "learning_rate": 5e-05, "loss": 0.003, "step": 11800 }, { "epoch": 3.13179527976664, "grad_norm": 0.012935691513121128, "learning_rate": 5e-05, "loss": 0.0027, "step": 11810 }, { "epoch": 3.1344470962609385, "grad_norm": 0.006129916291683912, "learning_rate": 5e-05, "loss": 0.0029, "step": 11820 }, { "epoch": 3.1370989127552376, "grad_norm": 0.008501745760440826, "learning_rate": 5e-05, "loss": 0.0031, "step": 11830 }, { "epoch": 3.139750729249536, "grad_norm": 0.004828463774174452, "learning_rate": 5e-05, "loss": 0.003, "step": 11840 }, { "epoch": 3.1424025457438347, "grad_norm": 0.009735962375998497, "learning_rate": 5e-05, "loss": 0.0037, "step": 11850 }, { "epoch": 3.1450543622381333, "grad_norm": 0.007634418550878763, "learning_rate": 5e-05, "loss": 0.0028, "step": 11860 }, { "epoch": 3.147706178732432, "grad_norm": 0.006656670942902565, "learning_rate": 5e-05, "loss": 0.0029, "step": 11870 }, { "epoch": 3.1503579952267304, "grad_norm": 0.009606346487998962, "learning_rate": 5e-05, "loss": 0.0026, "step": 11880 }, { "epoch": 3.153009811721029, "grad_norm": 0.008410502225160599, "learning_rate": 5e-05, "loss": 0.0027, "step": 11890 }, { "epoch": 3.1556616282153276, "grad_norm": 0.01001001987606287, "learning_rate": 5e-05, "loss": 0.0033, "step": 11900 }, { "epoch": 3.158313444709626, "grad_norm": 0.00777325127273798, "learning_rate": 5e-05, "loss": 0.003, "step": 11910 }, { "epoch": 3.1609652612039247, "grad_norm": 0.0058600157499313354, "learning_rate": 5e-05, "loss": 0.0028, "step": 11920 }, { "epoch": 3.1636170776982233, "grad_norm": 0.006187852006405592, "learning_rate": 5e-05, "loss": 0.0028, "step": 11930 }, { "epoch": 3.166268894192522, "grad_norm": 0.010686635971069336, "learning_rate": 5e-05, "loss": 0.002, "step": 11940 }, { "epoch": 3.1689207106868205, "grad_norm": 0.007299201097339392, "learning_rate": 5e-05, "loss": 0.0033, "step": 11950 }, { "epoch": 3.171572527181119, "grad_norm": 0.003292008303105831, "learning_rate": 5e-05, "loss": 0.0027, "step": 11960 }, { "epoch": 3.1742243436754176, "grad_norm": 0.012169078923761845, "learning_rate": 5e-05, "loss": 0.003, "step": 11970 }, { "epoch": 3.176876160169716, "grad_norm": 0.010134708136320114, "learning_rate": 5e-05, "loss": 0.0031, "step": 11980 }, { "epoch": 3.1795279766640148, "grad_norm": 0.008288051001727581, "learning_rate": 5e-05, "loss": 0.0033, "step": 11990 }, { "epoch": 3.1821797931583133, "grad_norm": 0.020057952031493187, "learning_rate": 5e-05, "loss": 0.0028, "step": 12000 }, { "epoch": 3.184831609652612, "grad_norm": 0.006772251334041357, "learning_rate": 5e-05, "loss": 0.0033, "step": 12010 }, { "epoch": 3.1874834261469105, "grad_norm": 0.008427330292761326, "learning_rate": 5e-05, "loss": 0.0028, "step": 12020 }, { "epoch": 3.190135242641209, "grad_norm": 0.008697639219462872, "learning_rate": 5e-05, "loss": 0.0029, "step": 12030 }, { "epoch": 3.1927870591355076, "grad_norm": 0.008972895331680775, "learning_rate": 5e-05, "loss": 0.0033, "step": 12040 }, { "epoch": 3.195438875629806, "grad_norm": 0.008698650635778904, "learning_rate": 5e-05, "loss": 0.0034, "step": 12050 }, { "epoch": 3.1980906921241052, "grad_norm": 0.007045097649097443, "learning_rate": 5e-05, "loss": 0.0034, "step": 12060 }, { "epoch": 3.200742508618404, "grad_norm": 0.007919251918792725, "learning_rate": 5e-05, "loss": 0.0026, "step": 12070 }, { "epoch": 3.2033943251127024, "grad_norm": 0.008017742075026035, "learning_rate": 5e-05, "loss": 0.0032, "step": 12080 }, { "epoch": 3.206046141607001, "grad_norm": 0.007431503385305405, "learning_rate": 5e-05, "loss": 0.0023, "step": 12090 }, { "epoch": 3.2086979581012995, "grad_norm": 0.008982460014522076, "learning_rate": 5e-05, "loss": 0.0025, "step": 12100 }, { "epoch": 3.211349774595598, "grad_norm": 0.00952233374118805, "learning_rate": 5e-05, "loss": 0.0034, "step": 12110 }, { "epoch": 3.2140015910898967, "grad_norm": 0.01011247094720602, "learning_rate": 5e-05, "loss": 0.0028, "step": 12120 }, { "epoch": 3.2166534075841953, "grad_norm": 0.011318445205688477, "learning_rate": 5e-05, "loss": 0.0028, "step": 12130 }, { "epoch": 3.219305224078494, "grad_norm": 0.008919197134673595, "learning_rate": 5e-05, "loss": 0.0027, "step": 12140 }, { "epoch": 3.2219570405727924, "grad_norm": 0.0074367765337228775, "learning_rate": 5e-05, "loss": 0.0026, "step": 12150 }, { "epoch": 3.224608857067091, "grad_norm": 0.011481928639113903, "learning_rate": 5e-05, "loss": 0.0028, "step": 12160 }, { "epoch": 3.2272606735613896, "grad_norm": 0.007668850477784872, "learning_rate": 5e-05, "loss": 0.0024, "step": 12170 }, { "epoch": 3.229912490055688, "grad_norm": 0.0072609009221196175, "learning_rate": 5e-05, "loss": 0.0027, "step": 12180 }, { "epoch": 3.2325643065499867, "grad_norm": 0.009711334481835365, "learning_rate": 5e-05, "loss": 0.0034, "step": 12190 }, { "epoch": 3.2352161230442853, "grad_norm": 0.0077856192365288734, "learning_rate": 5e-05, "loss": 0.003, "step": 12200 }, { "epoch": 3.237867939538584, "grad_norm": 0.01467285118997097, "learning_rate": 5e-05, "loss": 0.0036, "step": 12210 }, { "epoch": 3.2405197560328824, "grad_norm": 0.009133322164416313, "learning_rate": 5e-05, "loss": 0.0026, "step": 12220 }, { "epoch": 3.243171572527181, "grad_norm": 0.007850880734622478, "learning_rate": 5e-05, "loss": 0.0034, "step": 12230 }, { "epoch": 3.2458233890214796, "grad_norm": 0.0158639345318079, "learning_rate": 5e-05, "loss": 0.0026, "step": 12240 }, { "epoch": 3.248475205515778, "grad_norm": 0.00934574380517006, "learning_rate": 5e-05, "loss": 0.0027, "step": 12250 }, { "epoch": 3.2511270220100768, "grad_norm": 0.008462486788630486, "learning_rate": 5e-05, "loss": 0.0027, "step": 12260 }, { "epoch": 3.2537788385043753, "grad_norm": 0.005278984550386667, "learning_rate": 5e-05, "loss": 0.0027, "step": 12270 }, { "epoch": 3.256430654998674, "grad_norm": 0.013485332019627094, "learning_rate": 5e-05, "loss": 0.003, "step": 12280 }, { "epoch": 3.259082471492973, "grad_norm": 0.010330303572118282, "learning_rate": 5e-05, "loss": 0.0032, "step": 12290 }, { "epoch": 3.261734287987271, "grad_norm": 0.004170497879385948, "learning_rate": 5e-05, "loss": 0.0029, "step": 12300 }, { "epoch": 3.26438610448157, "grad_norm": 0.006552340928465128, "learning_rate": 5e-05, "loss": 0.0027, "step": 12310 }, { "epoch": 3.2670379209758686, "grad_norm": 0.006589491851627827, "learning_rate": 5e-05, "loss": 0.003, "step": 12320 }, { "epoch": 3.2696897374701672, "grad_norm": 0.008516370318830013, "learning_rate": 5e-05, "loss": 0.0027, "step": 12330 }, { "epoch": 3.272341553964466, "grad_norm": 0.014157376252114773, "learning_rate": 5e-05, "loss": 0.003, "step": 12340 }, { "epoch": 3.2749933704587644, "grad_norm": 0.014282504096627235, "learning_rate": 5e-05, "loss": 0.0026, "step": 12350 }, { "epoch": 3.277645186953063, "grad_norm": 0.011779260821640491, "learning_rate": 5e-05, "loss": 0.0031, "step": 12360 }, { "epoch": 3.2802970034473615, "grad_norm": 0.012897745706140995, "learning_rate": 5e-05, "loss": 0.0026, "step": 12370 }, { "epoch": 3.28294881994166, "grad_norm": 0.006072803866118193, "learning_rate": 5e-05, "loss": 0.0028, "step": 12380 }, { "epoch": 3.2856006364359587, "grad_norm": 0.0103487903252244, "learning_rate": 5e-05, "loss": 0.0029, "step": 12390 }, { "epoch": 3.2882524529302573, "grad_norm": 0.006638475228101015, "learning_rate": 5e-05, "loss": 0.0027, "step": 12400 }, { "epoch": 3.290904269424556, "grad_norm": 0.01219526119530201, "learning_rate": 5e-05, "loss": 0.003, "step": 12410 }, { "epoch": 3.2935560859188544, "grad_norm": 0.005177476443350315, "learning_rate": 5e-05, "loss": 0.0027, "step": 12420 }, { "epoch": 3.296207902413153, "grad_norm": 0.009455864317715168, "learning_rate": 5e-05, "loss": 0.0029, "step": 12430 }, { "epoch": 3.2988597189074516, "grad_norm": 0.011621707119047642, "learning_rate": 5e-05, "loss": 0.0028, "step": 12440 }, { "epoch": 3.30151153540175, "grad_norm": 0.0065255784429609776, "learning_rate": 5e-05, "loss": 0.0029, "step": 12450 }, { "epoch": 3.3041633518960487, "grad_norm": 0.011635035276412964, "learning_rate": 5e-05, "loss": 0.0026, "step": 12460 }, { "epoch": 3.3068151683903473, "grad_norm": 0.01766085997223854, "learning_rate": 5e-05, "loss": 0.0034, "step": 12470 }, { "epoch": 3.309466984884646, "grad_norm": 0.010626236908137798, "learning_rate": 5e-05, "loss": 0.0028, "step": 12480 }, { "epoch": 3.3121188013789444, "grad_norm": 0.0038360534235835075, "learning_rate": 5e-05, "loss": 0.0026, "step": 12490 }, { "epoch": 3.314770617873243, "grad_norm": 0.008663657121360302, "learning_rate": 5e-05, "loss": 0.0031, "step": 12500 }, { "epoch": 3.3174224343675416, "grad_norm": 0.013955614529550076, "learning_rate": 5e-05, "loss": 0.0026, "step": 12510 }, { "epoch": 3.3200742508618406, "grad_norm": 0.009811095893383026, "learning_rate": 5e-05, "loss": 0.0028, "step": 12520 }, { "epoch": 3.3227260673561387, "grad_norm": 0.006653600838035345, "learning_rate": 5e-05, "loss": 0.003, "step": 12530 }, { "epoch": 3.3253778838504378, "grad_norm": 0.009711475111544132, "learning_rate": 5e-05, "loss": 0.0032, "step": 12540 }, { "epoch": 3.3280297003447363, "grad_norm": 0.0070326984860002995, "learning_rate": 5e-05, "loss": 0.0031, "step": 12550 }, { "epoch": 3.330681516839035, "grad_norm": 0.0057317647151649, "learning_rate": 5e-05, "loss": 0.0025, "step": 12560 }, { "epoch": 3.3333333333333335, "grad_norm": 0.006956405937671661, "learning_rate": 5e-05, "loss": 0.0024, "step": 12570 }, { "epoch": 3.335985149827632, "grad_norm": 0.01270278263837099, "learning_rate": 5e-05, "loss": 0.0037, "step": 12580 }, { "epoch": 3.3386369663219306, "grad_norm": 0.011067877523601055, "learning_rate": 5e-05, "loss": 0.0032, "step": 12590 }, { "epoch": 3.341288782816229, "grad_norm": 0.007368203718215227, "learning_rate": 5e-05, "loss": 0.0024, "step": 12600 }, { "epoch": 3.343940599310528, "grad_norm": 0.011022108606994152, "learning_rate": 5e-05, "loss": 0.0025, "step": 12610 }, { "epoch": 3.3465924158048264, "grad_norm": 0.008315417915582657, "learning_rate": 5e-05, "loss": 0.0031, "step": 12620 }, { "epoch": 3.349244232299125, "grad_norm": 0.008327306248247623, "learning_rate": 5e-05, "loss": 0.0034, "step": 12630 }, { "epoch": 3.3518960487934235, "grad_norm": 0.00999219436198473, "learning_rate": 5e-05, "loss": 0.0029, "step": 12640 }, { "epoch": 3.354547865287722, "grad_norm": 0.006799385882914066, "learning_rate": 5e-05, "loss": 0.0027, "step": 12650 }, { "epoch": 3.3571996817820207, "grad_norm": 0.01071498915553093, "learning_rate": 5e-05, "loss": 0.0031, "step": 12660 }, { "epoch": 3.3598514982763192, "grad_norm": 0.008339794352650642, "learning_rate": 5e-05, "loss": 0.0026, "step": 12670 }, { "epoch": 3.362503314770618, "grad_norm": 0.013814863748848438, "learning_rate": 5e-05, "loss": 0.0028, "step": 12680 }, { "epoch": 3.3651551312649164, "grad_norm": 0.008044657297432423, "learning_rate": 5e-05, "loss": 0.0028, "step": 12690 }, { "epoch": 3.367806947759215, "grad_norm": 0.009778212755918503, "learning_rate": 5e-05, "loss": 0.0027, "step": 12700 }, { "epoch": 3.3704587642535135, "grad_norm": 0.009851636365056038, "learning_rate": 5e-05, "loss": 0.0034, "step": 12710 }, { "epoch": 3.373110580747812, "grad_norm": 0.009961026720702648, "learning_rate": 5e-05, "loss": 0.003, "step": 12720 }, { "epoch": 3.3757623972421107, "grad_norm": 0.01074157003313303, "learning_rate": 5e-05, "loss": 0.0027, "step": 12730 }, { "epoch": 3.3784142137364093, "grad_norm": 0.010845938697457314, "learning_rate": 5e-05, "loss": 0.0024, "step": 12740 }, { "epoch": 3.381066030230708, "grad_norm": 0.00951968040317297, "learning_rate": 5e-05, "loss": 0.0025, "step": 12750 }, { "epoch": 3.3837178467250064, "grad_norm": 0.013121331110596657, "learning_rate": 5e-05, "loss": 0.0026, "step": 12760 }, { "epoch": 3.3863696632193054, "grad_norm": 0.006081548053771257, "learning_rate": 5e-05, "loss": 0.003, "step": 12770 }, { "epoch": 3.389021479713604, "grad_norm": 0.011374875903129578, "learning_rate": 5e-05, "loss": 0.0033, "step": 12780 }, { "epoch": 3.3916732962079026, "grad_norm": 0.005756790284067392, "learning_rate": 5e-05, "loss": 0.003, "step": 12790 }, { "epoch": 3.394325112702201, "grad_norm": 0.007226353976875544, "learning_rate": 5e-05, "loss": 0.0026, "step": 12800 }, { "epoch": 3.3969769291964997, "grad_norm": 0.009584473446011543, "learning_rate": 5e-05, "loss": 0.0025, "step": 12810 }, { "epoch": 3.3996287456907983, "grad_norm": 0.011390375904738903, "learning_rate": 5e-05, "loss": 0.0025, "step": 12820 }, { "epoch": 3.402280562185097, "grad_norm": 0.006510079372674227, "learning_rate": 5e-05, "loss": 0.003, "step": 12830 }, { "epoch": 3.4049323786793955, "grad_norm": 0.01291276142001152, "learning_rate": 5e-05, "loss": 0.0031, "step": 12840 }, { "epoch": 3.407584195173694, "grad_norm": 0.011421065777540207, "learning_rate": 5e-05, "loss": 0.0028, "step": 12850 }, { "epoch": 3.4102360116679926, "grad_norm": 0.01135805994272232, "learning_rate": 5e-05, "loss": 0.0027, "step": 12860 }, { "epoch": 3.412887828162291, "grad_norm": 0.007829434238374233, "learning_rate": 5e-05, "loss": 0.0032, "step": 12870 }, { "epoch": 3.4155396446565898, "grad_norm": 0.007179883308708668, "learning_rate": 5e-05, "loss": 0.0027, "step": 12880 }, { "epoch": 3.4181914611508883, "grad_norm": 0.007559699472039938, "learning_rate": 5e-05, "loss": 0.0028, "step": 12890 }, { "epoch": 3.420843277645187, "grad_norm": 0.009791441261768341, "learning_rate": 5e-05, "loss": 0.002, "step": 12900 }, { "epoch": 3.4234950941394855, "grad_norm": 0.0099486093968153, "learning_rate": 5e-05, "loss": 0.0026, "step": 12910 }, { "epoch": 3.426146910633784, "grad_norm": 0.006639926694333553, "learning_rate": 5e-05, "loss": 0.0032, "step": 12920 }, { "epoch": 3.4287987271280826, "grad_norm": 0.007623007986694574, "learning_rate": 5e-05, "loss": 0.0027, "step": 12930 }, { "epoch": 3.431450543622381, "grad_norm": 0.008356518112123013, "learning_rate": 5e-05, "loss": 0.0031, "step": 12940 }, { "epoch": 3.43410236011668, "grad_norm": 0.013848883099853992, "learning_rate": 5e-05, "loss": 0.0028, "step": 12950 }, { "epoch": 3.4367541766109784, "grad_norm": 0.012234157882630825, "learning_rate": 5e-05, "loss": 0.0034, "step": 12960 }, { "epoch": 3.439405993105277, "grad_norm": 0.0062556699849665165, "learning_rate": 5e-05, "loss": 0.0029, "step": 12970 }, { "epoch": 3.4420578095995755, "grad_norm": 0.007761972490698099, "learning_rate": 5e-05, "loss": 0.0026, "step": 12980 }, { "epoch": 3.444709626093874, "grad_norm": 0.0053863199427723885, "learning_rate": 5e-05, "loss": 0.0024, "step": 12990 }, { "epoch": 3.447361442588173, "grad_norm": 0.0069814459420740604, "learning_rate": 5e-05, "loss": 0.0025, "step": 13000 }, { "epoch": 3.4500132590824713, "grad_norm": 0.005459206644445658, "learning_rate": 5e-05, "loss": 0.0024, "step": 13010 }, { "epoch": 3.4526650755767703, "grad_norm": 0.010840688832104206, "learning_rate": 5e-05, "loss": 0.0022, "step": 13020 }, { "epoch": 3.455316892071069, "grad_norm": 0.006321368273347616, "learning_rate": 5e-05, "loss": 0.0032, "step": 13030 }, { "epoch": 3.4579687085653674, "grad_norm": 0.008765127509832382, "learning_rate": 5e-05, "loss": 0.0025, "step": 13040 }, { "epoch": 3.460620525059666, "grad_norm": 0.005972926504909992, "learning_rate": 5e-05, "loss": 0.0031, "step": 13050 }, { "epoch": 3.4632723415539646, "grad_norm": 0.006556614767760038, "learning_rate": 5e-05, "loss": 0.0027, "step": 13060 }, { "epoch": 3.465924158048263, "grad_norm": 0.009730293415486813, "learning_rate": 5e-05, "loss": 0.0028, "step": 13070 }, { "epoch": 3.4685759745425617, "grad_norm": 0.011461048386991024, "learning_rate": 5e-05, "loss": 0.0024, "step": 13080 }, { "epoch": 3.4712277910368603, "grad_norm": 0.021718496456742287, "learning_rate": 5e-05, "loss": 0.003, "step": 13090 }, { "epoch": 3.473879607531159, "grad_norm": 0.008683325722813606, "learning_rate": 5e-05, "loss": 0.0023, "step": 13100 }, { "epoch": 3.4765314240254575, "grad_norm": 0.006617540027946234, "learning_rate": 5e-05, "loss": 0.0032, "step": 13110 }, { "epoch": 3.479183240519756, "grad_norm": 0.0072217658162117004, "learning_rate": 5e-05, "loss": 0.0024, "step": 13120 }, { "epoch": 3.4818350570140546, "grad_norm": 0.007083084899932146, "learning_rate": 5e-05, "loss": 0.0025, "step": 13130 }, { "epoch": 3.484486873508353, "grad_norm": 0.008823337033390999, "learning_rate": 5e-05, "loss": 0.0027, "step": 13140 }, { "epoch": 3.4871386900026518, "grad_norm": 0.013336651027202606, "learning_rate": 5e-05, "loss": 0.0027, "step": 13150 }, { "epoch": 3.4897905064969503, "grad_norm": 0.00568110216408968, "learning_rate": 5e-05, "loss": 0.0025, "step": 13160 }, { "epoch": 3.492442322991249, "grad_norm": 0.009824039414525032, "learning_rate": 5e-05, "loss": 0.003, "step": 13170 }, { "epoch": 3.4950941394855475, "grad_norm": 0.006865704897791147, "learning_rate": 5e-05, "loss": 0.0024, "step": 13180 }, { "epoch": 3.497745955979846, "grad_norm": 0.011041556484997272, "learning_rate": 5e-05, "loss": 0.0026, "step": 13190 }, { "epoch": 3.5003977724741446, "grad_norm": 0.004674922209233046, "learning_rate": 5e-05, "loss": 0.003, "step": 13200 }, { "epoch": 3.5030495889684437, "grad_norm": 0.007755803409963846, "learning_rate": 5e-05, "loss": 0.0023, "step": 13210 }, { "epoch": 3.505701405462742, "grad_norm": 0.01479882001876831, "learning_rate": 5e-05, "loss": 0.003, "step": 13220 }, { "epoch": 3.508353221957041, "grad_norm": 0.010927666909992695, "learning_rate": 5e-05, "loss": 0.0028, "step": 13230 }, { "epoch": 3.511005038451339, "grad_norm": 0.00938371755182743, "learning_rate": 5e-05, "loss": 0.0026, "step": 13240 }, { "epoch": 3.513656854945638, "grad_norm": 0.012289544567465782, "learning_rate": 5e-05, "loss": 0.0022, "step": 13250 }, { "epoch": 3.516308671439936, "grad_norm": 0.00919871125370264, "learning_rate": 5e-05, "loss": 0.0039, "step": 13260 }, { "epoch": 3.518960487934235, "grad_norm": 0.007361495867371559, "learning_rate": 5e-05, "loss": 0.0028, "step": 13270 }, { "epoch": 3.5216123044285337, "grad_norm": 0.007766252849251032, "learning_rate": 5e-05, "loss": 0.0028, "step": 13280 }, { "epoch": 3.5242641209228323, "grad_norm": 0.007234795019030571, "learning_rate": 5e-05, "loss": 0.0028, "step": 13290 }, { "epoch": 3.526915937417131, "grad_norm": 0.0043191793374717236, "learning_rate": 5e-05, "loss": 0.0029, "step": 13300 }, { "epoch": 3.5295677539114294, "grad_norm": 0.004127430263906717, "learning_rate": 5e-05, "loss": 0.0023, "step": 13310 }, { "epoch": 3.532219570405728, "grad_norm": 0.008152993395924568, "learning_rate": 5e-05, "loss": 0.0027, "step": 13320 }, { "epoch": 3.5348713869000266, "grad_norm": 0.010146440006792545, "learning_rate": 5e-05, "loss": 0.0029, "step": 13330 }, { "epoch": 3.537523203394325, "grad_norm": 0.0030962838791310787, "learning_rate": 5e-05, "loss": 0.0026, "step": 13340 }, { "epoch": 3.5401750198886237, "grad_norm": 0.008021341636776924, "learning_rate": 5e-05, "loss": 0.0026, "step": 13350 }, { "epoch": 3.5428268363829223, "grad_norm": 0.011787437833845615, "learning_rate": 5e-05, "loss": 0.0026, "step": 13360 }, { "epoch": 3.545478652877221, "grad_norm": 0.008704227395355701, "learning_rate": 5e-05, "loss": 0.0032, "step": 13370 }, { "epoch": 3.5481304693715194, "grad_norm": 0.0193464457988739, "learning_rate": 5e-05, "loss": 0.0028, "step": 13380 }, { "epoch": 3.550782285865818, "grad_norm": 0.006063324864953756, "learning_rate": 5e-05, "loss": 0.0027, "step": 13390 }, { "epoch": 3.5534341023601166, "grad_norm": 0.011250495910644531, "learning_rate": 5e-05, "loss": 0.0027, "step": 13400 }, { "epoch": 3.556085918854415, "grad_norm": 0.008360723964869976, "learning_rate": 5e-05, "loss": 0.0028, "step": 13410 }, { "epoch": 3.5587377353487137, "grad_norm": 0.008057902567088604, "learning_rate": 5e-05, "loss": 0.0028, "step": 13420 }, { "epoch": 3.5613895518430123, "grad_norm": 0.007567374501377344, "learning_rate": 5e-05, "loss": 0.0025, "step": 13430 }, { "epoch": 3.5640413683373113, "grad_norm": 0.008163590915501118, "learning_rate": 5e-05, "loss": 0.0031, "step": 13440 }, { "epoch": 3.5666931848316095, "grad_norm": 0.00689227320253849, "learning_rate": 5e-05, "loss": 0.0026, "step": 13450 }, { "epoch": 3.5693450013259085, "grad_norm": 0.005781299900263548, "learning_rate": 5e-05, "loss": 0.0031, "step": 13460 }, { "epoch": 3.5719968178202066, "grad_norm": 0.008765170350670815, "learning_rate": 5e-05, "loss": 0.0026, "step": 13470 }, { "epoch": 3.5746486343145056, "grad_norm": 0.007878731936216354, "learning_rate": 5e-05, "loss": 0.0032, "step": 13480 }, { "epoch": 3.5773004508088038, "grad_norm": 0.008994423784315586, "learning_rate": 5e-05, "loss": 0.0028, "step": 13490 }, { "epoch": 3.579952267303103, "grad_norm": 0.013594531454145908, "learning_rate": 5e-05, "loss": 0.0027, "step": 13500 }, { "epoch": 3.5826040837974014, "grad_norm": 0.006259510293602943, "learning_rate": 5e-05, "loss": 0.0026, "step": 13510 }, { "epoch": 3.5852559002917, "grad_norm": 0.00677074957638979, "learning_rate": 5e-05, "loss": 0.0024, "step": 13520 }, { "epoch": 3.5879077167859985, "grad_norm": 0.008082320913672447, "learning_rate": 5e-05, "loss": 0.003, "step": 13530 }, { "epoch": 3.590559533280297, "grad_norm": 0.016817554831504822, "learning_rate": 5e-05, "loss": 0.0029, "step": 13540 }, { "epoch": 3.5932113497745957, "grad_norm": 0.009401633404195309, "learning_rate": 5e-05, "loss": 0.0025, "step": 13550 }, { "epoch": 3.5958631662688942, "grad_norm": 0.010330266319215298, "learning_rate": 5e-05, "loss": 0.0024, "step": 13560 }, { "epoch": 3.598514982763193, "grad_norm": 0.008878298103809357, "learning_rate": 5e-05, "loss": 0.0024, "step": 13570 }, { "epoch": 3.6011667992574914, "grad_norm": 0.009746634401381016, "learning_rate": 5e-05, "loss": 0.0026, "step": 13580 }, { "epoch": 3.60381861575179, "grad_norm": 0.011211326345801353, "learning_rate": 5e-05, "loss": 0.003, "step": 13590 }, { "epoch": 3.6064704322460885, "grad_norm": 0.008298727683722973, "learning_rate": 5e-05, "loss": 0.0027, "step": 13600 }, { "epoch": 3.609122248740387, "grad_norm": 0.013551696203649044, "learning_rate": 5e-05, "loss": 0.0035, "step": 13610 }, { "epoch": 3.6117740652346857, "grad_norm": 0.007005871273577213, "learning_rate": 5e-05, "loss": 0.0033, "step": 13620 }, { "epoch": 3.6144258817289843, "grad_norm": 0.007949241437017918, "learning_rate": 5e-05, "loss": 0.0025, "step": 13630 }, { "epoch": 3.617077698223283, "grad_norm": 0.008615352213382721, "learning_rate": 5e-05, "loss": 0.0038, "step": 13640 }, { "epoch": 3.6197295147175814, "grad_norm": 0.004444362595677376, "learning_rate": 5e-05, "loss": 0.002, "step": 13650 }, { "epoch": 3.62238133121188, "grad_norm": 0.0061761499382555485, "learning_rate": 5e-05, "loss": 0.0027, "step": 13660 }, { "epoch": 3.6250331477061786, "grad_norm": 0.006657309830188751, "learning_rate": 5e-05, "loss": 0.0024, "step": 13670 }, { "epoch": 3.627684964200477, "grad_norm": 0.006317457649856806, "learning_rate": 5e-05, "loss": 0.0024, "step": 13680 }, { "epoch": 3.630336780694776, "grad_norm": 0.007162455469369888, "learning_rate": 5e-05, "loss": 0.0027, "step": 13690 }, { "epoch": 3.6329885971890743, "grad_norm": 0.007997745648026466, "learning_rate": 5e-05, "loss": 0.0026, "step": 13700 }, { "epoch": 3.6356404136833733, "grad_norm": 0.008956292644143105, "learning_rate": 5e-05, "loss": 0.0023, "step": 13710 }, { "epoch": 3.6382922301776714, "grad_norm": 0.006985752377659082, "learning_rate": 5e-05, "loss": 0.0027, "step": 13720 }, { "epoch": 3.6409440466719705, "grad_norm": 0.005666147917509079, "learning_rate": 5e-05, "loss": 0.0019, "step": 13730 }, { "epoch": 3.643595863166269, "grad_norm": 0.007769061718136072, "learning_rate": 5e-05, "loss": 0.0026, "step": 13740 }, { "epoch": 3.6462476796605676, "grad_norm": 0.007389887701719999, "learning_rate": 5e-05, "loss": 0.0017, "step": 13750 }, { "epoch": 3.648899496154866, "grad_norm": 0.012733234092593193, "learning_rate": 5e-05, "loss": 0.0022, "step": 13760 }, { "epoch": 3.6515513126491648, "grad_norm": 0.026253964751958847, "learning_rate": 5e-05, "loss": 0.003, "step": 13770 }, { "epoch": 3.6542031291434633, "grad_norm": 0.011754609644412994, "learning_rate": 5e-05, "loss": 0.0029, "step": 13780 }, { "epoch": 3.656854945637762, "grad_norm": 0.014833345077931881, "learning_rate": 5e-05, "loss": 0.0031, "step": 13790 }, { "epoch": 3.6595067621320605, "grad_norm": 0.008133265189826488, "learning_rate": 5e-05, "loss": 0.0032, "step": 13800 }, { "epoch": 3.662158578626359, "grad_norm": 0.01177831832319498, "learning_rate": 5e-05, "loss": 0.0029, "step": 13810 }, { "epoch": 3.6648103951206576, "grad_norm": 0.00914313830435276, "learning_rate": 5e-05, "loss": 0.0029, "step": 13820 }, { "epoch": 3.6674622116149562, "grad_norm": 0.011466903612017632, "learning_rate": 5e-05, "loss": 0.0032, "step": 13830 }, { "epoch": 3.670114028109255, "grad_norm": 0.010285668075084686, "learning_rate": 5e-05, "loss": 0.0026, "step": 13840 }, { "epoch": 3.6727658446035534, "grad_norm": 0.007202278822660446, "learning_rate": 5e-05, "loss": 0.0025, "step": 13850 }, { "epoch": 3.675417661097852, "grad_norm": 0.008771276101469994, "learning_rate": 5e-05, "loss": 0.0036, "step": 13860 }, { "epoch": 3.6780694775921505, "grad_norm": 0.011085055768489838, "learning_rate": 5e-05, "loss": 0.0024, "step": 13870 }, { "epoch": 3.680721294086449, "grad_norm": 0.0061330669559538364, "learning_rate": 5e-05, "loss": 0.0028, "step": 13880 }, { "epoch": 3.6833731105807477, "grad_norm": 0.007196085527539253, "learning_rate": 5e-05, "loss": 0.0027, "step": 13890 }, { "epoch": 3.6860249270750463, "grad_norm": 0.008626720868051052, "learning_rate": 5e-05, "loss": 0.0024, "step": 13900 }, { "epoch": 3.688676743569345, "grad_norm": 0.012340892106294632, "learning_rate": 5e-05, "loss": 0.002, "step": 13910 }, { "epoch": 3.691328560063644, "grad_norm": 0.011493880301713943, "learning_rate": 5e-05, "loss": 0.0028, "step": 13920 }, { "epoch": 3.693980376557942, "grad_norm": 0.0076263281516730785, "learning_rate": 5e-05, "loss": 0.0024, "step": 13930 }, { "epoch": 3.696632193052241, "grad_norm": 0.010294991545379162, "learning_rate": 5e-05, "loss": 0.0026, "step": 13940 }, { "epoch": 3.699284009546539, "grad_norm": 0.012889938428997993, "learning_rate": 5e-05, "loss": 0.0025, "step": 13950 }, { "epoch": 3.701935826040838, "grad_norm": 0.008957677520811558, "learning_rate": 5e-05, "loss": 0.0022, "step": 13960 }, { "epoch": 3.7045876425351367, "grad_norm": 0.011983728036284447, "learning_rate": 5e-05, "loss": 0.0023, "step": 13970 }, { "epoch": 3.7072394590294353, "grad_norm": 0.012618215754628181, "learning_rate": 5e-05, "loss": 0.0027, "step": 13980 }, { "epoch": 3.709891275523734, "grad_norm": 0.01327463798224926, "learning_rate": 5e-05, "loss": 0.0027, "step": 13990 }, { "epoch": 3.7125430920180325, "grad_norm": 0.016127271577715874, "learning_rate": 5e-05, "loss": 0.003, "step": 14000 }, { "epoch": 3.715194908512331, "grad_norm": 0.00799216516315937, "learning_rate": 5e-05, "loss": 0.0027, "step": 14010 }, { "epoch": 3.7178467250066296, "grad_norm": 0.006950966082513332, "learning_rate": 5e-05, "loss": 0.0026, "step": 14020 }, { "epoch": 3.720498541500928, "grad_norm": 0.007451845332980156, "learning_rate": 5e-05, "loss": 0.0029, "step": 14030 }, { "epoch": 3.7231503579952268, "grad_norm": 0.006339040584862232, "learning_rate": 5e-05, "loss": 0.0027, "step": 14040 }, { "epoch": 3.7258021744895253, "grad_norm": 0.007045057136565447, "learning_rate": 5e-05, "loss": 0.0023, "step": 14050 }, { "epoch": 3.728453990983824, "grad_norm": 0.005639287177473307, "learning_rate": 5e-05, "loss": 0.0025, "step": 14060 }, { "epoch": 3.7311058074781225, "grad_norm": 0.019121933728456497, "learning_rate": 5e-05, "loss": 0.0029, "step": 14070 }, { "epoch": 3.733757623972421, "grad_norm": 0.007082813885062933, "learning_rate": 5e-05, "loss": 0.0029, "step": 14080 }, { "epoch": 3.7364094404667196, "grad_norm": 0.005047230049967766, "learning_rate": 5e-05, "loss": 0.0029, "step": 14090 }, { "epoch": 3.739061256961018, "grad_norm": 0.009307501837611198, "learning_rate": 5e-05, "loss": 0.0025, "step": 14100 }, { "epoch": 3.741713073455317, "grad_norm": 0.005744463764131069, "learning_rate": 5e-05, "loss": 0.002, "step": 14110 }, { "epoch": 3.7443648899496154, "grad_norm": 0.003777736099436879, "learning_rate": 5e-05, "loss": 0.0025, "step": 14120 }, { "epoch": 3.747016706443914, "grad_norm": 0.00836227834224701, "learning_rate": 5e-05, "loss": 0.0031, "step": 14130 }, { "epoch": 3.7496685229382125, "grad_norm": 0.00854493211954832, "learning_rate": 5e-05, "loss": 0.0027, "step": 14140 }, { "epoch": 3.7523203394325115, "grad_norm": 0.01055374275892973, "learning_rate": 5e-05, "loss": 0.0031, "step": 14150 }, { "epoch": 3.7549721559268097, "grad_norm": 0.009920302778482437, "learning_rate": 5e-05, "loss": 0.0023, "step": 14160 }, { "epoch": 3.7576239724211087, "grad_norm": 0.011379816569387913, "learning_rate": 5e-05, "loss": 0.0027, "step": 14170 }, { "epoch": 3.760275788915407, "grad_norm": 0.007068165112286806, "learning_rate": 5e-05, "loss": 0.003, "step": 14180 }, { "epoch": 3.762927605409706, "grad_norm": 0.00448951218277216, "learning_rate": 5e-05, "loss": 0.0024, "step": 14190 }, { "epoch": 3.7655794219040044, "grad_norm": 0.0050430926494300365, "learning_rate": 5e-05, "loss": 0.0023, "step": 14200 }, { "epoch": 3.768231238398303, "grad_norm": 0.011368288658559322, "learning_rate": 5e-05, "loss": 0.0022, "step": 14210 }, { "epoch": 3.7708830548926016, "grad_norm": 0.006268114782869816, "learning_rate": 5e-05, "loss": 0.0034, "step": 14220 }, { "epoch": 3.7735348713869, "grad_norm": 0.009626205079257488, "learning_rate": 5e-05, "loss": 0.0025, "step": 14230 }, { "epoch": 3.7761866878811987, "grad_norm": 0.0053503126837313175, "learning_rate": 5e-05, "loss": 0.0025, "step": 14240 }, { "epoch": 3.7788385043754973, "grad_norm": 0.008039800450205803, "learning_rate": 5e-05, "loss": 0.0029, "step": 14250 }, { "epoch": 3.781490320869796, "grad_norm": 0.0069915554486215115, "learning_rate": 5e-05, "loss": 0.0021, "step": 14260 }, { "epoch": 3.7841421373640944, "grad_norm": 0.009076796472072601, "learning_rate": 5e-05, "loss": 0.0023, "step": 14270 }, { "epoch": 3.786793953858393, "grad_norm": 0.009993322193622589, "learning_rate": 5e-05, "loss": 0.0024, "step": 14280 }, { "epoch": 3.7894457703526916, "grad_norm": 0.009041983634233475, "learning_rate": 5e-05, "loss": 0.0027, "step": 14290 }, { "epoch": 3.79209758684699, "grad_norm": 0.01106687355786562, "learning_rate": 5e-05, "loss": 0.0028, "step": 14300 }, { "epoch": 3.7947494033412887, "grad_norm": 0.014278717339038849, "learning_rate": 5e-05, "loss": 0.0028, "step": 14310 }, { "epoch": 3.7974012198355873, "grad_norm": 0.005345839541405439, "learning_rate": 5e-05, "loss": 0.0024, "step": 14320 }, { "epoch": 3.800053036329886, "grad_norm": 0.007211613934487104, "learning_rate": 5e-05, "loss": 0.0021, "step": 14330 }, { "epoch": 3.8027048528241845, "grad_norm": 0.009021852165460587, "learning_rate": 5e-05, "loss": 0.0025, "step": 14340 }, { "epoch": 3.805356669318483, "grad_norm": 0.009036186151206493, "learning_rate": 5e-05, "loss": 0.0026, "step": 14350 }, { "epoch": 3.8080084858127816, "grad_norm": 0.008518347516655922, "learning_rate": 5e-05, "loss": 0.0025, "step": 14360 }, { "epoch": 3.81066030230708, "grad_norm": 0.009148101322352886, "learning_rate": 5e-05, "loss": 0.0019, "step": 14370 }, { "epoch": 3.813312118801379, "grad_norm": 0.010991228744387627, "learning_rate": 5e-05, "loss": 0.0023, "step": 14380 }, { "epoch": 3.8159639352956773, "grad_norm": 0.01238295529037714, "learning_rate": 5e-05, "loss": 0.0023, "step": 14390 }, { "epoch": 3.8186157517899764, "grad_norm": 0.009698028676211834, "learning_rate": 5e-05, "loss": 0.0025, "step": 14400 }, { "epoch": 3.8212675682842745, "grad_norm": 0.00523020327091217, "learning_rate": 5e-05, "loss": 0.0022, "step": 14410 }, { "epoch": 3.8239193847785735, "grad_norm": 0.011558160185813904, "learning_rate": 5e-05, "loss": 0.0023, "step": 14420 }, { "epoch": 3.8265712012728716, "grad_norm": 0.008126907981932163, "learning_rate": 5e-05, "loss": 0.0025, "step": 14430 }, { "epoch": 3.8292230177671707, "grad_norm": 0.010546782054007053, "learning_rate": 5e-05, "loss": 0.0026, "step": 14440 }, { "epoch": 3.8318748342614692, "grad_norm": 0.01129871141165495, "learning_rate": 5e-05, "loss": 0.0029, "step": 14450 }, { "epoch": 3.834526650755768, "grad_norm": 0.00996595248579979, "learning_rate": 5e-05, "loss": 0.0031, "step": 14460 }, { "epoch": 3.8371784672500664, "grad_norm": 0.01996619999408722, "learning_rate": 5e-05, "loss": 0.0031, "step": 14470 }, { "epoch": 3.839830283744365, "grad_norm": 0.007121739909052849, "learning_rate": 5e-05, "loss": 0.0029, "step": 14480 }, { "epoch": 3.8424821002386635, "grad_norm": 0.007823626510798931, "learning_rate": 5e-05, "loss": 0.0028, "step": 14490 }, { "epoch": 3.845133916732962, "grad_norm": 0.0067743947729468346, "learning_rate": 5e-05, "loss": 0.0031, "step": 14500 }, { "epoch": 3.8477857332272607, "grad_norm": 0.006484656594693661, "learning_rate": 5e-05, "loss": 0.0027, "step": 14510 }, { "epoch": 3.8504375497215593, "grad_norm": 0.0059370421804487705, "learning_rate": 5e-05, "loss": 0.003, "step": 14520 }, { "epoch": 3.853089366215858, "grad_norm": 0.00716768391430378, "learning_rate": 5e-05, "loss": 0.0025, "step": 14530 }, { "epoch": 3.8557411827101564, "grad_norm": 0.010216071270406246, "learning_rate": 5e-05, "loss": 0.0024, "step": 14540 }, { "epoch": 3.858392999204455, "grad_norm": 0.008456237614154816, "learning_rate": 5e-05, "loss": 0.0021, "step": 14550 }, { "epoch": 3.8610448156987536, "grad_norm": 0.010795687325298786, "learning_rate": 5e-05, "loss": 0.0023, "step": 14560 }, { "epoch": 3.863696632193052, "grad_norm": 0.005440107546746731, "learning_rate": 5e-05, "loss": 0.0028, "step": 14570 }, { "epoch": 3.8663484486873507, "grad_norm": 0.009270248003304005, "learning_rate": 5e-05, "loss": 0.0027, "step": 14580 }, { "epoch": 3.8690002651816493, "grad_norm": 0.006497440859675407, "learning_rate": 5e-05, "loss": 0.0031, "step": 14590 }, { "epoch": 3.871652081675948, "grad_norm": 0.00893011037260294, "learning_rate": 5e-05, "loss": 0.0026, "step": 14600 }, { "epoch": 3.874303898170247, "grad_norm": 0.006699581630527973, "learning_rate": 5e-05, "loss": 0.0025, "step": 14610 }, { "epoch": 3.876955714664545, "grad_norm": 0.011388921178877354, "learning_rate": 5e-05, "loss": 0.0026, "step": 14620 }, { "epoch": 3.879607531158844, "grad_norm": 0.006864545401185751, "learning_rate": 5e-05, "loss": 0.0023, "step": 14630 }, { "epoch": 3.882259347653142, "grad_norm": 0.005453717429190874, "learning_rate": 5e-05, "loss": 0.0032, "step": 14640 }, { "epoch": 3.884911164147441, "grad_norm": 0.008967090398073196, "learning_rate": 5e-05, "loss": 0.0023, "step": 14650 }, { "epoch": 3.8875629806417393, "grad_norm": 0.025389712303876877, "learning_rate": 5e-05, "loss": 0.0023, "step": 14660 }, { "epoch": 3.8902147971360383, "grad_norm": 0.007407363038510084, "learning_rate": 5e-05, "loss": 0.003, "step": 14670 }, { "epoch": 3.892866613630337, "grad_norm": 0.010996345430612564, "learning_rate": 5e-05, "loss": 0.0032, "step": 14680 }, { "epoch": 3.8955184301246355, "grad_norm": 0.0037422156892716885, "learning_rate": 5e-05, "loss": 0.0025, "step": 14690 }, { "epoch": 3.898170246618934, "grad_norm": 0.010920009575784206, "learning_rate": 5e-05, "loss": 0.0024, "step": 14700 }, { "epoch": 3.9008220631132327, "grad_norm": 0.008523516356945038, "learning_rate": 5e-05, "loss": 0.0023, "step": 14710 }, { "epoch": 3.9034738796075312, "grad_norm": 0.009959502145648003, "learning_rate": 5e-05, "loss": 0.0028, "step": 14720 }, { "epoch": 3.90612569610183, "grad_norm": 0.010574010200798512, "learning_rate": 5e-05, "loss": 0.0024, "step": 14730 }, { "epoch": 3.9087775125961284, "grad_norm": 0.007226879708468914, "learning_rate": 5e-05, "loss": 0.002, "step": 14740 }, { "epoch": 3.911429329090427, "grad_norm": 0.010911266319453716, "learning_rate": 5e-05, "loss": 0.0025, "step": 14750 }, { "epoch": 3.9140811455847255, "grad_norm": 0.014098417945206165, "learning_rate": 5e-05, "loss": 0.0031, "step": 14760 }, { "epoch": 3.916732962079024, "grad_norm": 0.006065277848392725, "learning_rate": 5e-05, "loss": 0.002, "step": 14770 }, { "epoch": 3.9193847785733227, "grad_norm": 0.0066675469279289246, "learning_rate": 5e-05, "loss": 0.0024, "step": 14780 }, { "epoch": 3.9220365950676213, "grad_norm": 0.009463622234761715, "learning_rate": 5e-05, "loss": 0.0023, "step": 14790 }, { "epoch": 3.92468841156192, "grad_norm": 0.006484018638730049, "learning_rate": 5e-05, "loss": 0.0026, "step": 14800 }, { "epoch": 3.9273402280562184, "grad_norm": 0.007998033426702023, "learning_rate": 5e-05, "loss": 0.0022, "step": 14810 }, { "epoch": 3.929992044550517, "grad_norm": 0.008118566125631332, "learning_rate": 5e-05, "loss": 0.0024, "step": 14820 }, { "epoch": 3.9326438610448156, "grad_norm": 0.010084705427289009, "learning_rate": 5e-05, "loss": 0.0025, "step": 14830 }, { "epoch": 3.935295677539114, "grad_norm": 0.02206442318856716, "learning_rate": 5e-05, "loss": 0.0031, "step": 14840 }, { "epoch": 3.9379474940334127, "grad_norm": 0.006801731884479523, "learning_rate": 5e-05, "loss": 0.0029, "step": 14850 }, { "epoch": 3.9405993105277117, "grad_norm": 0.008452722802758217, "learning_rate": 5e-05, "loss": 0.0034, "step": 14860 }, { "epoch": 3.94325112702201, "grad_norm": 0.01026077102869749, "learning_rate": 5e-05, "loss": 0.003, "step": 14870 }, { "epoch": 3.945902943516309, "grad_norm": 0.006398217286914587, "learning_rate": 5e-05, "loss": 0.0025, "step": 14880 }, { "epoch": 3.948554760010607, "grad_norm": 0.009717120788991451, "learning_rate": 5e-05, "loss": 0.0025, "step": 14890 }, { "epoch": 3.951206576504906, "grad_norm": 0.007192777469754219, "learning_rate": 5e-05, "loss": 0.0026, "step": 14900 }, { "epoch": 3.9538583929992046, "grad_norm": 0.007811115123331547, "learning_rate": 5e-05, "loss": 0.0025, "step": 14910 }, { "epoch": 3.956510209493503, "grad_norm": 0.00871477834880352, "learning_rate": 5e-05, "loss": 0.0033, "step": 14920 }, { "epoch": 3.9591620259878018, "grad_norm": 0.006688418798148632, "learning_rate": 5e-05, "loss": 0.0025, "step": 14930 }, { "epoch": 3.9618138424821003, "grad_norm": 0.004463114310055971, "learning_rate": 5e-05, "loss": 0.0021, "step": 14940 }, { "epoch": 3.964465658976399, "grad_norm": 0.006771722808480263, "learning_rate": 5e-05, "loss": 0.0023, "step": 14950 }, { "epoch": 3.9671174754706975, "grad_norm": 0.009392993524670601, "learning_rate": 5e-05, "loss": 0.0025, "step": 14960 }, { "epoch": 3.969769291964996, "grad_norm": 0.010050095617771149, "learning_rate": 5e-05, "loss": 0.0026, "step": 14970 }, { "epoch": 3.9724211084592946, "grad_norm": 0.008699995465576649, "learning_rate": 5e-05, "loss": 0.0029, "step": 14980 }, { "epoch": 3.975072924953593, "grad_norm": 0.007309725508093834, "learning_rate": 5e-05, "loss": 0.0029, "step": 14990 }, { "epoch": 3.977724741447892, "grad_norm": 0.009145733900368214, "learning_rate": 5e-05, "loss": 0.0029, "step": 15000 }, { "epoch": 3.9803765579421904, "grad_norm": 0.008233997970819473, "learning_rate": 5e-05, "loss": 0.0024, "step": 15010 }, { "epoch": 3.983028374436489, "grad_norm": 0.010516766458749771, "learning_rate": 5e-05, "loss": 0.0029, "step": 15020 }, { "epoch": 3.9856801909307875, "grad_norm": 0.009537981823086739, "learning_rate": 5e-05, "loss": 0.0025, "step": 15030 }, { "epoch": 3.988332007425086, "grad_norm": 0.024754511192440987, "learning_rate": 5e-05, "loss": 0.0026, "step": 15040 }, { "epoch": 3.9909838239193847, "grad_norm": 0.009900420904159546, "learning_rate": 5e-05, "loss": 0.0029, "step": 15050 }, { "epoch": 3.9936356404136832, "grad_norm": 0.019124867394566536, "learning_rate": 5e-05, "loss": 0.0022, "step": 15060 }, { "epoch": 3.996287456907982, "grad_norm": 0.011227266862988472, "learning_rate": 5e-05, "loss": 0.0027, "step": 15070 }, { "epoch": 3.9989392734022804, "grad_norm": 0.009516655467450619, "learning_rate": 5e-05, "loss": 0.0026, "step": 15080 }, { "epoch": 4.001591089896579, "grad_norm": 0.005022477358579636, "learning_rate": 5e-05, "loss": 0.0024, "step": 15090 }, { "epoch": 4.0042429063908775, "grad_norm": 0.00815173052251339, "learning_rate": 5e-05, "loss": 0.0027, "step": 15100 }, { "epoch": 4.006894722885177, "grad_norm": 0.008042416535317898, "learning_rate": 5e-05, "loss": 0.0026, "step": 15110 }, { "epoch": 4.009546539379475, "grad_norm": 0.013812507502734661, "learning_rate": 5e-05, "loss": 0.0026, "step": 15120 }, { "epoch": 4.012198355873774, "grad_norm": 0.00687458785250783, "learning_rate": 5e-05, "loss": 0.0029, "step": 15130 }, { "epoch": 4.014850172368072, "grad_norm": 0.005765493027865887, "learning_rate": 5e-05, "loss": 0.0024, "step": 15140 }, { "epoch": 4.017501988862371, "grad_norm": 0.011101383715867996, "learning_rate": 5e-05, "loss": 0.0024, "step": 15150 }, { "epoch": 4.020153805356669, "grad_norm": 0.00815488863736391, "learning_rate": 5e-05, "loss": 0.003, "step": 15160 }, { "epoch": 4.022805621850968, "grad_norm": 0.008026650175452232, "learning_rate": 5e-05, "loss": 0.0025, "step": 15170 }, { "epoch": 4.025457438345266, "grad_norm": 0.004814527928829193, "learning_rate": 5e-05, "loss": 0.0028, "step": 15180 }, { "epoch": 4.028109254839565, "grad_norm": 0.005404304247349501, "learning_rate": 5e-05, "loss": 0.0027, "step": 15190 }, { "epoch": 4.030761071333863, "grad_norm": 0.008779503405094147, "learning_rate": 5e-05, "loss": 0.0023, "step": 15200 }, { "epoch": 4.033412887828162, "grad_norm": 0.01807679608464241, "learning_rate": 5e-05, "loss": 0.0024, "step": 15210 }, { "epoch": 4.0360647043224605, "grad_norm": 0.011976487934589386, "learning_rate": 5e-05, "loss": 0.0028, "step": 15220 }, { "epoch": 4.0387165208167595, "grad_norm": 0.009957092814147472, "learning_rate": 5e-05, "loss": 0.0023, "step": 15230 }, { "epoch": 4.0413683373110585, "grad_norm": 0.00926530733704567, "learning_rate": 5e-05, "loss": 0.0027, "step": 15240 }, { "epoch": 4.044020153805357, "grad_norm": 0.01033795066177845, "learning_rate": 5e-05, "loss": 0.0021, "step": 15250 }, { "epoch": 4.046671970299656, "grad_norm": 0.007956873625516891, "learning_rate": 5e-05, "loss": 0.0025, "step": 15260 }, { "epoch": 4.049323786793954, "grad_norm": 0.005751341115683317, "learning_rate": 5e-05, "loss": 0.0026, "step": 15270 }, { "epoch": 4.051975603288253, "grad_norm": 0.010111185722053051, "learning_rate": 5e-05, "loss": 0.0032, "step": 15280 }, { "epoch": 4.054627419782551, "grad_norm": 0.005843948572874069, "learning_rate": 5e-05, "loss": 0.0026, "step": 15290 }, { "epoch": 4.05727923627685, "grad_norm": 0.017574086785316467, "learning_rate": 5e-05, "loss": 0.0025, "step": 15300 }, { "epoch": 4.059931052771148, "grad_norm": 0.010595451109111309, "learning_rate": 5e-05, "loss": 0.0031, "step": 15310 }, { "epoch": 4.062582869265447, "grad_norm": 0.010846746154129505, "learning_rate": 5e-05, "loss": 0.0022, "step": 15320 }, { "epoch": 4.065234685759745, "grad_norm": 0.003765919478610158, "learning_rate": 5e-05, "loss": 0.0022, "step": 15330 }, { "epoch": 4.067886502254044, "grad_norm": 0.008866051211953163, "learning_rate": 5e-05, "loss": 0.0021, "step": 15340 }, { "epoch": 4.070538318748342, "grad_norm": 0.01170317828655243, "learning_rate": 5e-05, "loss": 0.0028, "step": 15350 }, { "epoch": 4.073190135242641, "grad_norm": 0.01966707780957222, "learning_rate": 5e-05, "loss": 0.0028, "step": 15360 }, { "epoch": 4.0758419517369395, "grad_norm": 0.008551125414669514, "learning_rate": 5e-05, "loss": 0.0024, "step": 15370 }, { "epoch": 4.0784937682312385, "grad_norm": 0.008552002720534801, "learning_rate": 5e-05, "loss": 0.0027, "step": 15380 }, { "epoch": 4.081145584725537, "grad_norm": 0.012902392074465752, "learning_rate": 5e-05, "loss": 0.0027, "step": 15390 }, { "epoch": 4.083797401219836, "grad_norm": 0.00695030577480793, "learning_rate": 5e-05, "loss": 0.0025, "step": 15400 }, { "epoch": 4.086449217714134, "grad_norm": 0.008856438100337982, "learning_rate": 5e-05, "loss": 0.0025, "step": 15410 }, { "epoch": 4.089101034208433, "grad_norm": 0.010097301565110683, "learning_rate": 5e-05, "loss": 0.0028, "step": 15420 }, { "epoch": 4.091752850702731, "grad_norm": 0.007564583793282509, "learning_rate": 5e-05, "loss": 0.0022, "step": 15430 }, { "epoch": 4.09440466719703, "grad_norm": 0.011479288339614868, "learning_rate": 5e-05, "loss": 0.0023, "step": 15440 }, { "epoch": 4.097056483691328, "grad_norm": 0.00758014852181077, "learning_rate": 5e-05, "loss": 0.0025, "step": 15450 }, { "epoch": 4.099708300185627, "grad_norm": 0.0074356175027787685, "learning_rate": 5e-05, "loss": 0.0022, "step": 15460 }, { "epoch": 4.102360116679926, "grad_norm": 0.008101812563836575, "learning_rate": 5e-05, "loss": 0.0024, "step": 15470 }, { "epoch": 4.105011933174224, "grad_norm": 0.014048833400011063, "learning_rate": 5e-05, "loss": 0.0024, "step": 15480 }, { "epoch": 4.107663749668523, "grad_norm": 0.010367672890424728, "learning_rate": 5e-05, "loss": 0.0027, "step": 15490 }, { "epoch": 4.1103155661628215, "grad_norm": 0.006588998716324568, "learning_rate": 5e-05, "loss": 0.0027, "step": 15500 }, { "epoch": 4.1129673826571205, "grad_norm": 0.0038325851783156395, "learning_rate": 5e-05, "loss": 0.0024, "step": 15510 }, { "epoch": 4.115619199151419, "grad_norm": 0.013166899792850018, "learning_rate": 5e-05, "loss": 0.0029, "step": 15520 }, { "epoch": 4.118271015645718, "grad_norm": 0.008794999681413174, "learning_rate": 5e-05, "loss": 0.0025, "step": 15530 }, { "epoch": 4.120922832140016, "grad_norm": 0.01400101650506258, "learning_rate": 5e-05, "loss": 0.0026, "step": 15540 }, { "epoch": 4.123574648634315, "grad_norm": 0.007475132588297129, "learning_rate": 5e-05, "loss": 0.0024, "step": 15550 }, { "epoch": 4.126226465128613, "grad_norm": 0.00820545107126236, "learning_rate": 5e-05, "loss": 0.0026, "step": 15560 }, { "epoch": 4.128878281622912, "grad_norm": 0.0073463087901473045, "learning_rate": 5e-05, "loss": 0.0026, "step": 15570 }, { "epoch": 4.13153009811721, "grad_norm": 0.017729012295603752, "learning_rate": 5e-05, "loss": 0.0029, "step": 15580 }, { "epoch": 4.134181914611509, "grad_norm": 0.014985989779233932, "learning_rate": 5e-05, "loss": 0.0028, "step": 15590 }, { "epoch": 4.136833731105807, "grad_norm": 0.006068153772503138, "learning_rate": 5e-05, "loss": 0.0027, "step": 15600 }, { "epoch": 4.139485547600106, "grad_norm": 0.007643744815140963, "learning_rate": 5e-05, "loss": 0.0026, "step": 15610 }, { "epoch": 4.142137364094404, "grad_norm": 0.01598348841071129, "learning_rate": 5e-05, "loss": 0.0033, "step": 15620 }, { "epoch": 4.144789180588703, "grad_norm": 0.008618521504104137, "learning_rate": 5e-05, "loss": 0.0027, "step": 15630 }, { "epoch": 4.1474409970830015, "grad_norm": 0.007251124829053879, "learning_rate": 5e-05, "loss": 0.0024, "step": 15640 }, { "epoch": 4.1500928135773005, "grad_norm": 0.009798114188015461, "learning_rate": 5e-05, "loss": 0.0023, "step": 15650 }, { "epoch": 4.152744630071599, "grad_norm": 0.005940656643360853, "learning_rate": 5e-05, "loss": 0.0024, "step": 15660 }, { "epoch": 4.155396446565898, "grad_norm": 0.010411057621240616, "learning_rate": 5e-05, "loss": 0.0027, "step": 15670 }, { "epoch": 4.158048263060196, "grad_norm": 0.011272044852375984, "learning_rate": 5e-05, "loss": 0.003, "step": 15680 }, { "epoch": 4.160700079554495, "grad_norm": 0.006107686553150415, "learning_rate": 5e-05, "loss": 0.0021, "step": 15690 }, { "epoch": 4.163351896048793, "grad_norm": 0.010294955223798752, "learning_rate": 5e-05, "loss": 0.0021, "step": 15700 }, { "epoch": 4.166003712543092, "grad_norm": 0.005170962307602167, "learning_rate": 5e-05, "loss": 0.0015, "step": 15710 }, { "epoch": 4.168655529037391, "grad_norm": 0.011078094132244587, "learning_rate": 5e-05, "loss": 0.003, "step": 15720 }, { "epoch": 4.171307345531689, "grad_norm": 0.004677261225879192, "learning_rate": 5e-05, "loss": 0.0026, "step": 15730 }, { "epoch": 4.173959162025988, "grad_norm": 0.008776109665632248, "learning_rate": 5e-05, "loss": 0.002, "step": 15740 }, { "epoch": 4.176610978520286, "grad_norm": 0.010209780186414719, "learning_rate": 5e-05, "loss": 0.0028, "step": 15750 }, { "epoch": 4.179262795014585, "grad_norm": 0.006502863951027393, "learning_rate": 5e-05, "loss": 0.0029, "step": 15760 }, { "epoch": 4.181914611508883, "grad_norm": 0.009896485134959221, "learning_rate": 5e-05, "loss": 0.0024, "step": 15770 }, { "epoch": 4.1845664280031825, "grad_norm": 0.008999438025057316, "learning_rate": 5e-05, "loss": 0.0025, "step": 15780 }, { "epoch": 4.187218244497481, "grad_norm": 0.007617855444550514, "learning_rate": 5e-05, "loss": 0.0026, "step": 15790 }, { "epoch": 4.18987006099178, "grad_norm": 0.008191118016839027, "learning_rate": 5e-05, "loss": 0.0029, "step": 15800 }, { "epoch": 4.192521877486078, "grad_norm": 0.010412057861685753, "learning_rate": 5e-05, "loss": 0.0024, "step": 15810 }, { "epoch": 4.195173693980377, "grad_norm": 0.0081082284450531, "learning_rate": 5e-05, "loss": 0.0036, "step": 15820 }, { "epoch": 4.197825510474675, "grad_norm": 0.007017150055617094, "learning_rate": 5e-05, "loss": 0.003, "step": 15830 }, { "epoch": 4.200477326968974, "grad_norm": 0.015287511050701141, "learning_rate": 5e-05, "loss": 0.0024, "step": 15840 }, { "epoch": 4.203129143463272, "grad_norm": 0.010176829062402248, "learning_rate": 5e-05, "loss": 0.0027, "step": 15850 }, { "epoch": 4.205780959957571, "grad_norm": 0.007373983506113291, "learning_rate": 5e-05, "loss": 0.0023, "step": 15860 }, { "epoch": 4.208432776451869, "grad_norm": 0.010654757730662823, "learning_rate": 5e-05, "loss": 0.0021, "step": 15870 }, { "epoch": 4.211084592946168, "grad_norm": 0.0048538437113165855, "learning_rate": 5e-05, "loss": 0.0027, "step": 15880 }, { "epoch": 4.213736409440466, "grad_norm": 0.005359725095331669, "learning_rate": 5e-05, "loss": 0.0026, "step": 15890 }, { "epoch": 4.216388225934765, "grad_norm": 0.012133251875638962, "learning_rate": 5e-05, "loss": 0.0025, "step": 15900 }, { "epoch": 4.2190400424290635, "grad_norm": 0.00618775887414813, "learning_rate": 5e-05, "loss": 0.0025, "step": 15910 }, { "epoch": 4.2216918589233625, "grad_norm": 0.014979773201048374, "learning_rate": 5e-05, "loss": 0.0026, "step": 15920 }, { "epoch": 4.224343675417661, "grad_norm": 0.008881661109626293, "learning_rate": 5e-05, "loss": 0.0022, "step": 15930 }, { "epoch": 4.22699549191196, "grad_norm": 0.010891123674809933, "learning_rate": 5e-05, "loss": 0.0022, "step": 15940 }, { "epoch": 4.229647308406259, "grad_norm": 0.01366396527737379, "learning_rate": 5e-05, "loss": 0.0023, "step": 15950 }, { "epoch": 4.232299124900557, "grad_norm": 0.011937658302485943, "learning_rate": 5e-05, "loss": 0.003, "step": 15960 }, { "epoch": 4.234950941394856, "grad_norm": 0.00983242318034172, "learning_rate": 5e-05, "loss": 0.0027, "step": 15970 }, { "epoch": 4.237602757889154, "grad_norm": 0.007831680588424206, "learning_rate": 5e-05, "loss": 0.0025, "step": 15980 }, { "epoch": 4.240254574383453, "grad_norm": 0.004416955169290304, "learning_rate": 5e-05, "loss": 0.0028, "step": 15990 }, { "epoch": 4.242906390877751, "grad_norm": 0.015489389188587666, "learning_rate": 5e-05, "loss": 0.0027, "step": 16000 }, { "epoch": 4.24555820737205, "grad_norm": 0.026868030428886414, "learning_rate": 5e-05, "loss": 0.0022, "step": 16010 }, { "epoch": 4.248210023866348, "grad_norm": 0.008888555690646172, "learning_rate": 5e-05, "loss": 0.0022, "step": 16020 }, { "epoch": 4.250861840360647, "grad_norm": 0.014179510995745659, "learning_rate": 5e-05, "loss": 0.0026, "step": 16030 }, { "epoch": 4.253513656854945, "grad_norm": 0.005505410488694906, "learning_rate": 5e-05, "loss": 0.0024, "step": 16040 }, { "epoch": 4.256165473349244, "grad_norm": 0.009107704274356365, "learning_rate": 5e-05, "loss": 0.0029, "step": 16050 }, { "epoch": 4.258817289843543, "grad_norm": 0.0065087792463600636, "learning_rate": 5e-05, "loss": 0.0028, "step": 16060 }, { "epoch": 4.261469106337842, "grad_norm": 0.007014525588601828, "learning_rate": 5e-05, "loss": 0.0029, "step": 16070 }, { "epoch": 4.26412092283214, "grad_norm": 0.009081746451556683, "learning_rate": 5e-05, "loss": 0.0028, "step": 16080 }, { "epoch": 4.266772739326439, "grad_norm": 0.005923256743699312, "learning_rate": 5e-05, "loss": 0.0026, "step": 16090 }, { "epoch": 4.269424555820737, "grad_norm": 0.010579952970147133, "learning_rate": 5e-05, "loss": 0.0024, "step": 16100 }, { "epoch": 4.272076372315036, "grad_norm": 0.007529107388108969, "learning_rate": 5e-05, "loss": 0.0026, "step": 16110 }, { "epoch": 4.274728188809334, "grad_norm": 0.008107173256576061, "learning_rate": 5e-05, "loss": 0.0023, "step": 16120 }, { "epoch": 4.277380005303633, "grad_norm": 0.010039577260613441, "learning_rate": 5e-05, "loss": 0.0026, "step": 16130 }, { "epoch": 4.280031821797931, "grad_norm": 0.009119220077991486, "learning_rate": 5e-05, "loss": 0.0022, "step": 16140 }, { "epoch": 4.28268363829223, "grad_norm": 0.0057527110911905766, "learning_rate": 5e-05, "loss": 0.0025, "step": 16150 }, { "epoch": 4.285335454786528, "grad_norm": 0.007705080788582563, "learning_rate": 5e-05, "loss": 0.0022, "step": 16160 }, { "epoch": 4.287987271280827, "grad_norm": 0.011342032812535763, "learning_rate": 5e-05, "loss": 0.0022, "step": 16170 }, { "epoch": 4.290639087775126, "grad_norm": 0.009245144203305244, "learning_rate": 5e-05, "loss": 0.0024, "step": 16180 }, { "epoch": 4.2932909042694245, "grad_norm": 0.01407627947628498, "learning_rate": 5e-05, "loss": 0.0025, "step": 16190 }, { "epoch": 4.2959427207637235, "grad_norm": 0.010243909433484077, "learning_rate": 5e-05, "loss": 0.0023, "step": 16200 }, { "epoch": 4.298594537258022, "grad_norm": 0.006399893201887608, "learning_rate": 5e-05, "loss": 0.0023, "step": 16210 }, { "epoch": 4.301246353752321, "grad_norm": 0.01122941356152296, "learning_rate": 5e-05, "loss": 0.0024, "step": 16220 }, { "epoch": 4.303898170246619, "grad_norm": 0.00825104583054781, "learning_rate": 5e-05, "loss": 0.0026, "step": 16230 }, { "epoch": 4.306549986740918, "grad_norm": 0.009232701733708382, "learning_rate": 5e-05, "loss": 0.0027, "step": 16240 }, { "epoch": 4.309201803235216, "grad_norm": 0.0048955525271594524, "learning_rate": 5e-05, "loss": 0.0027, "step": 16250 }, { "epoch": 4.311853619729515, "grad_norm": 0.009229153394699097, "learning_rate": 5e-05, "loss": 0.0023, "step": 16260 }, { "epoch": 4.314505436223813, "grad_norm": 0.008362018503248692, "learning_rate": 5e-05, "loss": 0.0026, "step": 16270 }, { "epoch": 4.317157252718112, "grad_norm": 0.007550325710326433, "learning_rate": 5e-05, "loss": 0.002, "step": 16280 }, { "epoch": 4.31980906921241, "grad_norm": 0.010534018278121948, "learning_rate": 5e-05, "loss": 0.0025, "step": 16290 }, { "epoch": 4.322460885706709, "grad_norm": 0.005788877606391907, "learning_rate": 5e-05, "loss": 0.0026, "step": 16300 }, { "epoch": 4.325112702201007, "grad_norm": 0.011799757368862629, "learning_rate": 5e-05, "loss": 0.0026, "step": 16310 }, { "epoch": 4.327764518695306, "grad_norm": 0.0066305906511843204, "learning_rate": 5e-05, "loss": 0.0026, "step": 16320 }, { "epoch": 4.330416335189605, "grad_norm": 0.008191617205739021, "learning_rate": 5e-05, "loss": 0.0025, "step": 16330 }, { "epoch": 4.333068151683904, "grad_norm": 0.00969676673412323, "learning_rate": 5e-05, "loss": 0.0022, "step": 16340 }, { "epoch": 4.335719968178202, "grad_norm": 0.010808815248310566, "learning_rate": 5e-05, "loss": 0.0025, "step": 16350 }, { "epoch": 4.338371784672501, "grad_norm": 0.01742013916373253, "learning_rate": 5e-05, "loss": 0.003, "step": 16360 }, { "epoch": 4.341023601166799, "grad_norm": 0.011579358950257301, "learning_rate": 5e-05, "loss": 0.0022, "step": 16370 }, { "epoch": 4.343675417661098, "grad_norm": 0.009403097443282604, "learning_rate": 5e-05, "loss": 0.0021, "step": 16380 }, { "epoch": 4.346327234155396, "grad_norm": 0.005998173728585243, "learning_rate": 5e-05, "loss": 0.0023, "step": 16390 }, { "epoch": 4.348979050649695, "grad_norm": 0.00842077936977148, "learning_rate": 5e-05, "loss": 0.0027, "step": 16400 }, { "epoch": 4.351630867143994, "grad_norm": 0.016417445614933968, "learning_rate": 5e-05, "loss": 0.0029, "step": 16410 }, { "epoch": 4.354282683638292, "grad_norm": 0.006509271450340748, "learning_rate": 5e-05, "loss": 0.0026, "step": 16420 }, { "epoch": 4.356934500132591, "grad_norm": 0.012377278879284859, "learning_rate": 5e-05, "loss": 0.0026, "step": 16430 }, { "epoch": 4.359586316626889, "grad_norm": 0.023760629817843437, "learning_rate": 5e-05, "loss": 0.0025, "step": 16440 }, { "epoch": 4.362238133121188, "grad_norm": 0.006723439320921898, "learning_rate": 5e-05, "loss": 0.0028, "step": 16450 }, { "epoch": 4.3648899496154865, "grad_norm": 0.009623004123568535, "learning_rate": 5e-05, "loss": 0.0024, "step": 16460 }, { "epoch": 4.3675417661097855, "grad_norm": 0.00998237356543541, "learning_rate": 5e-05, "loss": 0.0023, "step": 16470 }, { "epoch": 4.370193582604084, "grad_norm": 0.00886438973248005, "learning_rate": 5e-05, "loss": 0.0032, "step": 16480 }, { "epoch": 4.372845399098383, "grad_norm": 0.008397947065532207, "learning_rate": 5e-05, "loss": 0.0024, "step": 16490 }, { "epoch": 4.375497215592681, "grad_norm": 0.01084205973893404, "learning_rate": 5e-05, "loss": 0.002, "step": 16500 }, { "epoch": 4.37814903208698, "grad_norm": 0.0119198989123106, "learning_rate": 5e-05, "loss": 0.002, "step": 16510 }, { "epoch": 4.380800848581278, "grad_norm": 0.008697298355400562, "learning_rate": 5e-05, "loss": 0.0024, "step": 16520 }, { "epoch": 4.383452665075577, "grad_norm": 0.015554849058389664, "learning_rate": 5e-05, "loss": 0.002, "step": 16530 }, { "epoch": 4.386104481569875, "grad_norm": 0.011053034104406834, "learning_rate": 5e-05, "loss": 0.0028, "step": 16540 }, { "epoch": 4.388756298064174, "grad_norm": 0.007424443494528532, "learning_rate": 5e-05, "loss": 0.0025, "step": 16550 }, { "epoch": 4.391408114558472, "grad_norm": 0.013250579126179218, "learning_rate": 5e-05, "loss": 0.0027, "step": 16560 }, { "epoch": 4.394059931052771, "grad_norm": 0.014538522809743881, "learning_rate": 5e-05, "loss": 0.0024, "step": 16570 }, { "epoch": 4.396711747547069, "grad_norm": 0.010441260412335396, "learning_rate": 5e-05, "loss": 0.0023, "step": 16580 }, { "epoch": 4.399363564041368, "grad_norm": 0.011740516871213913, "learning_rate": 5e-05, "loss": 0.0023, "step": 16590 }, { "epoch": 4.4020153805356665, "grad_norm": 0.012403851374983788, "learning_rate": 5e-05, "loss": 0.0024, "step": 16600 }, { "epoch": 4.404667197029966, "grad_norm": 0.009253264404833317, "learning_rate": 5e-05, "loss": 0.0024, "step": 16610 }, { "epoch": 4.407319013524264, "grad_norm": 0.007247175555676222, "learning_rate": 5e-05, "loss": 0.0025, "step": 16620 }, { "epoch": 4.409970830018563, "grad_norm": 0.00860622152686119, "learning_rate": 5e-05, "loss": 0.0026, "step": 16630 }, { "epoch": 4.412622646512862, "grad_norm": 0.007542745675891638, "learning_rate": 5e-05, "loss": 0.0029, "step": 16640 }, { "epoch": 4.41527446300716, "grad_norm": 0.010870569385588169, "learning_rate": 5e-05, "loss": 0.0027, "step": 16650 }, { "epoch": 4.417926279501459, "grad_norm": 0.007201709318906069, "learning_rate": 5e-05, "loss": 0.0023, "step": 16660 }, { "epoch": 4.420578095995757, "grad_norm": 0.00915071927011013, "learning_rate": 5e-05, "loss": 0.0016, "step": 16670 }, { "epoch": 4.423229912490056, "grad_norm": 0.004696717951446772, "learning_rate": 5e-05, "loss": 0.002, "step": 16680 }, { "epoch": 4.425881728984354, "grad_norm": 0.012144661508500576, "learning_rate": 5e-05, "loss": 0.0031, "step": 16690 }, { "epoch": 4.428533545478653, "grad_norm": 0.007173256017267704, "learning_rate": 5e-05, "loss": 0.0026, "step": 16700 }, { "epoch": 4.431185361972951, "grad_norm": 0.010684410110116005, "learning_rate": 5e-05, "loss": 0.0024, "step": 16710 }, { "epoch": 4.43383717846725, "grad_norm": 0.008377669379115105, "learning_rate": 5e-05, "loss": 0.0024, "step": 16720 }, { "epoch": 4.4364889949615485, "grad_norm": 0.0064465925097465515, "learning_rate": 5e-05, "loss": 0.0028, "step": 16730 }, { "epoch": 4.4391408114558475, "grad_norm": 0.01103215105831623, "learning_rate": 5e-05, "loss": 0.0026, "step": 16740 }, { "epoch": 4.441792627950146, "grad_norm": 0.007930045947432518, "learning_rate": 5e-05, "loss": 0.0025, "step": 16750 }, { "epoch": 4.444444444444445, "grad_norm": 0.0081079276278615, "learning_rate": 5e-05, "loss": 0.002, "step": 16760 }, { "epoch": 4.447096260938743, "grad_norm": 0.007294425740838051, "learning_rate": 5e-05, "loss": 0.0026, "step": 16770 }, { "epoch": 4.449748077433042, "grad_norm": 0.01107637956738472, "learning_rate": 5e-05, "loss": 0.0022, "step": 16780 }, { "epoch": 4.45239989392734, "grad_norm": 0.008090582676231861, "learning_rate": 5e-05, "loss": 0.0015, "step": 16790 }, { "epoch": 4.455051710421639, "grad_norm": 0.011434951797127724, "learning_rate": 5e-05, "loss": 0.003, "step": 16800 }, { "epoch": 4.457703526915937, "grad_norm": 0.005317237693816423, "learning_rate": 5e-05, "loss": 0.0018, "step": 16810 }, { "epoch": 4.460355343410236, "grad_norm": 0.00959730613976717, "learning_rate": 5e-05, "loss": 0.0027, "step": 16820 }, { "epoch": 4.463007159904534, "grad_norm": 0.013586374931037426, "learning_rate": 5e-05, "loss": 0.0026, "step": 16830 }, { "epoch": 4.465658976398833, "grad_norm": 0.0071860686875879765, "learning_rate": 5e-05, "loss": 0.0024, "step": 16840 }, { "epoch": 4.468310792893131, "grad_norm": 0.008671855553984642, "learning_rate": 5e-05, "loss": 0.0023, "step": 16850 }, { "epoch": 4.47096260938743, "grad_norm": 0.011683979071676731, "learning_rate": 5e-05, "loss": 0.0027, "step": 16860 }, { "epoch": 4.473614425881729, "grad_norm": 0.006530901417136192, "learning_rate": 5e-05, "loss": 0.0023, "step": 16870 }, { "epoch": 4.4762662423760275, "grad_norm": 0.013120067305862904, "learning_rate": 5e-05, "loss": 0.0025, "step": 16880 }, { "epoch": 4.478918058870327, "grad_norm": 0.008053500205278397, "learning_rate": 5e-05, "loss": 0.0024, "step": 16890 }, { "epoch": 4.481569875364625, "grad_norm": 0.00954759307205677, "learning_rate": 5e-05, "loss": 0.0021, "step": 16900 }, { "epoch": 4.484221691858924, "grad_norm": 0.009571189060807228, "learning_rate": 5e-05, "loss": 0.0025, "step": 16910 }, { "epoch": 4.486873508353222, "grad_norm": 0.00730912946164608, "learning_rate": 5e-05, "loss": 0.0022, "step": 16920 }, { "epoch": 4.489525324847521, "grad_norm": 0.006479683332145214, "learning_rate": 5e-05, "loss": 0.0025, "step": 16930 }, { "epoch": 4.492177141341819, "grad_norm": 0.009897160343825817, "learning_rate": 5e-05, "loss": 0.0024, "step": 16940 }, { "epoch": 4.494828957836118, "grad_norm": 0.00779009610414505, "learning_rate": 5e-05, "loss": 0.002, "step": 16950 }, { "epoch": 4.497480774330416, "grad_norm": 0.0094199413433671, "learning_rate": 5e-05, "loss": 0.0021, "step": 16960 }, { "epoch": 4.500132590824715, "grad_norm": 0.015873564407229424, "learning_rate": 5e-05, "loss": 0.0027, "step": 16970 }, { "epoch": 4.502784407319013, "grad_norm": 0.005503157619386911, "learning_rate": 5e-05, "loss": 0.0022, "step": 16980 }, { "epoch": 4.505436223813312, "grad_norm": 0.009524544700980186, "learning_rate": 5e-05, "loss": 0.0025, "step": 16990 }, { "epoch": 4.5080880403076105, "grad_norm": 0.008638451807200909, "learning_rate": 5e-05, "loss": 0.0024, "step": 17000 }, { "epoch": 4.5107398568019095, "grad_norm": 0.011376433074474335, "learning_rate": 5e-05, "loss": 0.0023, "step": 17010 }, { "epoch": 4.513391673296208, "grad_norm": 0.008573627099394798, "learning_rate": 5e-05, "loss": 0.002, "step": 17020 }, { "epoch": 4.516043489790507, "grad_norm": 0.007988306693732738, "learning_rate": 5e-05, "loss": 0.003, "step": 17030 }, { "epoch": 4.518695306284805, "grad_norm": 0.00742417573928833, "learning_rate": 5e-05, "loss": 0.0025, "step": 17040 }, { "epoch": 4.521347122779104, "grad_norm": 0.008118955418467522, "learning_rate": 5e-05, "loss": 0.0022, "step": 17050 }, { "epoch": 4.523998939273402, "grad_norm": 0.007288120221346617, "learning_rate": 5e-05, "loss": 0.0022, "step": 17060 }, { "epoch": 4.526650755767701, "grad_norm": 0.008039956912398338, "learning_rate": 5e-05, "loss": 0.0024, "step": 17070 }, { "epoch": 4.529302572261999, "grad_norm": 0.008547033183276653, "learning_rate": 5e-05, "loss": 0.0021, "step": 17080 }, { "epoch": 4.531954388756298, "grad_norm": 0.007560960948467255, "learning_rate": 5e-05, "loss": 0.0025, "step": 17090 }, { "epoch": 4.534606205250597, "grad_norm": 0.015920381993055344, "learning_rate": 5e-05, "loss": 0.0029, "step": 17100 }, { "epoch": 4.537258021744895, "grad_norm": 0.008901769295334816, "learning_rate": 5e-05, "loss": 0.0025, "step": 17110 }, { "epoch": 4.539909838239193, "grad_norm": 0.011614211834967136, "learning_rate": 5e-05, "loss": 0.0019, "step": 17120 }, { "epoch": 4.542561654733492, "grad_norm": 0.00521627115085721, "learning_rate": 5e-05, "loss": 0.0023, "step": 17130 }, { "epoch": 4.545213471227791, "grad_norm": 0.013656217604875565, "learning_rate": 5e-05, "loss": 0.0029, "step": 17140 }, { "epoch": 4.5478652877220895, "grad_norm": 0.04105531424283981, "learning_rate": 5e-05, "loss": 0.0026, "step": 17150 }, { "epoch": 4.5505171042163886, "grad_norm": 0.016687462106347084, "learning_rate": 5e-05, "loss": 0.0024, "step": 17160 }, { "epoch": 4.553168920710687, "grad_norm": 0.00868432316929102, "learning_rate": 5e-05, "loss": 0.0025, "step": 17170 }, { "epoch": 4.555820737204986, "grad_norm": 0.009711219929158688, "learning_rate": 5e-05, "loss": 0.0028, "step": 17180 }, { "epoch": 4.558472553699284, "grad_norm": 0.008104918524622917, "learning_rate": 5e-05, "loss": 0.0027, "step": 17190 }, { "epoch": 4.561124370193583, "grad_norm": 0.010512364096939564, "learning_rate": 5e-05, "loss": 0.0023, "step": 17200 }, { "epoch": 4.563776186687881, "grad_norm": 0.010798780247569084, "learning_rate": 5e-05, "loss": 0.0022, "step": 17210 }, { "epoch": 4.56642800318218, "grad_norm": 0.00893179140985012, "learning_rate": 5e-05, "loss": 0.0021, "step": 17220 }, { "epoch": 4.569079819676478, "grad_norm": 0.014848314225673676, "learning_rate": 5e-05, "loss": 0.003, "step": 17230 }, { "epoch": 4.571731636170777, "grad_norm": 0.008952787145972252, "learning_rate": 5e-05, "loss": 0.0026, "step": 17240 }, { "epoch": 4.574383452665075, "grad_norm": 0.005290111992508173, "learning_rate": 5e-05, "loss": 0.0027, "step": 17250 }, { "epoch": 4.577035269159374, "grad_norm": 0.006841118447482586, "learning_rate": 5e-05, "loss": 0.0022, "step": 17260 }, { "epoch": 4.579687085653672, "grad_norm": 0.0063551850616931915, "learning_rate": 5e-05, "loss": 0.002, "step": 17270 }, { "epoch": 4.5823389021479715, "grad_norm": 0.01001470722258091, "learning_rate": 5e-05, "loss": 0.0025, "step": 17280 }, { "epoch": 4.58499071864227, "grad_norm": 0.009360388852655888, "learning_rate": 5e-05, "loss": 0.0021, "step": 17290 }, { "epoch": 4.587642535136569, "grad_norm": 0.011245540343225002, "learning_rate": 5e-05, "loss": 0.0023, "step": 17300 }, { "epoch": 4.590294351630867, "grad_norm": 0.01224070880562067, "learning_rate": 5e-05, "loss": 0.0024, "step": 17310 }, { "epoch": 4.592946168125166, "grad_norm": 0.007543554995208979, "learning_rate": 5e-05, "loss": 0.0024, "step": 17320 }, { "epoch": 4.595597984619465, "grad_norm": 0.012044860050082207, "learning_rate": 5e-05, "loss": 0.0021, "step": 17330 }, { "epoch": 4.598249801113763, "grad_norm": 0.011776200495660305, "learning_rate": 5e-05, "loss": 0.0025, "step": 17340 }, { "epoch": 4.600901617608061, "grad_norm": 0.010950282216072083, "learning_rate": 5e-05, "loss": 0.0022, "step": 17350 }, { "epoch": 4.60355343410236, "grad_norm": 0.010925744660198689, "learning_rate": 5e-05, "loss": 0.0025, "step": 17360 }, { "epoch": 4.606205250596659, "grad_norm": 0.014864250086247921, "learning_rate": 5e-05, "loss": 0.0025, "step": 17370 }, { "epoch": 4.608857067090957, "grad_norm": 0.013101198710501194, "learning_rate": 5e-05, "loss": 0.003, "step": 17380 }, { "epoch": 4.611508883585256, "grad_norm": 0.013338735327124596, "learning_rate": 5e-05, "loss": 0.003, "step": 17390 }, { "epoch": 4.614160700079554, "grad_norm": 0.007085057906806469, "learning_rate": 5e-05, "loss": 0.0021, "step": 17400 }, { "epoch": 4.616812516573853, "grad_norm": 0.009359377436339855, "learning_rate": 5e-05, "loss": 0.003, "step": 17410 }, { "epoch": 4.6194643330681515, "grad_norm": 0.010406458750367165, "learning_rate": 5e-05, "loss": 0.0022, "step": 17420 }, { "epoch": 4.6221161495624505, "grad_norm": 0.012977986596524715, "learning_rate": 5e-05, "loss": 0.0023, "step": 17430 }, { "epoch": 4.624767966056749, "grad_norm": 0.003755610901862383, "learning_rate": 5e-05, "loss": 0.0019, "step": 17440 }, { "epoch": 4.627419782551048, "grad_norm": 0.011827344074845314, "learning_rate": 5e-05, "loss": 0.0021, "step": 17450 }, { "epoch": 4.630071599045346, "grad_norm": 0.009999267756938934, "learning_rate": 5e-05, "loss": 0.0022, "step": 17460 }, { "epoch": 4.632723415539645, "grad_norm": 0.007511702366173267, "learning_rate": 5e-05, "loss": 0.0023, "step": 17470 }, { "epoch": 4.635375232033943, "grad_norm": 0.006017100065946579, "learning_rate": 5e-05, "loss": 0.002, "step": 17480 }, { "epoch": 4.638027048528242, "grad_norm": 0.011882358230650425, "learning_rate": 5e-05, "loss": 0.0024, "step": 17490 }, { "epoch": 4.64067886502254, "grad_norm": 0.021907979622483253, "learning_rate": 5e-05, "loss": 0.002, "step": 17500 }, { "epoch": 4.643330681516839, "grad_norm": 0.010786093771457672, "learning_rate": 5e-05, "loss": 0.002, "step": 17510 }, { "epoch": 4.645982498011137, "grad_norm": 0.0075947605073452, "learning_rate": 5e-05, "loss": 0.0016, "step": 17520 }, { "epoch": 4.648634314505436, "grad_norm": 0.01716807670891285, "learning_rate": 5e-05, "loss": 0.002, "step": 17530 }, { "epoch": 4.651286130999734, "grad_norm": 0.015365085564553738, "learning_rate": 5e-05, "loss": 0.0023, "step": 17540 }, { "epoch": 4.6539379474940334, "grad_norm": 0.007714100182056427, "learning_rate": 5e-05, "loss": 0.0026, "step": 17550 }, { "epoch": 4.6565897639883325, "grad_norm": 0.022539371624588966, "learning_rate": 5e-05, "loss": 0.003, "step": 17560 }, { "epoch": 4.659241580482631, "grad_norm": 0.009364682249724865, "learning_rate": 5e-05, "loss": 0.0026, "step": 17570 }, { "epoch": 4.661893396976929, "grad_norm": 0.012696553952991962, "learning_rate": 5e-05, "loss": 0.0028, "step": 17580 }, { "epoch": 4.664545213471228, "grad_norm": 0.008608188480138779, "learning_rate": 5e-05, "loss": 0.0026, "step": 17590 }, { "epoch": 4.667197029965527, "grad_norm": 0.018880246207118034, "learning_rate": 5e-05, "loss": 0.0025, "step": 17600 }, { "epoch": 4.669848846459825, "grad_norm": 0.015945537015795708, "learning_rate": 5e-05, "loss": 0.0024, "step": 17610 }, { "epoch": 4.672500662954124, "grad_norm": 0.007815002463757992, "learning_rate": 5e-05, "loss": 0.0031, "step": 17620 }, { "epoch": 4.675152479448422, "grad_norm": 0.007628649938851595, "learning_rate": 5e-05, "loss": 0.0026, "step": 17630 }, { "epoch": 4.677804295942721, "grad_norm": 0.009712512604892254, "learning_rate": 5e-05, "loss": 0.0021, "step": 17640 }, { "epoch": 4.680456112437019, "grad_norm": 0.012696007266640663, "learning_rate": 5e-05, "loss": 0.0024, "step": 17650 }, { "epoch": 4.683107928931318, "grad_norm": 0.011976806446909904, "learning_rate": 5e-05, "loss": 0.002, "step": 17660 }, { "epoch": 4.685759745425616, "grad_norm": 0.01108976174145937, "learning_rate": 5e-05, "loss": 0.0022, "step": 17670 }, { "epoch": 4.688411561919915, "grad_norm": 0.005556208547204733, "learning_rate": 5e-05, "loss": 0.0019, "step": 17680 }, { "epoch": 4.6910633784142135, "grad_norm": 0.007990704849362373, "learning_rate": 5e-05, "loss": 0.0022, "step": 17690 }, { "epoch": 4.6937151949085125, "grad_norm": 0.010698648169636726, "learning_rate": 5e-05, "loss": 0.0026, "step": 17700 }, { "epoch": 4.696367011402811, "grad_norm": 0.015230542048811913, "learning_rate": 5e-05, "loss": 0.0019, "step": 17710 }, { "epoch": 4.69901882789711, "grad_norm": 0.0066938623785972595, "learning_rate": 5e-05, "loss": 0.0021, "step": 17720 }, { "epoch": 4.701670644391408, "grad_norm": 0.0038570642936974764, "learning_rate": 5e-05, "loss": 0.0023, "step": 17730 }, { "epoch": 4.704322460885707, "grad_norm": 0.011806166730821133, "learning_rate": 5e-05, "loss": 0.0021, "step": 17740 }, { "epoch": 4.706974277380005, "grad_norm": 0.011627624742686749, "learning_rate": 5e-05, "loss": 0.0022, "step": 17750 }, { "epoch": 4.709626093874304, "grad_norm": 0.00892964843660593, "learning_rate": 5e-05, "loss": 0.0021, "step": 17760 }, { "epoch": 4.712277910368602, "grad_norm": 0.008756861090660095, "learning_rate": 5e-05, "loss": 0.0021, "step": 17770 }, { "epoch": 4.714929726862901, "grad_norm": 0.011889573186635971, "learning_rate": 5e-05, "loss": 0.0025, "step": 17780 }, { "epoch": 4.7175815433572, "grad_norm": 0.006123214028775692, "learning_rate": 5e-05, "loss": 0.0019, "step": 17790 }, { "epoch": 4.720233359851498, "grad_norm": 0.002977800788357854, "learning_rate": 5e-05, "loss": 0.0023, "step": 17800 }, { "epoch": 4.722885176345796, "grad_norm": 0.007765581831336021, "learning_rate": 5e-05, "loss": 0.0023, "step": 17810 }, { "epoch": 4.725536992840095, "grad_norm": 0.0037622975651174784, "learning_rate": 5e-05, "loss": 0.0023, "step": 17820 }, { "epoch": 4.7281888093343944, "grad_norm": 0.012626653537154198, "learning_rate": 5e-05, "loss": 0.0025, "step": 17830 }, { "epoch": 4.730840625828693, "grad_norm": 0.006224280223250389, "learning_rate": 5e-05, "loss": 0.0023, "step": 17840 }, { "epoch": 4.733492442322992, "grad_norm": 0.006588088348507881, "learning_rate": 5e-05, "loss": 0.0025, "step": 17850 }, { "epoch": 4.73614425881729, "grad_norm": 0.010222169570624828, "learning_rate": 5e-05, "loss": 0.0025, "step": 17860 }, { "epoch": 4.738796075311589, "grad_norm": 0.01303170807659626, "learning_rate": 5e-05, "loss": 0.0021, "step": 17870 }, { "epoch": 4.741447891805887, "grad_norm": 0.006802068557590246, "learning_rate": 5e-05, "loss": 0.0021, "step": 17880 }, { "epoch": 4.744099708300186, "grad_norm": 0.005539207253605127, "learning_rate": 5e-05, "loss": 0.0022, "step": 17890 }, { "epoch": 4.746751524794484, "grad_norm": 0.0066566914319992065, "learning_rate": 5e-05, "loss": 0.0024, "step": 17900 }, { "epoch": 4.749403341288783, "grad_norm": 0.006357003469020128, "learning_rate": 5e-05, "loss": 0.0021, "step": 17910 }, { "epoch": 4.752055157783081, "grad_norm": 0.009358861483633518, "learning_rate": 5e-05, "loss": 0.0024, "step": 17920 }, { "epoch": 4.75470697427738, "grad_norm": 0.006505472585558891, "learning_rate": 5e-05, "loss": 0.0023, "step": 17930 }, { "epoch": 4.757358790771678, "grad_norm": 0.008336082100868225, "learning_rate": 5e-05, "loss": 0.0021, "step": 17940 }, { "epoch": 4.760010607265977, "grad_norm": 0.011920350603759289, "learning_rate": 5e-05, "loss": 0.0023, "step": 17950 }, { "epoch": 4.7626624237602755, "grad_norm": 0.014082185924053192, "learning_rate": 5e-05, "loss": 0.0025, "step": 17960 }, { "epoch": 4.7653142402545745, "grad_norm": 0.005958221387118101, "learning_rate": 5e-05, "loss": 0.002, "step": 17970 }, { "epoch": 4.767966056748873, "grad_norm": 0.008960156701505184, "learning_rate": 5e-05, "loss": 0.0016, "step": 17980 }, { "epoch": 4.770617873243172, "grad_norm": 0.011645154096186161, "learning_rate": 5e-05, "loss": 0.0027, "step": 17990 }, { "epoch": 4.77326968973747, "grad_norm": 0.005685538984835148, "learning_rate": 5e-05, "loss": 0.0021, "step": 18000 }, { "epoch": 4.775921506231769, "grad_norm": 0.007106482051312923, "learning_rate": 5e-05, "loss": 0.0022, "step": 18010 }, { "epoch": 4.778573322726068, "grad_norm": 0.022039109840989113, "learning_rate": 5e-05, "loss": 0.0025, "step": 18020 }, { "epoch": 4.781225139220366, "grad_norm": 0.011150696314871311, "learning_rate": 5e-05, "loss": 0.002, "step": 18030 }, { "epoch": 4.783876955714664, "grad_norm": 0.01153763197362423, "learning_rate": 5e-05, "loss": 0.0018, "step": 18040 }, { "epoch": 4.786528772208963, "grad_norm": 0.0068135918118059635, "learning_rate": 5e-05, "loss": 0.0019, "step": 18050 }, { "epoch": 4.789180588703262, "grad_norm": 0.0050367028452456, "learning_rate": 5e-05, "loss": 0.002, "step": 18060 }, { "epoch": 4.79183240519756, "grad_norm": 0.009725788608193398, "learning_rate": 5e-05, "loss": 0.0027, "step": 18070 }, { "epoch": 4.794484221691859, "grad_norm": 0.0068784235045313835, "learning_rate": 5e-05, "loss": 0.0023, "step": 18080 }, { "epoch": 4.797136038186157, "grad_norm": 0.005362240131944418, "learning_rate": 5e-05, "loss": 0.0022, "step": 18090 }, { "epoch": 4.799787854680456, "grad_norm": 0.008044397458434105, "learning_rate": 5e-05, "loss": 0.002, "step": 18100 }, { "epoch": 4.802439671174755, "grad_norm": 0.006384431384503841, "learning_rate": 5e-05, "loss": 0.0023, "step": 18110 }, { "epoch": 4.805091487669054, "grad_norm": 0.008338048122823238, "learning_rate": 5e-05, "loss": 0.002, "step": 18120 }, { "epoch": 4.807743304163352, "grad_norm": 0.00615050969645381, "learning_rate": 5e-05, "loss": 0.0021, "step": 18130 }, { "epoch": 4.810395120657651, "grad_norm": 0.006335535552352667, "learning_rate": 5e-05, "loss": 0.0016, "step": 18140 }, { "epoch": 4.813046937151949, "grad_norm": 0.01287879329174757, "learning_rate": 5e-05, "loss": 0.002, "step": 18150 }, { "epoch": 4.815698753646248, "grad_norm": 0.010444823652505875, "learning_rate": 5e-05, "loss": 0.0022, "step": 18160 }, { "epoch": 4.818350570140546, "grad_norm": 0.010255801491439342, "learning_rate": 5e-05, "loss": 0.002, "step": 18170 }, { "epoch": 4.821002386634845, "grad_norm": 0.008541508577764034, "learning_rate": 5e-05, "loss": 0.0021, "step": 18180 }, { "epoch": 4.823654203129143, "grad_norm": 0.00798971951007843, "learning_rate": 5e-05, "loss": 0.0017, "step": 18190 }, { "epoch": 4.826306019623442, "grad_norm": 0.007130864541977644, "learning_rate": 5e-05, "loss": 0.0023, "step": 18200 }, { "epoch": 4.82895783611774, "grad_norm": 0.006840972229838371, "learning_rate": 5e-05, "loss": 0.0024, "step": 18210 }, { "epoch": 4.831609652612039, "grad_norm": 0.005067253950983286, "learning_rate": 5e-05, "loss": 0.0026, "step": 18220 }, { "epoch": 4.8342614691063375, "grad_norm": 0.010103357024490833, "learning_rate": 5e-05, "loss": 0.0027, "step": 18230 }, { "epoch": 4.8369132856006365, "grad_norm": 0.00645621819421649, "learning_rate": 5e-05, "loss": 0.0027, "step": 18240 }, { "epoch": 4.8395651020949355, "grad_norm": 0.007666268385946751, "learning_rate": 5e-05, "loss": 0.0025, "step": 18250 }, { "epoch": 4.842216918589234, "grad_norm": 0.004389377776533365, "learning_rate": 5e-05, "loss": 0.0026, "step": 18260 }, { "epoch": 4.844868735083532, "grad_norm": 0.00566681707277894, "learning_rate": 5e-05, "loss": 0.0025, "step": 18270 }, { "epoch": 4.847520551577831, "grad_norm": 0.006864502560347319, "learning_rate": 5e-05, "loss": 0.0022, "step": 18280 }, { "epoch": 4.85017236807213, "grad_norm": 0.00860898569226265, "learning_rate": 5e-05, "loss": 0.0026, "step": 18290 }, { "epoch": 4.852824184566428, "grad_norm": 0.00942122284322977, "learning_rate": 5e-05, "loss": 0.002, "step": 18300 }, { "epoch": 4.855476001060727, "grad_norm": 0.009017029777169228, "learning_rate": 5e-05, "loss": 0.0017, "step": 18310 }, { "epoch": 4.858127817555025, "grad_norm": 0.010154861025512218, "learning_rate": 5e-05, "loss": 0.0019, "step": 18320 }, { "epoch": 4.860779634049324, "grad_norm": 0.005521242041140795, "learning_rate": 5e-05, "loss": 0.0018, "step": 18330 }, { "epoch": 4.863431450543622, "grad_norm": 0.011697899550199509, "learning_rate": 5e-05, "loss": 0.0026, "step": 18340 }, { "epoch": 4.866083267037921, "grad_norm": 0.00575700867921114, "learning_rate": 5e-05, "loss": 0.0022, "step": 18350 }, { "epoch": 4.868735083532219, "grad_norm": 0.00866071879863739, "learning_rate": 5e-05, "loss": 0.0026, "step": 18360 }, { "epoch": 4.871386900026518, "grad_norm": 0.010187746956944466, "learning_rate": 5e-05, "loss": 0.0022, "step": 18370 }, { "epoch": 4.8740387165208165, "grad_norm": 0.008153585717082024, "learning_rate": 5e-05, "loss": 0.002, "step": 18380 }, { "epoch": 4.876690533015116, "grad_norm": 0.0158903319388628, "learning_rate": 5e-05, "loss": 0.0019, "step": 18390 }, { "epoch": 4.879342349509414, "grad_norm": 0.0077789416536688805, "learning_rate": 5e-05, "loss": 0.002, "step": 18400 }, { "epoch": 4.881994166003713, "grad_norm": 0.011078652925789356, "learning_rate": 5e-05, "loss": 0.0032, "step": 18410 }, { "epoch": 4.884645982498011, "grad_norm": 0.01700688526034355, "learning_rate": 5e-05, "loss": 0.002, "step": 18420 }, { "epoch": 4.88729779899231, "grad_norm": 0.007208365481346846, "learning_rate": 5e-05, "loss": 0.0021, "step": 18430 }, { "epoch": 4.889949615486608, "grad_norm": 0.010610542260110378, "learning_rate": 5e-05, "loss": 0.0026, "step": 18440 }, { "epoch": 4.892601431980907, "grad_norm": 0.009387544356286526, "learning_rate": 5e-05, "loss": 0.0026, "step": 18450 }, { "epoch": 4.895253248475205, "grad_norm": 0.007007151376456022, "learning_rate": 5e-05, "loss": 0.0028, "step": 18460 }, { "epoch": 4.897905064969504, "grad_norm": 0.008715378120541573, "learning_rate": 5e-05, "loss": 0.0021, "step": 18470 }, { "epoch": 4.900556881463803, "grad_norm": 0.007702070288360119, "learning_rate": 5e-05, "loss": 0.0022, "step": 18480 }, { "epoch": 4.903208697958101, "grad_norm": 0.008719461970031261, "learning_rate": 5e-05, "loss": 0.0028, "step": 18490 }, { "epoch": 4.9058605144523995, "grad_norm": 0.0063592828810215, "learning_rate": 5e-05, "loss": 0.0022, "step": 18500 }, { "epoch": 4.9085123309466985, "grad_norm": 0.00904164370149374, "learning_rate": 5e-05, "loss": 0.0019, "step": 18510 }, { "epoch": 4.9111641474409975, "grad_norm": 0.00833105854690075, "learning_rate": 5e-05, "loss": 0.0019, "step": 18520 }, { "epoch": 4.913815963935296, "grad_norm": 0.007793267723172903, "learning_rate": 5e-05, "loss": 0.0028, "step": 18530 }, { "epoch": 4.916467780429595, "grad_norm": 0.0071108415722846985, "learning_rate": 5e-05, "loss": 0.0021, "step": 18540 }, { "epoch": 4.919119596923893, "grad_norm": 0.008698676712810993, "learning_rate": 5e-05, "loss": 0.002, "step": 18550 }, { "epoch": 4.921771413418192, "grad_norm": 0.0092619014903903, "learning_rate": 5e-05, "loss": 0.0023, "step": 18560 }, { "epoch": 4.92442322991249, "grad_norm": 0.012645019218325615, "learning_rate": 5e-05, "loss": 0.0021, "step": 18570 }, { "epoch": 4.927075046406789, "grad_norm": 0.010353156365454197, "learning_rate": 5e-05, "loss": 0.0024, "step": 18580 }, { "epoch": 4.929726862901087, "grad_norm": 0.01478218287229538, "learning_rate": 5e-05, "loss": 0.002, "step": 18590 }, { "epoch": 4.932378679395386, "grad_norm": 0.007321788463741541, "learning_rate": 5e-05, "loss": 0.0023, "step": 18600 }, { "epoch": 4.935030495889684, "grad_norm": 0.008396975696086884, "learning_rate": 5e-05, "loss": 0.0023, "step": 18610 }, { "epoch": 4.937682312383983, "grad_norm": 0.0028094318695366383, "learning_rate": 5e-05, "loss": 0.0025, "step": 18620 }, { "epoch": 4.940334128878281, "grad_norm": 0.00603881711140275, "learning_rate": 5e-05, "loss": 0.0027, "step": 18630 }, { "epoch": 4.94298594537258, "grad_norm": 0.0032514198683202267, "learning_rate": 5e-05, "loss": 0.0026, "step": 18640 }, { "epoch": 4.9456377618668785, "grad_norm": 0.006055097561329603, "learning_rate": 5e-05, "loss": 0.0021, "step": 18650 }, { "epoch": 4.9482895783611776, "grad_norm": 0.00914507545530796, "learning_rate": 5e-05, "loss": 0.002, "step": 18660 }, { "epoch": 4.950941394855476, "grad_norm": 0.00912519171833992, "learning_rate": 5e-05, "loss": 0.0019, "step": 18670 }, { "epoch": 4.953593211349775, "grad_norm": 0.006467909552156925, "learning_rate": 5e-05, "loss": 0.0019, "step": 18680 }, { "epoch": 4.956245027844073, "grad_norm": 0.028072940185666084, "learning_rate": 5e-05, "loss": 0.0027, "step": 18690 }, { "epoch": 4.958896844338372, "grad_norm": 0.00809249747544527, "learning_rate": 5e-05, "loss": 0.0022, "step": 18700 }, { "epoch": 4.961548660832671, "grad_norm": 0.011884951964020729, "learning_rate": 5e-05, "loss": 0.002, "step": 18710 }, { "epoch": 4.964200477326969, "grad_norm": 0.011508145369589329, "learning_rate": 5e-05, "loss": 0.0021, "step": 18720 }, { "epoch": 4.966852293821267, "grad_norm": 0.007345788646489382, "learning_rate": 5e-05, "loss": 0.002, "step": 18730 }, { "epoch": 4.969504110315566, "grad_norm": 0.005650839768350124, "learning_rate": 5e-05, "loss": 0.0022, "step": 18740 }, { "epoch": 4.972155926809865, "grad_norm": 0.017743993550539017, "learning_rate": 5e-05, "loss": 0.0024, "step": 18750 }, { "epoch": 4.974807743304163, "grad_norm": 0.011714538559317589, "learning_rate": 5e-05, "loss": 0.0026, "step": 18760 }, { "epoch": 4.977459559798462, "grad_norm": 0.005583342630416155, "learning_rate": 5e-05, "loss": 0.0027, "step": 18770 }, { "epoch": 4.9801113762927605, "grad_norm": 0.009029177017509937, "learning_rate": 5e-05, "loss": 0.0021, "step": 18780 }, { "epoch": 4.9827631927870595, "grad_norm": 0.005323822144418955, "learning_rate": 5e-05, "loss": 0.0028, "step": 18790 }, { "epoch": 4.985415009281358, "grad_norm": 0.008516227826476097, "learning_rate": 5e-05, "loss": 0.0021, "step": 18800 }, { "epoch": 4.988066825775657, "grad_norm": 0.016141997650265694, "learning_rate": 5e-05, "loss": 0.0025, "step": 18810 }, { "epoch": 4.990718642269955, "grad_norm": 0.008069918490946293, "learning_rate": 5e-05, "loss": 0.0023, "step": 18820 }, { "epoch": 4.993370458764254, "grad_norm": 0.01588020659983158, "learning_rate": 5e-05, "loss": 0.002, "step": 18830 }, { "epoch": 4.996022275258552, "grad_norm": 0.010942109860479832, "learning_rate": 5e-05, "loss": 0.0019, "step": 18840 }, { "epoch": 4.998674091752851, "grad_norm": 0.007689508143812418, "learning_rate": 5e-05, "loss": 0.002, "step": 18850 }, { "epoch": 5.001325908247149, "grad_norm": 0.011007560417056084, "learning_rate": 5e-05, "loss": 0.0019, "step": 18860 }, { "epoch": 5.003977724741448, "grad_norm": 0.01150535512715578, "learning_rate": 5e-05, "loss": 0.002, "step": 18870 }, { "epoch": 5.006629541235746, "grad_norm": 0.00868306402117014, "learning_rate": 5e-05, "loss": 0.0023, "step": 18880 }, { "epoch": 5.009281357730045, "grad_norm": 0.0107127008959651, "learning_rate": 5e-05, "loss": 0.0021, "step": 18890 }, { "epoch": 5.011933174224343, "grad_norm": 0.013927076943218708, "learning_rate": 5e-05, "loss": 0.0026, "step": 18900 }, { "epoch": 5.014584990718642, "grad_norm": 0.01582954078912735, "learning_rate": 5e-05, "loss": 0.0019, "step": 18910 }, { "epoch": 5.0172368072129405, "grad_norm": 0.009336546063423157, "learning_rate": 5e-05, "loss": 0.0021, "step": 18920 }, { "epoch": 5.0198886237072395, "grad_norm": 0.010342431254684925, "learning_rate": 5e-05, "loss": 0.0024, "step": 18930 }, { "epoch": 5.022540440201538, "grad_norm": 0.007573293522000313, "learning_rate": 5e-05, "loss": 0.002, "step": 18940 }, { "epoch": 5.025192256695837, "grad_norm": 0.01343587040901184, "learning_rate": 5e-05, "loss": 0.0026, "step": 18950 }, { "epoch": 5.027844073190135, "grad_norm": 0.00900188833475113, "learning_rate": 5e-05, "loss": 0.0025, "step": 18960 }, { "epoch": 5.030495889684434, "grad_norm": 0.009238946251571178, "learning_rate": 5e-05, "loss": 0.0019, "step": 18970 }, { "epoch": 5.033147706178733, "grad_norm": 0.009450959041714668, "learning_rate": 5e-05, "loss": 0.0022, "step": 18980 }, { "epoch": 5.035799522673031, "grad_norm": 0.007868419401347637, "learning_rate": 5e-05, "loss": 0.0027, "step": 18990 }, { "epoch": 5.03845133916733, "grad_norm": 0.005700752604752779, "learning_rate": 5e-05, "loss": 0.0019, "step": 19000 }, { "epoch": 5.041103155661628, "grad_norm": 0.012905679643154144, "learning_rate": 5e-05, "loss": 0.0019, "step": 19010 }, { "epoch": 5.043754972155927, "grad_norm": 0.005751166958361864, "learning_rate": 5e-05, "loss": 0.0019, "step": 19020 }, { "epoch": 5.046406788650225, "grad_norm": 0.013630257919430733, "learning_rate": 5e-05, "loss": 0.0026, "step": 19030 }, { "epoch": 5.049058605144524, "grad_norm": 0.007389447186142206, "learning_rate": 5e-05, "loss": 0.0022, "step": 19040 }, { "epoch": 5.0517104216388224, "grad_norm": 0.017706498503684998, "learning_rate": 5e-05, "loss": 0.0027, "step": 19050 }, { "epoch": 5.0543622381331215, "grad_norm": 0.010448137298226357, "learning_rate": 5e-05, "loss": 0.0024, "step": 19060 }, { "epoch": 5.05701405462742, "grad_norm": 0.004312935750931501, "learning_rate": 5e-05, "loss": 0.0021, "step": 19070 }, { "epoch": 5.059665871121719, "grad_norm": 0.006015543360263109, "learning_rate": 5e-05, "loss": 0.0022, "step": 19080 }, { "epoch": 5.062317687616017, "grad_norm": 0.009686710312962532, "learning_rate": 5e-05, "loss": 0.0017, "step": 19090 }, { "epoch": 5.064969504110316, "grad_norm": 0.0035624138545244932, "learning_rate": 5e-05, "loss": 0.0018, "step": 19100 }, { "epoch": 5.067621320604614, "grad_norm": 0.0075892251916229725, "learning_rate": 5e-05, "loss": 0.0016, "step": 19110 }, { "epoch": 5.070273137098913, "grad_norm": 0.003303742269054055, "learning_rate": 5e-05, "loss": 0.0018, "step": 19120 }, { "epoch": 5.072924953593211, "grad_norm": 0.008441607467830181, "learning_rate": 5e-05, "loss": 0.0026, "step": 19130 }, { "epoch": 5.07557677008751, "grad_norm": 0.012244882062077522, "learning_rate": 5e-05, "loss": 0.0023, "step": 19140 }, { "epoch": 5.078228586581808, "grad_norm": 0.010644977912306786, "learning_rate": 5e-05, "loss": 0.003, "step": 19150 }, { "epoch": 5.080880403076107, "grad_norm": 0.007507531903684139, "learning_rate": 5e-05, "loss": 0.0022, "step": 19160 }, { "epoch": 5.083532219570405, "grad_norm": 0.0044964211992919445, "learning_rate": 5e-05, "loss": 0.0024, "step": 19170 }, { "epoch": 5.086184036064704, "grad_norm": 0.005285849794745445, "learning_rate": 5e-05, "loss": 0.0019, "step": 19180 }, { "epoch": 5.0888358525590025, "grad_norm": 0.004471100401133299, "learning_rate": 5e-05, "loss": 0.0024, "step": 19190 }, { "epoch": 5.0914876690533015, "grad_norm": 0.019999168813228607, "learning_rate": 5e-05, "loss": 0.0024, "step": 19200 }, { "epoch": 5.0941394855476005, "grad_norm": 0.005426430143415928, "learning_rate": 5e-05, "loss": 0.0019, "step": 19210 }, { "epoch": 5.096791302041899, "grad_norm": 0.008948256261646748, "learning_rate": 5e-05, "loss": 0.0026, "step": 19220 }, { "epoch": 5.099443118536198, "grad_norm": 0.007438531145453453, "learning_rate": 5e-05, "loss": 0.002, "step": 19230 }, { "epoch": 5.102094935030496, "grad_norm": 0.013367682695388794, "learning_rate": 5e-05, "loss": 0.0024, "step": 19240 }, { "epoch": 5.104746751524795, "grad_norm": 0.011143756099045277, "learning_rate": 5e-05, "loss": 0.0021, "step": 19250 }, { "epoch": 5.107398568019093, "grad_norm": 0.014682364650070667, "learning_rate": 5e-05, "loss": 0.0022, "step": 19260 }, { "epoch": 5.110050384513392, "grad_norm": 0.015891319140791893, "learning_rate": 5e-05, "loss": 0.0023, "step": 19270 }, { "epoch": 5.11270220100769, "grad_norm": 0.008893347345292568, "learning_rate": 5e-05, "loss": 0.0022, "step": 19280 }, { "epoch": 5.115354017501989, "grad_norm": 0.012530235573649406, "learning_rate": 5e-05, "loss": 0.0023, "step": 19290 }, { "epoch": 5.118005833996287, "grad_norm": 0.007689608260989189, "learning_rate": 5e-05, "loss": 0.0021, "step": 19300 }, { "epoch": 5.120657650490586, "grad_norm": 0.010997097007930279, "learning_rate": 5e-05, "loss": 0.0024, "step": 19310 }, { "epoch": 5.123309466984884, "grad_norm": 0.007092935033142567, "learning_rate": 5e-05, "loss": 0.0024, "step": 19320 }, { "epoch": 5.1259612834791835, "grad_norm": 0.010647648014128208, "learning_rate": 5e-05, "loss": 0.0025, "step": 19330 }, { "epoch": 5.128613099973482, "grad_norm": 0.008937565609812737, "learning_rate": 5e-05, "loss": 0.0027, "step": 19340 }, { "epoch": 5.131264916467781, "grad_norm": 0.007508918177336454, "learning_rate": 5e-05, "loss": 0.0021, "step": 19350 }, { "epoch": 5.133916732962079, "grad_norm": 0.008994635194540024, "learning_rate": 5e-05, "loss": 0.0023, "step": 19360 }, { "epoch": 5.136568549456378, "grad_norm": 0.013487657532095909, "learning_rate": 5e-05, "loss": 0.0027, "step": 19370 }, { "epoch": 5.139220365950676, "grad_norm": 0.007586451712995768, "learning_rate": 5e-05, "loss": 0.0019, "step": 19380 }, { "epoch": 5.141872182444975, "grad_norm": 0.008448794484138489, "learning_rate": 5e-05, "loss": 0.0026, "step": 19390 }, { "epoch": 5.144523998939273, "grad_norm": 0.00829700380563736, "learning_rate": 5e-05, "loss": 0.0022, "step": 19400 }, { "epoch": 5.147175815433572, "grad_norm": 0.01205848902463913, "learning_rate": 5e-05, "loss": 0.0024, "step": 19410 }, { "epoch": 5.14982763192787, "grad_norm": 0.009416243061423302, "learning_rate": 5e-05, "loss": 0.0019, "step": 19420 }, { "epoch": 5.152479448422169, "grad_norm": 0.008818073198199272, "learning_rate": 5e-05, "loss": 0.0021, "step": 19430 }, { "epoch": 5.155131264916468, "grad_norm": 0.007919207215309143, "learning_rate": 5e-05, "loss": 0.0023, "step": 19440 }, { "epoch": 5.157783081410766, "grad_norm": 0.009834399446845055, "learning_rate": 5e-05, "loss": 0.0023, "step": 19450 }, { "epoch": 5.160434897905065, "grad_norm": 0.006598184816539288, "learning_rate": 5e-05, "loss": 0.0021, "step": 19460 }, { "epoch": 5.1630867143993635, "grad_norm": 0.01210337970405817, "learning_rate": 5e-05, "loss": 0.0017, "step": 19470 }, { "epoch": 5.1657385308936625, "grad_norm": 0.010526706464588642, "learning_rate": 5e-05, "loss": 0.0015, "step": 19480 }, { "epoch": 5.168390347387961, "grad_norm": 0.008931709453463554, "learning_rate": 5e-05, "loss": 0.0028, "step": 19490 }, { "epoch": 5.17104216388226, "grad_norm": 0.0070275054313242435, "learning_rate": 5e-05, "loss": 0.0025, "step": 19500 }, { "epoch": 5.173693980376558, "grad_norm": 0.008199476636946201, "learning_rate": 5e-05, "loss": 0.0018, "step": 19510 }, { "epoch": 5.176345796870857, "grad_norm": 0.00719004962593317, "learning_rate": 5e-05, "loss": 0.0025, "step": 19520 }, { "epoch": 5.178997613365155, "grad_norm": 0.0059782350435853004, "learning_rate": 5e-05, "loss": 0.0027, "step": 19530 }, { "epoch": 5.181649429859454, "grad_norm": 0.007679115515202284, "learning_rate": 5e-05, "loss": 0.002, "step": 19540 }, { "epoch": 5.184301246353752, "grad_norm": 0.007826339453458786, "learning_rate": 5e-05, "loss": 0.0022, "step": 19550 }, { "epoch": 5.186953062848051, "grad_norm": 0.00450492650270462, "learning_rate": 5e-05, "loss": 0.0019, "step": 19560 }, { "epoch": 5.189604879342349, "grad_norm": 0.013698823750019073, "learning_rate": 5e-05, "loss": 0.002, "step": 19570 }, { "epoch": 5.192256695836648, "grad_norm": 0.009158325381577015, "learning_rate": 5e-05, "loss": 0.0021, "step": 19580 }, { "epoch": 5.194908512330946, "grad_norm": 0.013789533637464046, "learning_rate": 5e-05, "loss": 0.0028, "step": 19590 }, { "epoch": 5.197560328825245, "grad_norm": 0.008155712857842445, "learning_rate": 5e-05, "loss": 0.0028, "step": 19600 }, { "epoch": 5.200212145319544, "grad_norm": 0.01027657650411129, "learning_rate": 5e-05, "loss": 0.0024, "step": 19610 }, { "epoch": 5.202863961813843, "grad_norm": 0.010521236807107925, "learning_rate": 5e-05, "loss": 0.0024, "step": 19620 }, { "epoch": 5.205515778308141, "grad_norm": 0.008681908249855042, "learning_rate": 5e-05, "loss": 0.0021, "step": 19630 }, { "epoch": 5.20816759480244, "grad_norm": 0.007375162094831467, "learning_rate": 5e-05, "loss": 0.0018, "step": 19640 }, { "epoch": 5.210819411296738, "grad_norm": 0.01696680672466755, "learning_rate": 5e-05, "loss": 0.0029, "step": 19650 }, { "epoch": 5.213471227791037, "grad_norm": 0.014512255787849426, "learning_rate": 5e-05, "loss": 0.0021, "step": 19660 }, { "epoch": 5.216123044285336, "grad_norm": 0.007451257202774286, "learning_rate": 5e-05, "loss": 0.002, "step": 19670 }, { "epoch": 5.218774860779634, "grad_norm": 0.010252011008560658, "learning_rate": 5e-05, "loss": 0.0025, "step": 19680 }, { "epoch": 5.221426677273933, "grad_norm": 0.007147873751819134, "learning_rate": 5e-05, "loss": 0.0016, "step": 19690 }, { "epoch": 5.224078493768231, "grad_norm": 0.013160398229956627, "learning_rate": 5e-05, "loss": 0.0023, "step": 19700 }, { "epoch": 5.22673031026253, "grad_norm": 0.006827126257121563, "learning_rate": 5e-05, "loss": 0.0022, "step": 19710 }, { "epoch": 5.229382126756828, "grad_norm": 0.010701554827392101, "learning_rate": 5e-05, "loss": 0.002, "step": 19720 }, { "epoch": 5.232033943251127, "grad_norm": 0.009143390692770481, "learning_rate": 5e-05, "loss": 0.0024, "step": 19730 }, { "epoch": 5.2346857597454255, "grad_norm": 0.011145660653710365, "learning_rate": 5e-05, "loss": 0.0023, "step": 19740 }, { "epoch": 5.2373375762397245, "grad_norm": 0.008439169265329838, "learning_rate": 5e-05, "loss": 0.0024, "step": 19750 }, { "epoch": 5.239989392734023, "grad_norm": 0.011837499216198921, "learning_rate": 5e-05, "loss": 0.0029, "step": 19760 }, { "epoch": 5.242641209228322, "grad_norm": 0.006482951808720827, "learning_rate": 5e-05, "loss": 0.0022, "step": 19770 }, { "epoch": 5.24529302572262, "grad_norm": 0.00430036336183548, "learning_rate": 5e-05, "loss": 0.0017, "step": 19780 }, { "epoch": 5.247944842216919, "grad_norm": 0.008603110909461975, "learning_rate": 5e-05, "loss": 0.002, "step": 19790 }, { "epoch": 5.250596658711217, "grad_norm": 0.004020649939775467, "learning_rate": 5e-05, "loss": 0.002, "step": 19800 }, { "epoch": 5.253248475205516, "grad_norm": 0.012118149548768997, "learning_rate": 5e-05, "loss": 0.0025, "step": 19810 }, { "epoch": 5.255900291699814, "grad_norm": 0.005953253712505102, "learning_rate": 5e-05, "loss": 0.002, "step": 19820 }, { "epoch": 5.258552108194113, "grad_norm": 0.011427442543208599, "learning_rate": 5e-05, "loss": 0.0025, "step": 19830 }, { "epoch": 5.261203924688411, "grad_norm": 0.007139986380934715, "learning_rate": 5e-05, "loss": 0.0023, "step": 19840 }, { "epoch": 5.26385574118271, "grad_norm": 0.004172046668827534, "learning_rate": 5e-05, "loss": 0.0021, "step": 19850 }, { "epoch": 5.266507557677008, "grad_norm": 0.007753309328109026, "learning_rate": 5e-05, "loss": 0.0021, "step": 19860 }, { "epoch": 5.269159374171307, "grad_norm": 0.012585273943841457, "learning_rate": 5e-05, "loss": 0.0019, "step": 19870 }, { "epoch": 5.2718111906656056, "grad_norm": 0.01083324383944273, "learning_rate": 5e-05, "loss": 0.0021, "step": 19880 }, { "epoch": 5.274463007159905, "grad_norm": 0.0048668039962649345, "learning_rate": 5e-05, "loss": 0.002, "step": 19890 }, { "epoch": 5.277114823654204, "grad_norm": 0.006861058063805103, "learning_rate": 5e-05, "loss": 0.002, "step": 19900 }, { "epoch": 5.279766640148502, "grad_norm": 0.009259099140763283, "learning_rate": 5e-05, "loss": 0.0018, "step": 19910 }, { "epoch": 5.2824184566428, "grad_norm": 0.006800658535212278, "learning_rate": 5e-05, "loss": 0.0024, "step": 19920 }, { "epoch": 5.285070273137099, "grad_norm": 0.011738399043679237, "learning_rate": 5e-05, "loss": 0.0018, "step": 19930 }, { "epoch": 5.287722089631398, "grad_norm": 0.01875419355928898, "learning_rate": 5e-05, "loss": 0.002, "step": 19940 }, { "epoch": 5.290373906125696, "grad_norm": 0.005051505286246538, "learning_rate": 5e-05, "loss": 0.0023, "step": 19950 }, { "epoch": 5.293025722619995, "grad_norm": 0.01214070525020361, "learning_rate": 5e-05, "loss": 0.0022, "step": 19960 }, { "epoch": 5.295677539114293, "grad_norm": 0.011808081530034542, "learning_rate": 5e-05, "loss": 0.0023, "step": 19970 }, { "epoch": 5.298329355608592, "grad_norm": 0.01502525806427002, "learning_rate": 5e-05, "loss": 0.0023, "step": 19980 }, { "epoch": 5.30098117210289, "grad_norm": 0.013385587371885777, "learning_rate": 5e-05, "loss": 0.0022, "step": 19990 }, { "epoch": 5.303632988597189, "grad_norm": 0.004715291783213615, "learning_rate": 5e-05, "loss": 0.0022, "step": 20000 } ], "logging_steps": 10, "max_steps": 75420, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.03843772355969e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }