diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4681 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.44468256094294134, + "eval_steps": 500, + "global_step": 6640, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000669702652022502, + "grad_norm": 0.0, + "learning_rate": 6.693440428380188e-07, + "loss": 2.3902, + "step": 10 + }, + { + "epoch": 0.001339405304045004, + "grad_norm": 0.0, + "learning_rate": 1.3386880856760376e-06, + "loss": 2.4415, + "step": 20 + }, + { + "epoch": 0.002009107956067506, + "grad_norm": 0.0, + "learning_rate": 2.0080321285140564e-06, + "loss": 2.3648, + "step": 30 + }, + { + "epoch": 0.002678810608090008, + "grad_norm": 0.0, + "learning_rate": 2.6773761713520752e-06, + "loss": 2.2811, + "step": 40 + }, + { + "epoch": 0.00334851326011251, + "grad_norm": 0.0, + "learning_rate": 3.346720214190094e-06, + "loss": 2.4023, + "step": 50 + }, + { + "epoch": 0.004018215912135012, + "grad_norm": 0.0, + "learning_rate": 4.016064257028113e-06, + "loss": 2.3196, + "step": 60 + }, + { + "epoch": 0.004687918564157514, + "grad_norm": 0.0, + "learning_rate": 4.685408299866132e-06, + "loss": 2.2711, + "step": 70 + }, + { + "epoch": 0.005357621216180016, + "grad_norm": 0.0, + "learning_rate": 5.3547523427041504e-06, + "loss": 2.3224, + "step": 80 + }, + { + "epoch": 0.006027323868202518, + "grad_norm": 0.0, + "learning_rate": 6.024096385542169e-06, + "loss": 2.287, + "step": 90 + }, + { + "epoch": 0.00669702652022502, + "grad_norm": 0.0, + "learning_rate": 6.693440428380188e-06, + "loss": 2.2395, + "step": 100 + }, + { + "epoch": 0.0073667291722475225, + "grad_norm": 0.0, + "learning_rate": 7.362784471218207e-06, + "loss": 2.3105, + "step": 110 + }, + { + "epoch": 0.008036431824270024, + "grad_norm": 0.0, + "learning_rate": 8.032128514056226e-06, + "loss": 2.4276, + "step": 120 + }, + { + "epoch": 0.008706134476292525, + "grad_norm": 0.0, + "learning_rate": 8.701472556894244e-06, + "loss": 2.3688, + "step": 130 + }, + { + "epoch": 0.009375837128315028, + "grad_norm": 0.0, + "learning_rate": 9.370816599732263e-06, + "loss": 2.3826, + "step": 140 + }, + { + "epoch": 0.01004553978033753, + "grad_norm": 0.0, + "learning_rate": 1.0040160642570281e-05, + "loss": 2.383, + "step": 150 + }, + { + "epoch": 0.010715242432360031, + "grad_norm": 0.0, + "learning_rate": 1.0709504685408301e-05, + "loss": 2.3427, + "step": 160 + }, + { + "epoch": 0.011384945084382534, + "grad_norm": 0.0, + "learning_rate": 1.1378848728246319e-05, + "loss": 2.2256, + "step": 170 + }, + { + "epoch": 0.012054647736405036, + "grad_norm": 0.0, + "learning_rate": 1.2048192771084338e-05, + "loss": 2.3393, + "step": 180 + }, + { + "epoch": 0.012724350388427539, + "grad_norm": 0.0, + "learning_rate": 1.2717536813922356e-05, + "loss": 2.3954, + "step": 190 + }, + { + "epoch": 0.01339405304045004, + "grad_norm": 0.0, + "learning_rate": 1.3386880856760376e-05, + "loss": 2.2947, + "step": 200 + }, + { + "epoch": 0.014063755692472542, + "grad_norm": 0.0, + "learning_rate": 1.4056224899598394e-05, + "loss": 2.2528, + "step": 210 + }, + { + "epoch": 0.014733458344495045, + "grad_norm": 0.0, + "learning_rate": 1.4725568942436414e-05, + "loss": 2.2722, + "step": 220 + }, + { + "epoch": 0.015403160996517546, + "grad_norm": 0.0, + "learning_rate": 1.5394912985274433e-05, + "loss": 2.3249, + "step": 230 + }, + { + "epoch": 0.016072863648540048, + "grad_norm": 0.0, + "learning_rate": 1.606425702811245e-05, + "loss": 2.3077, + "step": 240 + }, + { + "epoch": 0.01674256630056255, + "grad_norm": 0.0, + "learning_rate": 1.673360107095047e-05, + "loss": 2.3056, + "step": 250 + }, + { + "epoch": 0.01741226895258505, + "grad_norm": 0.0, + "learning_rate": 1.7402945113788487e-05, + "loss": 2.3655, + "step": 260 + }, + { + "epoch": 0.018081971604607554, + "grad_norm": 0.0, + "learning_rate": 1.8072289156626505e-05, + "loss": 2.2622, + "step": 270 + }, + { + "epoch": 0.018751674256630057, + "grad_norm": 0.0, + "learning_rate": 1.8741633199464527e-05, + "loss": 2.2843, + "step": 280 + }, + { + "epoch": 0.019421376908652557, + "grad_norm": 0.0, + "learning_rate": 1.9410977242302544e-05, + "loss": 2.1858, + "step": 290 + }, + { + "epoch": 0.02009107956067506, + "grad_norm": 0.0, + "learning_rate": 2.0080321285140562e-05, + "loss": 2.3016, + "step": 300 + }, + { + "epoch": 0.020760782212697563, + "grad_norm": 0.0, + "learning_rate": 2.074966532797858e-05, + "loss": 2.2798, + "step": 310 + }, + { + "epoch": 0.021430484864720063, + "grad_norm": 0.0, + "learning_rate": 2.1419009370816602e-05, + "loss": 2.2639, + "step": 320 + }, + { + "epoch": 0.022100187516742566, + "grad_norm": 0.0, + "learning_rate": 2.208835341365462e-05, + "loss": 2.2364, + "step": 330 + }, + { + "epoch": 0.02276989016876507, + "grad_norm": 0.0, + "learning_rate": 2.2757697456492638e-05, + "loss": 2.2894, + "step": 340 + }, + { + "epoch": 0.023439592820787572, + "grad_norm": 0.0, + "learning_rate": 2.3427041499330656e-05, + "loss": 2.3324, + "step": 350 + }, + { + "epoch": 0.02410929547281007, + "grad_norm": 0.0, + "learning_rate": 2.4096385542168677e-05, + "loss": 2.3137, + "step": 360 + }, + { + "epoch": 0.024778998124832575, + "grad_norm": 0.0, + "learning_rate": 2.4765729585006695e-05, + "loss": 2.2814, + "step": 370 + }, + { + "epoch": 0.025448700776855078, + "grad_norm": 0.0, + "learning_rate": 2.5435073627844713e-05, + "loss": 2.3252, + "step": 380 + }, + { + "epoch": 0.026118403428877578, + "grad_norm": 0.0, + "learning_rate": 2.6104417670682734e-05, + "loss": 2.3306, + "step": 390 + }, + { + "epoch": 0.02678810608090008, + "grad_norm": 0.0, + "learning_rate": 2.6773761713520752e-05, + "loss": 2.3344, + "step": 400 + }, + { + "epoch": 0.027457808732922584, + "grad_norm": 0.0, + "learning_rate": 2.7443105756358774e-05, + "loss": 2.2686, + "step": 410 + }, + { + "epoch": 0.028127511384945084, + "grad_norm": 0.0, + "learning_rate": 2.8112449799196788e-05, + "loss": 2.2904, + "step": 420 + }, + { + "epoch": 0.028797214036967587, + "grad_norm": 0.0, + "learning_rate": 2.878179384203481e-05, + "loss": 2.3387, + "step": 430 + }, + { + "epoch": 0.02946691668899009, + "grad_norm": 0.0, + "learning_rate": 2.9451137884872827e-05, + "loss": 2.2865, + "step": 440 + }, + { + "epoch": 0.03013661934101259, + "grad_norm": 0.0, + "learning_rate": 3.012048192771085e-05, + "loss": 2.2472, + "step": 450 + }, + { + "epoch": 0.030806321993035093, + "grad_norm": 0.0, + "learning_rate": 3.078982597054887e-05, + "loss": 2.2773, + "step": 460 + }, + { + "epoch": 0.031476024645057596, + "grad_norm": 0.0, + "learning_rate": 3.1459170013386885e-05, + "loss": 2.3224, + "step": 470 + }, + { + "epoch": 0.032145727297080096, + "grad_norm": 0.0, + "learning_rate": 3.21285140562249e-05, + "loss": 2.1539, + "step": 480 + }, + { + "epoch": 0.032815429949102595, + "grad_norm": 0.0, + "learning_rate": 3.279785809906292e-05, + "loss": 2.3025, + "step": 490 + }, + { + "epoch": 0.0334851326011251, + "grad_norm": 0.0, + "learning_rate": 3.346720214190094e-05, + "loss": 2.2328, + "step": 500 + }, + { + "epoch": 0.0341548352531476, + "grad_norm": 0.0, + "learning_rate": 3.413654618473896e-05, + "loss": 2.2389, + "step": 510 + }, + { + "epoch": 0.0348245379051701, + "grad_norm": 0.0, + "learning_rate": 3.4805890227576974e-05, + "loss": 2.2465, + "step": 520 + }, + { + "epoch": 0.03549424055719261, + "grad_norm": 0.0, + "learning_rate": 3.5475234270415e-05, + "loss": 2.2608, + "step": 530 + }, + { + "epoch": 0.03616394320921511, + "grad_norm": 0.0, + "learning_rate": 3.614457831325301e-05, + "loss": 2.2254, + "step": 540 + }, + { + "epoch": 0.03683364586123761, + "grad_norm": 0.0, + "learning_rate": 3.6813922356091035e-05, + "loss": 2.2041, + "step": 550 + }, + { + "epoch": 0.037503348513260114, + "grad_norm": 0.0, + "learning_rate": 3.748326639892905e-05, + "loss": 2.3156, + "step": 560 + }, + { + "epoch": 0.038173051165282613, + "grad_norm": 0.0, + "learning_rate": 3.815261044176707e-05, + "loss": 2.3493, + "step": 570 + }, + { + "epoch": 0.03884275381730511, + "grad_norm": 0.0, + "learning_rate": 3.882195448460509e-05, + "loss": 2.2966, + "step": 580 + }, + { + "epoch": 0.03951245646932762, + "grad_norm": 0.0, + "learning_rate": 3.949129852744311e-05, + "loss": 2.2741, + "step": 590 + }, + { + "epoch": 0.04018215912135012, + "grad_norm": 0.0, + "learning_rate": 4.0160642570281125e-05, + "loss": 2.198, + "step": 600 + }, + { + "epoch": 0.04085186177337262, + "grad_norm": 0.0, + "learning_rate": 4.082998661311915e-05, + "loss": 2.2987, + "step": 610 + }, + { + "epoch": 0.041521564425395126, + "grad_norm": 0.0, + "learning_rate": 4.149933065595716e-05, + "loss": 2.3219, + "step": 620 + }, + { + "epoch": 0.042191267077417625, + "grad_norm": 0.0, + "learning_rate": 4.2168674698795186e-05, + "loss": 2.2609, + "step": 630 + }, + { + "epoch": 0.042860969729440125, + "grad_norm": 0.0, + "learning_rate": 4.2838018741633203e-05, + "loss": 2.1874, + "step": 640 + }, + { + "epoch": 0.04353067238146263, + "grad_norm": 0.0, + "learning_rate": 4.350736278447122e-05, + "loss": 2.2077, + "step": 650 + }, + { + "epoch": 0.04420037503348513, + "grad_norm": 0.0, + "learning_rate": 4.417670682730924e-05, + "loss": 2.187, + "step": 660 + }, + { + "epoch": 0.04487007768550764, + "grad_norm": 0.0, + "learning_rate": 4.484605087014726e-05, + "loss": 2.2983, + "step": 670 + }, + { + "epoch": 0.04553978033753014, + "grad_norm": 0.0, + "learning_rate": 4.5515394912985275e-05, + "loss": 2.3472, + "step": 680 + }, + { + "epoch": 0.04620948298955264, + "grad_norm": 0.0, + "learning_rate": 4.61847389558233e-05, + "loss": 2.2707, + "step": 690 + }, + { + "epoch": 0.046879185641575144, + "grad_norm": 0.0, + "learning_rate": 4.685408299866131e-05, + "loss": 2.3201, + "step": 700 + }, + { + "epoch": 0.047548888293597644, + "grad_norm": 0.0, + "learning_rate": 4.7523427041499336e-05, + "loss": 2.2628, + "step": 710 + }, + { + "epoch": 0.04821859094562014, + "grad_norm": 0.0, + "learning_rate": 4.8192771084337354e-05, + "loss": 2.3217, + "step": 720 + }, + { + "epoch": 0.04888829359764265, + "grad_norm": 0.0, + "learning_rate": 4.886211512717537e-05, + "loss": 2.3157, + "step": 730 + }, + { + "epoch": 0.04955799624966515, + "grad_norm": 0.0, + "learning_rate": 4.953145917001339e-05, + "loss": 2.2407, + "step": 740 + }, + { + "epoch": 0.05022769890168765, + "grad_norm": 0.0, + "learning_rate": 5.020080321285141e-05, + "loss": 2.2607, + "step": 750 + }, + { + "epoch": 0.050897401553710156, + "grad_norm": 0.0, + "learning_rate": 5.0870147255689426e-05, + "loss": 2.2192, + "step": 760 + }, + { + "epoch": 0.051567104205732656, + "grad_norm": 0.0, + "learning_rate": 5.1539491298527444e-05, + "loss": 2.2501, + "step": 770 + }, + { + "epoch": 0.052236806857755155, + "grad_norm": 0.0, + "learning_rate": 5.220883534136547e-05, + "loss": 2.2901, + "step": 780 + }, + { + "epoch": 0.05290650950977766, + "grad_norm": 0.0, + "learning_rate": 5.2878179384203486e-05, + "loss": 2.2391, + "step": 790 + }, + { + "epoch": 0.05357621216180016, + "grad_norm": 0.0, + "learning_rate": 5.3547523427041504e-05, + "loss": 2.2306, + "step": 800 + }, + { + "epoch": 0.05424591481382266, + "grad_norm": 0.0, + "learning_rate": 5.4216867469879516e-05, + "loss": 2.322, + "step": 810 + }, + { + "epoch": 0.05491561746584517, + "grad_norm": 0.0, + "learning_rate": 5.488621151271755e-05, + "loss": 2.2978, + "step": 820 + }, + { + "epoch": 0.05558532011786767, + "grad_norm": 0.0, + "learning_rate": 5.555555555555556e-05, + "loss": 2.3016, + "step": 830 + }, + { + "epoch": 0.05625502276989017, + "grad_norm": 0.0, + "learning_rate": 5.6224899598393576e-05, + "loss": 2.1525, + "step": 840 + }, + { + "epoch": 0.056924725421912674, + "grad_norm": 0.0, + "learning_rate": 5.6894243641231594e-05, + "loss": 2.2663, + "step": 850 + }, + { + "epoch": 0.057594428073935174, + "grad_norm": 0.0, + "learning_rate": 5.756358768406962e-05, + "loss": 2.2755, + "step": 860 + }, + { + "epoch": 0.05826413072595767, + "grad_norm": 0.0, + "learning_rate": 5.823293172690764e-05, + "loss": 2.2842, + "step": 870 + }, + { + "epoch": 0.05893383337798018, + "grad_norm": 0.0, + "learning_rate": 5.8902275769745655e-05, + "loss": 2.2635, + "step": 880 + }, + { + "epoch": 0.05960353603000268, + "grad_norm": 0.0, + "learning_rate": 5.9571619812583666e-05, + "loss": 2.2726, + "step": 890 + }, + { + "epoch": 0.06027323868202518, + "grad_norm": 0.0, + "learning_rate": 6.02409638554217e-05, + "loss": 2.2318, + "step": 900 + }, + { + "epoch": 0.060942941334047686, + "grad_norm": 0.0, + "learning_rate": 6.091030789825971e-05, + "loss": 2.3523, + "step": 910 + }, + { + "epoch": 0.061612643986070185, + "grad_norm": 0.0, + "learning_rate": 6.157965194109773e-05, + "loss": 2.317, + "step": 920 + }, + { + "epoch": 0.062282346638092685, + "grad_norm": 0.0, + "learning_rate": 6.224899598393574e-05, + "loss": 2.1225, + "step": 930 + }, + { + "epoch": 0.06295204929011519, + "grad_norm": 0.0, + "learning_rate": 6.291834002677377e-05, + "loss": 2.2357, + "step": 940 + }, + { + "epoch": 0.06362175194213769, + "grad_norm": 0.0, + "learning_rate": 6.358768406961179e-05, + "loss": 2.2447, + "step": 950 + }, + { + "epoch": 0.06429145459416019, + "grad_norm": 0.0, + "learning_rate": 6.42570281124498e-05, + "loss": 2.1638, + "step": 960 + }, + { + "epoch": 0.06496115724618269, + "grad_norm": 0.0, + "learning_rate": 6.492637215528782e-05, + "loss": 2.1353, + "step": 970 + }, + { + "epoch": 0.06563085989820519, + "grad_norm": 0.0, + "learning_rate": 6.559571619812584e-05, + "loss": 2.1564, + "step": 980 + }, + { + "epoch": 0.0663005625502277, + "grad_norm": 0.0, + "learning_rate": 6.626506024096386e-05, + "loss": 2.1921, + "step": 990 + }, + { + "epoch": 0.0669702652022502, + "grad_norm": 0.0, + "learning_rate": 6.693440428380188e-05, + "loss": 2.2064, + "step": 1000 + }, + { + "epoch": 0.0676399678542727, + "grad_norm": 0.0, + "learning_rate": 6.76037483266399e-05, + "loss": 2.223, + "step": 1010 + }, + { + "epoch": 0.0683096705062952, + "grad_norm": 0.0, + "learning_rate": 6.827309236947793e-05, + "loss": 2.3054, + "step": 1020 + }, + { + "epoch": 0.0689793731583177, + "grad_norm": 0.0, + "learning_rate": 6.894243641231593e-05, + "loss": 2.2002, + "step": 1030 + }, + { + "epoch": 0.0696490758103402, + "grad_norm": 0.0, + "learning_rate": 6.961178045515395e-05, + "loss": 2.249, + "step": 1040 + }, + { + "epoch": 0.07031877846236272, + "grad_norm": 0.0, + "learning_rate": 7.028112449799197e-05, + "loss": 2.1752, + "step": 1050 + }, + { + "epoch": 0.07098848111438522, + "grad_norm": 0.0, + "learning_rate": 7.095046854083e-05, + "loss": 2.2828, + "step": 1060 + }, + { + "epoch": 0.07165818376640772, + "grad_norm": 0.0, + "learning_rate": 7.161981258366802e-05, + "loss": 2.2456, + "step": 1070 + }, + { + "epoch": 0.07232788641843022, + "grad_norm": 0.0, + "learning_rate": 7.228915662650602e-05, + "loss": 2.3329, + "step": 1080 + }, + { + "epoch": 0.07299758907045271, + "grad_norm": 0.0, + "learning_rate": 7.295850066934404e-05, + "loss": 2.2911, + "step": 1090 + }, + { + "epoch": 0.07366729172247521, + "grad_norm": 0.0, + "learning_rate": 7.362784471218207e-05, + "loss": 2.3092, + "step": 1100 + }, + { + "epoch": 0.07433699437449773, + "grad_norm": 0.0, + "learning_rate": 7.429718875502009e-05, + "loss": 2.3058, + "step": 1110 + }, + { + "epoch": 0.07500669702652023, + "grad_norm": 0.0, + "learning_rate": 7.49665327978581e-05, + "loss": 2.3024, + "step": 1120 + }, + { + "epoch": 0.07567639967854273, + "grad_norm": 0.0, + "learning_rate": 7.563587684069612e-05, + "loss": 2.3491, + "step": 1130 + }, + { + "epoch": 0.07634610233056523, + "grad_norm": 0.0, + "learning_rate": 7.630522088353414e-05, + "loss": 2.3675, + "step": 1140 + }, + { + "epoch": 0.07701580498258773, + "grad_norm": 0.0, + "learning_rate": 7.697456492637216e-05, + "loss": 2.3257, + "step": 1150 + }, + { + "epoch": 0.07768550763461023, + "grad_norm": 0.0, + "learning_rate": 7.764390896921018e-05, + "loss": 2.3037, + "step": 1160 + }, + { + "epoch": 0.07835521028663274, + "grad_norm": 0.0, + "learning_rate": 7.83132530120482e-05, + "loss": 2.2627, + "step": 1170 + }, + { + "epoch": 0.07902491293865524, + "grad_norm": 0.0, + "learning_rate": 7.898259705488621e-05, + "loss": 2.3351, + "step": 1180 + }, + { + "epoch": 0.07969461559067774, + "grad_norm": 0.0, + "learning_rate": 7.965194109772423e-05, + "loss": 2.2829, + "step": 1190 + }, + { + "epoch": 0.08036431824270024, + "grad_norm": 0.0, + "learning_rate": 8.032128514056225e-05, + "loss": 2.2612, + "step": 1200 + }, + { + "epoch": 0.08103402089472274, + "grad_norm": 0.0, + "learning_rate": 8.099062918340027e-05, + "loss": 2.2377, + "step": 1210 + }, + { + "epoch": 0.08170372354674524, + "grad_norm": 0.0, + "learning_rate": 8.16599732262383e-05, + "loss": 2.2353, + "step": 1220 + }, + { + "epoch": 0.08237342619876775, + "grad_norm": 0.0, + "learning_rate": 8.232931726907632e-05, + "loss": 2.2509, + "step": 1230 + }, + { + "epoch": 0.08304312885079025, + "grad_norm": 0.0, + "learning_rate": 8.299866131191432e-05, + "loss": 2.2449, + "step": 1240 + }, + { + "epoch": 0.08371283150281275, + "grad_norm": 0.0, + "learning_rate": 8.366800535475234e-05, + "loss": 2.2076, + "step": 1250 + }, + { + "epoch": 0.08438253415483525, + "grad_norm": 0.0, + "learning_rate": 8.433734939759037e-05, + "loss": 2.2714, + "step": 1260 + }, + { + "epoch": 0.08505223680685775, + "grad_norm": 0.0, + "learning_rate": 8.500669344042839e-05, + "loss": 2.2424, + "step": 1270 + }, + { + "epoch": 0.08572193945888025, + "grad_norm": 0.0, + "learning_rate": 8.567603748326641e-05, + "loss": 2.2711, + "step": 1280 + }, + { + "epoch": 0.08639164211090276, + "grad_norm": 0.0, + "learning_rate": 8.634538152610442e-05, + "loss": 2.2266, + "step": 1290 + }, + { + "epoch": 0.08706134476292526, + "grad_norm": 0.0, + "learning_rate": 8.701472556894244e-05, + "loss": 2.2329, + "step": 1300 + }, + { + "epoch": 0.08773104741494776, + "grad_norm": 0.0, + "learning_rate": 8.768406961178046e-05, + "loss": 2.2388, + "step": 1310 + }, + { + "epoch": 0.08840075006697026, + "grad_norm": 0.0, + "learning_rate": 8.835341365461848e-05, + "loss": 2.3047, + "step": 1320 + }, + { + "epoch": 0.08907045271899276, + "grad_norm": 0.0, + "learning_rate": 8.90227576974565e-05, + "loss": 2.2679, + "step": 1330 + }, + { + "epoch": 0.08974015537101528, + "grad_norm": 0.0, + "learning_rate": 8.969210174029451e-05, + "loss": 2.3128, + "step": 1340 + }, + { + "epoch": 0.09040985802303778, + "grad_norm": 0.0, + "learning_rate": 9.036144578313253e-05, + "loss": 2.2134, + "step": 1350 + }, + { + "epoch": 0.09107956067506028, + "grad_norm": 0.0, + "learning_rate": 9.103078982597055e-05, + "loss": 2.2883, + "step": 1360 + }, + { + "epoch": 0.09174926332708278, + "grad_norm": 0.0, + "learning_rate": 9.170013386880857e-05, + "loss": 2.2464, + "step": 1370 + }, + { + "epoch": 0.09241896597910527, + "grad_norm": 0.0, + "learning_rate": 9.23694779116466e-05, + "loss": 2.2318, + "step": 1380 + }, + { + "epoch": 0.09308866863112777, + "grad_norm": 0.0, + "learning_rate": 9.303882195448462e-05, + "loss": 2.229, + "step": 1390 + }, + { + "epoch": 0.09375837128315029, + "grad_norm": 0.0, + "learning_rate": 9.370816599732262e-05, + "loss": 2.2054, + "step": 1400 + }, + { + "epoch": 0.09442807393517279, + "grad_norm": 0.0, + "learning_rate": 9.437751004016064e-05, + "loss": 2.2667, + "step": 1410 + }, + { + "epoch": 0.09509777658719529, + "grad_norm": 0.0, + "learning_rate": 9.504685408299867e-05, + "loss": 2.2674, + "step": 1420 + }, + { + "epoch": 0.09576747923921779, + "grad_norm": 0.0, + "learning_rate": 9.571619812583669e-05, + "loss": 2.2865, + "step": 1430 + }, + { + "epoch": 0.09643718189124029, + "grad_norm": 0.0, + "learning_rate": 9.638554216867471e-05, + "loss": 2.2337, + "step": 1440 + }, + { + "epoch": 0.09710688454326279, + "grad_norm": 0.0, + "learning_rate": 9.705488621151271e-05, + "loss": 2.291, + "step": 1450 + }, + { + "epoch": 0.0977765871952853, + "grad_norm": 0.0, + "learning_rate": 9.772423025435074e-05, + "loss": 2.3411, + "step": 1460 + }, + { + "epoch": 0.0984462898473078, + "grad_norm": 0.0, + "learning_rate": 9.839357429718876e-05, + "loss": 2.1827, + "step": 1470 + }, + { + "epoch": 0.0991159924993303, + "grad_norm": 0.0, + "learning_rate": 9.906291834002678e-05, + "loss": 2.1987, + "step": 1480 + }, + { + "epoch": 0.0997856951513528, + "grad_norm": 0.0, + "learning_rate": 9.97322623828648e-05, + "loss": 2.2079, + "step": 1490 + }, + { + "epoch": 0.1004553978033753, + "grad_norm": 0.0, + "learning_rate": 9.999995081044314e-05, + "loss": 2.2484, + "step": 1500 + }, + { + "epoch": 0.1011251004553978, + "grad_norm": 0.0, + "learning_rate": 9.999965020794615e-05, + "loss": 2.3123, + "step": 1510 + }, + { + "epoch": 0.10179480310742031, + "grad_norm": 0.0, + "learning_rate": 9.99990763321247e-05, + "loss": 2.2429, + "step": 1520 + }, + { + "epoch": 0.10246450575944281, + "grad_norm": 0.0, + "learning_rate": 9.999822918611533e-05, + "loss": 2.2205, + "step": 1530 + }, + { + "epoch": 0.10313420841146531, + "grad_norm": 0.0, + "learning_rate": 9.999710877454811e-05, + "loss": 2.2077, + "step": 1540 + }, + { + "epoch": 0.10380391106348781, + "grad_norm": 0.0, + "learning_rate": 9.999571510354664e-05, + "loss": 2.3028, + "step": 1550 + }, + { + "epoch": 0.10447361371551031, + "grad_norm": 0.0, + "learning_rate": 9.999404818072808e-05, + "loss": 2.252, + "step": 1560 + }, + { + "epoch": 0.10514331636753281, + "grad_norm": 0.0, + "learning_rate": 9.999210801520296e-05, + "loss": 2.3754, + "step": 1570 + }, + { + "epoch": 0.10581301901955532, + "grad_norm": 0.0, + "learning_rate": 9.998989461757526e-05, + "loss": 2.2761, + "step": 1580 + }, + { + "epoch": 0.10648272167157782, + "grad_norm": 0.0, + "learning_rate": 9.998740799994235e-05, + "loss": 2.2168, + "step": 1590 + }, + { + "epoch": 0.10715242432360032, + "grad_norm": 0.0, + "learning_rate": 9.998464817589484e-05, + "loss": 2.2639, + "step": 1600 + }, + { + "epoch": 0.10782212697562282, + "grad_norm": 0.0, + "learning_rate": 9.998161516051656e-05, + "loss": 2.2687, + "step": 1610 + }, + { + "epoch": 0.10849182962764532, + "grad_norm": 0.0, + "learning_rate": 9.997830897038446e-05, + "loss": 2.2239, + "step": 1620 + }, + { + "epoch": 0.10916153227966782, + "grad_norm": 0.0, + "learning_rate": 9.997472962356854e-05, + "loss": 2.1802, + "step": 1630 + }, + { + "epoch": 0.10983123493169034, + "grad_norm": 0.0, + "learning_rate": 9.997087713963174e-05, + "loss": 2.2154, + "step": 1640 + }, + { + "epoch": 0.11050093758371284, + "grad_norm": 0.0, + "learning_rate": 9.996675153962984e-05, + "loss": 2.212, + "step": 1650 + }, + { + "epoch": 0.11117064023573534, + "grad_norm": 0.0, + "learning_rate": 9.996235284611131e-05, + "loss": 2.251, + "step": 1660 + }, + { + "epoch": 0.11184034288775783, + "grad_norm": 0.0, + "learning_rate": 9.995768108311722e-05, + "loss": 2.299, + "step": 1670 + }, + { + "epoch": 0.11251004553978033, + "grad_norm": 0.0, + "learning_rate": 9.995273627618116e-05, + "loss": 2.2021, + "step": 1680 + }, + { + "epoch": 0.11317974819180283, + "grad_norm": 0.0, + "learning_rate": 9.994751845232894e-05, + "loss": 2.244, + "step": 1690 + }, + { + "epoch": 0.11384945084382535, + "grad_norm": 0.0, + "learning_rate": 9.994202764007865e-05, + "loss": 2.3063, + "step": 1700 + }, + { + "epoch": 0.11451915349584785, + "grad_norm": 0.0, + "learning_rate": 9.993626386944031e-05, + "loss": 2.2809, + "step": 1710 + }, + { + "epoch": 0.11518885614787035, + "grad_norm": 0.0, + "learning_rate": 9.993022717191584e-05, + "loss": 2.2767, + "step": 1720 + }, + { + "epoch": 0.11585855879989285, + "grad_norm": 0.0, + "learning_rate": 9.992391758049889e-05, + "loss": 2.2354, + "step": 1730 + }, + { + "epoch": 0.11652826145191535, + "grad_norm": 0.0, + "learning_rate": 9.99173351296745e-05, + "loss": 2.2889, + "step": 1740 + }, + { + "epoch": 0.11719796410393785, + "grad_norm": 0.0, + "learning_rate": 9.99104798554191e-05, + "loss": 2.22, + "step": 1750 + }, + { + "epoch": 0.11786766675596036, + "grad_norm": 0.0, + "learning_rate": 9.990335179520023e-05, + "loss": 2.2941, + "step": 1760 + }, + { + "epoch": 0.11853736940798286, + "grad_norm": 0.0, + "learning_rate": 9.989595098797635e-05, + "loss": 2.2584, + "step": 1770 + }, + { + "epoch": 0.11920707206000536, + "grad_norm": 0.0, + "learning_rate": 9.988827747419659e-05, + "loss": 2.2408, + "step": 1780 + }, + { + "epoch": 0.11987677471202786, + "grad_norm": 0.0, + "learning_rate": 9.988033129580059e-05, + "loss": 2.1831, + "step": 1790 + }, + { + "epoch": 0.12054647736405036, + "grad_norm": 0.0, + "learning_rate": 9.98721124962182e-05, + "loss": 2.2853, + "step": 1800 + }, + { + "epoch": 0.12121618001607286, + "grad_norm": 0.0, + "learning_rate": 9.986362112036935e-05, + "loss": 2.2537, + "step": 1810 + }, + { + "epoch": 0.12188588266809537, + "grad_norm": 0.0, + "learning_rate": 9.985485721466366e-05, + "loss": 2.2257, + "step": 1820 + }, + { + "epoch": 0.12255558532011787, + "grad_norm": 0.0, + "learning_rate": 9.984582082700029e-05, + "loss": 2.3031, + "step": 1830 + }, + { + "epoch": 0.12322528797214037, + "grad_norm": 0.0, + "learning_rate": 9.98365120067677e-05, + "loss": 2.2435, + "step": 1840 + }, + { + "epoch": 0.12389499062416287, + "grad_norm": 0.0, + "learning_rate": 9.982693080484327e-05, + "loss": 2.1834, + "step": 1850 + }, + { + "epoch": 0.12456469327618537, + "grad_norm": 0.0, + "learning_rate": 9.981707727359308e-05, + "loss": 2.2215, + "step": 1860 + }, + { + "epoch": 0.12523439592820787, + "grad_norm": 0.0, + "learning_rate": 9.98069514668717e-05, + "loss": 2.3128, + "step": 1870 + }, + { + "epoch": 0.12590409858023038, + "grad_norm": 0.0, + "learning_rate": 9.979655344002172e-05, + "loss": 2.2512, + "step": 1880 + }, + { + "epoch": 0.12657380123225287, + "grad_norm": 0.0, + "learning_rate": 9.978588324987365e-05, + "loss": 2.2278, + "step": 1890 + }, + { + "epoch": 0.12724350388427538, + "grad_norm": 0.0, + "learning_rate": 9.977494095474546e-05, + "loss": 2.2445, + "step": 1900 + }, + { + "epoch": 0.1279132065362979, + "grad_norm": 0.0, + "learning_rate": 9.97637266144423e-05, + "loss": 2.2834, + "step": 1910 + }, + { + "epoch": 0.12858290918832038, + "grad_norm": 0.0, + "learning_rate": 9.975224029025619e-05, + "loss": 2.2807, + "step": 1920 + }, + { + "epoch": 0.1292526118403429, + "grad_norm": 0.0, + "learning_rate": 9.974048204496572e-05, + "loss": 2.3837, + "step": 1930 + }, + { + "epoch": 0.12992231449236538, + "grad_norm": 0.0, + "learning_rate": 9.97284519428356e-05, + "loss": 2.2318, + "step": 1940 + }, + { + "epoch": 0.1305920171443879, + "grad_norm": 0.0, + "learning_rate": 9.971615004961645e-05, + "loss": 2.1961, + "step": 1950 + }, + { + "epoch": 0.13126171979641038, + "grad_norm": 0.0, + "learning_rate": 9.970357643254429e-05, + "loss": 2.301, + "step": 1960 + }, + { + "epoch": 0.1319314224484329, + "grad_norm": 0.0, + "learning_rate": 9.96907311603403e-05, + "loss": 2.2454, + "step": 1970 + }, + { + "epoch": 0.1326011251004554, + "grad_norm": 0.0, + "learning_rate": 9.967761430321037e-05, + "loss": 2.2505, + "step": 1980 + }, + { + "epoch": 0.1332708277524779, + "grad_norm": 0.0, + "learning_rate": 9.966422593284474e-05, + "loss": 2.2103, + "step": 1990 + }, + { + "epoch": 0.1339405304045004, + "grad_norm": 0.0, + "learning_rate": 9.965056612241764e-05, + "loss": 2.3467, + "step": 2000 + }, + { + "epoch": 0.1346102330565229, + "grad_norm": 0.0, + "learning_rate": 9.963663494658681e-05, + "loss": 2.174, + "step": 2010 + }, + { + "epoch": 0.1352799357085454, + "grad_norm": 0.0, + "learning_rate": 9.962243248149314e-05, + "loss": 2.3771, + "step": 2020 + }, + { + "epoch": 0.13594963836056792, + "grad_norm": 0.0, + "learning_rate": 9.960795880476029e-05, + "loss": 2.3623, + "step": 2030 + }, + { + "epoch": 0.1366193410125904, + "grad_norm": 0.0, + "learning_rate": 9.95932139954942e-05, + "loss": 2.27, + "step": 2040 + }, + { + "epoch": 0.13728904366461292, + "grad_norm": 0.0, + "learning_rate": 9.95781981342827e-05, + "loss": 2.2534, + "step": 2050 + }, + { + "epoch": 0.1379587463166354, + "grad_norm": 0.0, + "learning_rate": 9.956291130319506e-05, + "loss": 2.2726, + "step": 2060 + }, + { + "epoch": 0.13862844896865792, + "grad_norm": 0.0, + "learning_rate": 9.95473535857815e-05, + "loss": 2.3065, + "step": 2070 + }, + { + "epoch": 0.1392981516206804, + "grad_norm": 0.0, + "learning_rate": 9.953152506707283e-05, + "loss": 2.1472, + "step": 2080 + }, + { + "epoch": 0.13996785427270292, + "grad_norm": 0.0, + "learning_rate": 9.951542583357986e-05, + "loss": 2.2893, + "step": 2090 + }, + { + "epoch": 0.14063755692472543, + "grad_norm": 0.0, + "learning_rate": 9.949905597329306e-05, + "loss": 2.1606, + "step": 2100 + }, + { + "epoch": 0.14130725957674792, + "grad_norm": 0.0, + "learning_rate": 9.948241557568196e-05, + "loss": 2.2155, + "step": 2110 + }, + { + "epoch": 0.14197696222877043, + "grad_norm": 0.0, + "learning_rate": 9.946550473169474e-05, + "loss": 2.2544, + "step": 2120 + }, + { + "epoch": 0.14264666488079292, + "grad_norm": 0.0, + "learning_rate": 9.944832353375769e-05, + "loss": 2.2661, + "step": 2130 + }, + { + "epoch": 0.14331636753281543, + "grad_norm": 0.0, + "learning_rate": 9.943087207577473e-05, + "loss": 2.2873, + "step": 2140 + }, + { + "epoch": 0.14398607018483794, + "grad_norm": 0.0, + "learning_rate": 9.941315045312689e-05, + "loss": 2.2044, + "step": 2150 + }, + { + "epoch": 0.14465577283686043, + "grad_norm": 0.0, + "learning_rate": 9.939515876267178e-05, + "loss": 2.3315, + "step": 2160 + }, + { + "epoch": 0.14532547548888294, + "grad_norm": 0.0, + "learning_rate": 9.937689710274308e-05, + "loss": 2.2413, + "step": 2170 + }, + { + "epoch": 0.14599517814090543, + "grad_norm": 0.0, + "learning_rate": 9.935836557314995e-05, + "loss": 2.2494, + "step": 2180 + }, + { + "epoch": 0.14666488079292794, + "grad_norm": 0.0, + "learning_rate": 9.933956427517657e-05, + "loss": 2.2623, + "step": 2190 + }, + { + "epoch": 0.14733458344495043, + "grad_norm": 0.0, + "learning_rate": 9.932049331158153e-05, + "loss": 2.2775, + "step": 2200 + }, + { + "epoch": 0.14800428609697294, + "grad_norm": 0.0, + "learning_rate": 9.930115278659727e-05, + "loss": 2.1811, + "step": 2210 + }, + { + "epoch": 0.14867398874899546, + "grad_norm": 0.0, + "learning_rate": 9.928154280592952e-05, + "loss": 2.2731, + "step": 2220 + }, + { + "epoch": 0.14934369140101794, + "grad_norm": 0.0, + "learning_rate": 9.92616634767567e-05, + "loss": 2.2588, + "step": 2230 + }, + { + "epoch": 0.15001339405304046, + "grad_norm": 0.0, + "learning_rate": 9.924151490772942e-05, + "loss": 2.2073, + "step": 2240 + }, + { + "epoch": 0.15068309670506294, + "grad_norm": 0.0, + "learning_rate": 9.922109720896973e-05, + "loss": 2.2965, + "step": 2250 + }, + { + "epoch": 0.15135279935708545, + "grad_norm": 0.0, + "learning_rate": 9.92004104920707e-05, + "loss": 2.1766, + "step": 2260 + }, + { + "epoch": 0.15202250200910797, + "grad_norm": 0.0, + "learning_rate": 9.917945487009566e-05, + "loss": 2.2915, + "step": 2270 + }, + { + "epoch": 0.15269220466113045, + "grad_norm": 0.0, + "learning_rate": 9.915823045757765e-05, + "loss": 2.2717, + "step": 2280 + }, + { + "epoch": 0.15336190731315297, + "grad_norm": 0.0, + "learning_rate": 9.913673737051882e-05, + "loss": 2.2301, + "step": 2290 + }, + { + "epoch": 0.15403160996517545, + "grad_norm": 0.0, + "learning_rate": 9.911497572638973e-05, + "loss": 2.2289, + "step": 2300 + }, + { + "epoch": 0.15470131261719797, + "grad_norm": 0.0, + "learning_rate": 9.909294564412874e-05, + "loss": 2.3655, + "step": 2310 + }, + { + "epoch": 0.15537101526922045, + "grad_norm": 0.0, + "learning_rate": 9.907064724414139e-05, + "loss": 2.1503, + "step": 2320 + }, + { + "epoch": 0.15604071792124297, + "grad_norm": 0.0, + "learning_rate": 9.904808064829967e-05, + "loss": 2.2254, + "step": 2330 + }, + { + "epoch": 0.15671042057326548, + "grad_norm": 0.0, + "learning_rate": 9.902524597994143e-05, + "loss": 2.28, + "step": 2340 + }, + { + "epoch": 0.15738012322528797, + "grad_norm": 0.0, + "learning_rate": 9.900214336386964e-05, + "loss": 2.18, + "step": 2350 + }, + { + "epoch": 0.15804982587731048, + "grad_norm": 0.0, + "learning_rate": 9.897877292635179e-05, + "loss": 2.3032, + "step": 2360 + }, + { + "epoch": 0.15871952852933296, + "grad_norm": 0.0, + "learning_rate": 9.895513479511907e-05, + "loss": 2.2472, + "step": 2370 + }, + { + "epoch": 0.15938923118135548, + "grad_norm": 0.0, + "learning_rate": 9.893122909936583e-05, + "loss": 2.1331, + "step": 2380 + }, + { + "epoch": 0.160058933833378, + "grad_norm": 0.0, + "learning_rate": 9.890705596974875e-05, + "loss": 2.1931, + "step": 2390 + }, + { + "epoch": 0.16072863648540048, + "grad_norm": 0.0, + "learning_rate": 9.88826155383862e-05, + "loss": 2.2551, + "step": 2400 + }, + { + "epoch": 0.161398339137423, + "grad_norm": 0.0, + "learning_rate": 9.885790793885746e-05, + "loss": 2.1717, + "step": 2410 + }, + { + "epoch": 0.16206804178944548, + "grad_norm": 0.0, + "learning_rate": 9.883293330620205e-05, + "loss": 2.2582, + "step": 2420 + }, + { + "epoch": 0.162737744441468, + "grad_norm": 0.0, + "learning_rate": 9.880769177691892e-05, + "loss": 2.2466, + "step": 2430 + }, + { + "epoch": 0.16340744709349048, + "grad_norm": 0.0, + "learning_rate": 9.878218348896577e-05, + "loss": 2.2734, + "step": 2440 + }, + { + "epoch": 0.164077149745513, + "grad_norm": 0.0, + "learning_rate": 9.875640858175827e-05, + "loss": 2.3381, + "step": 2450 + }, + { + "epoch": 0.1647468523975355, + "grad_norm": 0.0, + "learning_rate": 9.87303671961693e-05, + "loss": 2.2529, + "step": 2460 + }, + { + "epoch": 0.165416555049558, + "grad_norm": 0.0, + "learning_rate": 9.870405947452819e-05, + "loss": 2.2889, + "step": 2470 + }, + { + "epoch": 0.1660862577015805, + "grad_norm": 0.0, + "learning_rate": 9.86774855606199e-05, + "loss": 2.1831, + "step": 2480 + }, + { + "epoch": 0.166755960353603, + "grad_norm": 0.0, + "learning_rate": 9.86506455996843e-05, + "loss": 2.2241, + "step": 2490 + }, + { + "epoch": 0.1674256630056255, + "grad_norm": 0.0, + "learning_rate": 9.862353973841526e-05, + "loss": 2.2539, + "step": 2500 + }, + { + "epoch": 0.16809536565764802, + "grad_norm": 0.0, + "learning_rate": 9.859616812496008e-05, + "loss": 2.263, + "step": 2510 + }, + { + "epoch": 0.1687650683096705, + "grad_norm": 0.0, + "learning_rate": 9.856853090891843e-05, + "loss": 2.2179, + "step": 2520 + }, + { + "epoch": 0.16943477096169302, + "grad_norm": 0.0, + "learning_rate": 9.854062824134159e-05, + "loss": 2.2326, + "step": 2530 + }, + { + "epoch": 0.1701044736137155, + "grad_norm": 0.0, + "learning_rate": 9.851246027473173e-05, + "loss": 2.3374, + "step": 2540 + }, + { + "epoch": 0.17077417626573801, + "grad_norm": 0.0, + "learning_rate": 9.848402716304106e-05, + "loss": 2.2265, + "step": 2550 + }, + { + "epoch": 0.1714438789177605, + "grad_norm": 0.0, + "learning_rate": 9.845532906167083e-05, + "loss": 2.1868, + "step": 2560 + }, + { + "epoch": 0.17211358156978301, + "grad_norm": 0.0, + "learning_rate": 9.842636612747069e-05, + "loss": 2.2097, + "step": 2570 + }, + { + "epoch": 0.17278328422180553, + "grad_norm": 0.0, + "learning_rate": 9.839713851873766e-05, + "loss": 2.2601, + "step": 2580 + }, + { + "epoch": 0.173452986873828, + "grad_norm": 0.0, + "learning_rate": 9.836764639521539e-05, + "loss": 2.2559, + "step": 2590 + }, + { + "epoch": 0.17412268952585053, + "grad_norm": 0.0, + "learning_rate": 9.833788991809323e-05, + "loss": 2.2896, + "step": 2600 + }, + { + "epoch": 0.174792392177873, + "grad_norm": 0.0, + "learning_rate": 9.830786925000533e-05, + "loss": 2.2095, + "step": 2610 + }, + { + "epoch": 0.17546209482989553, + "grad_norm": 0.0, + "learning_rate": 9.827758455502978e-05, + "loss": 2.206, + "step": 2620 + }, + { + "epoch": 0.17613179748191804, + "grad_norm": 0.0, + "learning_rate": 9.824703599868776e-05, + "loss": 2.3201, + "step": 2630 + }, + { + "epoch": 0.17680150013394053, + "grad_norm": 0.0, + "learning_rate": 9.821622374794253e-05, + "loss": 2.2804, + "step": 2640 + }, + { + "epoch": 0.17747120278596304, + "grad_norm": 0.0, + "learning_rate": 9.81851479711986e-05, + "loss": 2.1435, + "step": 2650 + }, + { + "epoch": 0.17814090543798553, + "grad_norm": 0.0, + "learning_rate": 9.815380883830076e-05, + "loss": 2.1878, + "step": 2660 + }, + { + "epoch": 0.17881060809000804, + "grad_norm": 0.0, + "learning_rate": 9.812220652053318e-05, + "loss": 2.3302, + "step": 2670 + }, + { + "epoch": 0.17948031074203055, + "grad_norm": 0.0, + "learning_rate": 9.80903411906185e-05, + "loss": 2.2151, + "step": 2680 + }, + { + "epoch": 0.18015001339405304, + "grad_norm": 0.0, + "learning_rate": 9.805821302271685e-05, + "loss": 2.2155, + "step": 2690 + }, + { + "epoch": 0.18081971604607555, + "grad_norm": 0.0, + "learning_rate": 9.802582219242484e-05, + "loss": 2.2942, + "step": 2700 + }, + { + "epoch": 0.18148941869809804, + "grad_norm": 0.0, + "learning_rate": 9.799316887677471e-05, + "loss": 2.2229, + "step": 2710 + }, + { + "epoch": 0.18215912135012055, + "grad_norm": 0.0, + "learning_rate": 9.796025325423334e-05, + "loss": 2.2516, + "step": 2720 + }, + { + "epoch": 0.18282882400214304, + "grad_norm": 0.0, + "learning_rate": 9.792707550470122e-05, + "loss": 2.1446, + "step": 2730 + }, + { + "epoch": 0.18349852665416555, + "grad_norm": 0.0, + "learning_rate": 9.789363580951145e-05, + "loss": 2.2074, + "step": 2740 + }, + { + "epoch": 0.18416822930618806, + "grad_norm": 0.0, + "learning_rate": 9.785993435142891e-05, + "loss": 2.236, + "step": 2750 + }, + { + "epoch": 0.18483793195821055, + "grad_norm": 0.0, + "learning_rate": 9.782597131464901e-05, + "loss": 2.2744, + "step": 2760 + }, + { + "epoch": 0.18550763461023306, + "grad_norm": 0.0, + "learning_rate": 9.779174688479693e-05, + "loss": 2.2472, + "step": 2770 + }, + { + "epoch": 0.18617733726225555, + "grad_norm": 0.0, + "learning_rate": 9.775726124892646e-05, + "loss": 2.4463, + "step": 2780 + }, + { + "epoch": 0.18684703991427806, + "grad_norm": 0.0, + "learning_rate": 9.772251459551897e-05, + "loss": 2.1792, + "step": 2790 + }, + { + "epoch": 0.18751674256630058, + "grad_norm": 0.0, + "learning_rate": 9.768750711448249e-05, + "loss": 2.2459, + "step": 2800 + }, + { + "epoch": 0.18818644521832306, + "grad_norm": 0.0, + "learning_rate": 9.765223899715054e-05, + "loss": 2.1656, + "step": 2810 + }, + { + "epoch": 0.18885614787034558, + "grad_norm": 0.0, + "learning_rate": 9.761671043628124e-05, + "loss": 2.3268, + "step": 2820 + }, + { + "epoch": 0.18952585052236806, + "grad_norm": 0.0, + "learning_rate": 9.758092162605604e-05, + "loss": 2.2815, + "step": 2830 + }, + { + "epoch": 0.19019555317439057, + "grad_norm": 0.0, + "learning_rate": 9.754487276207889e-05, + "loss": 2.2414, + "step": 2840 + }, + { + "epoch": 0.19086525582641306, + "grad_norm": 0.0, + "learning_rate": 9.750856404137502e-05, + "loss": 2.1398, + "step": 2850 + }, + { + "epoch": 0.19153495847843557, + "grad_norm": 0.0, + "learning_rate": 9.747199566238991e-05, + "loss": 2.2683, + "step": 2860 + }, + { + "epoch": 0.1922046611304581, + "grad_norm": 0.0, + "learning_rate": 9.743516782498822e-05, + "loss": 2.1707, + "step": 2870 + }, + { + "epoch": 0.19287436378248057, + "grad_norm": 0.0, + "learning_rate": 9.739808073045264e-05, + "loss": 2.3202, + "step": 2880 + }, + { + "epoch": 0.1935440664345031, + "grad_norm": 0.0, + "learning_rate": 9.73607345814829e-05, + "loss": 2.2712, + "step": 2890 + }, + { + "epoch": 0.19421376908652557, + "grad_norm": 0.0, + "learning_rate": 9.732312958219453e-05, + "loss": 2.2749, + "step": 2900 + }, + { + "epoch": 0.1948834717385481, + "grad_norm": 0.0, + "learning_rate": 9.728526593811783e-05, + "loss": 2.1855, + "step": 2910 + }, + { + "epoch": 0.1955531743905706, + "grad_norm": 0.0, + "learning_rate": 9.724714385619673e-05, + "loss": 2.1736, + "step": 2920 + }, + { + "epoch": 0.19622287704259309, + "grad_norm": 0.0, + "learning_rate": 9.720876354478765e-05, + "loss": 2.2247, + "step": 2930 + }, + { + "epoch": 0.1968925796946156, + "grad_norm": 0.0, + "learning_rate": 9.717012521365836e-05, + "loss": 2.2102, + "step": 2940 + }, + { + "epoch": 0.19756228234663809, + "grad_norm": 0.0, + "learning_rate": 9.713122907398686e-05, + "loss": 2.275, + "step": 2950 + }, + { + "epoch": 0.1982319849986606, + "grad_norm": 0.0, + "learning_rate": 9.709207533836016e-05, + "loss": 2.0444, + "step": 2960 + }, + { + "epoch": 0.19890168765068308, + "grad_norm": 0.0, + "learning_rate": 9.70526642207732e-05, + "loss": 2.2226, + "step": 2970 + }, + { + "epoch": 0.1995713903027056, + "grad_norm": 0.0, + "learning_rate": 9.701299593662763e-05, + "loss": 2.2936, + "step": 2980 + }, + { + "epoch": 0.2002410929547281, + "grad_norm": 0.0, + "learning_rate": 9.697307070273062e-05, + "loss": 2.2361, + "step": 2990 + }, + { + "epoch": 0.2009107956067506, + "grad_norm": 0.0, + "learning_rate": 9.693288873729376e-05, + "loss": 2.1928, + "step": 3000 + }, + { + "epoch": 0.2015804982587731, + "grad_norm": 0.0, + "learning_rate": 9.689245025993175e-05, + "loss": 2.185, + "step": 3010 + }, + { + "epoch": 0.2022502009107956, + "grad_norm": 0.0, + "learning_rate": 9.68517554916613e-05, + "loss": 2.1397, + "step": 3020 + }, + { + "epoch": 0.2029199035628181, + "grad_norm": 0.0, + "learning_rate": 9.681080465489983e-05, + "loss": 2.3028, + "step": 3030 + }, + { + "epoch": 0.20358960621484062, + "grad_norm": 0.0, + "learning_rate": 9.676959797346435e-05, + "loss": 2.2324, + "step": 3040 + }, + { + "epoch": 0.2042593088668631, + "grad_norm": 0.0, + "learning_rate": 9.672813567257017e-05, + "loss": 2.2146, + "step": 3050 + }, + { + "epoch": 0.20492901151888562, + "grad_norm": 0.0, + "learning_rate": 9.668641797882969e-05, + "loss": 2.1497, + "step": 3060 + }, + { + "epoch": 0.2055987141709081, + "grad_norm": 0.0, + "learning_rate": 9.664444512025116e-05, + "loss": 2.2361, + "step": 3070 + }, + { + "epoch": 0.20626841682293062, + "grad_norm": 0.0, + "learning_rate": 9.660221732623744e-05, + "loss": 2.2336, + "step": 3080 + }, + { + "epoch": 0.2069381194749531, + "grad_norm": 0.0, + "learning_rate": 9.655973482758473e-05, + "loss": 2.2446, + "step": 3090 + }, + { + "epoch": 0.20760782212697562, + "grad_norm": 0.0, + "learning_rate": 9.651699785648135e-05, + "loss": 2.2724, + "step": 3100 + }, + { + "epoch": 0.20827752477899814, + "grad_norm": 0.0, + "learning_rate": 9.647400664650638e-05, + "loss": 2.2362, + "step": 3110 + }, + { + "epoch": 0.20894722743102062, + "grad_norm": 0.0, + "learning_rate": 9.643076143262851e-05, + "loss": 2.2481, + "step": 3120 + }, + { + "epoch": 0.20961693008304313, + "grad_norm": 0.0, + "learning_rate": 9.638726245120466e-05, + "loss": 2.1208, + "step": 3130 + }, + { + "epoch": 0.21028663273506562, + "grad_norm": 0.0, + "learning_rate": 9.634350993997871e-05, + "loss": 2.2085, + "step": 3140 + }, + { + "epoch": 0.21095633538708813, + "grad_norm": 0.0, + "learning_rate": 9.629950413808022e-05, + "loss": 2.2974, + "step": 3150 + }, + { + "epoch": 0.21162603803911065, + "grad_norm": 0.0, + "learning_rate": 9.62552452860231e-05, + "loss": 2.2781, + "step": 3160 + }, + { + "epoch": 0.21229574069113313, + "grad_norm": 0.0, + "learning_rate": 9.621073362570432e-05, + "loss": 2.3054, + "step": 3170 + }, + { + "epoch": 0.21296544334315565, + "grad_norm": 0.0, + "learning_rate": 9.616596940040257e-05, + "loss": 2.1255, + "step": 3180 + }, + { + "epoch": 0.21363514599517813, + "grad_norm": 0.0, + "learning_rate": 9.612095285477694e-05, + "loss": 2.1918, + "step": 3190 + }, + { + "epoch": 0.21430484864720065, + "grad_norm": 0.0, + "learning_rate": 9.607568423486558e-05, + "loss": 2.2317, + "step": 3200 + }, + { + "epoch": 0.21497455129922313, + "grad_norm": 0.0, + "learning_rate": 9.603016378808432e-05, + "loss": 2.0927, + "step": 3210 + }, + { + "epoch": 0.21564425395124565, + "grad_norm": 0.0, + "learning_rate": 9.59843917632254e-05, + "loss": 2.2694, + "step": 3220 + }, + { + "epoch": 0.21631395660326816, + "grad_norm": 0.0, + "learning_rate": 9.593836841045602e-05, + "loss": 2.2625, + "step": 3230 + }, + { + "epoch": 0.21698365925529065, + "grad_norm": 0.0, + "learning_rate": 9.589209398131706e-05, + "loss": 2.3238, + "step": 3240 + }, + { + "epoch": 0.21765336190731316, + "grad_norm": 0.0, + "learning_rate": 9.584556872872159e-05, + "loss": 2.2813, + "step": 3250 + }, + { + "epoch": 0.21832306455933564, + "grad_norm": 0.0, + "learning_rate": 9.579879290695364e-05, + "loss": 2.2298, + "step": 3260 + }, + { + "epoch": 0.21899276721135816, + "grad_norm": 0.0, + "learning_rate": 9.575176677166667e-05, + "loss": 2.2254, + "step": 3270 + }, + { + "epoch": 0.21966246986338067, + "grad_norm": 0.0, + "learning_rate": 9.570449057988222e-05, + "loss": 2.3366, + "step": 3280 + }, + { + "epoch": 0.22033217251540316, + "grad_norm": 0.0, + "learning_rate": 9.565696458998858e-05, + "loss": 2.2601, + "step": 3290 + }, + { + "epoch": 0.22100187516742567, + "grad_norm": 0.0, + "learning_rate": 9.560918906173923e-05, + "loss": 2.231, + "step": 3300 + }, + { + "epoch": 0.22167157781944816, + "grad_norm": 0.0, + "learning_rate": 9.556116425625159e-05, + "loss": 2.2668, + "step": 3310 + }, + { + "epoch": 0.22234128047147067, + "grad_norm": 0.0, + "learning_rate": 9.551289043600542e-05, + "loss": 2.2073, + "step": 3320 + }, + { + "epoch": 0.22301098312349316, + "grad_norm": 0.0, + "learning_rate": 9.546436786484155e-05, + "loss": 2.3307, + "step": 3330 + }, + { + "epoch": 0.22368068577551567, + "grad_norm": 0.0, + "learning_rate": 9.541559680796029e-05, + "loss": 2.2272, + "step": 3340 + }, + { + "epoch": 0.22435038842753818, + "grad_norm": 0.0, + "learning_rate": 9.536657753192011e-05, + "loss": 2.2672, + "step": 3350 + }, + { + "epoch": 0.22502009107956067, + "grad_norm": 0.0, + "learning_rate": 9.531731030463607e-05, + "loss": 2.2473, + "step": 3360 + }, + { + "epoch": 0.22568979373158318, + "grad_norm": 0.0, + "learning_rate": 9.526779539537845e-05, + "loss": 2.2187, + "step": 3370 + }, + { + "epoch": 0.22635949638360567, + "grad_norm": 0.0, + "learning_rate": 9.52180330747712e-05, + "loss": 2.3373, + "step": 3380 + }, + { + "epoch": 0.22702919903562818, + "grad_norm": 0.0, + "learning_rate": 9.516802361479056e-05, + "loss": 2.1831, + "step": 3390 + }, + { + "epoch": 0.2276989016876507, + "grad_norm": 0.0, + "learning_rate": 9.511776728876341e-05, + "loss": 2.1993, + "step": 3400 + }, + { + "epoch": 0.22836860433967318, + "grad_norm": 0.0, + "learning_rate": 9.506726437136599e-05, + "loss": 2.1827, + "step": 3410 + }, + { + "epoch": 0.2290383069916957, + "grad_norm": 0.0, + "learning_rate": 9.501651513862222e-05, + "loss": 2.2418, + "step": 3420 + }, + { + "epoch": 0.22970800964371818, + "grad_norm": 0.0, + "learning_rate": 9.496551986790225e-05, + "loss": 2.1535, + "step": 3430 + }, + { + "epoch": 0.2303777122957407, + "grad_norm": 0.0, + "learning_rate": 9.4914278837921e-05, + "loss": 2.37, + "step": 3440 + }, + { + "epoch": 0.23104741494776318, + "grad_norm": 0.0, + "learning_rate": 9.486279232873654e-05, + "loss": 2.1839, + "step": 3450 + }, + { + "epoch": 0.2317171175997857, + "grad_norm": 0.0, + "learning_rate": 9.481106062174863e-05, + "loss": 2.2899, + "step": 3460 + }, + { + "epoch": 0.2323868202518082, + "grad_norm": 0.0, + "learning_rate": 9.475908399969718e-05, + "loss": 2.2152, + "step": 3470 + }, + { + "epoch": 0.2330565229038307, + "grad_norm": 0.0, + "learning_rate": 9.470686274666065e-05, + "loss": 2.2035, + "step": 3480 + }, + { + "epoch": 0.2337262255558532, + "grad_norm": 0.0, + "learning_rate": 9.465439714805455e-05, + "loss": 2.2696, + "step": 3490 + }, + { + "epoch": 0.2343959282078757, + "grad_norm": 0.0, + "learning_rate": 9.460168749062985e-05, + "loss": 2.2141, + "step": 3500 + }, + { + "epoch": 0.2350656308598982, + "grad_norm": 0.0, + "learning_rate": 9.454873406247143e-05, + "loss": 2.2786, + "step": 3510 + }, + { + "epoch": 0.23573533351192072, + "grad_norm": 0.0, + "learning_rate": 9.449553715299652e-05, + "loss": 2.2578, + "step": 3520 + }, + { + "epoch": 0.2364050361639432, + "grad_norm": 0.0, + "learning_rate": 9.444209705295305e-05, + "loss": 2.2044, + "step": 3530 + }, + { + "epoch": 0.23707473881596572, + "grad_norm": 0.0, + "learning_rate": 9.438841405441816e-05, + "loss": 2.2658, + "step": 3540 + }, + { + "epoch": 0.2377444414679882, + "grad_norm": 0.0, + "learning_rate": 9.433448845079653e-05, + "loss": 2.2988, + "step": 3550 + }, + { + "epoch": 0.23841414412001072, + "grad_norm": 0.0, + "learning_rate": 9.42803205368188e-05, + "loss": 2.2158, + "step": 3560 + }, + { + "epoch": 0.2390838467720332, + "grad_norm": 0.0, + "learning_rate": 9.422591060853997e-05, + "loss": 2.3033, + "step": 3570 + }, + { + "epoch": 0.23975354942405572, + "grad_norm": 0.0, + "learning_rate": 9.417125896333774e-05, + "loss": 2.3241, + "step": 3580 + }, + { + "epoch": 0.24042325207607823, + "grad_norm": 0.0, + "learning_rate": 9.411636589991095e-05, + "loss": 2.2695, + "step": 3590 + }, + { + "epoch": 0.24109295472810072, + "grad_norm": 0.0, + "learning_rate": 9.406123171827789e-05, + "loss": 2.2711, + "step": 3600 + }, + { + "epoch": 0.24176265738012323, + "grad_norm": 0.0, + "learning_rate": 9.400585671977469e-05, + "loss": 2.1886, + "step": 3610 + }, + { + "epoch": 0.24243236003214572, + "grad_norm": 0.0, + "learning_rate": 9.395024120705367e-05, + "loss": 2.2764, + "step": 3620 + }, + { + "epoch": 0.24310206268416823, + "grad_norm": 0.0, + "learning_rate": 9.389438548408167e-05, + "loss": 2.1141, + "step": 3630 + }, + { + "epoch": 0.24377176533619074, + "grad_norm": 0.0, + "learning_rate": 9.383828985613843e-05, + "loss": 2.2824, + "step": 3640 + }, + { + "epoch": 0.24444146798821323, + "grad_norm": 0.0, + "learning_rate": 9.378195462981484e-05, + "loss": 2.2951, + "step": 3650 + }, + { + "epoch": 0.24511117064023574, + "grad_norm": 0.0, + "learning_rate": 9.372538011301135e-05, + "loss": 2.2363, + "step": 3660 + }, + { + "epoch": 0.24578087329225823, + "grad_norm": 0.0, + "learning_rate": 9.366856661493628e-05, + "loss": 2.2701, + "step": 3670 + }, + { + "epoch": 0.24645057594428074, + "grad_norm": 0.0, + "learning_rate": 9.361151444610404e-05, + "loss": 2.2683, + "step": 3680 + }, + { + "epoch": 0.24712027859630323, + "grad_norm": 0.0, + "learning_rate": 9.355422391833353e-05, + "loss": 2.3463, + "step": 3690 + }, + { + "epoch": 0.24778998124832574, + "grad_norm": 0.0, + "learning_rate": 9.349669534474641e-05, + "loss": 2.2302, + "step": 3700 + }, + { + "epoch": 0.24845968390034825, + "grad_norm": 0.0, + "learning_rate": 9.343892903976539e-05, + "loss": 2.2496, + "step": 3710 + }, + { + "epoch": 0.24912938655237074, + "grad_norm": 0.0, + "learning_rate": 9.338092531911245e-05, + "loss": 2.2797, + "step": 3720 + }, + { + "epoch": 0.24979908920439325, + "grad_norm": 0.0, + "learning_rate": 9.33226844998072e-05, + "loss": 2.2403, + "step": 3730 + }, + { + "epoch": 0.25046879185641574, + "grad_norm": 0.0, + "learning_rate": 9.326420690016513e-05, + "loss": 2.1988, + "step": 3740 + }, + { + "epoch": 0.2511384945084382, + "grad_norm": 0.0, + "learning_rate": 9.320549283979584e-05, + "loss": 2.249, + "step": 3750 + }, + { + "epoch": 0.25180819716046077, + "grad_norm": 0.0, + "learning_rate": 9.314654263960128e-05, + "loss": 2.2201, + "step": 3760 + }, + { + "epoch": 0.25247789981248325, + "grad_norm": 0.0, + "learning_rate": 9.308735662177407e-05, + "loss": 2.2789, + "step": 3770 + }, + { + "epoch": 0.25314760246450574, + "grad_norm": 0.0, + "learning_rate": 9.302793510979568e-05, + "loss": 2.2515, + "step": 3780 + }, + { + "epoch": 0.2538173051165283, + "grad_norm": 0.0, + "learning_rate": 9.296827842843463e-05, + "loss": 2.2134, + "step": 3790 + }, + { + "epoch": 0.25448700776855077, + "grad_norm": 0.0, + "learning_rate": 9.290838690374483e-05, + "loss": 2.2498, + "step": 3800 + }, + { + "epoch": 0.25515671042057325, + "grad_norm": 0.0, + "learning_rate": 9.284826086306366e-05, + "loss": 2.1802, + "step": 3810 + }, + { + "epoch": 0.2558264130725958, + "grad_norm": 0.0, + "learning_rate": 9.278790063501029e-05, + "loss": 2.2067, + "step": 3820 + }, + { + "epoch": 0.2564961157246183, + "grad_norm": 0.0, + "learning_rate": 9.272730654948384e-05, + "loss": 2.2824, + "step": 3830 + }, + { + "epoch": 0.25716581837664076, + "grad_norm": 0.0, + "learning_rate": 9.266647893766157e-05, + "loss": 2.2489, + "step": 3840 + }, + { + "epoch": 0.25783552102866325, + "grad_norm": 0.0, + "learning_rate": 9.260541813199706e-05, + "loss": 2.2607, + "step": 3850 + }, + { + "epoch": 0.2585052236806858, + "grad_norm": 0.0, + "learning_rate": 9.254412446621845e-05, + "loss": 2.3607, + "step": 3860 + }, + { + "epoch": 0.2591749263327083, + "grad_norm": 0.0, + "learning_rate": 9.248259827532656e-05, + "loss": 2.2326, + "step": 3870 + }, + { + "epoch": 0.25984462898473076, + "grad_norm": 0.0, + "learning_rate": 9.242083989559308e-05, + "loss": 2.2679, + "step": 3880 + }, + { + "epoch": 0.2605143316367533, + "grad_norm": 0.0, + "learning_rate": 9.235884966455872e-05, + "loss": 2.1288, + "step": 3890 + }, + { + "epoch": 0.2611840342887758, + "grad_norm": 0.0, + "learning_rate": 9.229662792103137e-05, + "loss": 2.2834, + "step": 3900 + }, + { + "epoch": 0.2618537369407983, + "grad_norm": 0.0, + "learning_rate": 9.223417500508427e-05, + "loss": 2.0785, + "step": 3910 + }, + { + "epoch": 0.26252343959282076, + "grad_norm": 0.0, + "learning_rate": 9.217149125805416e-05, + "loss": 2.2012, + "step": 3920 + }, + { + "epoch": 0.2631931422448433, + "grad_norm": 0.0, + "learning_rate": 9.21085770225393e-05, + "loss": 2.2774, + "step": 3930 + }, + { + "epoch": 0.2638628448968658, + "grad_norm": 0.0, + "learning_rate": 9.204543264239778e-05, + "loss": 2.2201, + "step": 3940 + }, + { + "epoch": 0.2645325475488883, + "grad_norm": 0.0, + "learning_rate": 9.198205846274548e-05, + "loss": 2.3057, + "step": 3950 + }, + { + "epoch": 0.2652022502009108, + "grad_norm": 0.0, + "learning_rate": 9.191845482995431e-05, + "loss": 2.2332, + "step": 3960 + }, + { + "epoch": 0.2658719528529333, + "grad_norm": 0.0, + "learning_rate": 9.185462209165021e-05, + "loss": 2.1952, + "step": 3970 + }, + { + "epoch": 0.2665416555049558, + "grad_norm": 0.0, + "learning_rate": 9.179056059671129e-05, + "loss": 2.3655, + "step": 3980 + }, + { + "epoch": 0.26721135815697833, + "grad_norm": 0.0, + "learning_rate": 9.1726270695266e-05, + "loss": 2.2544, + "step": 3990 + }, + { + "epoch": 0.2678810608090008, + "grad_norm": 0.0, + "learning_rate": 9.166175273869107e-05, + "loss": 2.2741, + "step": 4000 + }, + { + "epoch": 0.2685507634610233, + "grad_norm": 0.0, + "learning_rate": 9.15970070796097e-05, + "loss": 2.2881, + "step": 4010 + }, + { + "epoch": 0.2692204661130458, + "grad_norm": 0.0, + "learning_rate": 9.15320340718896e-05, + "loss": 2.2729, + "step": 4020 + }, + { + "epoch": 0.26989016876506833, + "grad_norm": 0.0, + "learning_rate": 9.146683407064105e-05, + "loss": 2.238, + "step": 4030 + }, + { + "epoch": 0.2705598714170908, + "grad_norm": 0.0, + "learning_rate": 9.140140743221496e-05, + "loss": 2.3336, + "step": 4040 + }, + { + "epoch": 0.2712295740691133, + "grad_norm": 0.0, + "learning_rate": 9.133575451420093e-05, + "loss": 2.2249, + "step": 4050 + }, + { + "epoch": 0.27189927672113584, + "grad_norm": 0.0, + "learning_rate": 9.126987567542532e-05, + "loss": 2.2385, + "step": 4060 + }, + { + "epoch": 0.2725689793731583, + "grad_norm": 0.0, + "learning_rate": 9.12037712759492e-05, + "loss": 2.2343, + "step": 4070 + }, + { + "epoch": 0.2732386820251808, + "grad_norm": 0.0, + "learning_rate": 9.113744167706648e-05, + "loss": 2.2202, + "step": 4080 + }, + { + "epoch": 0.2739083846772033, + "grad_norm": 0.0, + "learning_rate": 9.107088724130192e-05, + "loss": 2.2392, + "step": 4090 + }, + { + "epoch": 0.27457808732922584, + "grad_norm": 0.0, + "learning_rate": 9.100410833240908e-05, + "loss": 2.2371, + "step": 4100 + }, + { + "epoch": 0.2752477899812483, + "grad_norm": 0.0, + "learning_rate": 9.093710531536842e-05, + "loss": 2.3052, + "step": 4110 + }, + { + "epoch": 0.2759174926332708, + "grad_norm": 0.0, + "learning_rate": 9.086987855638525e-05, + "loss": 2.1853, + "step": 4120 + }, + { + "epoch": 0.27658719528529335, + "grad_norm": 0.0, + "learning_rate": 9.08024284228877e-05, + "loss": 2.2073, + "step": 4130 + }, + { + "epoch": 0.27725689793731584, + "grad_norm": 0.0, + "learning_rate": 9.073475528352484e-05, + "loss": 2.1914, + "step": 4140 + }, + { + "epoch": 0.2779266005893383, + "grad_norm": 0.0, + "learning_rate": 9.066685950816451e-05, + "loss": 2.2104, + "step": 4150 + }, + { + "epoch": 0.2785963032413608, + "grad_norm": 0.0, + "learning_rate": 9.059874146789139e-05, + "loss": 2.1314, + "step": 4160 + }, + { + "epoch": 0.27926600589338335, + "grad_norm": 0.0, + "learning_rate": 9.053040153500496e-05, + "loss": 2.1836, + "step": 4170 + }, + { + "epoch": 0.27993570854540584, + "grad_norm": 0.0, + "learning_rate": 9.046184008301743e-05, + "loss": 2.2723, + "step": 4180 + }, + { + "epoch": 0.2806054111974283, + "grad_norm": 0.0, + "learning_rate": 9.039305748665176e-05, + "loss": 2.2827, + "step": 4190 + }, + { + "epoch": 0.28127511384945086, + "grad_norm": 0.0, + "learning_rate": 9.032405412183956e-05, + "loss": 2.1977, + "step": 4200 + }, + { + "epoch": 0.28194481650147335, + "grad_norm": 0.0, + "learning_rate": 9.025483036571902e-05, + "loss": 2.2147, + "step": 4210 + }, + { + "epoch": 0.28261451915349584, + "grad_norm": 0.0, + "learning_rate": 9.018538659663293e-05, + "loss": 2.2783, + "step": 4220 + }, + { + "epoch": 0.2832842218055184, + "grad_norm": 0.0, + "learning_rate": 9.01157231941265e-05, + "loss": 2.1797, + "step": 4230 + }, + { + "epoch": 0.28395392445754086, + "grad_norm": 0.0, + "learning_rate": 9.004584053894545e-05, + "loss": 2.2945, + "step": 4240 + }, + { + "epoch": 0.28462362710956335, + "grad_norm": 0.0, + "learning_rate": 8.997573901303372e-05, + "loss": 2.2139, + "step": 4250 + }, + { + "epoch": 0.28529332976158583, + "grad_norm": 0.0, + "learning_rate": 8.990541899953151e-05, + "loss": 2.2501, + "step": 4260 + }, + { + "epoch": 0.2859630324136084, + "grad_norm": 0.0, + "learning_rate": 8.983488088277323e-05, + "loss": 2.233, + "step": 4270 + }, + { + "epoch": 0.28663273506563086, + "grad_norm": 0.0, + "learning_rate": 8.976412504828526e-05, + "loss": 2.2957, + "step": 4280 + }, + { + "epoch": 0.28730243771765335, + "grad_norm": 0.0, + "learning_rate": 8.969315188278396e-05, + "loss": 2.2512, + "step": 4290 + }, + { + "epoch": 0.2879721403696759, + "grad_norm": 0.0, + "learning_rate": 8.962196177417353e-05, + "loss": 2.1994, + "step": 4300 + }, + { + "epoch": 0.2886418430216984, + "grad_norm": 0.0, + "learning_rate": 8.955055511154378e-05, + "loss": 2.189, + "step": 4310 + }, + { + "epoch": 0.28931154567372086, + "grad_norm": 0.0, + "learning_rate": 8.947893228516821e-05, + "loss": 2.2342, + "step": 4320 + }, + { + "epoch": 0.28998124832574335, + "grad_norm": 0.0, + "learning_rate": 8.940709368650173e-05, + "loss": 2.1649, + "step": 4330 + }, + { + "epoch": 0.2906509509777659, + "grad_norm": 0.0, + "learning_rate": 8.933503970817849e-05, + "loss": 2.232, + "step": 4340 + }, + { + "epoch": 0.2913206536297884, + "grad_norm": 0.0, + "learning_rate": 8.926277074400987e-05, + "loss": 2.2633, + "step": 4350 + }, + { + "epoch": 0.29199035628181086, + "grad_norm": 0.0, + "learning_rate": 8.919028718898226e-05, + "loss": 2.2506, + "step": 4360 + }, + { + "epoch": 0.2926600589338334, + "grad_norm": 0.0, + "learning_rate": 8.911758943925483e-05, + "loss": 2.2919, + "step": 4370 + }, + { + "epoch": 0.2933297615858559, + "grad_norm": 0.0, + "learning_rate": 8.904467789215751e-05, + "loss": 2.2114, + "step": 4380 + }, + { + "epoch": 0.29399946423787837, + "grad_norm": 0.0, + "learning_rate": 8.897155294618869e-05, + "loss": 2.2352, + "step": 4390 + }, + { + "epoch": 0.29466916688990086, + "grad_norm": 0.0, + "learning_rate": 8.88982150010131e-05, + "loss": 2.2521, + "step": 4400 + }, + { + "epoch": 0.2953388695419234, + "grad_norm": 0.0, + "learning_rate": 8.882466445745964e-05, + "loss": 2.2084, + "step": 4410 + }, + { + "epoch": 0.2960085721939459, + "grad_norm": 0.0, + "learning_rate": 8.875090171751915e-05, + "loss": 2.2102, + "step": 4420 + }, + { + "epoch": 0.29667827484596837, + "grad_norm": 0.0, + "learning_rate": 8.867692718434223e-05, + "loss": 2.1578, + "step": 4430 + }, + { + "epoch": 0.2973479774979909, + "grad_norm": 0.0, + "learning_rate": 8.860274126223705e-05, + "loss": 2.2352, + "step": 4440 + }, + { + "epoch": 0.2980176801500134, + "grad_norm": 0.0, + "learning_rate": 8.85283443566671e-05, + "loss": 2.2075, + "step": 4450 + }, + { + "epoch": 0.2986873828020359, + "grad_norm": 0.0, + "learning_rate": 8.845373687424903e-05, + "loss": 2.3332, + "step": 4460 + }, + { + "epoch": 0.2993570854540584, + "grad_norm": 0.0, + "learning_rate": 8.83789192227504e-05, + "loss": 2.1775, + "step": 4470 + }, + { + "epoch": 0.3000267881060809, + "grad_norm": 0.0, + "learning_rate": 8.83038918110874e-05, + "loss": 2.2348, + "step": 4480 + }, + { + "epoch": 0.3006964907581034, + "grad_norm": 0.0, + "learning_rate": 8.822865504932275e-05, + "loss": 2.1897, + "step": 4490 + }, + { + "epoch": 0.3013661934101259, + "grad_norm": 0.0, + "learning_rate": 8.815320934866329e-05, + "loss": 2.2539, + "step": 4500 + }, + { + "epoch": 0.3020358960621484, + "grad_norm": 0.0, + "learning_rate": 8.807755512145788e-05, + "loss": 2.2351, + "step": 4510 + }, + { + "epoch": 0.3027055987141709, + "grad_norm": 0.0, + "learning_rate": 8.800169278119503e-05, + "loss": 2.3542, + "step": 4520 + }, + { + "epoch": 0.3033753013661934, + "grad_norm": 0.0, + "learning_rate": 8.792562274250075e-05, + "loss": 2.2724, + "step": 4530 + }, + { + "epoch": 0.30404500401821594, + "grad_norm": 0.0, + "learning_rate": 8.784934542113617e-05, + "loss": 2.204, + "step": 4540 + }, + { + "epoch": 0.3047147066702384, + "grad_norm": 0.0, + "learning_rate": 8.777286123399536e-05, + "loss": 2.2944, + "step": 4550 + }, + { + "epoch": 0.3053844093222609, + "grad_norm": 0.0, + "learning_rate": 8.7696170599103e-05, + "loss": 2.1318, + "step": 4560 + }, + { + "epoch": 0.3060541119742834, + "grad_norm": 0.0, + "learning_rate": 8.761927393561214e-05, + "loss": 2.1441, + "step": 4570 + }, + { + "epoch": 0.30672381462630594, + "grad_norm": 0.0, + "learning_rate": 8.754217166380184e-05, + "loss": 2.303, + "step": 4580 + }, + { + "epoch": 0.3073935172783284, + "grad_norm": 0.0, + "learning_rate": 8.746486420507491e-05, + "loss": 2.2769, + "step": 4590 + }, + { + "epoch": 0.3080632199303509, + "grad_norm": 0.0, + "learning_rate": 8.738735198195566e-05, + "loss": 2.3048, + "step": 4600 + }, + { + "epoch": 0.30873292258237345, + "grad_norm": 0.0, + "learning_rate": 8.73096354180875e-05, + "loss": 2.2582, + "step": 4610 + }, + { + "epoch": 0.30940262523439593, + "grad_norm": 0.0, + "learning_rate": 8.72317149382307e-05, + "loss": 2.3533, + "step": 4620 + }, + { + "epoch": 0.3100723278864184, + "grad_norm": 0.0, + "learning_rate": 8.715359096825999e-05, + "loss": 2.2417, + "step": 4630 + }, + { + "epoch": 0.3107420305384409, + "grad_norm": 0.0, + "learning_rate": 8.707526393516228e-05, + "loss": 2.215, + "step": 4640 + }, + { + "epoch": 0.31141173319046345, + "grad_norm": 0.0, + "learning_rate": 8.699673426703436e-05, + "loss": 2.1881, + "step": 4650 + }, + { + "epoch": 0.31208143584248593, + "grad_norm": 0.0, + "learning_rate": 8.691800239308052e-05, + "loss": 2.2907, + "step": 4660 + }, + { + "epoch": 0.3127511384945084, + "grad_norm": 0.0, + "learning_rate": 8.683906874361017e-05, + "loss": 2.221, + "step": 4670 + }, + { + "epoch": 0.31342084114653096, + "grad_norm": 0.0, + "learning_rate": 8.675993375003553e-05, + "loss": 2.1868, + "step": 4680 + }, + { + "epoch": 0.31409054379855345, + "grad_norm": 0.0, + "learning_rate": 8.668059784486929e-05, + "loss": 2.244, + "step": 4690 + }, + { + "epoch": 0.31476024645057593, + "grad_norm": 0.0, + "learning_rate": 8.660106146172223e-05, + "loss": 2.2294, + "step": 4700 + }, + { + "epoch": 0.3154299491025985, + "grad_norm": 0.0, + "learning_rate": 8.652132503530082e-05, + "loss": 2.2347, + "step": 4710 + }, + { + "epoch": 0.31609965175462096, + "grad_norm": 0.0, + "learning_rate": 8.644138900140485e-05, + "loss": 2.2633, + "step": 4720 + }, + { + "epoch": 0.31676935440664344, + "grad_norm": 0.0, + "learning_rate": 8.636125379692515e-05, + "loss": 2.2383, + "step": 4730 + }, + { + "epoch": 0.31743905705866593, + "grad_norm": 0.0, + "learning_rate": 8.628091985984099e-05, + "loss": 2.2282, + "step": 4740 + }, + { + "epoch": 0.31810875971068847, + "grad_norm": 0.0, + "learning_rate": 8.620038762921794e-05, + "loss": 2.172, + "step": 4750 + }, + { + "epoch": 0.31877846236271096, + "grad_norm": 0.0, + "learning_rate": 8.611965754520526e-05, + "loss": 2.2849, + "step": 4760 + }, + { + "epoch": 0.31944816501473344, + "grad_norm": 0.0, + "learning_rate": 8.60387300490336e-05, + "loss": 2.3451, + "step": 4770 + }, + { + "epoch": 0.320117867666756, + "grad_norm": 0.0, + "learning_rate": 8.595760558301257e-05, + "loss": 2.2314, + "step": 4780 + }, + { + "epoch": 0.32078757031877847, + "grad_norm": 0.0, + "learning_rate": 8.587628459052834e-05, + "loss": 2.155, + "step": 4790 + }, + { + "epoch": 0.32145727297080096, + "grad_norm": 0.0, + "learning_rate": 8.579476751604119e-05, + "loss": 2.2874, + "step": 4800 + }, + { + "epoch": 0.32212697562282344, + "grad_norm": 0.0, + "learning_rate": 8.571305480508302e-05, + "loss": 2.2094, + "step": 4810 + }, + { + "epoch": 0.322796678274846, + "grad_norm": 0.0, + "learning_rate": 8.563114690425511e-05, + "loss": 2.2631, + "step": 4820 + }, + { + "epoch": 0.32346638092686847, + "grad_norm": 0.0, + "learning_rate": 8.554904426122543e-05, + "loss": 2.287, + "step": 4830 + }, + { + "epoch": 0.32413608357889095, + "grad_norm": 0.0, + "learning_rate": 8.546674732472638e-05, + "loss": 2.2728, + "step": 4840 + }, + { + "epoch": 0.3248057862309135, + "grad_norm": 0.0, + "learning_rate": 8.538425654455225e-05, + "loss": 2.2106, + "step": 4850 + }, + { + "epoch": 0.325475488882936, + "grad_norm": 0.0, + "learning_rate": 8.530157237155681e-05, + "loss": 2.2396, + "step": 4860 + }, + { + "epoch": 0.32614519153495847, + "grad_norm": 0.0, + "learning_rate": 8.521869525765076e-05, + "loss": 2.2975, + "step": 4870 + }, + { + "epoch": 0.32681489418698095, + "grad_norm": 0.0, + "learning_rate": 8.513562565579936e-05, + "loss": 2.2552, + "step": 4880 + }, + { + "epoch": 0.3274845968390035, + "grad_norm": 0.0, + "learning_rate": 8.505236402001996e-05, + "loss": 2.2631, + "step": 4890 + }, + { + "epoch": 0.328154299491026, + "grad_norm": 0.0, + "learning_rate": 8.496891080537939e-05, + "loss": 2.1826, + "step": 4900 + }, + { + "epoch": 0.32882400214304847, + "grad_norm": 0.0, + "learning_rate": 8.488526646799158e-05, + "loss": 2.2127, + "step": 4910 + }, + { + "epoch": 0.329493704795071, + "grad_norm": 0.0, + "learning_rate": 8.480143146501506e-05, + "loss": 2.1991, + "step": 4920 + }, + { + "epoch": 0.3301634074470935, + "grad_norm": 0.0, + "learning_rate": 8.471740625465044e-05, + "loss": 2.2428, + "step": 4930 + }, + { + "epoch": 0.330833110099116, + "grad_norm": 0.0, + "learning_rate": 8.463319129613791e-05, + "loss": 2.2601, + "step": 4940 + }, + { + "epoch": 0.3315028127511385, + "grad_norm": 0.0, + "learning_rate": 8.45487870497547e-05, + "loss": 2.2814, + "step": 4950 + }, + { + "epoch": 0.332172515403161, + "grad_norm": 0.0, + "learning_rate": 8.446419397681265e-05, + "loss": 2.1667, + "step": 4960 + }, + { + "epoch": 0.3328422180551835, + "grad_norm": 0.0, + "learning_rate": 8.437941253965558e-05, + "loss": 2.3395, + "step": 4970 + }, + { + "epoch": 0.333511920707206, + "grad_norm": 0.0, + "learning_rate": 8.429444320165683e-05, + "loss": 2.2234, + "step": 4980 + }, + { + "epoch": 0.3341816233592285, + "grad_norm": 0.0, + "learning_rate": 8.420928642721672e-05, + "loss": 2.3486, + "step": 4990 + }, + { + "epoch": 0.334851326011251, + "grad_norm": 0.0, + "learning_rate": 8.412394268176003e-05, + "loss": 2.2212, + "step": 5000 + }, + { + "epoch": 0.3355210286632735, + "grad_norm": 0.0, + "learning_rate": 8.403841243173338e-05, + "loss": 2.2387, + "step": 5010 + }, + { + "epoch": 0.33619073131529603, + "grad_norm": 0.0, + "learning_rate": 8.395269614460275e-05, + "loss": 2.1898, + "step": 5020 + }, + { + "epoch": 0.3368604339673185, + "grad_norm": 0.0, + "learning_rate": 8.386679428885092e-05, + "loss": 2.245, + "step": 5030 + }, + { + "epoch": 0.337530136619341, + "grad_norm": 0.0, + "learning_rate": 8.37807073339749e-05, + "loss": 2.2294, + "step": 5040 + }, + { + "epoch": 0.3381998392713635, + "grad_norm": 0.0, + "learning_rate": 8.369443575048332e-05, + "loss": 2.2182, + "step": 5050 + }, + { + "epoch": 0.33886954192338603, + "grad_norm": 0.0, + "learning_rate": 8.360798000989394e-05, + "loss": 2.2475, + "step": 5060 + }, + { + "epoch": 0.3395392445754085, + "grad_norm": 0.0, + "learning_rate": 8.352134058473106e-05, + "loss": 2.2286, + "step": 5070 + }, + { + "epoch": 0.340208947227431, + "grad_norm": 0.0, + "learning_rate": 8.343451794852282e-05, + "loss": 2.2071, + "step": 5080 + }, + { + "epoch": 0.34087864987945354, + "grad_norm": 0.0, + "learning_rate": 8.334751257579874e-05, + "loss": 2.2365, + "step": 5090 + }, + { + "epoch": 0.34154835253147603, + "grad_norm": 0.0, + "learning_rate": 8.326032494208713e-05, + "loss": 2.3365, + "step": 5100 + }, + { + "epoch": 0.3422180551834985, + "grad_norm": 0.0, + "learning_rate": 8.31729555239124e-05, + "loss": 2.2691, + "step": 5110 + }, + { + "epoch": 0.342887757835521, + "grad_norm": 0.0, + "learning_rate": 8.308540479879252e-05, + "loss": 2.1833, + "step": 5120 + }, + { + "epoch": 0.34355746048754354, + "grad_norm": 0.0, + "learning_rate": 8.299767324523638e-05, + "loss": 2.1165, + "step": 5130 + }, + { + "epoch": 0.34422716313956603, + "grad_norm": 0.0, + "learning_rate": 8.290976134274123e-05, + "loss": 2.3461, + "step": 5140 + }, + { + "epoch": 0.3448968657915885, + "grad_norm": 0.0, + "learning_rate": 8.282166957178995e-05, + "loss": 2.2149, + "step": 5150 + }, + { + "epoch": 0.34556656844361106, + "grad_norm": 0.0, + "learning_rate": 8.273339841384855e-05, + "loss": 2.2644, + "step": 5160 + }, + { + "epoch": 0.34623627109563354, + "grad_norm": 0.0, + "learning_rate": 8.264494835136347e-05, + "loss": 2.2684, + "step": 5170 + }, + { + "epoch": 0.346905973747656, + "grad_norm": 0.0, + "learning_rate": 8.255631986775894e-05, + "loss": 2.2436, + "step": 5180 + }, + { + "epoch": 0.34757567639967857, + "grad_norm": 0.0, + "learning_rate": 8.246751344743433e-05, + "loss": 2.1691, + "step": 5190 + }, + { + "epoch": 0.34824537905170105, + "grad_norm": 0.0, + "learning_rate": 8.237852957576158e-05, + "loss": 2.2467, + "step": 5200 + }, + { + "epoch": 0.34891508170372354, + "grad_norm": 0.0, + "learning_rate": 8.228936873908244e-05, + "loss": 2.2633, + "step": 5210 + }, + { + "epoch": 0.349584784355746, + "grad_norm": 0.0, + "learning_rate": 8.220003142470592e-05, + "loss": 2.247, + "step": 5220 + }, + { + "epoch": 0.35025448700776857, + "grad_norm": 0.0, + "learning_rate": 8.211051812090548e-05, + "loss": 2.3088, + "step": 5230 + }, + { + "epoch": 0.35092418965979105, + "grad_norm": 0.0, + "learning_rate": 8.202082931691655e-05, + "loss": 2.2103, + "step": 5240 + }, + { + "epoch": 0.35159389231181354, + "grad_norm": 0.0, + "learning_rate": 8.193096550293369e-05, + "loss": 2.0959, + "step": 5250 + }, + { + "epoch": 0.3522635949638361, + "grad_norm": 0.0, + "learning_rate": 8.184092717010801e-05, + "loss": 2.2747, + "step": 5260 + }, + { + "epoch": 0.35293329761585857, + "grad_norm": 0.0, + "learning_rate": 8.175071481054444e-05, + "loss": 2.2294, + "step": 5270 + }, + { + "epoch": 0.35360300026788105, + "grad_norm": 0.0, + "learning_rate": 8.166032891729905e-05, + "loss": 2.1772, + "step": 5280 + }, + { + "epoch": 0.35427270291990354, + "grad_norm": 0.0, + "learning_rate": 8.15697699843764e-05, + "loss": 2.2499, + "step": 5290 + }, + { + "epoch": 0.3549424055719261, + "grad_norm": 0.0, + "learning_rate": 8.147903850672671e-05, + "loss": 2.2, + "step": 5300 + }, + { + "epoch": 0.35561210822394856, + "grad_norm": 0.0, + "learning_rate": 8.138813498024332e-05, + "loss": 2.2583, + "step": 5310 + }, + { + "epoch": 0.35628181087597105, + "grad_norm": 0.0, + "learning_rate": 8.129705990175991e-05, + "loss": 2.2661, + "step": 5320 + }, + { + "epoch": 0.3569515135279936, + "grad_norm": 0.0, + "learning_rate": 8.120581376904773e-05, + "loss": 2.2114, + "step": 5330 + }, + { + "epoch": 0.3576212161800161, + "grad_norm": 0.0, + "learning_rate": 8.111439708081297e-05, + "loss": 2.2011, + "step": 5340 + }, + { + "epoch": 0.35829091883203856, + "grad_norm": 0.0, + "learning_rate": 8.102281033669393e-05, + "loss": 2.252, + "step": 5350 + }, + { + "epoch": 0.3589606214840611, + "grad_norm": 0.0, + "learning_rate": 8.093105403725842e-05, + "loss": 2.1967, + "step": 5360 + }, + { + "epoch": 0.3596303241360836, + "grad_norm": 0.0, + "learning_rate": 8.083912868400094e-05, + "loss": 2.2036, + "step": 5370 + }, + { + "epoch": 0.3603000267881061, + "grad_norm": 0.0, + "learning_rate": 8.074703477933991e-05, + "loss": 2.2294, + "step": 5380 + }, + { + "epoch": 0.36096972944012856, + "grad_norm": 0.0, + "learning_rate": 8.065477282661504e-05, + "loss": 2.2663, + "step": 5390 + }, + { + "epoch": 0.3616394320921511, + "grad_norm": 0.0, + "learning_rate": 8.056234333008445e-05, + "loss": 2.2182, + "step": 5400 + }, + { + "epoch": 0.3623091347441736, + "grad_norm": 0.0, + "learning_rate": 8.046974679492197e-05, + "loss": 2.2519, + "step": 5410 + }, + { + "epoch": 0.3629788373961961, + "grad_norm": 0.0, + "learning_rate": 8.037698372721442e-05, + "loss": 2.1965, + "step": 5420 + }, + { + "epoch": 0.3636485400482186, + "grad_norm": 0.0, + "learning_rate": 8.028405463395878e-05, + "loss": 2.1936, + "step": 5430 + }, + { + "epoch": 0.3643182427002411, + "grad_norm": 0.0, + "learning_rate": 8.019096002305946e-05, + "loss": 2.2061, + "step": 5440 + }, + { + "epoch": 0.3649879453522636, + "grad_norm": 0.0, + "learning_rate": 8.00977004033255e-05, + "loss": 2.2627, + "step": 5450 + }, + { + "epoch": 0.3656576480042861, + "grad_norm": 0.0, + "learning_rate": 8.000427628446776e-05, + "loss": 2.3286, + "step": 5460 + }, + { + "epoch": 0.3663273506563086, + "grad_norm": 0.0, + "learning_rate": 7.991068817709624e-05, + "loss": 2.1929, + "step": 5470 + }, + { + "epoch": 0.3669970533083311, + "grad_norm": 0.0, + "learning_rate": 7.981693659271716e-05, + "loss": 2.2085, + "step": 5480 + }, + { + "epoch": 0.3676667559603536, + "grad_norm": 0.0, + "learning_rate": 7.972302204373024e-05, + "loss": 2.2468, + "step": 5490 + }, + { + "epoch": 0.3683364586123761, + "grad_norm": 0.0, + "learning_rate": 7.962894504342591e-05, + "loss": 2.2503, + "step": 5500 + }, + { + "epoch": 0.3690061612643986, + "grad_norm": 0.0, + "learning_rate": 7.953470610598244e-05, + "loss": 2.2457, + "step": 5510 + }, + { + "epoch": 0.3696758639164211, + "grad_norm": 0.0, + "learning_rate": 7.944030574646323e-05, + "loss": 2.2607, + "step": 5520 + }, + { + "epoch": 0.3703455665684436, + "grad_norm": 0.0, + "learning_rate": 7.934574448081385e-05, + "loss": 2.4296, + "step": 5530 + }, + { + "epoch": 0.3710152692204661, + "grad_norm": 0.0, + "learning_rate": 7.925102282585936e-05, + "loss": 2.2071, + "step": 5540 + }, + { + "epoch": 0.3716849718724886, + "grad_norm": 0.0, + "learning_rate": 7.91561412993014e-05, + "loss": 2.2177, + "step": 5550 + }, + { + "epoch": 0.3723546745245111, + "grad_norm": 0.0, + "learning_rate": 7.906110041971541e-05, + "loss": 2.2693, + "step": 5560 + }, + { + "epoch": 0.37302437717653364, + "grad_norm": 0.0, + "learning_rate": 7.896590070654777e-05, + "loss": 2.1922, + "step": 5570 + }, + { + "epoch": 0.3736940798285561, + "grad_norm": 0.0, + "learning_rate": 7.887054268011297e-05, + "loss": 2.216, + "step": 5580 + }, + { + "epoch": 0.3743637824805786, + "grad_norm": 0.0, + "learning_rate": 7.877502686159074e-05, + "loss": 2.2372, + "step": 5590 + }, + { + "epoch": 0.37503348513260115, + "grad_norm": 0.0, + "learning_rate": 7.867935377302324e-05, + "loss": 2.2107, + "step": 5600 + }, + { + "epoch": 0.37570318778462364, + "grad_norm": 0.0, + "learning_rate": 7.858352393731219e-05, + "loss": 2.2494, + "step": 5610 + }, + { + "epoch": 0.3763728904366461, + "grad_norm": 0.0, + "learning_rate": 7.8487537878216e-05, + "loss": 2.1675, + "step": 5620 + }, + { + "epoch": 0.3770425930886686, + "grad_norm": 0.0, + "learning_rate": 7.839139612034695e-05, + "loss": 2.2335, + "step": 5630 + }, + { + "epoch": 0.37771229574069115, + "grad_norm": 0.0, + "learning_rate": 7.829509918916825e-05, + "loss": 2.2172, + "step": 5640 + }, + { + "epoch": 0.37838199839271364, + "grad_norm": 0.0, + "learning_rate": 7.819864761099125e-05, + "loss": 2.2317, + "step": 5650 + }, + { + "epoch": 0.3790517010447361, + "grad_norm": 0.0, + "learning_rate": 7.81020419129725e-05, + "loss": 2.2232, + "step": 5660 + }, + { + "epoch": 0.37972140369675866, + "grad_norm": 0.0, + "learning_rate": 7.800528262311089e-05, + "loss": 2.2933, + "step": 5670 + }, + { + "epoch": 0.38039110634878115, + "grad_norm": 0.0, + "learning_rate": 7.790837027024478e-05, + "loss": 2.1793, + "step": 5680 + }, + { + "epoch": 0.38106080900080364, + "grad_norm": 0.0, + "learning_rate": 7.781130538404911e-05, + "loss": 2.1571, + "step": 5690 + }, + { + "epoch": 0.3817305116528261, + "grad_norm": 0.0, + "learning_rate": 7.771408849503245e-05, + "loss": 2.1802, + "step": 5700 + }, + { + "epoch": 0.38240021430484866, + "grad_norm": 0.0, + "learning_rate": 7.761672013453418e-05, + "loss": 2.247, + "step": 5710 + }, + { + "epoch": 0.38306991695687115, + "grad_norm": 0.0, + "learning_rate": 7.751920083472153e-05, + "loss": 2.2638, + "step": 5720 + }, + { + "epoch": 0.38373961960889363, + "grad_norm": 0.0, + "learning_rate": 7.74215311285867e-05, + "loss": 2.2909, + "step": 5730 + }, + { + "epoch": 0.3844093222609162, + "grad_norm": 0.0, + "learning_rate": 7.732371154994392e-05, + "loss": 2.1795, + "step": 5740 + }, + { + "epoch": 0.38507902491293866, + "grad_norm": 0.0, + "learning_rate": 7.722574263342656e-05, + "loss": 2.2015, + "step": 5750 + }, + { + "epoch": 0.38574872756496115, + "grad_norm": 0.0, + "learning_rate": 7.71276249144842e-05, + "loss": 2.2313, + "step": 5760 + }, + { + "epoch": 0.38641843021698363, + "grad_norm": 0.0, + "learning_rate": 7.70293589293797e-05, + "loss": 2.2416, + "step": 5770 + }, + { + "epoch": 0.3870881328690062, + "grad_norm": 0.0, + "learning_rate": 7.693094521518627e-05, + "loss": 2.2684, + "step": 5780 + }, + { + "epoch": 0.38775783552102866, + "grad_norm": 0.0, + "learning_rate": 7.683238430978452e-05, + "loss": 2.237, + "step": 5790 + }, + { + "epoch": 0.38842753817305115, + "grad_norm": 0.0, + "learning_rate": 7.673367675185953e-05, + "loss": 2.053, + "step": 5800 + }, + { + "epoch": 0.3890972408250737, + "grad_norm": 0.0, + "learning_rate": 7.663482308089795e-05, + "loss": 2.228, + "step": 5810 + }, + { + "epoch": 0.3897669434770962, + "grad_norm": 0.0, + "learning_rate": 7.653582383718498e-05, + "loss": 2.1679, + "step": 5820 + }, + { + "epoch": 0.39043664612911866, + "grad_norm": 0.0, + "learning_rate": 7.643667956180146e-05, + "loss": 2.2186, + "step": 5830 + }, + { + "epoch": 0.3911063487811412, + "grad_norm": 0.0, + "learning_rate": 7.633739079662088e-05, + "loss": 2.2065, + "step": 5840 + }, + { + "epoch": 0.3917760514331637, + "grad_norm": 0.0, + "learning_rate": 7.623795808430645e-05, + "loss": 2.2311, + "step": 5850 + }, + { + "epoch": 0.39244575408518617, + "grad_norm": 0.0, + "learning_rate": 7.613838196830816e-05, + "loss": 2.222, + "step": 5860 + }, + { + "epoch": 0.39311545673720866, + "grad_norm": 0.0, + "learning_rate": 7.603866299285972e-05, + "loss": 2.2572, + "step": 5870 + }, + { + "epoch": 0.3937851593892312, + "grad_norm": 0.0, + "learning_rate": 7.593880170297564e-05, + "loss": 2.209, + "step": 5880 + }, + { + "epoch": 0.3944548620412537, + "grad_norm": 0.0, + "learning_rate": 7.583879864444832e-05, + "loss": 2.259, + "step": 5890 + }, + { + "epoch": 0.39512456469327617, + "grad_norm": 0.0, + "learning_rate": 7.573865436384491e-05, + "loss": 2.1778, + "step": 5900 + }, + { + "epoch": 0.3957942673452987, + "grad_norm": 0.0, + "learning_rate": 7.563836940850449e-05, + "loss": 2.2173, + "step": 5910 + }, + { + "epoch": 0.3964639699973212, + "grad_norm": 0.0, + "learning_rate": 7.55379443265349e-05, + "loss": 2.2584, + "step": 5920 + }, + { + "epoch": 0.3971336726493437, + "grad_norm": 0.0, + "learning_rate": 7.543737966680994e-05, + "loss": 2.2348, + "step": 5930 + }, + { + "epoch": 0.39780337530136617, + "grad_norm": 0.0, + "learning_rate": 7.533667597896623e-05, + "loss": 2.1596, + "step": 5940 + }, + { + "epoch": 0.3984730779533887, + "grad_norm": 0.0, + "learning_rate": 7.523583381340025e-05, + "loss": 2.2926, + "step": 5950 + }, + { + "epoch": 0.3991427806054112, + "grad_norm": 0.0, + "learning_rate": 7.513485372126531e-05, + "loss": 2.2573, + "step": 5960 + }, + { + "epoch": 0.3998124832574337, + "grad_norm": 0.0, + "learning_rate": 7.503373625446862e-05, + "loss": 2.2771, + "step": 5970 + }, + { + "epoch": 0.4004821859094562, + "grad_norm": 0.0, + "learning_rate": 7.493248196566816e-05, + "loss": 2.2364, + "step": 5980 + }, + { + "epoch": 0.4011518885614787, + "grad_norm": 0.0, + "learning_rate": 7.483109140826968e-05, + "loss": 2.163, + "step": 5990 + }, + { + "epoch": 0.4018215912135012, + "grad_norm": 0.0, + "learning_rate": 7.472956513642379e-05, + "loss": 2.1557, + "step": 6000 + }, + { + "epoch": 0.4024912938655237, + "grad_norm": 0.0, + "learning_rate": 7.462790370502284e-05, + "loss": 2.2514, + "step": 6010 + }, + { + "epoch": 0.4031609965175462, + "grad_norm": 0.0, + "learning_rate": 7.452610766969781e-05, + "loss": 2.1986, + "step": 6020 + }, + { + "epoch": 0.4038306991695687, + "grad_norm": 0.0, + "learning_rate": 7.442417758681542e-05, + "loss": 2.2523, + "step": 6030 + }, + { + "epoch": 0.4045004018215912, + "grad_norm": 0.0, + "learning_rate": 7.432211401347504e-05, + "loss": 2.3855, + "step": 6040 + }, + { + "epoch": 0.40517010447361373, + "grad_norm": 0.0, + "learning_rate": 7.421991750750559e-05, + "loss": 2.1914, + "step": 6050 + }, + { + "epoch": 0.4058398071256362, + "grad_norm": 0.0, + "learning_rate": 7.411758862746258e-05, + "loss": 2.122, + "step": 6060 + }, + { + "epoch": 0.4065095097776587, + "grad_norm": 0.0, + "learning_rate": 7.401512793262496e-05, + "loss": 2.1607, + "step": 6070 + }, + { + "epoch": 0.40717921242968125, + "grad_norm": 0.0, + "learning_rate": 7.391253598299217e-05, + "loss": 2.1664, + "step": 6080 + }, + { + "epoch": 0.40784891508170373, + "grad_norm": 0.0, + "learning_rate": 7.380981333928097e-05, + "loss": 2.2631, + "step": 6090 + }, + { + "epoch": 0.4085186177337262, + "grad_norm": 0.0, + "learning_rate": 7.370696056292249e-05, + "loss": 2.2071, + "step": 6100 + }, + { + "epoch": 0.4091883203857487, + "grad_norm": 0.0, + "learning_rate": 7.360397821605902e-05, + "loss": 2.2336, + "step": 6110 + }, + { + "epoch": 0.40985802303777125, + "grad_norm": 0.0, + "learning_rate": 7.350086686154111e-05, + "loss": 2.2513, + "step": 6120 + }, + { + "epoch": 0.41052772568979373, + "grad_norm": 0.0, + "learning_rate": 7.33976270629243e-05, + "loss": 2.2542, + "step": 6130 + }, + { + "epoch": 0.4111974283418162, + "grad_norm": 0.0, + "learning_rate": 7.329425938446625e-05, + "loss": 2.2058, + "step": 6140 + }, + { + "epoch": 0.41186713099383876, + "grad_norm": 0.0, + "learning_rate": 7.319076439112347e-05, + "loss": 2.2086, + "step": 6150 + }, + { + "epoch": 0.41253683364586125, + "grad_norm": 0.0, + "learning_rate": 7.308714264854833e-05, + "loss": 2.1696, + "step": 6160 + }, + { + "epoch": 0.41320653629788373, + "grad_norm": 0.0, + "learning_rate": 7.298339472308598e-05, + "loss": 2.2503, + "step": 6170 + }, + { + "epoch": 0.4138762389499062, + "grad_norm": 0.0, + "learning_rate": 7.287952118177117e-05, + "loss": 2.1965, + "step": 6180 + }, + { + "epoch": 0.41454594160192876, + "grad_norm": 0.0, + "learning_rate": 7.277552259232522e-05, + "loss": 2.1826, + "step": 6190 + }, + { + "epoch": 0.41521564425395124, + "grad_norm": 0.0, + "learning_rate": 7.267139952315295e-05, + "loss": 2.1862, + "step": 6200 + }, + { + "epoch": 0.41588534690597373, + "grad_norm": 0.0, + "learning_rate": 7.256715254333946e-05, + "loss": 2.1264, + "step": 6210 + }, + { + "epoch": 0.41655504955799627, + "grad_norm": 0.0, + "learning_rate": 7.246278222264713e-05, + "loss": 2.2027, + "step": 6220 + }, + { + "epoch": 0.41722475221001876, + "grad_norm": 0.0, + "learning_rate": 7.235828913151242e-05, + "loss": 2.2236, + "step": 6230 + }, + { + "epoch": 0.41789445486204124, + "grad_norm": 0.0, + "learning_rate": 7.225367384104282e-05, + "loss": 2.2782, + "step": 6240 + }, + { + "epoch": 0.41856415751406373, + "grad_norm": 0.0, + "learning_rate": 7.21489369230137e-05, + "loss": 2.2659, + "step": 6250 + }, + { + "epoch": 0.41923386016608627, + "grad_norm": 0.0, + "learning_rate": 7.204407894986518e-05, + "loss": 2.1987, + "step": 6260 + }, + { + "epoch": 0.41990356281810876, + "grad_norm": 0.0, + "learning_rate": 7.1939100494699e-05, + "loss": 2.2765, + "step": 6270 + }, + { + "epoch": 0.42057326547013124, + "grad_norm": 0.0, + "learning_rate": 7.183400213127543e-05, + "loss": 2.1841, + "step": 6280 + }, + { + "epoch": 0.4212429681221538, + "grad_norm": 0.0, + "learning_rate": 7.172878443401003e-05, + "loss": 2.2169, + "step": 6290 + }, + { + "epoch": 0.42191267077417627, + "grad_norm": 0.0, + "learning_rate": 7.162344797797065e-05, + "loss": 2.1275, + "step": 6300 + }, + { + "epoch": 0.42258237342619875, + "grad_norm": 0.0, + "learning_rate": 7.151799333887415e-05, + "loss": 2.1387, + "step": 6310 + }, + { + "epoch": 0.4232520760782213, + "grad_norm": 0.0, + "learning_rate": 7.141242109308343e-05, + "loss": 2.3116, + "step": 6320 + }, + { + "epoch": 0.4239217787302438, + "grad_norm": 0.0, + "learning_rate": 7.130673181760403e-05, + "loss": 2.2473, + "step": 6330 + }, + { + "epoch": 0.42459148138226627, + "grad_norm": 0.0, + "learning_rate": 7.120092609008122e-05, + "loss": 2.2514, + "step": 6340 + }, + { + "epoch": 0.42526118403428875, + "grad_norm": 0.0, + "learning_rate": 7.10950044887967e-05, + "loss": 2.2596, + "step": 6350 + }, + { + "epoch": 0.4259308866863113, + "grad_norm": 0.0, + "learning_rate": 7.098896759266547e-05, + "loss": 2.1879, + "step": 6360 + }, + { + "epoch": 0.4266005893383338, + "grad_norm": 0.0, + "learning_rate": 7.088281598123267e-05, + "loss": 2.1676, + "step": 6370 + }, + { + "epoch": 0.42727029199035627, + "grad_norm": 0.0, + "learning_rate": 7.077655023467049e-05, + "loss": 2.2725, + "step": 6380 + }, + { + "epoch": 0.4279399946423788, + "grad_norm": 0.0, + "learning_rate": 7.067017093377484e-05, + "loss": 2.1804, + "step": 6390 + }, + { + "epoch": 0.4286096972944013, + "grad_norm": 0.0, + "learning_rate": 7.056367865996226e-05, + "loss": 2.2629, + "step": 6400 + }, + { + "epoch": 0.4292793999464238, + "grad_norm": 0.0, + "learning_rate": 7.045707399526682e-05, + "loss": 2.1842, + "step": 6410 + }, + { + "epoch": 0.42994910259844626, + "grad_norm": 0.0, + "learning_rate": 7.035035752233681e-05, + "loss": 2.217, + "step": 6420 + }, + { + "epoch": 0.4306188052504688, + "grad_norm": 0.0, + "learning_rate": 7.024352982443159e-05, + "loss": 2.2961, + "step": 6430 + }, + { + "epoch": 0.4312885079024913, + "grad_norm": 0.0, + "learning_rate": 7.013659148541849e-05, + "loss": 2.2467, + "step": 6440 + }, + { + "epoch": 0.4319582105545138, + "grad_norm": 0.0, + "learning_rate": 7.002954308976948e-05, + "loss": 2.3547, + "step": 6450 + }, + { + "epoch": 0.4326279132065363, + "grad_norm": 0.0, + "learning_rate": 6.992238522255805e-05, + "loss": 2.2718, + "step": 6460 + }, + { + "epoch": 0.4332976158585588, + "grad_norm": 0.0, + "learning_rate": 6.981511846945608e-05, + "loss": 2.2483, + "step": 6470 + }, + { + "epoch": 0.4339673185105813, + "grad_norm": 0.0, + "learning_rate": 6.97077434167305e-05, + "loss": 2.2501, + "step": 6480 + }, + { + "epoch": 0.43463702116260383, + "grad_norm": 0.0, + "learning_rate": 6.960026065124013e-05, + "loss": 2.3018, + "step": 6490 + }, + { + "epoch": 0.4353067238146263, + "grad_norm": 0.0, + "learning_rate": 6.94926707604326e-05, + "loss": 2.2755, + "step": 6500 + }, + { + "epoch": 0.4359764264666488, + "grad_norm": 0.0, + "learning_rate": 6.938497433234091e-05, + "loss": 2.2152, + "step": 6510 + }, + { + "epoch": 0.4366461291186713, + "grad_norm": 0.0, + "learning_rate": 6.927717195558041e-05, + "loss": 2.2302, + "step": 6520 + }, + { + "epoch": 0.43731583177069383, + "grad_norm": 0.0, + "learning_rate": 6.916926421934553e-05, + "loss": 2.2201, + "step": 6530 + }, + { + "epoch": 0.4379855344227163, + "grad_norm": 0.0, + "learning_rate": 6.906125171340646e-05, + "loss": 2.2863, + "step": 6540 + }, + { + "epoch": 0.4386552370747388, + "grad_norm": 0.0, + "learning_rate": 6.89531350281061e-05, + "loss": 2.2234, + "step": 6550 + }, + { + "epoch": 0.43932493972676134, + "grad_norm": 0.0, + "learning_rate": 6.88449147543567e-05, + "loss": 2.1963, + "step": 6560 + }, + { + "epoch": 0.43999464237878383, + "grad_norm": 0.0, + "learning_rate": 6.873659148363667e-05, + "loss": 2.2119, + "step": 6570 + }, + { + "epoch": 0.4406643450308063, + "grad_norm": 0.0, + "learning_rate": 6.862816580798734e-05, + "loss": 2.2322, + "step": 6580 + }, + { + "epoch": 0.4413340476828288, + "grad_norm": 0.0, + "learning_rate": 6.85196383200098e-05, + "loss": 2.2327, + "step": 6590 + }, + { + "epoch": 0.44200375033485134, + "grad_norm": 0.0, + "learning_rate": 6.841100961286151e-05, + "loss": 2.2449, + "step": 6600 + }, + { + "epoch": 0.4426734529868738, + "grad_norm": 0.0, + "learning_rate": 6.830228028025319e-05, + "loss": 2.1634, + "step": 6610 + }, + { + "epoch": 0.4433431556388963, + "grad_norm": 0.0, + "learning_rate": 6.819345091644552e-05, + "loss": 2.2752, + "step": 6620 + }, + { + "epoch": 0.44401285829091885, + "grad_norm": 0.0, + "learning_rate": 6.80845221162459e-05, + "loss": 2.2577, + "step": 6630 + }, + { + "epoch": 0.44468256094294134, + "grad_norm": 0.0, + "learning_rate": 6.797549447500522e-05, + "loss": 2.2371, + "step": 6640 + } + ], + "logging_steps": 10, + "max_steps": 14932, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 1660, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.449376861654876e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}