diff --git "a/codet5+/q90useseq/checkpoint-16161/trainer_state.json" "b/codet5+/q90useseq/checkpoint-16161/trainer_state.json" new file mode 100644--- /dev/null +++ "b/codet5+/q90useseq/checkpoint-16161/trainer_state.json" @@ -0,0 +1,11340 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9998607824028958, + "eval_steps": 500, + "global_step": 16161, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 22.563289642333984, + "learning_rate": 2.5000000000000004e-07, + "loss": 8.4336, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 62.98283386230469, + "learning_rate": 2.5e-06, + "loss": 8.5697, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 54.93251419067383, + "learning_rate": 5e-06, + "loss": 7.4634, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 9.467167854309082, + "learning_rate": 7.5e-06, + "loss": 5.4256, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 5.084922790527344, + "learning_rate": 1e-05, + "loss": 4.2997, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 25.685922622680664, + "learning_rate": 1.25e-05, + "loss": 3.9332, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 5.632925987243652, + "learning_rate": 1.5e-05, + "loss": 3.7494, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 5.258584499359131, + "learning_rate": 1.75e-05, + "loss": 3.4965, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 5.852085113525391, + "learning_rate": 2e-05, + "loss": 3.5823, + "step": 80 + }, + { + "epoch": 0.02, + "grad_norm": 10.215030670166016, + "learning_rate": 2.25e-05, + "loss": 3.3422, + "step": 90 + }, + { + "epoch": 0.02, + "grad_norm": 5.968944072723389, + "learning_rate": 2.5e-05, + "loss": 3.4434, + "step": 100 + }, + { + "epoch": 0.02, + "grad_norm": 5.891924858093262, + "learning_rate": 2.7500000000000004e-05, + "loss": 3.3898, + "step": 110 + }, + { + "epoch": 0.02, + "grad_norm": 5.870503902435303, + "learning_rate": 3e-05, + "loss": 3.3386, + "step": 120 + }, + { + "epoch": 0.02, + "grad_norm": 12.607165336608887, + "learning_rate": 3.2500000000000004e-05, + "loss": 3.2335, + "step": 130 + }, + { + "epoch": 0.03, + "grad_norm": 5.651923179626465, + "learning_rate": 3.5e-05, + "loss": 3.216, + "step": 140 + }, + { + "epoch": 0.03, + "grad_norm": 4.639860153198242, + "learning_rate": 3.7500000000000003e-05, + "loss": 3.2893, + "step": 150 + }, + { + "epoch": 0.03, + "grad_norm": 5.180785179138184, + "learning_rate": 4e-05, + "loss": 3.1823, + "step": 160 + }, + { + "epoch": 0.03, + "grad_norm": 4.757967472076416, + "learning_rate": 4.25e-05, + "loss": 3.1802, + "step": 170 + }, + { + "epoch": 0.03, + "grad_norm": 5.358530044555664, + "learning_rate": 4.5e-05, + "loss": 3.2204, + "step": 180 + }, + { + "epoch": 0.04, + "grad_norm": 5.176595687866211, + "learning_rate": 4.75e-05, + "loss": 3.1479, + "step": 190 + }, + { + "epoch": 0.04, + "grad_norm": 5.044156551361084, + "learning_rate": 5e-05, + "loss": 3.2019, + "step": 200 + }, + { + "epoch": 0.04, + "grad_norm": 4.459234714508057, + "learning_rate": 4.996867364200238e-05, + "loss": 3.1785, + "step": 210 + }, + { + "epoch": 0.04, + "grad_norm": 4.484199047088623, + "learning_rate": 4.993734728400476e-05, + "loss": 2.9384, + "step": 220 + }, + { + "epoch": 0.04, + "grad_norm": 9.390307426452637, + "learning_rate": 4.990602092600715e-05, + "loss": 3.0402, + "step": 230 + }, + { + "epoch": 0.04, + "grad_norm": 4.658885478973389, + "learning_rate": 4.987469456800953e-05, + "loss": 3.1198, + "step": 240 + }, + { + "epoch": 0.05, + "grad_norm": 4.551351070404053, + "learning_rate": 4.98433682100119e-05, + "loss": 3.0451, + "step": 250 + }, + { + "epoch": 0.05, + "grad_norm": 5.482519149780273, + "learning_rate": 4.981204185201428e-05, + "loss": 3.165, + "step": 260 + }, + { + "epoch": 0.05, + "grad_norm": 4.959535598754883, + "learning_rate": 4.978071549401667e-05, + "loss": 3.156, + "step": 270 + }, + { + "epoch": 0.05, + "grad_norm": 4.811633586883545, + "learning_rate": 4.974938913601905e-05, + "loss": 3.0748, + "step": 280 + }, + { + "epoch": 0.05, + "grad_norm": 4.658722400665283, + "learning_rate": 4.971806277802143e-05, + "loss": 2.9398, + "step": 290 + }, + { + "epoch": 0.06, + "grad_norm": 6.098793983459473, + "learning_rate": 4.968673642002381e-05, + "loss": 2.9919, + "step": 300 + }, + { + "epoch": 0.06, + "grad_norm": 4.794303894042969, + "learning_rate": 4.965541006202619e-05, + "loss": 3.0925, + "step": 310 + }, + { + "epoch": 0.06, + "grad_norm": 4.6707611083984375, + "learning_rate": 4.9624083704028576e-05, + "loss": 3.0318, + "step": 320 + }, + { + "epoch": 0.06, + "grad_norm": 4.9659223556518555, + "learning_rate": 4.9592757346030956e-05, + "loss": 3.0405, + "step": 330 + }, + { + "epoch": 0.06, + "grad_norm": 4.642695426940918, + "learning_rate": 4.9561430988033336e-05, + "loss": 2.9802, + "step": 340 + }, + { + "epoch": 0.06, + "grad_norm": 4.881353378295898, + "learning_rate": 4.953010463003571e-05, + "loss": 2.9958, + "step": 350 + }, + { + "epoch": 0.07, + "grad_norm": 4.710669040679932, + "learning_rate": 4.9498778272038096e-05, + "loss": 2.853, + "step": 360 + }, + { + "epoch": 0.07, + "grad_norm": 3.929610013961792, + "learning_rate": 4.9467451914040476e-05, + "loss": 2.8538, + "step": 370 + }, + { + "epoch": 0.07, + "grad_norm": 4.2416839599609375, + "learning_rate": 4.9436125556042856e-05, + "loss": 2.9904, + "step": 380 + }, + { + "epoch": 0.07, + "grad_norm": 4.125613689422607, + "learning_rate": 4.9404799198045237e-05, + "loss": 3.0157, + "step": 390 + }, + { + "epoch": 0.07, + "grad_norm": 4.323487758636475, + "learning_rate": 4.9373472840047617e-05, + "loss": 3.0734, + "step": 400 + }, + { + "epoch": 0.08, + "grad_norm": 4.283641338348389, + "learning_rate": 4.934214648205e-05, + "loss": 3.0373, + "step": 410 + }, + { + "epoch": 0.08, + "grad_norm": 4.504553318023682, + "learning_rate": 4.931082012405238e-05, + "loss": 3.1041, + "step": 420 + }, + { + "epoch": 0.08, + "grad_norm": 3.953495740890503, + "learning_rate": 4.9279493766054763e-05, + "loss": 2.9039, + "step": 430 + }, + { + "epoch": 0.08, + "grad_norm": 4.179076671600342, + "learning_rate": 4.9248167408057143e-05, + "loss": 3.0008, + "step": 440 + }, + { + "epoch": 0.08, + "grad_norm": 4.270627498626709, + "learning_rate": 4.921684105005952e-05, + "loss": 2.9601, + "step": 450 + }, + { + "epoch": 0.09, + "grad_norm": 4.364182472229004, + "learning_rate": 4.9185514692061904e-05, + "loss": 2.8768, + "step": 460 + }, + { + "epoch": 0.09, + "grad_norm": 3.999711751937866, + "learning_rate": 4.9154188334064284e-05, + "loss": 3.0547, + "step": 470 + }, + { + "epoch": 0.09, + "grad_norm": 4.382208347320557, + "learning_rate": 4.9122861976066664e-05, + "loss": 2.9902, + "step": 480 + }, + { + "epoch": 0.09, + "grad_norm": 5.151576995849609, + "learning_rate": 4.9091535618069044e-05, + "loss": 2.8, + "step": 490 + }, + { + "epoch": 0.09, + "grad_norm": 4.85528039932251, + "learning_rate": 4.9060209260071424e-05, + "loss": 2.8966, + "step": 500 + }, + { + "epoch": 0.09, + "grad_norm": 3.874793767929077, + "learning_rate": 4.902888290207381e-05, + "loss": 3.0284, + "step": 510 + }, + { + "epoch": 0.1, + "grad_norm": 4.303886890411377, + "learning_rate": 4.899755654407619e-05, + "loss": 2.7964, + "step": 520 + }, + { + "epoch": 0.1, + "grad_norm": 4.484199047088623, + "learning_rate": 4.896623018607857e-05, + "loss": 2.9817, + "step": 530 + }, + { + "epoch": 0.1, + "grad_norm": 4.022371768951416, + "learning_rate": 4.893490382808095e-05, + "loss": 2.9067, + "step": 540 + }, + { + "epoch": 0.1, + "grad_norm": 3.7025790214538574, + "learning_rate": 4.890357747008333e-05, + "loss": 2.9264, + "step": 550 + }, + { + "epoch": 0.1, + "grad_norm": 4.232612133026123, + "learning_rate": 4.887225111208571e-05, + "loss": 2.8737, + "step": 560 + }, + { + "epoch": 0.11, + "grad_norm": 4.300458908081055, + "learning_rate": 4.884092475408809e-05, + "loss": 2.9234, + "step": 570 + }, + { + "epoch": 0.11, + "grad_norm": 4.4513115882873535, + "learning_rate": 4.880959839609047e-05, + "loss": 2.8182, + "step": 580 + }, + { + "epoch": 0.11, + "grad_norm": 4.555144786834717, + "learning_rate": 4.877827203809285e-05, + "loss": 2.9132, + "step": 590 + }, + { + "epoch": 0.11, + "grad_norm": 3.582031726837158, + "learning_rate": 4.874694568009524e-05, + "loss": 2.8415, + "step": 600 + }, + { + "epoch": 0.11, + "grad_norm": 4.070923328399658, + "learning_rate": 4.871561932209762e-05, + "loss": 2.9713, + "step": 610 + }, + { + "epoch": 0.12, + "grad_norm": 5.049088954925537, + "learning_rate": 4.86842929641e-05, + "loss": 3.0673, + "step": 620 + }, + { + "epoch": 0.12, + "grad_norm": 4.457193374633789, + "learning_rate": 4.865296660610238e-05, + "loss": 2.9815, + "step": 630 + }, + { + "epoch": 0.12, + "grad_norm": 4.2220258712768555, + "learning_rate": 4.862164024810476e-05, + "loss": 2.969, + "step": 640 + }, + { + "epoch": 0.12, + "grad_norm": 3.977060556411743, + "learning_rate": 4.859031389010714e-05, + "loss": 3.0564, + "step": 650 + }, + { + "epoch": 0.12, + "grad_norm": 4.071286678314209, + "learning_rate": 4.855898753210952e-05, + "loss": 2.9144, + "step": 660 + }, + { + "epoch": 0.12, + "grad_norm": 4.329357147216797, + "learning_rate": 4.85276611741119e-05, + "loss": 2.9175, + "step": 670 + }, + { + "epoch": 0.13, + "grad_norm": 3.701284646987915, + "learning_rate": 4.849633481611428e-05, + "loss": 2.911, + "step": 680 + }, + { + "epoch": 0.13, + "grad_norm": 4.07001256942749, + "learning_rate": 4.8465008458116665e-05, + "loss": 2.8067, + "step": 690 + }, + { + "epoch": 0.13, + "grad_norm": 4.72738790512085, + "learning_rate": 4.8433682100119045e-05, + "loss": 3.0779, + "step": 700 + }, + { + "epoch": 0.13, + "grad_norm": 3.782459259033203, + "learning_rate": 4.8402355742121425e-05, + "loss": 2.9219, + "step": 710 + }, + { + "epoch": 0.13, + "grad_norm": 4.410059452056885, + "learning_rate": 4.8371029384123805e-05, + "loss": 3.0018, + "step": 720 + }, + { + "epoch": 0.14, + "grad_norm": 4.083688259124756, + "learning_rate": 4.8339703026126185e-05, + "loss": 2.9102, + "step": 730 + }, + { + "epoch": 0.14, + "grad_norm": 3.7951762676239014, + "learning_rate": 4.8308376668128565e-05, + "loss": 2.8857, + "step": 740 + }, + { + "epoch": 0.14, + "grad_norm": 3.722205400466919, + "learning_rate": 4.8277050310130945e-05, + "loss": 2.8893, + "step": 750 + }, + { + "epoch": 0.14, + "grad_norm": 4.130901336669922, + "learning_rate": 4.8245723952133325e-05, + "loss": 2.943, + "step": 760 + }, + { + "epoch": 0.14, + "grad_norm": 4.320456027984619, + "learning_rate": 4.8214397594135705e-05, + "loss": 2.8967, + "step": 770 + }, + { + "epoch": 0.14, + "grad_norm": 4.467789649963379, + "learning_rate": 4.818307123613809e-05, + "loss": 2.721, + "step": 780 + }, + { + "epoch": 0.15, + "grad_norm": 3.661444902420044, + "learning_rate": 4.815174487814047e-05, + "loss": 2.8507, + "step": 790 + }, + { + "epoch": 0.15, + "grad_norm": 4.432613372802734, + "learning_rate": 4.812041852014285e-05, + "loss": 2.818, + "step": 800 + }, + { + "epoch": 0.15, + "grad_norm": 3.8605380058288574, + "learning_rate": 4.808909216214523e-05, + "loss": 2.8162, + "step": 810 + }, + { + "epoch": 0.15, + "grad_norm": 4.229771137237549, + "learning_rate": 4.805776580414761e-05, + "loss": 2.9675, + "step": 820 + }, + { + "epoch": 0.15, + "grad_norm": 4.812516212463379, + "learning_rate": 4.802643944615e-05, + "loss": 2.8635, + "step": 830 + }, + { + "epoch": 0.16, + "grad_norm": 4.7779951095581055, + "learning_rate": 4.799511308815237e-05, + "loss": 2.7532, + "step": 840 + }, + { + "epoch": 0.16, + "grad_norm": 3.997580051422119, + "learning_rate": 4.796378673015475e-05, + "loss": 2.8165, + "step": 850 + }, + { + "epoch": 0.16, + "grad_norm": 4.363816738128662, + "learning_rate": 4.793246037215713e-05, + "loss": 2.7508, + "step": 860 + }, + { + "epoch": 0.16, + "grad_norm": 3.727748155593872, + "learning_rate": 4.790113401415952e-05, + "loss": 2.9275, + "step": 870 + }, + { + "epoch": 0.16, + "grad_norm": 3.7258036136627197, + "learning_rate": 4.78698076561619e-05, + "loss": 2.7293, + "step": 880 + }, + { + "epoch": 0.17, + "grad_norm": 4.236034393310547, + "learning_rate": 4.783848129816428e-05, + "loss": 2.7062, + "step": 890 + }, + { + "epoch": 0.17, + "grad_norm": 3.967883348464966, + "learning_rate": 4.780715494016666e-05, + "loss": 2.8896, + "step": 900 + }, + { + "epoch": 0.17, + "grad_norm": 3.3881027698516846, + "learning_rate": 4.777582858216904e-05, + "loss": 2.8254, + "step": 910 + }, + { + "epoch": 0.17, + "grad_norm": 4.047328472137451, + "learning_rate": 4.7744502224171426e-05, + "loss": 2.8455, + "step": 920 + }, + { + "epoch": 0.17, + "grad_norm": 3.7306952476501465, + "learning_rate": 4.77131758661738e-05, + "loss": 2.6811, + "step": 930 + }, + { + "epoch": 0.17, + "grad_norm": 4.450077056884766, + "learning_rate": 4.768184950817618e-05, + "loss": 2.7895, + "step": 940 + }, + { + "epoch": 0.18, + "grad_norm": 4.613541603088379, + "learning_rate": 4.765052315017856e-05, + "loss": 2.8591, + "step": 950 + }, + { + "epoch": 0.18, + "grad_norm": 3.935161828994751, + "learning_rate": 4.7619196792180946e-05, + "loss": 2.8168, + "step": 960 + }, + { + "epoch": 0.18, + "grad_norm": 4.0404486656188965, + "learning_rate": 4.7587870434183326e-05, + "loss": 2.9274, + "step": 970 + }, + { + "epoch": 0.18, + "grad_norm": 3.5758628845214844, + "learning_rate": 4.7556544076185706e-05, + "loss": 2.8127, + "step": 980 + }, + { + "epoch": 0.18, + "grad_norm": 3.76188588142395, + "learning_rate": 4.7525217718188086e-05, + "loss": 2.8306, + "step": 990 + }, + { + "epoch": 0.19, + "grad_norm": 3.9346506595611572, + "learning_rate": 4.7493891360190466e-05, + "loss": 2.8627, + "step": 1000 + }, + { + "epoch": 0.19, + "grad_norm": 3.9877500534057617, + "learning_rate": 4.7462565002192846e-05, + "loss": 2.7633, + "step": 1010 + }, + { + "epoch": 0.19, + "grad_norm": 3.7940495014190674, + "learning_rate": 4.743123864419523e-05, + "loss": 2.9002, + "step": 1020 + }, + { + "epoch": 0.19, + "grad_norm": 3.6916394233703613, + "learning_rate": 4.7399912286197606e-05, + "loss": 2.7947, + "step": 1030 + }, + { + "epoch": 0.19, + "grad_norm": 3.910442352294922, + "learning_rate": 4.7368585928199986e-05, + "loss": 2.7819, + "step": 1040 + }, + { + "epoch": 0.19, + "grad_norm": 3.798110246658325, + "learning_rate": 4.7337259570202366e-05, + "loss": 2.781, + "step": 1050 + }, + { + "epoch": 0.2, + "grad_norm": 3.843815803527832, + "learning_rate": 4.730593321220475e-05, + "loss": 2.8417, + "step": 1060 + }, + { + "epoch": 0.2, + "grad_norm": 4.2749738693237305, + "learning_rate": 4.727460685420713e-05, + "loss": 2.8834, + "step": 1070 + }, + { + "epoch": 0.2, + "grad_norm": 4.073596477508545, + "learning_rate": 4.724328049620951e-05, + "loss": 2.7237, + "step": 1080 + }, + { + "epoch": 0.2, + "grad_norm": 3.9949352741241455, + "learning_rate": 4.721195413821189e-05, + "loss": 2.8605, + "step": 1090 + }, + { + "epoch": 0.2, + "grad_norm": 3.504049062728882, + "learning_rate": 4.718062778021427e-05, + "loss": 2.7565, + "step": 1100 + }, + { + "epoch": 0.21, + "grad_norm": 4.345983982086182, + "learning_rate": 4.714930142221666e-05, + "loss": 2.8596, + "step": 1110 + }, + { + "epoch": 0.21, + "grad_norm": 4.165377616882324, + "learning_rate": 4.711797506421904e-05, + "loss": 2.8525, + "step": 1120 + }, + { + "epoch": 0.21, + "grad_norm": 3.8506295680999756, + "learning_rate": 4.708664870622141e-05, + "loss": 2.8325, + "step": 1130 + }, + { + "epoch": 0.21, + "grad_norm": 3.528014659881592, + "learning_rate": 4.705532234822379e-05, + "loss": 2.7329, + "step": 1140 + }, + { + "epoch": 0.21, + "grad_norm": 4.269871234893799, + "learning_rate": 4.702399599022618e-05, + "loss": 2.7769, + "step": 1150 + }, + { + "epoch": 0.22, + "grad_norm": 3.8873980045318604, + "learning_rate": 4.699266963222856e-05, + "loss": 2.8734, + "step": 1160 + }, + { + "epoch": 0.22, + "grad_norm": 3.6617813110351562, + "learning_rate": 4.696134327423094e-05, + "loss": 2.9458, + "step": 1170 + }, + { + "epoch": 0.22, + "grad_norm": 3.9084973335266113, + "learning_rate": 4.693001691623332e-05, + "loss": 2.9346, + "step": 1180 + }, + { + "epoch": 0.22, + "grad_norm": 3.360800266265869, + "learning_rate": 4.68986905582357e-05, + "loss": 2.8122, + "step": 1190 + }, + { + "epoch": 0.22, + "grad_norm": 3.2493598461151123, + "learning_rate": 4.686736420023809e-05, + "loss": 2.7762, + "step": 1200 + }, + { + "epoch": 0.22, + "grad_norm": 3.779627561569214, + "learning_rate": 4.683603784224047e-05, + "loss": 2.9664, + "step": 1210 + }, + { + "epoch": 0.23, + "grad_norm": 3.850245475769043, + "learning_rate": 4.680471148424285e-05, + "loss": 2.9419, + "step": 1220 + }, + { + "epoch": 0.23, + "grad_norm": 3.702455759048462, + "learning_rate": 4.677338512624522e-05, + "loss": 2.7791, + "step": 1230 + }, + { + "epoch": 0.23, + "grad_norm": 4.315182685852051, + "learning_rate": 4.674205876824761e-05, + "loss": 2.9742, + "step": 1240 + }, + { + "epoch": 0.23, + "grad_norm": 3.8250505924224854, + "learning_rate": 4.671073241024999e-05, + "loss": 2.6065, + "step": 1250 + }, + { + "epoch": 0.23, + "grad_norm": 3.961651563644409, + "learning_rate": 4.667940605225237e-05, + "loss": 2.7905, + "step": 1260 + }, + { + "epoch": 0.24, + "grad_norm": 3.800668239593506, + "learning_rate": 4.664807969425475e-05, + "loss": 2.6825, + "step": 1270 + }, + { + "epoch": 0.24, + "grad_norm": 3.948024272918701, + "learning_rate": 4.661675333625713e-05, + "loss": 2.6316, + "step": 1280 + }, + { + "epoch": 0.24, + "grad_norm": 3.6402764320373535, + "learning_rate": 4.6585426978259514e-05, + "loss": 2.8085, + "step": 1290 + }, + { + "epoch": 0.24, + "grad_norm": 4.029736042022705, + "learning_rate": 4.6554100620261894e-05, + "loss": 2.6869, + "step": 1300 + }, + { + "epoch": 0.24, + "grad_norm": 3.9908854961395264, + "learning_rate": 4.6522774262264274e-05, + "loss": 2.7812, + "step": 1310 + }, + { + "epoch": 0.25, + "grad_norm": 3.9148330688476562, + "learning_rate": 4.649144790426665e-05, + "loss": 2.8214, + "step": 1320 + }, + { + "epoch": 0.25, + "grad_norm": 3.979130744934082, + "learning_rate": 4.6460121546269034e-05, + "loss": 2.9188, + "step": 1330 + }, + { + "epoch": 0.25, + "grad_norm": 4.001101970672607, + "learning_rate": 4.6428795188271414e-05, + "loss": 2.7542, + "step": 1340 + }, + { + "epoch": 0.25, + "grad_norm": 3.8721792697906494, + "learning_rate": 4.6397468830273794e-05, + "loss": 2.8949, + "step": 1350 + }, + { + "epoch": 0.25, + "grad_norm": 3.9208767414093018, + "learning_rate": 4.6366142472276174e-05, + "loss": 2.6644, + "step": 1360 + }, + { + "epoch": 0.25, + "grad_norm": 3.7345130443573, + "learning_rate": 4.6334816114278554e-05, + "loss": 2.7879, + "step": 1370 + }, + { + "epoch": 0.26, + "grad_norm": 4.162068843841553, + "learning_rate": 4.630348975628094e-05, + "loss": 2.8323, + "step": 1380 + }, + { + "epoch": 0.26, + "grad_norm": 4.239302635192871, + "learning_rate": 4.627216339828332e-05, + "loss": 2.8012, + "step": 1390 + }, + { + "epoch": 0.26, + "grad_norm": 4.081932544708252, + "learning_rate": 4.62408370402857e-05, + "loss": 2.7515, + "step": 1400 + }, + { + "epoch": 0.26, + "grad_norm": 3.6898136138916016, + "learning_rate": 4.620951068228808e-05, + "loss": 2.7073, + "step": 1410 + }, + { + "epoch": 0.26, + "grad_norm": 3.468146562576294, + "learning_rate": 4.617818432429046e-05, + "loss": 2.9264, + "step": 1420 + }, + { + "epoch": 0.27, + "grad_norm": 3.6380133628845215, + "learning_rate": 4.614685796629284e-05, + "loss": 2.8197, + "step": 1430 + }, + { + "epoch": 0.27, + "grad_norm": 4.102017402648926, + "learning_rate": 4.611553160829522e-05, + "loss": 2.7289, + "step": 1440 + }, + { + "epoch": 0.27, + "grad_norm": 3.9504294395446777, + "learning_rate": 4.60842052502976e-05, + "loss": 2.9459, + "step": 1450 + }, + { + "epoch": 0.27, + "grad_norm": 4.055880069732666, + "learning_rate": 4.605287889229998e-05, + "loss": 2.776, + "step": 1460 + }, + { + "epoch": 0.27, + "grad_norm": 3.9048073291778564, + "learning_rate": 4.602155253430237e-05, + "loss": 2.7488, + "step": 1470 + }, + { + "epoch": 0.27, + "grad_norm": 3.5023839473724365, + "learning_rate": 4.599022617630475e-05, + "loss": 2.7139, + "step": 1480 + }, + { + "epoch": 0.28, + "grad_norm": 3.604358434677124, + "learning_rate": 4.595889981830713e-05, + "loss": 2.7338, + "step": 1490 + }, + { + "epoch": 0.28, + "grad_norm": 3.7570292949676514, + "learning_rate": 4.592757346030951e-05, + "loss": 2.7606, + "step": 1500 + }, + { + "epoch": 0.28, + "grad_norm": 3.767991065979004, + "learning_rate": 4.589624710231189e-05, + "loss": 2.8058, + "step": 1510 + }, + { + "epoch": 0.28, + "grad_norm": 4.012033939361572, + "learning_rate": 4.586492074431427e-05, + "loss": 2.8896, + "step": 1520 + }, + { + "epoch": 0.28, + "grad_norm": 3.8234939575195312, + "learning_rate": 4.583359438631665e-05, + "loss": 2.7363, + "step": 1530 + }, + { + "epoch": 0.29, + "grad_norm": 3.762657880783081, + "learning_rate": 4.580226802831903e-05, + "loss": 2.9173, + "step": 1540 + }, + { + "epoch": 0.29, + "grad_norm": 3.2961838245391846, + "learning_rate": 4.577094167032141e-05, + "loss": 2.7286, + "step": 1550 + }, + { + "epoch": 0.29, + "grad_norm": 3.9149203300476074, + "learning_rate": 4.573961531232379e-05, + "loss": 2.9862, + "step": 1560 + }, + { + "epoch": 0.29, + "grad_norm": 4.118852138519287, + "learning_rate": 4.5708288954326175e-05, + "loss": 2.8026, + "step": 1570 + }, + { + "epoch": 0.29, + "grad_norm": 3.7272684574127197, + "learning_rate": 4.5676962596328555e-05, + "loss": 2.8653, + "step": 1580 + }, + { + "epoch": 0.3, + "grad_norm": 3.5148510932922363, + "learning_rate": 4.5645636238330935e-05, + "loss": 2.8848, + "step": 1590 + }, + { + "epoch": 0.3, + "grad_norm": 3.6071255207061768, + "learning_rate": 4.5614309880333315e-05, + "loss": 2.8441, + "step": 1600 + }, + { + "epoch": 0.3, + "grad_norm": 3.5002853870391846, + "learning_rate": 4.5582983522335695e-05, + "loss": 2.8536, + "step": 1610 + }, + { + "epoch": 0.3, + "grad_norm": 3.4483225345611572, + "learning_rate": 4.5551657164338075e-05, + "loss": 2.6594, + "step": 1620 + }, + { + "epoch": 0.3, + "grad_norm": 3.165606737136841, + "learning_rate": 4.5520330806340455e-05, + "loss": 2.6541, + "step": 1630 + }, + { + "epoch": 0.3, + "grad_norm": 3.3406383991241455, + "learning_rate": 4.5489004448342835e-05, + "loss": 2.8958, + "step": 1640 + }, + { + "epoch": 0.31, + "grad_norm": 3.9873170852661133, + "learning_rate": 4.5457678090345215e-05, + "loss": 2.7329, + "step": 1650 + }, + { + "epoch": 0.31, + "grad_norm": 4.1082024574279785, + "learning_rate": 4.54263517323476e-05, + "loss": 2.6916, + "step": 1660 + }, + { + "epoch": 0.31, + "grad_norm": 3.557987689971924, + "learning_rate": 4.539502537434998e-05, + "loss": 2.8576, + "step": 1670 + }, + { + "epoch": 0.31, + "grad_norm": 3.5951321125030518, + "learning_rate": 4.536369901635236e-05, + "loss": 2.9174, + "step": 1680 + }, + { + "epoch": 0.31, + "grad_norm": 4.360711097717285, + "learning_rate": 4.533237265835474e-05, + "loss": 2.677, + "step": 1690 + }, + { + "epoch": 0.32, + "grad_norm": 3.6791539192199707, + "learning_rate": 4.530104630035712e-05, + "loss": 2.8051, + "step": 1700 + }, + { + "epoch": 0.32, + "grad_norm": 3.4669108390808105, + "learning_rate": 4.52697199423595e-05, + "loss": 2.751, + "step": 1710 + }, + { + "epoch": 0.32, + "grad_norm": 3.8815035820007324, + "learning_rate": 4.523839358436188e-05, + "loss": 2.832, + "step": 1720 + }, + { + "epoch": 0.32, + "grad_norm": 4.085338115692139, + "learning_rate": 4.520706722636426e-05, + "loss": 2.8625, + "step": 1730 + }, + { + "epoch": 0.32, + "grad_norm": 3.5768237113952637, + "learning_rate": 4.517574086836664e-05, + "loss": 2.8064, + "step": 1740 + }, + { + "epoch": 0.32, + "grad_norm": 3.7768394947052, + "learning_rate": 4.514441451036903e-05, + "loss": 2.8454, + "step": 1750 + }, + { + "epoch": 0.33, + "grad_norm": 3.4390206336975098, + "learning_rate": 4.511308815237141e-05, + "loss": 2.8693, + "step": 1760 + }, + { + "epoch": 0.33, + "grad_norm": 3.720719575881958, + "learning_rate": 4.508176179437379e-05, + "loss": 2.7908, + "step": 1770 + }, + { + "epoch": 0.33, + "grad_norm": 3.6335737705230713, + "learning_rate": 4.505043543637617e-05, + "loss": 2.7125, + "step": 1780 + }, + { + "epoch": 0.33, + "grad_norm": 3.9518234729766846, + "learning_rate": 4.501910907837855e-05, + "loss": 2.6874, + "step": 1790 + }, + { + "epoch": 0.33, + "grad_norm": 4.64328670501709, + "learning_rate": 4.4987782720380936e-05, + "loss": 2.814, + "step": 1800 + }, + { + "epoch": 0.34, + "grad_norm": 4.018370151519775, + "learning_rate": 4.495645636238331e-05, + "loss": 2.8589, + "step": 1810 + }, + { + "epoch": 0.34, + "grad_norm": 3.9419972896575928, + "learning_rate": 4.492513000438569e-05, + "loss": 2.8111, + "step": 1820 + }, + { + "epoch": 0.34, + "grad_norm": 3.3817763328552246, + "learning_rate": 4.489380364638807e-05, + "loss": 2.6824, + "step": 1830 + }, + { + "epoch": 0.34, + "grad_norm": 3.3612987995147705, + "learning_rate": 4.4862477288390456e-05, + "loss": 2.6577, + "step": 1840 + }, + { + "epoch": 0.34, + "grad_norm": 3.4325666427612305, + "learning_rate": 4.4831150930392836e-05, + "loss": 2.7635, + "step": 1850 + }, + { + "epoch": 0.35, + "grad_norm": 3.85490345954895, + "learning_rate": 4.4799824572395216e-05, + "loss": 2.5752, + "step": 1860 + }, + { + "epoch": 0.35, + "grad_norm": 4.271368026733398, + "learning_rate": 4.4768498214397596e-05, + "loss": 2.9514, + "step": 1870 + }, + { + "epoch": 0.35, + "grad_norm": 3.287911891937256, + "learning_rate": 4.4737171856399976e-05, + "loss": 2.7836, + "step": 1880 + }, + { + "epoch": 0.35, + "grad_norm": 4.003190517425537, + "learning_rate": 4.470584549840236e-05, + "loss": 2.845, + "step": 1890 + }, + { + "epoch": 0.35, + "grad_norm": 4.682821750640869, + "learning_rate": 4.467451914040474e-05, + "loss": 2.8079, + "step": 1900 + }, + { + "epoch": 0.35, + "grad_norm": 3.5387868881225586, + "learning_rate": 4.4643192782407116e-05, + "loss": 2.7532, + "step": 1910 + }, + { + "epoch": 0.36, + "grad_norm": 3.777780055999756, + "learning_rate": 4.4611866424409496e-05, + "loss": 2.699, + "step": 1920 + }, + { + "epoch": 0.36, + "grad_norm": 3.1882541179656982, + "learning_rate": 4.458054006641188e-05, + "loss": 2.8145, + "step": 1930 + }, + { + "epoch": 0.36, + "grad_norm": 3.4730749130249023, + "learning_rate": 4.454921370841426e-05, + "loss": 2.7746, + "step": 1940 + }, + { + "epoch": 0.36, + "grad_norm": 3.5697054862976074, + "learning_rate": 4.451788735041664e-05, + "loss": 2.721, + "step": 1950 + }, + { + "epoch": 0.36, + "grad_norm": 3.4670157432556152, + "learning_rate": 4.448656099241902e-05, + "loss": 2.7096, + "step": 1960 + }, + { + "epoch": 0.37, + "grad_norm": 3.8159096240997314, + "learning_rate": 4.44552346344214e-05, + "loss": 2.7708, + "step": 1970 + }, + { + "epoch": 0.37, + "grad_norm": 3.4364335536956787, + "learning_rate": 4.442390827642379e-05, + "loss": 2.7625, + "step": 1980 + }, + { + "epoch": 0.37, + "grad_norm": 3.3506743907928467, + "learning_rate": 4.439258191842617e-05, + "loss": 2.7489, + "step": 1990 + }, + { + "epoch": 0.37, + "grad_norm": 4.412740230560303, + "learning_rate": 4.436125556042854e-05, + "loss": 2.7105, + "step": 2000 + }, + { + "epoch": 0.37, + "grad_norm": 3.519390821456909, + "learning_rate": 4.432992920243092e-05, + "loss": 2.7684, + "step": 2010 + }, + { + "epoch": 0.37, + "grad_norm": 3.9247891902923584, + "learning_rate": 4.429860284443331e-05, + "loss": 2.7872, + "step": 2020 + }, + { + "epoch": 0.38, + "grad_norm": 4.020471572875977, + "learning_rate": 4.426727648643569e-05, + "loss": 2.8781, + "step": 2030 + }, + { + "epoch": 0.38, + "grad_norm": 3.6361448764801025, + "learning_rate": 4.423595012843807e-05, + "loss": 2.8656, + "step": 2040 + }, + { + "epoch": 0.38, + "grad_norm": 3.7794973850250244, + "learning_rate": 4.420462377044045e-05, + "loss": 2.7562, + "step": 2050 + }, + { + "epoch": 0.38, + "grad_norm": 3.0659725666046143, + "learning_rate": 4.417329741244283e-05, + "loss": 2.7344, + "step": 2060 + }, + { + "epoch": 0.38, + "grad_norm": 4.266765594482422, + "learning_rate": 4.414197105444521e-05, + "loss": 2.6772, + "step": 2070 + }, + { + "epoch": 0.39, + "grad_norm": 3.694528102874756, + "learning_rate": 4.41106446964476e-05, + "loss": 2.8256, + "step": 2080 + }, + { + "epoch": 0.39, + "grad_norm": 3.6741867065429688, + "learning_rate": 4.407931833844998e-05, + "loss": 2.6166, + "step": 2090 + }, + { + "epoch": 0.39, + "grad_norm": 3.6495282649993896, + "learning_rate": 4.404799198045235e-05, + "loss": 2.8, + "step": 2100 + }, + { + "epoch": 0.39, + "grad_norm": 3.7345871925354004, + "learning_rate": 4.401666562245473e-05, + "loss": 2.6533, + "step": 2110 + }, + { + "epoch": 0.39, + "grad_norm": 3.817913770675659, + "learning_rate": 4.398533926445712e-05, + "loss": 2.8853, + "step": 2120 + }, + { + "epoch": 0.4, + "grad_norm": 4.097714424133301, + "learning_rate": 4.39540129064595e-05, + "loss": 2.7138, + "step": 2130 + }, + { + "epoch": 0.4, + "grad_norm": 3.885183095932007, + "learning_rate": 4.392268654846188e-05, + "loss": 2.6833, + "step": 2140 + }, + { + "epoch": 0.4, + "grad_norm": 3.780548095703125, + "learning_rate": 4.389136019046426e-05, + "loss": 2.5998, + "step": 2150 + }, + { + "epoch": 0.4, + "grad_norm": 3.925668716430664, + "learning_rate": 4.386003383246664e-05, + "loss": 2.8803, + "step": 2160 + }, + { + "epoch": 0.4, + "grad_norm": 4.053170680999756, + "learning_rate": 4.3828707474469024e-05, + "loss": 2.722, + "step": 2170 + }, + { + "epoch": 0.4, + "grad_norm": 3.3865647315979004, + "learning_rate": 4.3797381116471404e-05, + "loss": 2.7116, + "step": 2180 + }, + { + "epoch": 0.41, + "grad_norm": 3.8181633949279785, + "learning_rate": 4.3766054758473784e-05, + "loss": 2.7195, + "step": 2190 + }, + { + "epoch": 0.41, + "grad_norm": 3.9811198711395264, + "learning_rate": 4.373472840047616e-05, + "loss": 2.7277, + "step": 2200 + }, + { + "epoch": 0.41, + "grad_norm": 4.141151428222656, + "learning_rate": 4.3703402042478544e-05, + "loss": 2.7285, + "step": 2210 + }, + { + "epoch": 0.41, + "grad_norm": 3.5377368927001953, + "learning_rate": 4.3672075684480924e-05, + "loss": 2.8002, + "step": 2220 + }, + { + "epoch": 0.41, + "grad_norm": 3.696154832839966, + "learning_rate": 4.3640749326483304e-05, + "loss": 2.6738, + "step": 2230 + }, + { + "epoch": 0.42, + "grad_norm": 3.39733624458313, + "learning_rate": 4.3609422968485684e-05, + "loss": 2.7906, + "step": 2240 + }, + { + "epoch": 0.42, + "grad_norm": 3.4818873405456543, + "learning_rate": 4.3578096610488064e-05, + "loss": 2.754, + "step": 2250 + }, + { + "epoch": 0.42, + "grad_norm": 3.6994543075561523, + "learning_rate": 4.354677025249045e-05, + "loss": 2.7822, + "step": 2260 + }, + { + "epoch": 0.42, + "grad_norm": 3.5926380157470703, + "learning_rate": 4.351544389449283e-05, + "loss": 2.8609, + "step": 2270 + }, + { + "epoch": 0.42, + "grad_norm": 3.7803244590759277, + "learning_rate": 4.348411753649521e-05, + "loss": 2.691, + "step": 2280 + }, + { + "epoch": 0.43, + "grad_norm": 3.8215737342834473, + "learning_rate": 4.3452791178497584e-05, + "loss": 2.793, + "step": 2290 + }, + { + "epoch": 0.43, + "grad_norm": 3.6718356609344482, + "learning_rate": 4.342146482049997e-05, + "loss": 2.7768, + "step": 2300 + }, + { + "epoch": 0.43, + "grad_norm": 3.3875210285186768, + "learning_rate": 4.339013846250235e-05, + "loss": 2.8338, + "step": 2310 + }, + { + "epoch": 0.43, + "grad_norm": 3.8644275665283203, + "learning_rate": 4.335881210450473e-05, + "loss": 2.6768, + "step": 2320 + }, + { + "epoch": 0.43, + "grad_norm": 3.872359037399292, + "learning_rate": 4.332748574650711e-05, + "loss": 2.7054, + "step": 2330 + }, + { + "epoch": 0.43, + "grad_norm": 3.703859567642212, + "learning_rate": 4.329615938850949e-05, + "loss": 2.7998, + "step": 2340 + }, + { + "epoch": 0.44, + "grad_norm": 3.350034236907959, + "learning_rate": 4.326483303051188e-05, + "loss": 2.7704, + "step": 2350 + }, + { + "epoch": 0.44, + "grad_norm": 4.220928192138672, + "learning_rate": 4.323350667251426e-05, + "loss": 2.8345, + "step": 2360 + }, + { + "epoch": 0.44, + "grad_norm": 3.6803479194641113, + "learning_rate": 4.320218031451664e-05, + "loss": 2.8015, + "step": 2370 + }, + { + "epoch": 0.44, + "grad_norm": 3.12473464012146, + "learning_rate": 4.317085395651902e-05, + "loss": 2.6426, + "step": 2380 + }, + { + "epoch": 0.44, + "grad_norm": 3.914316177368164, + "learning_rate": 4.31395275985214e-05, + "loss": 2.784, + "step": 2390 + }, + { + "epoch": 0.45, + "grad_norm": 3.1893820762634277, + "learning_rate": 4.310820124052378e-05, + "loss": 2.6237, + "step": 2400 + }, + { + "epoch": 0.45, + "grad_norm": 3.594568967819214, + "learning_rate": 4.307687488252616e-05, + "loss": 2.6872, + "step": 2410 + }, + { + "epoch": 0.45, + "grad_norm": 3.504798412322998, + "learning_rate": 4.304554852452854e-05, + "loss": 2.6686, + "step": 2420 + }, + { + "epoch": 0.45, + "grad_norm": 3.518606662750244, + "learning_rate": 4.301422216653092e-05, + "loss": 2.7731, + "step": 2430 + }, + { + "epoch": 0.45, + "grad_norm": 3.7136247158050537, + "learning_rate": 4.2982895808533305e-05, + "loss": 2.7606, + "step": 2440 + }, + { + "epoch": 0.45, + "grad_norm": 3.783796787261963, + "learning_rate": 4.2951569450535685e-05, + "loss": 2.722, + "step": 2450 + }, + { + "epoch": 0.46, + "grad_norm": 3.313835382461548, + "learning_rate": 4.2920243092538065e-05, + "loss": 2.7654, + "step": 2460 + }, + { + "epoch": 0.46, + "grad_norm": 3.9201815128326416, + "learning_rate": 4.2888916734540445e-05, + "loss": 2.7922, + "step": 2470 + }, + { + "epoch": 0.46, + "grad_norm": 4.019741058349609, + "learning_rate": 4.2857590376542825e-05, + "loss": 2.7475, + "step": 2480 + }, + { + "epoch": 0.46, + "grad_norm": 3.6343600749969482, + "learning_rate": 4.2826264018545205e-05, + "loss": 2.7395, + "step": 2490 + }, + { + "epoch": 0.46, + "grad_norm": 3.6151719093322754, + "learning_rate": 4.2794937660547585e-05, + "loss": 2.6704, + "step": 2500 + }, + { + "epoch": 0.47, + "grad_norm": 3.841431140899658, + "learning_rate": 4.2763611302549965e-05, + "loss": 2.7389, + "step": 2510 + }, + { + "epoch": 0.47, + "grad_norm": 3.631673574447632, + "learning_rate": 4.2732284944552346e-05, + "loss": 2.7652, + "step": 2520 + }, + { + "epoch": 0.47, + "grad_norm": 3.611374616622925, + "learning_rate": 4.270095858655473e-05, + "loss": 2.8215, + "step": 2530 + }, + { + "epoch": 0.47, + "grad_norm": 3.66530179977417, + "learning_rate": 4.266963222855711e-05, + "loss": 2.8931, + "step": 2540 + }, + { + "epoch": 0.47, + "grad_norm": 3.8743066787719727, + "learning_rate": 4.263830587055949e-05, + "loss": 2.6281, + "step": 2550 + }, + { + "epoch": 0.48, + "grad_norm": 3.602386951446533, + "learning_rate": 4.260697951256187e-05, + "loss": 2.5779, + "step": 2560 + }, + { + "epoch": 0.48, + "grad_norm": 3.621814489364624, + "learning_rate": 4.257565315456425e-05, + "loss": 2.7075, + "step": 2570 + }, + { + "epoch": 0.48, + "grad_norm": 3.740424871444702, + "learning_rate": 4.254432679656664e-05, + "loss": 2.8009, + "step": 2580 + }, + { + "epoch": 0.48, + "grad_norm": 3.92523193359375, + "learning_rate": 4.251300043856901e-05, + "loss": 2.7026, + "step": 2590 + }, + { + "epoch": 0.48, + "grad_norm": 3.489698886871338, + "learning_rate": 4.248167408057139e-05, + "loss": 2.6533, + "step": 2600 + }, + { + "epoch": 0.48, + "grad_norm": 3.64864444732666, + "learning_rate": 4.245034772257377e-05, + "loss": 2.836, + "step": 2610 + }, + { + "epoch": 0.49, + "grad_norm": 3.7577192783355713, + "learning_rate": 4.241902136457615e-05, + "loss": 2.6508, + "step": 2620 + }, + { + "epoch": 0.49, + "grad_norm": 3.5754554271698, + "learning_rate": 4.238769500657854e-05, + "loss": 2.6652, + "step": 2630 + }, + { + "epoch": 0.49, + "grad_norm": 3.6247026920318604, + "learning_rate": 4.235636864858092e-05, + "loss": 2.7284, + "step": 2640 + }, + { + "epoch": 0.49, + "grad_norm": 3.587557315826416, + "learning_rate": 4.23250422905833e-05, + "loss": 2.8586, + "step": 2650 + }, + { + "epoch": 0.49, + "grad_norm": 3.577183723449707, + "learning_rate": 4.229371593258568e-05, + "loss": 2.7014, + "step": 2660 + }, + { + "epoch": 0.5, + "grad_norm": 3.8624074459075928, + "learning_rate": 4.226238957458806e-05, + "loss": 2.7655, + "step": 2670 + }, + { + "epoch": 0.5, + "grad_norm": 3.9416675567626953, + "learning_rate": 4.223106321659044e-05, + "loss": 2.6485, + "step": 2680 + }, + { + "epoch": 0.5, + "grad_norm": 3.667386293411255, + "learning_rate": 4.219973685859282e-05, + "loss": 2.6066, + "step": 2690 + }, + { + "epoch": 0.5, + "grad_norm": 3.4557063579559326, + "learning_rate": 4.21684105005952e-05, + "loss": 2.7029, + "step": 2700 + }, + { + "epoch": 0.5, + "grad_norm": 3.884753704071045, + "learning_rate": 4.213708414259758e-05, + "loss": 2.657, + "step": 2710 + }, + { + "epoch": 0.5, + "grad_norm": 3.569491147994995, + "learning_rate": 4.2105757784599966e-05, + "loss": 2.7833, + "step": 2720 + }, + { + "epoch": 0.51, + "grad_norm": 3.7412760257720947, + "learning_rate": 4.2074431426602346e-05, + "loss": 2.7846, + "step": 2730 + }, + { + "epoch": 0.51, + "grad_norm": 3.8474833965301514, + "learning_rate": 4.2043105068604726e-05, + "loss": 2.7825, + "step": 2740 + }, + { + "epoch": 0.51, + "grad_norm": 3.9280788898468018, + "learning_rate": 4.2011778710607107e-05, + "loss": 2.8284, + "step": 2750 + }, + { + "epoch": 0.51, + "grad_norm": 4.080498695373535, + "learning_rate": 4.1980452352609487e-05, + "loss": 2.7753, + "step": 2760 + }, + { + "epoch": 0.51, + "grad_norm": 3.6435611248016357, + "learning_rate": 4.194912599461187e-05, + "loss": 2.6859, + "step": 2770 + }, + { + "epoch": 0.52, + "grad_norm": 3.6379106044769287, + "learning_rate": 4.191779963661425e-05, + "loss": 2.6579, + "step": 2780 + }, + { + "epoch": 0.52, + "grad_norm": 3.765613079071045, + "learning_rate": 4.188647327861663e-05, + "loss": 2.6523, + "step": 2790 + }, + { + "epoch": 0.52, + "grad_norm": 4.204902648925781, + "learning_rate": 4.185514692061901e-05, + "loss": 2.5839, + "step": 2800 + }, + { + "epoch": 0.52, + "grad_norm": 3.621143102645874, + "learning_rate": 4.1823820562621393e-05, + "loss": 2.638, + "step": 2810 + }, + { + "epoch": 0.52, + "grad_norm": 4.0735650062561035, + "learning_rate": 4.1792494204623774e-05, + "loss": 2.8022, + "step": 2820 + }, + { + "epoch": 0.53, + "grad_norm": 3.949160099029541, + "learning_rate": 4.1761167846626154e-05, + "loss": 2.647, + "step": 2830 + }, + { + "epoch": 0.53, + "grad_norm": 3.7632877826690674, + "learning_rate": 4.1729841488628534e-05, + "loss": 2.6991, + "step": 2840 + }, + { + "epoch": 0.53, + "grad_norm": 3.5531980991363525, + "learning_rate": 4.1698515130630914e-05, + "loss": 2.7372, + "step": 2850 + }, + { + "epoch": 0.53, + "grad_norm": 3.4067349433898926, + "learning_rate": 4.16671887726333e-05, + "loss": 2.6866, + "step": 2860 + }, + { + "epoch": 0.53, + "grad_norm": 3.5877177715301514, + "learning_rate": 4.163586241463568e-05, + "loss": 2.7729, + "step": 2870 + }, + { + "epoch": 0.53, + "grad_norm": 3.848222255706787, + "learning_rate": 4.1604536056638054e-05, + "loss": 2.658, + "step": 2880 + }, + { + "epoch": 0.54, + "grad_norm": 3.431466817855835, + "learning_rate": 4.1573209698640434e-05, + "loss": 2.8442, + "step": 2890 + }, + { + "epoch": 0.54, + "grad_norm": 3.4475386142730713, + "learning_rate": 4.154188334064282e-05, + "loss": 2.6148, + "step": 2900 + }, + { + "epoch": 0.54, + "grad_norm": 3.8094379901885986, + "learning_rate": 4.15105569826452e-05, + "loss": 2.5793, + "step": 2910 + }, + { + "epoch": 0.54, + "grad_norm": 3.5097644329071045, + "learning_rate": 4.147923062464758e-05, + "loss": 2.6795, + "step": 2920 + }, + { + "epoch": 0.54, + "grad_norm": 3.5722837448120117, + "learning_rate": 4.144790426664996e-05, + "loss": 2.6313, + "step": 2930 + }, + { + "epoch": 0.55, + "grad_norm": 3.841233491897583, + "learning_rate": 4.141657790865234e-05, + "loss": 2.6852, + "step": 2940 + }, + { + "epoch": 0.55, + "grad_norm": 3.675516128540039, + "learning_rate": 4.138525155065473e-05, + "loss": 2.7584, + "step": 2950 + }, + { + "epoch": 0.55, + "grad_norm": 3.3295745849609375, + "learning_rate": 4.135392519265711e-05, + "loss": 2.6703, + "step": 2960 + }, + { + "epoch": 0.55, + "grad_norm": 3.6419780254364014, + "learning_rate": 4.132259883465949e-05, + "loss": 2.7687, + "step": 2970 + }, + { + "epoch": 0.55, + "grad_norm": 3.3836734294891357, + "learning_rate": 4.129127247666186e-05, + "loss": 2.754, + "step": 2980 + }, + { + "epoch": 0.56, + "grad_norm": 4.0170369148254395, + "learning_rate": 4.125994611866425e-05, + "loss": 2.8139, + "step": 2990 + }, + { + "epoch": 0.56, + "grad_norm": 3.121138572692871, + "learning_rate": 4.122861976066663e-05, + "loss": 2.8392, + "step": 3000 + }, + { + "epoch": 0.56, + "grad_norm": 4.2617597579956055, + "learning_rate": 4.119729340266901e-05, + "loss": 2.6012, + "step": 3010 + }, + { + "epoch": 0.56, + "grad_norm": 3.2569031715393066, + "learning_rate": 4.116596704467139e-05, + "loss": 2.7925, + "step": 3020 + }, + { + "epoch": 0.56, + "grad_norm": 3.606506824493408, + "learning_rate": 4.113464068667377e-05, + "loss": 2.7402, + "step": 3030 + }, + { + "epoch": 0.56, + "grad_norm": 3.7875235080718994, + "learning_rate": 4.1103314328676154e-05, + "loss": 2.683, + "step": 3040 + }, + { + "epoch": 0.57, + "grad_norm": 3.055811643600464, + "learning_rate": 4.1071987970678535e-05, + "loss": 2.7721, + "step": 3050 + }, + { + "epoch": 0.57, + "grad_norm": 4.042735576629639, + "learning_rate": 4.1040661612680915e-05, + "loss": 2.6119, + "step": 3060 + }, + { + "epoch": 0.57, + "grad_norm": 3.30137038230896, + "learning_rate": 4.100933525468329e-05, + "loss": 2.7351, + "step": 3070 + }, + { + "epoch": 0.57, + "grad_norm": 3.567495107650757, + "learning_rate": 4.097800889668567e-05, + "loss": 2.6587, + "step": 3080 + }, + { + "epoch": 0.57, + "grad_norm": 3.2337441444396973, + "learning_rate": 4.0946682538688055e-05, + "loss": 2.7335, + "step": 3090 + }, + { + "epoch": 0.58, + "grad_norm": 3.5480425357818604, + "learning_rate": 4.0915356180690435e-05, + "loss": 2.6796, + "step": 3100 + }, + { + "epoch": 0.58, + "grad_norm": 3.805624008178711, + "learning_rate": 4.0884029822692815e-05, + "loss": 2.8585, + "step": 3110 + }, + { + "epoch": 0.58, + "grad_norm": 3.9506654739379883, + "learning_rate": 4.0852703464695195e-05, + "loss": 2.7387, + "step": 3120 + }, + { + "epoch": 0.58, + "grad_norm": 4.039269924163818, + "learning_rate": 4.0821377106697575e-05, + "loss": 2.7157, + "step": 3130 + }, + { + "epoch": 0.58, + "grad_norm": 3.0865540504455566, + "learning_rate": 4.079005074869996e-05, + "loss": 2.8107, + "step": 3140 + }, + { + "epoch": 0.58, + "grad_norm": 3.5151524543762207, + "learning_rate": 4.075872439070234e-05, + "loss": 2.768, + "step": 3150 + }, + { + "epoch": 0.59, + "grad_norm": 3.7833340167999268, + "learning_rate": 4.072739803270472e-05, + "loss": 2.6107, + "step": 3160 + }, + { + "epoch": 0.59, + "grad_norm": 3.864769697189331, + "learning_rate": 4.0696071674707095e-05, + "loss": 2.8684, + "step": 3170 + }, + { + "epoch": 0.59, + "grad_norm": 3.961406707763672, + "learning_rate": 4.066474531670948e-05, + "loss": 2.8007, + "step": 3180 + }, + { + "epoch": 0.59, + "grad_norm": 4.2316365242004395, + "learning_rate": 4.063341895871186e-05, + "loss": 2.8068, + "step": 3190 + }, + { + "epoch": 0.59, + "grad_norm": 3.583223581314087, + "learning_rate": 4.060209260071424e-05, + "loss": 2.6651, + "step": 3200 + }, + { + "epoch": 0.6, + "grad_norm": 3.6090996265411377, + "learning_rate": 4.057076624271662e-05, + "loss": 2.7669, + "step": 3210 + }, + { + "epoch": 0.6, + "grad_norm": 3.1495776176452637, + "learning_rate": 4.0539439884719e-05, + "loss": 2.696, + "step": 3220 + }, + { + "epoch": 0.6, + "grad_norm": 3.555459499359131, + "learning_rate": 4.050811352672139e-05, + "loss": 2.6938, + "step": 3230 + }, + { + "epoch": 0.6, + "grad_norm": 3.707810163497925, + "learning_rate": 4.047678716872377e-05, + "loss": 2.8262, + "step": 3240 + }, + { + "epoch": 0.6, + "grad_norm": 3.3268215656280518, + "learning_rate": 4.044546081072615e-05, + "loss": 2.6679, + "step": 3250 + }, + { + "epoch": 0.61, + "grad_norm": 3.708392858505249, + "learning_rate": 4.041413445272853e-05, + "loss": 2.5789, + "step": 3260 + }, + { + "epoch": 0.61, + "grad_norm": 3.439485788345337, + "learning_rate": 4.038280809473091e-05, + "loss": 2.642, + "step": 3270 + }, + { + "epoch": 0.61, + "grad_norm": 3.641021490097046, + "learning_rate": 4.035148173673329e-05, + "loss": 2.7528, + "step": 3280 + }, + { + "epoch": 0.61, + "grad_norm": 3.777803897857666, + "learning_rate": 4.032015537873567e-05, + "loss": 2.7106, + "step": 3290 + }, + { + "epoch": 0.61, + "grad_norm": 3.6226184368133545, + "learning_rate": 4.028882902073805e-05, + "loss": 2.5178, + "step": 3300 + }, + { + "epoch": 0.61, + "grad_norm": 3.5166571140289307, + "learning_rate": 4.025750266274043e-05, + "loss": 2.7359, + "step": 3310 + }, + { + "epoch": 0.62, + "grad_norm": 3.474928617477417, + "learning_rate": 4.0226176304742816e-05, + "loss": 2.6186, + "step": 3320 + }, + { + "epoch": 0.62, + "grad_norm": 3.6442630290985107, + "learning_rate": 4.0194849946745196e-05, + "loss": 2.8285, + "step": 3330 + }, + { + "epoch": 0.62, + "grad_norm": 3.43820858001709, + "learning_rate": 4.0163523588747576e-05, + "loss": 2.7525, + "step": 3340 + }, + { + "epoch": 0.62, + "grad_norm": 3.47707462310791, + "learning_rate": 4.0132197230749956e-05, + "loss": 2.7491, + "step": 3350 + }, + { + "epoch": 0.62, + "grad_norm": 3.5458624362945557, + "learning_rate": 4.0100870872752336e-05, + "loss": 2.575, + "step": 3360 + }, + { + "epoch": 0.63, + "grad_norm": 7.445703983306885, + "learning_rate": 4.0069544514754716e-05, + "loss": 2.6054, + "step": 3370 + }, + { + "epoch": 0.63, + "grad_norm": 4.1514811515808105, + "learning_rate": 4.0038218156757096e-05, + "loss": 2.5804, + "step": 3380 + }, + { + "epoch": 0.63, + "grad_norm": 3.6534276008605957, + "learning_rate": 4.0006891798759476e-05, + "loss": 2.6048, + "step": 3390 + }, + { + "epoch": 0.63, + "grad_norm": 3.285447120666504, + "learning_rate": 3.9975565440761856e-05, + "loss": 2.7041, + "step": 3400 + }, + { + "epoch": 0.63, + "grad_norm": 3.4054813385009766, + "learning_rate": 3.994423908276424e-05, + "loss": 2.7286, + "step": 3410 + }, + { + "epoch": 0.63, + "grad_norm": 3.6479828357696533, + "learning_rate": 3.991291272476662e-05, + "loss": 2.7448, + "step": 3420 + }, + { + "epoch": 0.64, + "grad_norm": 3.1932740211486816, + "learning_rate": 3.9881586366769e-05, + "loss": 2.6782, + "step": 3430 + }, + { + "epoch": 0.64, + "grad_norm": 3.734436511993408, + "learning_rate": 3.985026000877138e-05, + "loss": 2.5687, + "step": 3440 + }, + { + "epoch": 0.64, + "grad_norm": 3.552328586578369, + "learning_rate": 3.981893365077376e-05, + "loss": 2.6521, + "step": 3450 + }, + { + "epoch": 0.64, + "grad_norm": 3.3161375522613525, + "learning_rate": 3.978760729277614e-05, + "loss": 2.7503, + "step": 3460 + }, + { + "epoch": 0.64, + "grad_norm": 3.588078260421753, + "learning_rate": 3.975628093477852e-05, + "loss": 2.6705, + "step": 3470 + }, + { + "epoch": 0.65, + "grad_norm": 3.963134765625, + "learning_rate": 3.97249545767809e-05, + "loss": 2.6418, + "step": 3480 + }, + { + "epoch": 0.65, + "grad_norm": 3.536984443664551, + "learning_rate": 3.969362821878328e-05, + "loss": 2.6098, + "step": 3490 + }, + { + "epoch": 0.65, + "grad_norm": 4.218698978424072, + "learning_rate": 3.966230186078567e-05, + "loss": 2.6781, + "step": 3500 + }, + { + "epoch": 0.65, + "grad_norm": 3.5511672496795654, + "learning_rate": 3.963097550278805e-05, + "loss": 2.6131, + "step": 3510 + }, + { + "epoch": 0.65, + "grad_norm": 3.55438494682312, + "learning_rate": 3.959964914479043e-05, + "loss": 2.5787, + "step": 3520 + }, + { + "epoch": 0.66, + "grad_norm": 3.8446435928344727, + "learning_rate": 3.956832278679281e-05, + "loss": 2.6975, + "step": 3530 + }, + { + "epoch": 0.66, + "grad_norm": 3.2814626693725586, + "learning_rate": 3.953699642879519e-05, + "loss": 2.5755, + "step": 3540 + }, + { + "epoch": 0.66, + "grad_norm": 3.7546284198760986, + "learning_rate": 3.950567007079758e-05, + "loss": 2.7988, + "step": 3550 + }, + { + "epoch": 0.66, + "grad_norm": 3.575570583343506, + "learning_rate": 3.947434371279995e-05, + "loss": 2.8421, + "step": 3560 + }, + { + "epoch": 0.66, + "grad_norm": 3.8108553886413574, + "learning_rate": 3.944301735480233e-05, + "loss": 2.7151, + "step": 3570 + }, + { + "epoch": 0.66, + "grad_norm": 3.531538963317871, + "learning_rate": 3.941169099680471e-05, + "loss": 2.6352, + "step": 3580 + }, + { + "epoch": 0.67, + "grad_norm": 3.4760665893554688, + "learning_rate": 3.93803646388071e-05, + "loss": 2.6097, + "step": 3590 + }, + { + "epoch": 0.67, + "grad_norm": 3.213020086288452, + "learning_rate": 3.934903828080948e-05, + "loss": 2.6864, + "step": 3600 + }, + { + "epoch": 0.67, + "grad_norm": 3.4527084827423096, + "learning_rate": 3.931771192281186e-05, + "loss": 2.7612, + "step": 3610 + }, + { + "epoch": 0.67, + "grad_norm": 4.189217567443848, + "learning_rate": 3.928638556481424e-05, + "loss": 2.7369, + "step": 3620 + }, + { + "epoch": 0.67, + "grad_norm": 3.6353416442871094, + "learning_rate": 3.925505920681662e-05, + "loss": 2.6292, + "step": 3630 + }, + { + "epoch": 0.68, + "grad_norm": 4.04370641708374, + "learning_rate": 3.9223732848819004e-05, + "loss": 2.6481, + "step": 3640 + }, + { + "epoch": 0.68, + "grad_norm": 3.7645108699798584, + "learning_rate": 3.9192406490821384e-05, + "loss": 2.8236, + "step": 3650 + }, + { + "epoch": 0.68, + "grad_norm": 3.6773457527160645, + "learning_rate": 3.916108013282376e-05, + "loss": 2.8449, + "step": 3660 + }, + { + "epoch": 0.68, + "grad_norm": 3.6739068031311035, + "learning_rate": 3.912975377482614e-05, + "loss": 2.5603, + "step": 3670 + }, + { + "epoch": 0.68, + "grad_norm": 3.2916781902313232, + "learning_rate": 3.909842741682852e-05, + "loss": 2.7602, + "step": 3680 + }, + { + "epoch": 0.68, + "grad_norm": 3.625528573989868, + "learning_rate": 3.9067101058830904e-05, + "loss": 2.6513, + "step": 3690 + }, + { + "epoch": 0.69, + "grad_norm": 3.4369454383850098, + "learning_rate": 3.9035774700833284e-05, + "loss": 2.7553, + "step": 3700 + }, + { + "epoch": 0.69, + "grad_norm": 3.1812028884887695, + "learning_rate": 3.9004448342835664e-05, + "loss": 2.6196, + "step": 3710 + }, + { + "epoch": 0.69, + "grad_norm": 3.635446071624756, + "learning_rate": 3.8973121984838044e-05, + "loss": 2.6613, + "step": 3720 + }, + { + "epoch": 0.69, + "grad_norm": 3.7549827098846436, + "learning_rate": 3.8941795626840424e-05, + "loss": 2.522, + "step": 3730 + }, + { + "epoch": 0.69, + "grad_norm": 3.777796983718872, + "learning_rate": 3.891046926884281e-05, + "loss": 2.8314, + "step": 3740 + }, + { + "epoch": 0.7, + "grad_norm": 3.964322328567505, + "learning_rate": 3.8879142910845184e-05, + "loss": 2.7249, + "step": 3750 + }, + { + "epoch": 0.7, + "grad_norm": 3.494810104370117, + "learning_rate": 3.8847816552847564e-05, + "loss": 2.7195, + "step": 3760 + }, + { + "epoch": 0.7, + "grad_norm": 3.552868127822876, + "learning_rate": 3.8816490194849944e-05, + "loss": 2.5469, + "step": 3770 + }, + { + "epoch": 0.7, + "grad_norm": 3.6302225589752197, + "learning_rate": 3.878516383685233e-05, + "loss": 2.6063, + "step": 3780 + }, + { + "epoch": 0.7, + "grad_norm": 3.940479040145874, + "learning_rate": 3.875383747885471e-05, + "loss": 2.6185, + "step": 3790 + }, + { + "epoch": 0.71, + "grad_norm": 3.674966812133789, + "learning_rate": 3.872251112085709e-05, + "loss": 2.8044, + "step": 3800 + }, + { + "epoch": 0.71, + "grad_norm": 3.3397276401519775, + "learning_rate": 3.869118476285947e-05, + "loss": 2.6704, + "step": 3810 + }, + { + "epoch": 0.71, + "grad_norm": 3.9168059825897217, + "learning_rate": 3.865985840486185e-05, + "loss": 2.7444, + "step": 3820 + }, + { + "epoch": 0.71, + "grad_norm": 3.8766160011291504, + "learning_rate": 3.862853204686424e-05, + "loss": 2.6179, + "step": 3830 + }, + { + "epoch": 0.71, + "grad_norm": 3.8253989219665527, + "learning_rate": 3.859720568886662e-05, + "loss": 2.6412, + "step": 3840 + }, + { + "epoch": 0.71, + "grad_norm": 3.872537612915039, + "learning_rate": 3.856587933086899e-05, + "loss": 2.688, + "step": 3850 + }, + { + "epoch": 0.72, + "grad_norm": 3.495595932006836, + "learning_rate": 3.853455297287137e-05, + "loss": 2.6828, + "step": 3860 + }, + { + "epoch": 0.72, + "grad_norm": 3.5899336338043213, + "learning_rate": 3.850322661487376e-05, + "loss": 2.7323, + "step": 3870 + }, + { + "epoch": 0.72, + "grad_norm": 4.072638034820557, + "learning_rate": 3.847190025687614e-05, + "loss": 2.7366, + "step": 3880 + }, + { + "epoch": 0.72, + "grad_norm": 3.4157538414001465, + "learning_rate": 3.844057389887852e-05, + "loss": 2.6253, + "step": 3890 + }, + { + "epoch": 0.72, + "grad_norm": 3.646038055419922, + "learning_rate": 3.84092475408809e-05, + "loss": 2.6964, + "step": 3900 + }, + { + "epoch": 0.73, + "grad_norm": 4.447880744934082, + "learning_rate": 3.837792118288328e-05, + "loss": 2.6015, + "step": 3910 + }, + { + "epoch": 0.73, + "grad_norm": 3.5513362884521484, + "learning_rate": 3.8346594824885665e-05, + "loss": 2.6274, + "step": 3920 + }, + { + "epoch": 0.73, + "grad_norm": 3.7466373443603516, + "learning_rate": 3.8315268466888045e-05, + "loss": 2.7064, + "step": 3930 + }, + { + "epoch": 0.73, + "grad_norm": 3.1654741764068604, + "learning_rate": 3.8283942108890425e-05, + "loss": 2.7, + "step": 3940 + }, + { + "epoch": 0.73, + "grad_norm": 4.263967514038086, + "learning_rate": 3.82526157508928e-05, + "loss": 2.728, + "step": 3950 + }, + { + "epoch": 0.74, + "grad_norm": 3.407287836074829, + "learning_rate": 3.8221289392895185e-05, + "loss": 2.7616, + "step": 3960 + }, + { + "epoch": 0.74, + "grad_norm": 3.7538676261901855, + "learning_rate": 3.8189963034897565e-05, + "loss": 2.7607, + "step": 3970 + }, + { + "epoch": 0.74, + "grad_norm": 3.369481325149536, + "learning_rate": 3.8158636676899945e-05, + "loss": 2.708, + "step": 3980 + }, + { + "epoch": 0.74, + "grad_norm": 3.2874176502227783, + "learning_rate": 3.8127310318902325e-05, + "loss": 2.5958, + "step": 3990 + }, + { + "epoch": 0.74, + "grad_norm": 3.8853976726531982, + "learning_rate": 3.8095983960904705e-05, + "loss": 2.7041, + "step": 4000 + }, + { + "epoch": 0.74, + "grad_norm": 3.389012336730957, + "learning_rate": 3.806465760290709e-05, + "loss": 2.5207, + "step": 4010 + }, + { + "epoch": 0.75, + "grad_norm": 3.7182796001434326, + "learning_rate": 3.803333124490947e-05, + "loss": 2.6887, + "step": 4020 + }, + { + "epoch": 0.75, + "grad_norm": 3.6546928882598877, + "learning_rate": 3.800200488691185e-05, + "loss": 2.7441, + "step": 4030 + }, + { + "epoch": 0.75, + "grad_norm": 3.5602946281433105, + "learning_rate": 3.7970678528914225e-05, + "loss": 2.6461, + "step": 4040 + }, + { + "epoch": 0.75, + "grad_norm": 3.9851481914520264, + "learning_rate": 3.793935217091661e-05, + "loss": 2.6847, + "step": 4050 + }, + { + "epoch": 0.75, + "grad_norm": 3.431507110595703, + "learning_rate": 3.790802581291899e-05, + "loss": 2.8197, + "step": 4060 + }, + { + "epoch": 0.76, + "grad_norm": 3.4529969692230225, + "learning_rate": 3.787669945492137e-05, + "loss": 2.6065, + "step": 4070 + }, + { + "epoch": 0.76, + "grad_norm": 3.545023202896118, + "learning_rate": 3.784537309692375e-05, + "loss": 2.6978, + "step": 4080 + }, + { + "epoch": 0.76, + "grad_norm": 3.2153308391571045, + "learning_rate": 3.781404673892613e-05, + "loss": 2.5534, + "step": 4090 + }, + { + "epoch": 0.76, + "grad_norm": 3.68514347076416, + "learning_rate": 3.778272038092852e-05, + "loss": 2.6236, + "step": 4100 + }, + { + "epoch": 0.76, + "grad_norm": 3.956491231918335, + "learning_rate": 3.77513940229309e-05, + "loss": 2.8223, + "step": 4110 + }, + { + "epoch": 0.76, + "grad_norm": 3.5437729358673096, + "learning_rate": 3.772006766493328e-05, + "loss": 2.6469, + "step": 4120 + }, + { + "epoch": 0.77, + "grad_norm": 3.1809537410736084, + "learning_rate": 3.768874130693566e-05, + "loss": 2.7147, + "step": 4130 + }, + { + "epoch": 0.77, + "grad_norm": 3.4994957447052, + "learning_rate": 3.765741494893803e-05, + "loss": 2.685, + "step": 4140 + }, + { + "epoch": 0.77, + "grad_norm": 3.3174326419830322, + "learning_rate": 3.762608859094042e-05, + "loss": 2.6743, + "step": 4150 + }, + { + "epoch": 0.77, + "grad_norm": 4.003337383270264, + "learning_rate": 3.75947622329428e-05, + "loss": 2.6788, + "step": 4160 + }, + { + "epoch": 0.77, + "grad_norm": 3.902500867843628, + "learning_rate": 3.756343587494518e-05, + "loss": 2.7302, + "step": 4170 + }, + { + "epoch": 0.78, + "grad_norm": 3.3858609199523926, + "learning_rate": 3.753210951694756e-05, + "loss": 2.6986, + "step": 4180 + }, + { + "epoch": 0.78, + "grad_norm": 4.005223751068115, + "learning_rate": 3.750078315894994e-05, + "loss": 2.6507, + "step": 4190 + }, + { + "epoch": 0.78, + "grad_norm": 3.3084933757781982, + "learning_rate": 3.7469456800952326e-05, + "loss": 2.7117, + "step": 4200 + }, + { + "epoch": 0.78, + "grad_norm": 3.4372692108154297, + "learning_rate": 3.7438130442954706e-05, + "loss": 2.6912, + "step": 4210 + }, + { + "epoch": 0.78, + "grad_norm": 3.3670456409454346, + "learning_rate": 3.7406804084957086e-05, + "loss": 2.6367, + "step": 4220 + }, + { + "epoch": 0.79, + "grad_norm": 3.919041156768799, + "learning_rate": 3.7375477726959466e-05, + "loss": 2.7021, + "step": 4230 + }, + { + "epoch": 0.79, + "grad_norm": 4.321124076843262, + "learning_rate": 3.7344151368961846e-05, + "loss": 2.5745, + "step": 4240 + }, + { + "epoch": 0.79, + "grad_norm": 3.7045040130615234, + "learning_rate": 3.7312825010964226e-05, + "loss": 2.7285, + "step": 4250 + }, + { + "epoch": 0.79, + "grad_norm": 3.5482301712036133, + "learning_rate": 3.7281498652966606e-05, + "loss": 2.6938, + "step": 4260 + }, + { + "epoch": 0.79, + "grad_norm": 3.5309770107269287, + "learning_rate": 3.7250172294968986e-05, + "loss": 2.6954, + "step": 4270 + }, + { + "epoch": 0.79, + "grad_norm": 3.313293218612671, + "learning_rate": 3.7218845936971366e-05, + "loss": 2.7527, + "step": 4280 + }, + { + "epoch": 0.8, + "grad_norm": 3.3939263820648193, + "learning_rate": 3.718751957897375e-05, + "loss": 2.6962, + "step": 4290 + }, + { + "epoch": 0.8, + "grad_norm": 3.4195632934570312, + "learning_rate": 3.715619322097613e-05, + "loss": 2.6265, + "step": 4300 + }, + { + "epoch": 0.8, + "grad_norm": 3.189661741256714, + "learning_rate": 3.712486686297851e-05, + "loss": 2.7842, + "step": 4310 + }, + { + "epoch": 0.8, + "grad_norm": 3.6959714889526367, + "learning_rate": 3.709354050498089e-05, + "loss": 2.6563, + "step": 4320 + }, + { + "epoch": 0.8, + "grad_norm": 3.067373275756836, + "learning_rate": 3.706221414698327e-05, + "loss": 2.7157, + "step": 4330 + }, + { + "epoch": 0.81, + "grad_norm": 3.105192184448242, + "learning_rate": 3.703088778898565e-05, + "loss": 2.7298, + "step": 4340 + }, + { + "epoch": 0.81, + "grad_norm": 2.968775510787964, + "learning_rate": 3.699956143098803e-05, + "loss": 2.6267, + "step": 4350 + }, + { + "epoch": 0.81, + "grad_norm": 3.360661268234253, + "learning_rate": 3.696823507299041e-05, + "loss": 2.7032, + "step": 4360 + }, + { + "epoch": 0.81, + "grad_norm": 4.23892068862915, + "learning_rate": 3.693690871499279e-05, + "loss": 2.6217, + "step": 4370 + }, + { + "epoch": 0.81, + "grad_norm": 3.726921558380127, + "learning_rate": 3.690558235699518e-05, + "loss": 2.634, + "step": 4380 + }, + { + "epoch": 0.81, + "grad_norm": 3.8494482040405273, + "learning_rate": 3.687425599899756e-05, + "loss": 2.6996, + "step": 4390 + }, + { + "epoch": 0.82, + "grad_norm": 3.6485435962677, + "learning_rate": 3.684292964099994e-05, + "loss": 2.6526, + "step": 4400 + }, + { + "epoch": 0.82, + "grad_norm": 3.7774016857147217, + "learning_rate": 3.681160328300232e-05, + "loss": 2.5145, + "step": 4410 + }, + { + "epoch": 0.82, + "grad_norm": 3.6008520126342773, + "learning_rate": 3.67802769250047e-05, + "loss": 2.6462, + "step": 4420 + }, + { + "epoch": 0.82, + "grad_norm": 3.4602198600769043, + "learning_rate": 3.674895056700708e-05, + "loss": 2.6813, + "step": 4430 + }, + { + "epoch": 0.82, + "grad_norm": 3.7059261798858643, + "learning_rate": 3.671762420900946e-05, + "loss": 2.503, + "step": 4440 + }, + { + "epoch": 0.83, + "grad_norm": 3.0723626613616943, + "learning_rate": 3.668629785101184e-05, + "loss": 2.615, + "step": 4450 + }, + { + "epoch": 0.83, + "grad_norm": 3.386141538619995, + "learning_rate": 3.665497149301422e-05, + "loss": 2.4334, + "step": 4460 + }, + { + "epoch": 0.83, + "grad_norm": 3.4753592014312744, + "learning_rate": 3.662364513501661e-05, + "loss": 2.7665, + "step": 4470 + }, + { + "epoch": 0.83, + "grad_norm": 3.745521306991577, + "learning_rate": 3.659231877701899e-05, + "loss": 2.7753, + "step": 4480 + }, + { + "epoch": 0.83, + "grad_norm": 3.5241260528564453, + "learning_rate": 3.656099241902137e-05, + "loss": 2.6306, + "step": 4490 + }, + { + "epoch": 0.84, + "grad_norm": 3.418544292449951, + "learning_rate": 3.652966606102375e-05, + "loss": 2.6334, + "step": 4500 + }, + { + "epoch": 0.84, + "grad_norm": 3.802028179168701, + "learning_rate": 3.649833970302613e-05, + "loss": 2.6841, + "step": 4510 + }, + { + "epoch": 0.84, + "grad_norm": 3.664736747741699, + "learning_rate": 3.6467013345028514e-05, + "loss": 2.4587, + "step": 4520 + }, + { + "epoch": 0.84, + "grad_norm": 3.5815389156341553, + "learning_rate": 3.643568698703089e-05, + "loss": 2.7365, + "step": 4530 + }, + { + "epoch": 0.84, + "grad_norm": 4.934508800506592, + "learning_rate": 3.640436062903327e-05, + "loss": 2.6904, + "step": 4540 + }, + { + "epoch": 0.84, + "grad_norm": 3.8538053035736084, + "learning_rate": 3.637303427103565e-05, + "loss": 2.5488, + "step": 4550 + }, + { + "epoch": 0.85, + "grad_norm": 3.1405739784240723, + "learning_rate": 3.6341707913038034e-05, + "loss": 2.6419, + "step": 4560 + }, + { + "epoch": 0.85, + "grad_norm": 3.7020983695983887, + "learning_rate": 3.6310381555040414e-05, + "loss": 2.7255, + "step": 4570 + }, + { + "epoch": 0.85, + "grad_norm": 4.042298793792725, + "learning_rate": 3.6279055197042794e-05, + "loss": 2.6505, + "step": 4580 + }, + { + "epoch": 0.85, + "grad_norm": 3.720578670501709, + "learning_rate": 3.6247728839045174e-05, + "loss": 2.596, + "step": 4590 + }, + { + "epoch": 0.85, + "grad_norm": 3.2827565670013428, + "learning_rate": 3.6216402481047554e-05, + "loss": 2.7212, + "step": 4600 + }, + { + "epoch": 0.86, + "grad_norm": 3.7220571041107178, + "learning_rate": 3.618507612304994e-05, + "loss": 2.7044, + "step": 4610 + }, + { + "epoch": 0.86, + "grad_norm": 3.455204725265503, + "learning_rate": 3.615374976505232e-05, + "loss": 2.7255, + "step": 4620 + }, + { + "epoch": 0.86, + "grad_norm": 3.557081460952759, + "learning_rate": 3.6122423407054694e-05, + "loss": 2.6022, + "step": 4630 + }, + { + "epoch": 0.86, + "grad_norm": 3.3913400173187256, + "learning_rate": 3.6091097049057074e-05, + "loss": 2.583, + "step": 4640 + }, + { + "epoch": 0.86, + "grad_norm": 3.968672752380371, + "learning_rate": 3.605977069105946e-05, + "loss": 2.6592, + "step": 4650 + }, + { + "epoch": 0.87, + "grad_norm": 3.2826192378997803, + "learning_rate": 3.602844433306184e-05, + "loss": 2.7529, + "step": 4660 + }, + { + "epoch": 0.87, + "grad_norm": 3.4522457122802734, + "learning_rate": 3.599711797506422e-05, + "loss": 2.6546, + "step": 4670 + }, + { + "epoch": 0.87, + "grad_norm": 3.4398117065429688, + "learning_rate": 3.59657916170666e-05, + "loss": 2.7162, + "step": 4680 + }, + { + "epoch": 0.87, + "grad_norm": 3.898144483566284, + "learning_rate": 3.593446525906898e-05, + "loss": 2.7552, + "step": 4690 + }, + { + "epoch": 0.87, + "grad_norm": 3.4324584007263184, + "learning_rate": 3.590313890107136e-05, + "loss": 2.7634, + "step": 4700 + }, + { + "epoch": 0.87, + "grad_norm": 3.674661636352539, + "learning_rate": 3.587181254307375e-05, + "loss": 2.5938, + "step": 4710 + }, + { + "epoch": 0.88, + "grad_norm": 3.56758975982666, + "learning_rate": 3.584048618507613e-05, + "loss": 2.7581, + "step": 4720 + }, + { + "epoch": 0.88, + "grad_norm": 3.8987762928009033, + "learning_rate": 3.58091598270785e-05, + "loss": 2.7237, + "step": 4730 + }, + { + "epoch": 0.88, + "grad_norm": 3.58803653717041, + "learning_rate": 3.577783346908088e-05, + "loss": 2.6871, + "step": 4740 + }, + { + "epoch": 0.88, + "grad_norm": 3.4172046184539795, + "learning_rate": 3.574650711108327e-05, + "loss": 2.5975, + "step": 4750 + }, + { + "epoch": 0.88, + "grad_norm": 3.9828925132751465, + "learning_rate": 3.571518075308565e-05, + "loss": 2.8031, + "step": 4760 + }, + { + "epoch": 0.89, + "grad_norm": 3.6485350131988525, + "learning_rate": 3.568385439508803e-05, + "loss": 2.6183, + "step": 4770 + }, + { + "epoch": 0.89, + "grad_norm": 3.6514647006988525, + "learning_rate": 3.565252803709041e-05, + "loss": 2.7714, + "step": 4780 + }, + { + "epoch": 0.89, + "grad_norm": 3.168116569519043, + "learning_rate": 3.562120167909279e-05, + "loss": 2.6368, + "step": 4790 + }, + { + "epoch": 0.89, + "grad_norm": 3.9774527549743652, + "learning_rate": 3.5589875321095175e-05, + "loss": 2.5899, + "step": 4800 + }, + { + "epoch": 0.89, + "grad_norm": 3.036356210708618, + "learning_rate": 3.5558548963097555e-05, + "loss": 2.5359, + "step": 4810 + }, + { + "epoch": 0.89, + "grad_norm": 4.019016742706299, + "learning_rate": 3.552722260509993e-05, + "loss": 2.5671, + "step": 4820 + }, + { + "epoch": 0.9, + "grad_norm": 3.3434457778930664, + "learning_rate": 3.549589624710231e-05, + "loss": 2.5714, + "step": 4830 + }, + { + "epoch": 0.9, + "grad_norm": 3.958373785018921, + "learning_rate": 3.5464569889104695e-05, + "loss": 2.7045, + "step": 4840 + }, + { + "epoch": 0.9, + "grad_norm": 3.5077250003814697, + "learning_rate": 3.5433243531107075e-05, + "loss": 2.5583, + "step": 4850 + }, + { + "epoch": 0.9, + "grad_norm": 4.2609686851501465, + "learning_rate": 3.5401917173109455e-05, + "loss": 2.5628, + "step": 4860 + }, + { + "epoch": 0.9, + "grad_norm": 3.3074331283569336, + "learning_rate": 3.5370590815111835e-05, + "loss": 2.6041, + "step": 4870 + }, + { + "epoch": 0.91, + "grad_norm": 3.6179749965667725, + "learning_rate": 3.5339264457114216e-05, + "loss": 2.6826, + "step": 4880 + }, + { + "epoch": 0.91, + "grad_norm": 3.8938159942626953, + "learning_rate": 3.53079380991166e-05, + "loss": 2.642, + "step": 4890 + }, + { + "epoch": 0.91, + "grad_norm": 4.054030418395996, + "learning_rate": 3.527661174111898e-05, + "loss": 2.5999, + "step": 4900 + }, + { + "epoch": 0.91, + "grad_norm": 3.450873374938965, + "learning_rate": 3.524528538312136e-05, + "loss": 2.757, + "step": 4910 + }, + { + "epoch": 0.91, + "grad_norm": 3.52093243598938, + "learning_rate": 3.5213959025123736e-05, + "loss": 2.6478, + "step": 4920 + }, + { + "epoch": 0.92, + "grad_norm": 3.4646759033203125, + "learning_rate": 3.518263266712612e-05, + "loss": 2.6925, + "step": 4930 + }, + { + "epoch": 0.92, + "grad_norm": 3.8345937728881836, + "learning_rate": 3.51513063091285e-05, + "loss": 2.5773, + "step": 4940 + }, + { + "epoch": 0.92, + "grad_norm": 3.86470890045166, + "learning_rate": 3.511997995113088e-05, + "loss": 2.7242, + "step": 4950 + }, + { + "epoch": 0.92, + "grad_norm": 3.446380853652954, + "learning_rate": 3.508865359313326e-05, + "loss": 2.6121, + "step": 4960 + }, + { + "epoch": 0.92, + "grad_norm": 3.9185121059417725, + "learning_rate": 3.505732723513564e-05, + "loss": 2.8018, + "step": 4970 + }, + { + "epoch": 0.92, + "grad_norm": 4.135132789611816, + "learning_rate": 3.502600087713803e-05, + "loss": 2.6571, + "step": 4980 + }, + { + "epoch": 0.93, + "grad_norm": 3.8625378608703613, + "learning_rate": 3.499467451914041e-05, + "loss": 2.6498, + "step": 4990 + }, + { + "epoch": 0.93, + "grad_norm": 3.788400650024414, + "learning_rate": 3.496334816114279e-05, + "loss": 2.5997, + "step": 5000 + }, + { + "epoch": 0.93, + "grad_norm": 4.110319137573242, + "learning_rate": 3.493202180314517e-05, + "loss": 2.7054, + "step": 5010 + }, + { + "epoch": 0.93, + "grad_norm": 3.1062545776367188, + "learning_rate": 3.490069544514755e-05, + "loss": 2.6907, + "step": 5020 + }, + { + "epoch": 0.93, + "grad_norm": 3.9589688777923584, + "learning_rate": 3.486936908714993e-05, + "loss": 2.6445, + "step": 5030 + }, + { + "epoch": 0.94, + "grad_norm": 3.5143542289733887, + "learning_rate": 3.483804272915231e-05, + "loss": 2.6587, + "step": 5040 + }, + { + "epoch": 0.94, + "grad_norm": 4.875641822814941, + "learning_rate": 3.480671637115469e-05, + "loss": 2.5477, + "step": 5050 + }, + { + "epoch": 0.94, + "grad_norm": 3.7877542972564697, + "learning_rate": 3.477539001315707e-05, + "loss": 2.7636, + "step": 5060 + }, + { + "epoch": 0.94, + "grad_norm": 4.071690559387207, + "learning_rate": 3.4744063655159456e-05, + "loss": 2.6171, + "step": 5070 + }, + { + "epoch": 0.94, + "grad_norm": 3.912278413772583, + "learning_rate": 3.4712737297161836e-05, + "loss": 2.7926, + "step": 5080 + }, + { + "epoch": 0.94, + "grad_norm": 3.779947519302368, + "learning_rate": 3.4681410939164216e-05, + "loss": 2.7714, + "step": 5090 + }, + { + "epoch": 0.95, + "grad_norm": 3.9601123332977295, + "learning_rate": 3.4650084581166597e-05, + "loss": 2.7388, + "step": 5100 + }, + { + "epoch": 0.95, + "grad_norm": 3.5037424564361572, + "learning_rate": 3.4618758223168977e-05, + "loss": 2.5711, + "step": 5110 + }, + { + "epoch": 0.95, + "grad_norm": 3.4753992557525635, + "learning_rate": 3.4587431865171357e-05, + "loss": 2.6326, + "step": 5120 + }, + { + "epoch": 0.95, + "grad_norm": 4.041739463806152, + "learning_rate": 3.4556105507173737e-05, + "loss": 2.5689, + "step": 5130 + }, + { + "epoch": 0.95, + "grad_norm": 3.938133478164673, + "learning_rate": 3.452477914917612e-05, + "loss": 2.5507, + "step": 5140 + }, + { + "epoch": 0.96, + "grad_norm": 3.775325059890747, + "learning_rate": 3.44934527911785e-05, + "loss": 2.6409, + "step": 5150 + }, + { + "epoch": 0.96, + "grad_norm": 3.3595011234283447, + "learning_rate": 3.4462126433180883e-05, + "loss": 2.6732, + "step": 5160 + }, + { + "epoch": 0.96, + "grad_norm": 3.6374683380126953, + "learning_rate": 3.4430800075183263e-05, + "loss": 2.5409, + "step": 5170 + }, + { + "epoch": 0.96, + "grad_norm": 3.4543371200561523, + "learning_rate": 3.4399473717185644e-05, + "loss": 2.6854, + "step": 5180 + }, + { + "epoch": 0.96, + "grad_norm": 4.120486259460449, + "learning_rate": 3.4368147359188024e-05, + "loss": 2.5458, + "step": 5190 + }, + { + "epoch": 0.97, + "grad_norm": 3.764808177947998, + "learning_rate": 3.4336821001190404e-05, + "loss": 2.6321, + "step": 5200 + }, + { + "epoch": 0.97, + "grad_norm": 3.5418665409088135, + "learning_rate": 3.4305494643192784e-05, + "loss": 2.6508, + "step": 5210 + }, + { + "epoch": 0.97, + "grad_norm": 3.4328222274780273, + "learning_rate": 3.4274168285195164e-05, + "loss": 2.6223, + "step": 5220 + }, + { + "epoch": 0.97, + "grad_norm": 3.281841516494751, + "learning_rate": 3.4242841927197544e-05, + "loss": 2.6083, + "step": 5230 + }, + { + "epoch": 0.97, + "grad_norm": 3.453336238861084, + "learning_rate": 3.4211515569199924e-05, + "loss": 2.7111, + "step": 5240 + }, + { + "epoch": 0.97, + "grad_norm": 3.7157461643218994, + "learning_rate": 3.4180189211202304e-05, + "loss": 2.5758, + "step": 5250 + }, + { + "epoch": 0.98, + "grad_norm": 4.228655815124512, + "learning_rate": 3.414886285320469e-05, + "loss": 2.3715, + "step": 5260 + }, + { + "epoch": 0.98, + "grad_norm": 3.6937756538391113, + "learning_rate": 3.411753649520707e-05, + "loss": 2.68, + "step": 5270 + }, + { + "epoch": 0.98, + "grad_norm": 3.7848565578460693, + "learning_rate": 3.408621013720945e-05, + "loss": 2.5128, + "step": 5280 + }, + { + "epoch": 0.98, + "grad_norm": 3.687631607055664, + "learning_rate": 3.405488377921183e-05, + "loss": 2.6281, + "step": 5290 + }, + { + "epoch": 0.98, + "grad_norm": 3.8058853149414062, + "learning_rate": 3.402355742121421e-05, + "loss": 2.6807, + "step": 5300 + }, + { + "epoch": 0.99, + "grad_norm": 3.656662940979004, + "learning_rate": 3.399223106321659e-05, + "loss": 2.5632, + "step": 5310 + }, + { + "epoch": 0.99, + "grad_norm": 3.496729850769043, + "learning_rate": 3.396090470521897e-05, + "loss": 2.5288, + "step": 5320 + }, + { + "epoch": 0.99, + "grad_norm": 3.4001872539520264, + "learning_rate": 3.392957834722135e-05, + "loss": 2.5592, + "step": 5330 + }, + { + "epoch": 0.99, + "grad_norm": 3.141587495803833, + "learning_rate": 3.389825198922373e-05, + "loss": 2.6705, + "step": 5340 + }, + { + "epoch": 0.99, + "grad_norm": 3.4357247352600098, + "learning_rate": 3.386692563122612e-05, + "loss": 2.6251, + "step": 5350 + }, + { + "epoch": 0.99, + "grad_norm": 3.3765175342559814, + "learning_rate": 3.38355992732285e-05, + "loss": 2.7846, + "step": 5360 + }, + { + "epoch": 1.0, + "grad_norm": 3.75774884223938, + "learning_rate": 3.380427291523088e-05, + "loss": 2.7333, + "step": 5370 + }, + { + "epoch": 1.0, + "grad_norm": 3.415477991104126, + "learning_rate": 3.377294655723326e-05, + "loss": 2.7039, + "step": 5380 + }, + { + "epoch": 1.0, + "grad_norm": 3.3807625770568848, + "learning_rate": 3.374162019923564e-05, + "loss": 2.634, + "step": 5390 + }, + { + "epoch": 1.0, + "grad_norm": 3.7736899852752686, + "learning_rate": 3.3710293841238025e-05, + "loss": 2.3838, + "step": 5400 + }, + { + "epoch": 1.0, + "grad_norm": 3.7280807495117188, + "learning_rate": 3.36789674832404e-05, + "loss": 2.5235, + "step": 5410 + }, + { + "epoch": 1.01, + "grad_norm": 4.045708179473877, + "learning_rate": 3.364764112524278e-05, + "loss": 2.4621, + "step": 5420 + }, + { + "epoch": 1.01, + "grad_norm": 3.7396812438964844, + "learning_rate": 3.361631476724516e-05, + "loss": 2.4619, + "step": 5430 + }, + { + "epoch": 1.01, + "grad_norm": 4.0114898681640625, + "learning_rate": 3.3584988409247545e-05, + "loss": 2.4536, + "step": 5440 + }, + { + "epoch": 1.01, + "grad_norm": 3.227764844894409, + "learning_rate": 3.3553662051249925e-05, + "loss": 2.5467, + "step": 5450 + }, + { + "epoch": 1.01, + "grad_norm": 3.8109536170959473, + "learning_rate": 3.3522335693252305e-05, + "loss": 2.5285, + "step": 5460 + }, + { + "epoch": 1.02, + "grad_norm": 3.8032093048095703, + "learning_rate": 3.3491009335254685e-05, + "loss": 2.5643, + "step": 5470 + }, + { + "epoch": 1.02, + "grad_norm": 3.6691324710845947, + "learning_rate": 3.3459682977257065e-05, + "loss": 2.4793, + "step": 5480 + }, + { + "epoch": 1.02, + "grad_norm": 3.8392367362976074, + "learning_rate": 3.342835661925945e-05, + "loss": 2.4332, + "step": 5490 + }, + { + "epoch": 1.02, + "grad_norm": 3.776864528656006, + "learning_rate": 3.3397030261261825e-05, + "loss": 2.3964, + "step": 5500 + }, + { + "epoch": 1.02, + "grad_norm": 3.805351734161377, + "learning_rate": 3.3365703903264205e-05, + "loss": 2.4288, + "step": 5510 + }, + { + "epoch": 1.02, + "grad_norm": 3.900991439819336, + "learning_rate": 3.3334377545266585e-05, + "loss": 2.535, + "step": 5520 + }, + { + "epoch": 1.03, + "grad_norm": 3.3974769115448, + "learning_rate": 3.330305118726897e-05, + "loss": 2.4969, + "step": 5530 + }, + { + "epoch": 1.03, + "grad_norm": 3.656438112258911, + "learning_rate": 3.327172482927135e-05, + "loss": 2.3034, + "step": 5540 + }, + { + "epoch": 1.03, + "grad_norm": 4.124125957489014, + "learning_rate": 3.324039847127373e-05, + "loss": 2.3805, + "step": 5550 + }, + { + "epoch": 1.03, + "grad_norm": 3.620126247406006, + "learning_rate": 3.320907211327611e-05, + "loss": 2.5268, + "step": 5560 + }, + { + "epoch": 1.03, + "grad_norm": 3.6136839389801025, + "learning_rate": 3.317774575527849e-05, + "loss": 2.478, + "step": 5570 + }, + { + "epoch": 1.04, + "grad_norm": 4.039578914642334, + "learning_rate": 3.314641939728088e-05, + "loss": 2.5355, + "step": 5580 + }, + { + "epoch": 1.04, + "grad_norm": 3.641757011413574, + "learning_rate": 3.311509303928326e-05, + "loss": 2.4235, + "step": 5590 + }, + { + "epoch": 1.04, + "grad_norm": 4.343202590942383, + "learning_rate": 3.308376668128563e-05, + "loss": 2.4022, + "step": 5600 + }, + { + "epoch": 1.04, + "grad_norm": 4.0208258628845215, + "learning_rate": 3.305244032328801e-05, + "loss": 2.5053, + "step": 5610 + }, + { + "epoch": 1.04, + "grad_norm": 4.351205348968506, + "learning_rate": 3.30211139652904e-05, + "loss": 2.5123, + "step": 5620 + }, + { + "epoch": 1.05, + "grad_norm": 3.987982988357544, + "learning_rate": 3.298978760729278e-05, + "loss": 2.5238, + "step": 5630 + }, + { + "epoch": 1.05, + "grad_norm": 4.0602312088012695, + "learning_rate": 3.295846124929516e-05, + "loss": 2.4661, + "step": 5640 + }, + { + "epoch": 1.05, + "grad_norm": 3.855670928955078, + "learning_rate": 3.292713489129754e-05, + "loss": 2.5684, + "step": 5650 + }, + { + "epoch": 1.05, + "grad_norm": 4.313904762268066, + "learning_rate": 3.289580853329992e-05, + "loss": 2.59, + "step": 5660 + }, + { + "epoch": 1.05, + "grad_norm": 4.257472515106201, + "learning_rate": 3.2864482175302306e-05, + "loss": 2.4152, + "step": 5670 + }, + { + "epoch": 1.05, + "grad_norm": 4.372264385223389, + "learning_rate": 3.2833155817304686e-05, + "loss": 2.6152, + "step": 5680 + }, + { + "epoch": 1.06, + "grad_norm": 4.014118194580078, + "learning_rate": 3.2801829459307066e-05, + "loss": 2.4703, + "step": 5690 + }, + { + "epoch": 1.06, + "grad_norm": 4.317049503326416, + "learning_rate": 3.277050310130944e-05, + "loss": 2.4456, + "step": 5700 + }, + { + "epoch": 1.06, + "grad_norm": 4.42697811126709, + "learning_rate": 3.2739176743311826e-05, + "loss": 2.4755, + "step": 5710 + }, + { + "epoch": 1.06, + "grad_norm": 3.7758564949035645, + "learning_rate": 3.2707850385314206e-05, + "loss": 2.4876, + "step": 5720 + }, + { + "epoch": 1.06, + "grad_norm": 3.636563539505005, + "learning_rate": 3.2676524027316586e-05, + "loss": 2.3994, + "step": 5730 + }, + { + "epoch": 1.07, + "grad_norm": 4.024879455566406, + "learning_rate": 3.2645197669318966e-05, + "loss": 2.3539, + "step": 5740 + }, + { + "epoch": 1.07, + "grad_norm": 4.519440650939941, + "learning_rate": 3.2613871311321346e-05, + "loss": 2.4389, + "step": 5750 + }, + { + "epoch": 1.07, + "grad_norm": 3.379619598388672, + "learning_rate": 3.2582544953323726e-05, + "loss": 2.3534, + "step": 5760 + }, + { + "epoch": 1.07, + "grad_norm": 3.5998525619506836, + "learning_rate": 3.255121859532611e-05, + "loss": 2.4591, + "step": 5770 + }, + { + "epoch": 1.07, + "grad_norm": 3.6781418323516846, + "learning_rate": 3.251989223732849e-05, + "loss": 2.4257, + "step": 5780 + }, + { + "epoch": 1.07, + "grad_norm": 4.661477088928223, + "learning_rate": 3.2488565879330866e-05, + "loss": 2.5522, + "step": 5790 + }, + { + "epoch": 1.08, + "grad_norm": 3.9432196617126465, + "learning_rate": 3.2457239521333246e-05, + "loss": 2.414, + "step": 5800 + }, + { + "epoch": 1.08, + "grad_norm": 3.805156707763672, + "learning_rate": 3.242591316333563e-05, + "loss": 2.5464, + "step": 5810 + }, + { + "epoch": 1.08, + "grad_norm": 4.142496109008789, + "learning_rate": 3.239458680533801e-05, + "loss": 2.611, + "step": 5820 + }, + { + "epoch": 1.08, + "grad_norm": 3.8504600524902344, + "learning_rate": 3.236326044734039e-05, + "loss": 2.4074, + "step": 5830 + }, + { + "epoch": 1.08, + "grad_norm": 4.016257286071777, + "learning_rate": 3.233193408934277e-05, + "loss": 2.5023, + "step": 5840 + }, + { + "epoch": 1.09, + "grad_norm": 3.938784599304199, + "learning_rate": 3.230060773134515e-05, + "loss": 2.3188, + "step": 5850 + }, + { + "epoch": 1.09, + "grad_norm": 4.088259696960449, + "learning_rate": 3.226928137334754e-05, + "loss": 2.5935, + "step": 5860 + }, + { + "epoch": 1.09, + "grad_norm": 3.8898637294769287, + "learning_rate": 3.223795501534992e-05, + "loss": 2.4305, + "step": 5870 + }, + { + "epoch": 1.09, + "grad_norm": 6.501204013824463, + "learning_rate": 3.22066286573523e-05, + "loss": 2.4884, + "step": 5880 + }, + { + "epoch": 1.09, + "grad_norm": 3.9667720794677734, + "learning_rate": 3.217530229935467e-05, + "loss": 2.433, + "step": 5890 + }, + { + "epoch": 1.1, + "grad_norm": 4.108957290649414, + "learning_rate": 3.214397594135706e-05, + "loss": 2.604, + "step": 5900 + }, + { + "epoch": 1.1, + "grad_norm": 3.968045234680176, + "learning_rate": 3.211264958335944e-05, + "loss": 2.4587, + "step": 5910 + }, + { + "epoch": 1.1, + "grad_norm": 3.952162742614746, + "learning_rate": 3.208132322536182e-05, + "loss": 2.4285, + "step": 5920 + }, + { + "epoch": 1.1, + "grad_norm": 4.0700178146362305, + "learning_rate": 3.20499968673642e-05, + "loss": 2.6214, + "step": 5930 + }, + { + "epoch": 1.1, + "grad_norm": 3.97859787940979, + "learning_rate": 3.201867050936658e-05, + "loss": 2.5462, + "step": 5940 + }, + { + "epoch": 1.1, + "grad_norm": 4.449424743652344, + "learning_rate": 3.198734415136897e-05, + "loss": 2.4818, + "step": 5950 + }, + { + "epoch": 1.11, + "grad_norm": 4.160023212432861, + "learning_rate": 3.195601779337135e-05, + "loss": 2.4427, + "step": 5960 + }, + { + "epoch": 1.11, + "grad_norm": 3.9818923473358154, + "learning_rate": 3.192469143537373e-05, + "loss": 2.4902, + "step": 5970 + }, + { + "epoch": 1.11, + "grad_norm": 4.702857494354248, + "learning_rate": 3.189336507737611e-05, + "loss": 2.3095, + "step": 5980 + }, + { + "epoch": 1.11, + "grad_norm": 3.913214683532715, + "learning_rate": 3.186203871937849e-05, + "loss": 2.5468, + "step": 5990 + }, + { + "epoch": 1.11, + "grad_norm": 4.280374526977539, + "learning_rate": 3.183071236138087e-05, + "loss": 2.4706, + "step": 6000 + }, + { + "epoch": 1.12, + "grad_norm": 5.013718128204346, + "learning_rate": 3.179938600338325e-05, + "loss": 2.3338, + "step": 6010 + }, + { + "epoch": 1.12, + "grad_norm": 4.405649662017822, + "learning_rate": 3.176805964538563e-05, + "loss": 2.5071, + "step": 6020 + }, + { + "epoch": 1.12, + "grad_norm": 4.047056198120117, + "learning_rate": 3.173673328738801e-05, + "loss": 2.3017, + "step": 6030 + }, + { + "epoch": 1.12, + "grad_norm": 3.8888051509857178, + "learning_rate": 3.1705406929390394e-05, + "loss": 2.4258, + "step": 6040 + }, + { + "epoch": 1.12, + "grad_norm": 4.3164544105529785, + "learning_rate": 3.1674080571392774e-05, + "loss": 2.3265, + "step": 6050 + }, + { + "epoch": 1.12, + "grad_norm": 4.50031042098999, + "learning_rate": 3.1642754213395154e-05, + "loss": 2.4742, + "step": 6060 + }, + { + "epoch": 1.13, + "grad_norm": 3.77244234085083, + "learning_rate": 3.1611427855397534e-05, + "loss": 2.5043, + "step": 6070 + }, + { + "epoch": 1.13, + "grad_norm": 3.641749382019043, + "learning_rate": 3.1580101497399914e-05, + "loss": 2.4812, + "step": 6080 + }, + { + "epoch": 1.13, + "grad_norm": 4.318567752838135, + "learning_rate": 3.1548775139402294e-05, + "loss": 2.513, + "step": 6090 + }, + { + "epoch": 1.13, + "grad_norm": 4.369333744049072, + "learning_rate": 3.1517448781404674e-05, + "loss": 2.3888, + "step": 6100 + }, + { + "epoch": 1.13, + "grad_norm": 4.169533729553223, + "learning_rate": 3.1486122423407054e-05, + "loss": 2.5043, + "step": 6110 + }, + { + "epoch": 1.14, + "grad_norm": 4.253003120422363, + "learning_rate": 3.1454796065409434e-05, + "loss": 2.4792, + "step": 6120 + }, + { + "epoch": 1.14, + "grad_norm": 4.288143634796143, + "learning_rate": 3.142346970741182e-05, + "loss": 2.4291, + "step": 6130 + }, + { + "epoch": 1.14, + "grad_norm": 4.162696361541748, + "learning_rate": 3.13921433494142e-05, + "loss": 2.4592, + "step": 6140 + }, + { + "epoch": 1.14, + "grad_norm": 4.5401482582092285, + "learning_rate": 3.136081699141658e-05, + "loss": 2.5767, + "step": 6150 + }, + { + "epoch": 1.14, + "grad_norm": 4.044467926025391, + "learning_rate": 3.132949063341896e-05, + "loss": 2.4809, + "step": 6160 + }, + { + "epoch": 1.15, + "grad_norm": 4.036025524139404, + "learning_rate": 3.129816427542134e-05, + "loss": 2.608, + "step": 6170 + }, + { + "epoch": 1.15, + "grad_norm": 4.690577983856201, + "learning_rate": 3.126683791742372e-05, + "loss": 2.5302, + "step": 6180 + }, + { + "epoch": 1.15, + "grad_norm": 4.768272399902344, + "learning_rate": 3.12355115594261e-05, + "loss": 2.3404, + "step": 6190 + }, + { + "epoch": 1.15, + "grad_norm": 3.854149580001831, + "learning_rate": 3.120418520142848e-05, + "loss": 2.4891, + "step": 6200 + }, + { + "epoch": 1.15, + "grad_norm": 4.387482643127441, + "learning_rate": 3.117285884343086e-05, + "loss": 2.4742, + "step": 6210 + }, + { + "epoch": 1.15, + "grad_norm": 3.609062671661377, + "learning_rate": 3.114153248543325e-05, + "loss": 2.3332, + "step": 6220 + }, + { + "epoch": 1.16, + "grad_norm": 4.023780822753906, + "learning_rate": 3.111020612743563e-05, + "loss": 2.3789, + "step": 6230 + }, + { + "epoch": 1.16, + "grad_norm": 3.633157253265381, + "learning_rate": 3.107887976943801e-05, + "loss": 2.5869, + "step": 6240 + }, + { + "epoch": 1.16, + "grad_norm": 4.554311275482178, + "learning_rate": 3.104755341144039e-05, + "loss": 2.4892, + "step": 6250 + }, + { + "epoch": 1.16, + "grad_norm": 3.6120660305023193, + "learning_rate": 3.101622705344277e-05, + "loss": 2.3765, + "step": 6260 + }, + { + "epoch": 1.16, + "grad_norm": 4.1312336921691895, + "learning_rate": 3.0984900695445155e-05, + "loss": 2.3906, + "step": 6270 + }, + { + "epoch": 1.17, + "grad_norm": 3.6260271072387695, + "learning_rate": 3.095357433744753e-05, + "loss": 2.4732, + "step": 6280 + }, + { + "epoch": 1.17, + "grad_norm": 3.927614212036133, + "learning_rate": 3.092224797944991e-05, + "loss": 2.4285, + "step": 6290 + }, + { + "epoch": 1.17, + "grad_norm": 4.142879009246826, + "learning_rate": 3.089092162145229e-05, + "loss": 2.5053, + "step": 6300 + }, + { + "epoch": 1.17, + "grad_norm": 4.045203685760498, + "learning_rate": 3.085959526345467e-05, + "loss": 2.3809, + "step": 6310 + }, + { + "epoch": 1.17, + "grad_norm": 4.602118492126465, + "learning_rate": 3.0828268905457055e-05, + "loss": 2.4972, + "step": 6320 + }, + { + "epoch": 1.17, + "grad_norm": 4.50063419342041, + "learning_rate": 3.0796942547459435e-05, + "loss": 2.5565, + "step": 6330 + }, + { + "epoch": 1.18, + "grad_norm": 4.114358901977539, + "learning_rate": 3.0765616189461815e-05, + "loss": 2.3903, + "step": 6340 + }, + { + "epoch": 1.18, + "grad_norm": 3.6293530464172363, + "learning_rate": 3.0734289831464195e-05, + "loss": 2.4091, + "step": 6350 + }, + { + "epoch": 1.18, + "grad_norm": 3.8283889293670654, + "learning_rate": 3.0702963473466575e-05, + "loss": 2.6608, + "step": 6360 + }, + { + "epoch": 1.18, + "grad_norm": 3.8779544830322266, + "learning_rate": 3.067163711546896e-05, + "loss": 2.3576, + "step": 6370 + }, + { + "epoch": 1.18, + "grad_norm": 3.8340346813201904, + "learning_rate": 3.0640310757471335e-05, + "loss": 2.4389, + "step": 6380 + }, + { + "epoch": 1.19, + "grad_norm": 3.959573268890381, + "learning_rate": 3.0608984399473715e-05, + "loss": 2.5481, + "step": 6390 + }, + { + "epoch": 1.19, + "grad_norm": 3.6378843784332275, + "learning_rate": 3.0577658041476095e-05, + "loss": 2.3975, + "step": 6400 + }, + { + "epoch": 1.19, + "grad_norm": 4.129283428192139, + "learning_rate": 3.054633168347848e-05, + "loss": 2.4949, + "step": 6410 + }, + { + "epoch": 1.19, + "grad_norm": 4.536279201507568, + "learning_rate": 3.0515005325480862e-05, + "loss": 2.4757, + "step": 6420 + }, + { + "epoch": 1.19, + "grad_norm": 4.183682918548584, + "learning_rate": 3.0483678967483242e-05, + "loss": 2.4207, + "step": 6430 + }, + { + "epoch": 1.2, + "grad_norm": 3.7908713817596436, + "learning_rate": 3.0452352609485622e-05, + "loss": 2.4127, + "step": 6440 + }, + { + "epoch": 1.2, + "grad_norm": 3.6448137760162354, + "learning_rate": 3.0421026251488006e-05, + "loss": 2.4096, + "step": 6450 + }, + { + "epoch": 1.2, + "grad_norm": 3.8170764446258545, + "learning_rate": 3.0389699893490386e-05, + "loss": 2.6154, + "step": 6460 + }, + { + "epoch": 1.2, + "grad_norm": 4.10953426361084, + "learning_rate": 3.0358373535492762e-05, + "loss": 2.4478, + "step": 6470 + }, + { + "epoch": 1.2, + "grad_norm": 4.442537307739258, + "learning_rate": 3.0327047177495142e-05, + "loss": 2.6607, + "step": 6480 + }, + { + "epoch": 1.2, + "grad_norm": 3.768803834915161, + "learning_rate": 3.0295720819497526e-05, + "loss": 2.5009, + "step": 6490 + }, + { + "epoch": 1.21, + "grad_norm": 3.953761339187622, + "learning_rate": 3.0264394461499906e-05, + "loss": 2.4567, + "step": 6500 + }, + { + "epoch": 1.21, + "grad_norm": 3.973198413848877, + "learning_rate": 3.023306810350229e-05, + "loss": 2.5144, + "step": 6510 + }, + { + "epoch": 1.21, + "grad_norm": 4.184175491333008, + "learning_rate": 3.020174174550467e-05, + "loss": 2.3374, + "step": 6520 + }, + { + "epoch": 1.21, + "grad_norm": 4.105236530303955, + "learning_rate": 3.017041538750705e-05, + "loss": 2.4755, + "step": 6530 + }, + { + "epoch": 1.21, + "grad_norm": 4.367380619049072, + "learning_rate": 3.0139089029509433e-05, + "loss": 2.4868, + "step": 6540 + }, + { + "epoch": 1.22, + "grad_norm": 3.8320703506469727, + "learning_rate": 3.0107762671511813e-05, + "loss": 2.4634, + "step": 6550 + }, + { + "epoch": 1.22, + "grad_norm": 4.322927474975586, + "learning_rate": 3.0076436313514196e-05, + "loss": 2.5089, + "step": 6560 + }, + { + "epoch": 1.22, + "grad_norm": 4.073949813842773, + "learning_rate": 3.004510995551657e-05, + "loss": 2.3301, + "step": 6570 + }, + { + "epoch": 1.22, + "grad_norm": 3.9154913425445557, + "learning_rate": 3.0013783597518953e-05, + "loss": 2.5612, + "step": 6580 + }, + { + "epoch": 1.22, + "grad_norm": 4.1147332191467285, + "learning_rate": 2.9982457239521333e-05, + "loss": 2.4923, + "step": 6590 + }, + { + "epoch": 1.23, + "grad_norm": 4.687198162078857, + "learning_rate": 2.9951130881523716e-05, + "loss": 2.5775, + "step": 6600 + }, + { + "epoch": 1.23, + "grad_norm": 3.9940314292907715, + "learning_rate": 2.9919804523526096e-05, + "loss": 2.489, + "step": 6610 + }, + { + "epoch": 1.23, + "grad_norm": 3.8620872497558594, + "learning_rate": 2.9888478165528476e-05, + "loss": 2.337, + "step": 6620 + }, + { + "epoch": 1.23, + "grad_norm": 3.9539003372192383, + "learning_rate": 2.985715180753086e-05, + "loss": 2.4726, + "step": 6630 + }, + { + "epoch": 1.23, + "grad_norm": 4.662155628204346, + "learning_rate": 2.982582544953324e-05, + "loss": 2.4849, + "step": 6640 + }, + { + "epoch": 1.23, + "grad_norm": 4.394937038421631, + "learning_rate": 2.9794499091535623e-05, + "loss": 2.4574, + "step": 6650 + }, + { + "epoch": 1.24, + "grad_norm": 4.711225509643555, + "learning_rate": 2.9763172733538003e-05, + "loss": 2.5811, + "step": 6660 + }, + { + "epoch": 1.24, + "grad_norm": 4.110074996948242, + "learning_rate": 2.973184637554038e-05, + "loss": 2.4939, + "step": 6670 + }, + { + "epoch": 1.24, + "grad_norm": 4.297110557556152, + "learning_rate": 2.970052001754276e-05, + "loss": 2.4746, + "step": 6680 + }, + { + "epoch": 1.24, + "grad_norm": 3.9598119258880615, + "learning_rate": 2.966919365954514e-05, + "loss": 2.4667, + "step": 6690 + }, + { + "epoch": 1.24, + "grad_norm": 4.400740146636963, + "learning_rate": 2.9637867301547523e-05, + "loss": 2.4556, + "step": 6700 + }, + { + "epoch": 1.25, + "grad_norm": 3.5050714015960693, + "learning_rate": 2.9606540943549903e-05, + "loss": 2.4305, + "step": 6710 + }, + { + "epoch": 1.25, + "grad_norm": 3.708401679992676, + "learning_rate": 2.9575214585552287e-05, + "loss": 2.4872, + "step": 6720 + }, + { + "epoch": 1.25, + "grad_norm": 4.355490207672119, + "learning_rate": 2.9543888227554667e-05, + "loss": 2.4645, + "step": 6730 + }, + { + "epoch": 1.25, + "grad_norm": 3.8205530643463135, + "learning_rate": 2.9512561869557047e-05, + "loss": 2.5601, + "step": 6740 + }, + { + "epoch": 1.25, + "grad_norm": 3.833449363708496, + "learning_rate": 2.948123551155943e-05, + "loss": 2.4713, + "step": 6750 + }, + { + "epoch": 1.25, + "grad_norm": 3.864391803741455, + "learning_rate": 2.944990915356181e-05, + "loss": 2.594, + "step": 6760 + }, + { + "epoch": 1.26, + "grad_norm": 3.996546506881714, + "learning_rate": 2.9418582795564187e-05, + "loss": 2.5044, + "step": 6770 + }, + { + "epoch": 1.26, + "grad_norm": 4.637711048126221, + "learning_rate": 2.9387256437566567e-05, + "loss": 2.4977, + "step": 6780 + }, + { + "epoch": 1.26, + "grad_norm": 4.310656547546387, + "learning_rate": 2.935593007956895e-05, + "loss": 2.4352, + "step": 6790 + }, + { + "epoch": 1.26, + "grad_norm": 4.202852725982666, + "learning_rate": 2.932460372157133e-05, + "loss": 2.4125, + "step": 6800 + }, + { + "epoch": 1.26, + "grad_norm": 3.8896448612213135, + "learning_rate": 2.9293277363573714e-05, + "loss": 2.3875, + "step": 6810 + }, + { + "epoch": 1.27, + "grad_norm": 4.873081684112549, + "learning_rate": 2.9261951005576094e-05, + "loss": 2.4571, + "step": 6820 + }, + { + "epoch": 1.27, + "grad_norm": 4.322564601898193, + "learning_rate": 2.9230624647578474e-05, + "loss": 2.3468, + "step": 6830 + }, + { + "epoch": 1.27, + "grad_norm": 3.952484130859375, + "learning_rate": 2.9199298289580857e-05, + "loss": 2.4347, + "step": 6840 + }, + { + "epoch": 1.27, + "grad_norm": 4.476901531219482, + "learning_rate": 2.9167971931583237e-05, + "loss": 2.6337, + "step": 6850 + }, + { + "epoch": 1.27, + "grad_norm": 3.968850612640381, + "learning_rate": 2.9136645573585614e-05, + "loss": 2.466, + "step": 6860 + }, + { + "epoch": 1.28, + "grad_norm": 4.549422264099121, + "learning_rate": 2.9105319215587994e-05, + "loss": 2.5071, + "step": 6870 + }, + { + "epoch": 1.28, + "grad_norm": 3.74324107170105, + "learning_rate": 2.9073992857590377e-05, + "loss": 2.4761, + "step": 6880 + }, + { + "epoch": 1.28, + "grad_norm": 4.241300106048584, + "learning_rate": 2.9042666499592757e-05, + "loss": 2.403, + "step": 6890 + }, + { + "epoch": 1.28, + "grad_norm": 4.1481428146362305, + "learning_rate": 2.901134014159514e-05, + "loss": 2.3425, + "step": 6900 + }, + { + "epoch": 1.28, + "grad_norm": 4.6267499923706055, + "learning_rate": 2.898001378359752e-05, + "loss": 2.4772, + "step": 6910 + }, + { + "epoch": 1.28, + "grad_norm": 4.275341033935547, + "learning_rate": 2.89486874255999e-05, + "loss": 2.4042, + "step": 6920 + }, + { + "epoch": 1.29, + "grad_norm": 4.284965991973877, + "learning_rate": 2.8917361067602284e-05, + "loss": 2.52, + "step": 6930 + }, + { + "epoch": 1.29, + "grad_norm": 4.403405666351318, + "learning_rate": 2.8886034709604664e-05, + "loss": 2.4763, + "step": 6940 + }, + { + "epoch": 1.29, + "grad_norm": 4.16456413269043, + "learning_rate": 2.8854708351607044e-05, + "loss": 2.485, + "step": 6950 + }, + { + "epoch": 1.29, + "grad_norm": 3.8839504718780518, + "learning_rate": 2.882338199360942e-05, + "loss": 2.4893, + "step": 6960 + }, + { + "epoch": 1.29, + "grad_norm": 4.768056392669678, + "learning_rate": 2.8792055635611804e-05, + "loss": 2.4582, + "step": 6970 + }, + { + "epoch": 1.3, + "grad_norm": 4.094647407531738, + "learning_rate": 2.8760729277614184e-05, + "loss": 2.4824, + "step": 6980 + }, + { + "epoch": 1.3, + "grad_norm": 4.018396377563477, + "learning_rate": 2.8729402919616564e-05, + "loss": 2.5067, + "step": 6990 + }, + { + "epoch": 1.3, + "grad_norm": 3.8706161975860596, + "learning_rate": 2.8698076561618948e-05, + "loss": 2.5126, + "step": 7000 + }, + { + "epoch": 1.3, + "grad_norm": 3.7700846195220947, + "learning_rate": 2.8666750203621328e-05, + "loss": 2.5013, + "step": 7010 + }, + { + "epoch": 1.3, + "grad_norm": 3.8400371074676514, + "learning_rate": 2.863542384562371e-05, + "loss": 2.5477, + "step": 7020 + }, + { + "epoch": 1.3, + "grad_norm": 4.360779285430908, + "learning_rate": 2.860409748762609e-05, + "loss": 2.5295, + "step": 7030 + }, + { + "epoch": 1.31, + "grad_norm": 4.506185531616211, + "learning_rate": 2.857277112962847e-05, + "loss": 2.5114, + "step": 7040 + }, + { + "epoch": 1.31, + "grad_norm": 4.395011901855469, + "learning_rate": 2.8541444771630855e-05, + "loss": 2.508, + "step": 7050 + }, + { + "epoch": 1.31, + "grad_norm": 4.438186168670654, + "learning_rate": 2.851011841363323e-05, + "loss": 2.345, + "step": 7060 + }, + { + "epoch": 1.31, + "grad_norm": 3.920180082321167, + "learning_rate": 2.847879205563561e-05, + "loss": 2.4989, + "step": 7070 + }, + { + "epoch": 1.31, + "grad_norm": 3.980530023574829, + "learning_rate": 2.844746569763799e-05, + "loss": 2.5219, + "step": 7080 + }, + { + "epoch": 1.32, + "grad_norm": 3.988844871520996, + "learning_rate": 2.8416139339640375e-05, + "loss": 2.5288, + "step": 7090 + }, + { + "epoch": 1.32, + "grad_norm": 3.9802753925323486, + "learning_rate": 2.8384812981642755e-05, + "loss": 2.3894, + "step": 7100 + }, + { + "epoch": 1.32, + "grad_norm": 4.512762069702148, + "learning_rate": 2.835348662364514e-05, + "loss": 2.4125, + "step": 7110 + }, + { + "epoch": 1.32, + "grad_norm": 4.419162273406982, + "learning_rate": 2.832216026564752e-05, + "loss": 2.4308, + "step": 7120 + }, + { + "epoch": 1.32, + "grad_norm": 4.053534984588623, + "learning_rate": 2.82908339076499e-05, + "loss": 2.4747, + "step": 7130 + }, + { + "epoch": 1.33, + "grad_norm": 4.382496356964111, + "learning_rate": 2.8259507549652282e-05, + "loss": 2.4205, + "step": 7140 + }, + { + "epoch": 1.33, + "grad_norm": 4.047353267669678, + "learning_rate": 2.8228181191654662e-05, + "loss": 2.4959, + "step": 7150 + }, + { + "epoch": 1.33, + "grad_norm": 4.391164302825928, + "learning_rate": 2.819685483365704e-05, + "loss": 2.4672, + "step": 7160 + }, + { + "epoch": 1.33, + "grad_norm": 4.1165618896484375, + "learning_rate": 2.816552847565942e-05, + "loss": 2.3797, + "step": 7170 + }, + { + "epoch": 1.33, + "grad_norm": 4.1894307136535645, + "learning_rate": 2.8134202117661802e-05, + "loss": 2.5119, + "step": 7180 + }, + { + "epoch": 1.33, + "grad_norm": 4.277898788452148, + "learning_rate": 2.8102875759664182e-05, + "loss": 2.398, + "step": 7190 + }, + { + "epoch": 1.34, + "grad_norm": 5.254609107971191, + "learning_rate": 2.8071549401666562e-05, + "loss": 2.4263, + "step": 7200 + }, + { + "epoch": 1.34, + "grad_norm": 4.395539283752441, + "learning_rate": 2.8040223043668945e-05, + "loss": 2.5494, + "step": 7210 + }, + { + "epoch": 1.34, + "grad_norm": 3.832606792449951, + "learning_rate": 2.8008896685671325e-05, + "loss": 2.4564, + "step": 7220 + }, + { + "epoch": 1.34, + "grad_norm": 4.776379108428955, + "learning_rate": 2.797757032767371e-05, + "loss": 2.4843, + "step": 7230 + }, + { + "epoch": 1.34, + "grad_norm": 3.442601442337036, + "learning_rate": 2.794624396967609e-05, + "loss": 2.3466, + "step": 7240 + }, + { + "epoch": 1.35, + "grad_norm": 3.7504169940948486, + "learning_rate": 2.7914917611678466e-05, + "loss": 2.479, + "step": 7250 + }, + { + "epoch": 1.35, + "grad_norm": 4.110808849334717, + "learning_rate": 2.7883591253680846e-05, + "loss": 2.4306, + "step": 7260 + }, + { + "epoch": 1.35, + "grad_norm": 3.7822816371917725, + "learning_rate": 2.785226489568323e-05, + "loss": 2.4555, + "step": 7270 + }, + { + "epoch": 1.35, + "grad_norm": 4.296886444091797, + "learning_rate": 2.782093853768561e-05, + "loss": 2.4778, + "step": 7280 + }, + { + "epoch": 1.35, + "grad_norm": 4.053720951080322, + "learning_rate": 2.778961217968799e-05, + "loss": 2.4027, + "step": 7290 + }, + { + "epoch": 1.36, + "grad_norm": 4.153025150299072, + "learning_rate": 2.7758285821690372e-05, + "loss": 2.5618, + "step": 7300 + }, + { + "epoch": 1.36, + "grad_norm": 4.224099159240723, + "learning_rate": 2.7726959463692753e-05, + "loss": 2.4304, + "step": 7310 + }, + { + "epoch": 1.36, + "grad_norm": 4.171821594238281, + "learning_rate": 2.7695633105695136e-05, + "loss": 2.4833, + "step": 7320 + }, + { + "epoch": 1.36, + "grad_norm": 3.8762876987457275, + "learning_rate": 2.7664306747697516e-05, + "loss": 2.407, + "step": 7330 + }, + { + "epoch": 1.36, + "grad_norm": 4.1427693367004395, + "learning_rate": 2.7632980389699896e-05, + "loss": 2.4689, + "step": 7340 + }, + { + "epoch": 1.36, + "grad_norm": 4.639638423919678, + "learning_rate": 2.7601654031702273e-05, + "loss": 2.5578, + "step": 7350 + }, + { + "epoch": 1.37, + "grad_norm": 3.923847198486328, + "learning_rate": 2.7570327673704656e-05, + "loss": 2.4735, + "step": 7360 + }, + { + "epoch": 1.37, + "grad_norm": 3.8497817516326904, + "learning_rate": 2.7539001315707036e-05, + "loss": 2.3363, + "step": 7370 + }, + { + "epoch": 1.37, + "grad_norm": 4.234973907470703, + "learning_rate": 2.7507674957709416e-05, + "loss": 2.3606, + "step": 7380 + }, + { + "epoch": 1.37, + "grad_norm": 4.026918411254883, + "learning_rate": 2.74763485997118e-05, + "loss": 2.3884, + "step": 7390 + }, + { + "epoch": 1.37, + "grad_norm": 3.6976168155670166, + "learning_rate": 2.744502224171418e-05, + "loss": 2.508, + "step": 7400 + }, + { + "epoch": 1.38, + "grad_norm": 4.062636852264404, + "learning_rate": 2.7413695883716563e-05, + "loss": 2.5993, + "step": 7410 + }, + { + "epoch": 1.38, + "grad_norm": 9.24471664428711, + "learning_rate": 2.7382369525718943e-05, + "loss": 2.5709, + "step": 7420 + }, + { + "epoch": 1.38, + "grad_norm": 4.350656509399414, + "learning_rate": 2.7351043167721323e-05, + "loss": 2.4863, + "step": 7430 + }, + { + "epoch": 1.38, + "grad_norm": 3.904040813446045, + "learning_rate": 2.7319716809723706e-05, + "loss": 2.4859, + "step": 7440 + }, + { + "epoch": 1.38, + "grad_norm": 3.8008460998535156, + "learning_rate": 2.728839045172608e-05, + "loss": 2.5323, + "step": 7450 + }, + { + "epoch": 1.38, + "grad_norm": 4.462713241577148, + "learning_rate": 2.7257064093728463e-05, + "loss": 2.4299, + "step": 7460 + }, + { + "epoch": 1.39, + "grad_norm": 4.533032417297363, + "learning_rate": 2.7225737735730843e-05, + "loss": 2.4732, + "step": 7470 + }, + { + "epoch": 1.39, + "grad_norm": 3.7745182514190674, + "learning_rate": 2.7194411377733227e-05, + "loss": 2.2655, + "step": 7480 + }, + { + "epoch": 1.39, + "grad_norm": 4.192812919616699, + "learning_rate": 2.7163085019735607e-05, + "loss": 2.415, + "step": 7490 + }, + { + "epoch": 1.39, + "grad_norm": 4.448957443237305, + "learning_rate": 2.7131758661737987e-05, + "loss": 2.4624, + "step": 7500 + }, + { + "epoch": 1.39, + "grad_norm": 4.633139610290527, + "learning_rate": 2.710043230374037e-05, + "loss": 2.3715, + "step": 7510 + }, + { + "epoch": 1.4, + "grad_norm": 4.003988742828369, + "learning_rate": 2.706910594574275e-05, + "loss": 2.5053, + "step": 7520 + }, + { + "epoch": 1.4, + "grad_norm": 4.706925392150879, + "learning_rate": 2.7037779587745133e-05, + "loss": 2.463, + "step": 7530 + }, + { + "epoch": 1.4, + "grad_norm": 4.274069309234619, + "learning_rate": 2.7006453229747507e-05, + "loss": 2.368, + "step": 7540 + }, + { + "epoch": 1.4, + "grad_norm": 3.7519917488098145, + "learning_rate": 2.697512687174989e-05, + "loss": 2.4108, + "step": 7550 + }, + { + "epoch": 1.4, + "grad_norm": 3.982792615890503, + "learning_rate": 2.694380051375227e-05, + "loss": 2.4604, + "step": 7560 + }, + { + "epoch": 1.41, + "grad_norm": 3.7987496852874756, + "learning_rate": 2.6912474155754654e-05, + "loss": 2.3429, + "step": 7570 + }, + { + "epoch": 1.41, + "grad_norm": 4.573733806610107, + "learning_rate": 2.6881147797757034e-05, + "loss": 2.3859, + "step": 7580 + }, + { + "epoch": 1.41, + "grad_norm": 4.3863019943237305, + "learning_rate": 2.6849821439759414e-05, + "loss": 2.5498, + "step": 7590 + }, + { + "epoch": 1.41, + "grad_norm": 4.614287376403809, + "learning_rate": 2.6818495081761797e-05, + "loss": 2.5396, + "step": 7600 + }, + { + "epoch": 1.41, + "grad_norm": 4.737358570098877, + "learning_rate": 2.6787168723764177e-05, + "loss": 2.4409, + "step": 7610 + }, + { + "epoch": 1.41, + "grad_norm": 4.048407554626465, + "learning_rate": 2.675584236576656e-05, + "loss": 2.4081, + "step": 7620 + }, + { + "epoch": 1.42, + "grad_norm": 3.848973274230957, + "learning_rate": 2.672451600776894e-05, + "loss": 2.343, + "step": 7630 + }, + { + "epoch": 1.42, + "grad_norm": 4.802037715911865, + "learning_rate": 2.6693189649771317e-05, + "loss": 2.3406, + "step": 7640 + }, + { + "epoch": 1.42, + "grad_norm": 4.5075483322143555, + "learning_rate": 2.6661863291773697e-05, + "loss": 2.4503, + "step": 7650 + }, + { + "epoch": 1.42, + "grad_norm": 4.523159503936768, + "learning_rate": 2.663053693377608e-05, + "loss": 2.4643, + "step": 7660 + }, + { + "epoch": 1.42, + "grad_norm": 3.8925390243530273, + "learning_rate": 2.659921057577846e-05, + "loss": 2.3056, + "step": 7670 + }, + { + "epoch": 1.43, + "grad_norm": 4.5294060707092285, + "learning_rate": 2.656788421778084e-05, + "loss": 2.4761, + "step": 7680 + }, + { + "epoch": 1.43, + "grad_norm": 4.151029586791992, + "learning_rate": 2.6536557859783224e-05, + "loss": 2.6412, + "step": 7690 + }, + { + "epoch": 1.43, + "grad_norm": 3.9961705207824707, + "learning_rate": 2.6505231501785604e-05, + "loss": 2.4344, + "step": 7700 + }, + { + "epoch": 1.43, + "grad_norm": 4.595808982849121, + "learning_rate": 2.6473905143787988e-05, + "loss": 2.3549, + "step": 7710 + }, + { + "epoch": 1.43, + "grad_norm": 4.139496803283691, + "learning_rate": 2.6442578785790368e-05, + "loss": 2.5274, + "step": 7720 + }, + { + "epoch": 1.43, + "grad_norm": 3.9915213584899902, + "learning_rate": 2.6411252427792748e-05, + "loss": 2.3668, + "step": 7730 + }, + { + "epoch": 1.44, + "grad_norm": 4.448828220367432, + "learning_rate": 2.6379926069795124e-05, + "loss": 2.2979, + "step": 7740 + }, + { + "epoch": 1.44, + "grad_norm": 4.070132255554199, + "learning_rate": 2.6348599711797504e-05, + "loss": 2.3898, + "step": 7750 + }, + { + "epoch": 1.44, + "grad_norm": 4.449703216552734, + "learning_rate": 2.6317273353799888e-05, + "loss": 2.4488, + "step": 7760 + }, + { + "epoch": 1.44, + "grad_norm": 3.869356393814087, + "learning_rate": 2.6285946995802268e-05, + "loss": 2.3731, + "step": 7770 + }, + { + "epoch": 1.44, + "grad_norm": 4.151821136474609, + "learning_rate": 2.625462063780465e-05, + "loss": 2.5609, + "step": 7780 + }, + { + "epoch": 1.45, + "grad_norm": 4.265530109405518, + "learning_rate": 2.622329427980703e-05, + "loss": 2.4425, + "step": 7790 + }, + { + "epoch": 1.45, + "grad_norm": 4.279960632324219, + "learning_rate": 2.619196792180941e-05, + "loss": 2.429, + "step": 7800 + }, + { + "epoch": 1.45, + "grad_norm": 4.096279144287109, + "learning_rate": 2.6160641563811795e-05, + "loss": 2.4357, + "step": 7810 + }, + { + "epoch": 1.45, + "grad_norm": 3.8605334758758545, + "learning_rate": 2.6129315205814175e-05, + "loss": 2.4386, + "step": 7820 + }, + { + "epoch": 1.45, + "grad_norm": 3.625364065170288, + "learning_rate": 2.6097988847816558e-05, + "loss": 2.3049, + "step": 7830 + }, + { + "epoch": 1.46, + "grad_norm": 4.029632091522217, + "learning_rate": 2.606666248981893e-05, + "loss": 2.4797, + "step": 7840 + }, + { + "epoch": 1.46, + "grad_norm": 3.971041202545166, + "learning_rate": 2.6035336131821315e-05, + "loss": 2.2626, + "step": 7850 + }, + { + "epoch": 1.46, + "grad_norm": 4.361209392547607, + "learning_rate": 2.6004009773823695e-05, + "loss": 2.4605, + "step": 7860 + }, + { + "epoch": 1.46, + "grad_norm": 3.867220163345337, + "learning_rate": 2.5972683415826078e-05, + "loss": 2.4137, + "step": 7870 + }, + { + "epoch": 1.46, + "grad_norm": 4.512621879577637, + "learning_rate": 2.5941357057828458e-05, + "loss": 2.3925, + "step": 7880 + }, + { + "epoch": 1.46, + "grad_norm": 4.05349588394165, + "learning_rate": 2.5910030699830838e-05, + "loss": 2.4041, + "step": 7890 + }, + { + "epoch": 1.47, + "grad_norm": 4.112064361572266, + "learning_rate": 2.5878704341833222e-05, + "loss": 2.3245, + "step": 7900 + }, + { + "epoch": 1.47, + "grad_norm": 4.162067413330078, + "learning_rate": 2.5847377983835602e-05, + "loss": 2.3881, + "step": 7910 + }, + { + "epoch": 1.47, + "grad_norm": 3.984915018081665, + "learning_rate": 2.5816051625837985e-05, + "loss": 2.5021, + "step": 7920 + }, + { + "epoch": 1.47, + "grad_norm": 3.4609036445617676, + "learning_rate": 2.578472526784036e-05, + "loss": 2.3664, + "step": 7930 + }, + { + "epoch": 1.47, + "grad_norm": 3.9925968647003174, + "learning_rate": 2.5753398909842742e-05, + "loss": 2.3428, + "step": 7940 + }, + { + "epoch": 1.48, + "grad_norm": 4.392812252044678, + "learning_rate": 2.5722072551845122e-05, + "loss": 2.4966, + "step": 7950 + }, + { + "epoch": 1.48, + "grad_norm": 4.691169261932373, + "learning_rate": 2.5690746193847505e-05, + "loss": 2.4003, + "step": 7960 + }, + { + "epoch": 1.48, + "grad_norm": 4.080898284912109, + "learning_rate": 2.5659419835849885e-05, + "loss": 2.4125, + "step": 7970 + }, + { + "epoch": 1.48, + "grad_norm": 4.430429935455322, + "learning_rate": 2.5628093477852265e-05, + "loss": 2.4005, + "step": 7980 + }, + { + "epoch": 1.48, + "grad_norm": 4.048476696014404, + "learning_rate": 2.559676711985465e-05, + "loss": 2.4376, + "step": 7990 + }, + { + "epoch": 1.48, + "grad_norm": 4.243685722351074, + "learning_rate": 2.556544076185703e-05, + "loss": 2.5779, + "step": 8000 + }, + { + "epoch": 1.49, + "grad_norm": 4.010811805725098, + "learning_rate": 2.553411440385941e-05, + "loss": 2.4601, + "step": 8010 + }, + { + "epoch": 1.49, + "grad_norm": 4.074519634246826, + "learning_rate": 2.5502788045861792e-05, + "loss": 2.4129, + "step": 8020 + }, + { + "epoch": 1.49, + "grad_norm": 4.494532108306885, + "learning_rate": 2.547146168786417e-05, + "loss": 2.4148, + "step": 8030 + }, + { + "epoch": 1.49, + "grad_norm": 5.979499340057373, + "learning_rate": 2.544013532986655e-05, + "loss": 2.4027, + "step": 8040 + }, + { + "epoch": 1.49, + "grad_norm": 4.250908851623535, + "learning_rate": 2.540880897186893e-05, + "loss": 2.3471, + "step": 8050 + }, + { + "epoch": 1.5, + "grad_norm": 4.406290531158447, + "learning_rate": 2.5377482613871312e-05, + "loss": 2.5687, + "step": 8060 + }, + { + "epoch": 1.5, + "grad_norm": 3.947685480117798, + "learning_rate": 2.5346156255873692e-05, + "loss": 2.3511, + "step": 8070 + }, + { + "epoch": 1.5, + "grad_norm": 4.436914920806885, + "learning_rate": 2.5314829897876076e-05, + "loss": 2.4439, + "step": 8080 + }, + { + "epoch": 1.5, + "grad_norm": 3.601666212081909, + "learning_rate": 2.5283503539878456e-05, + "loss": 2.396, + "step": 8090 + }, + { + "epoch": 1.5, + "grad_norm": 4.406430721282959, + "learning_rate": 2.5252177181880836e-05, + "loss": 2.3042, + "step": 8100 + }, + { + "epoch": 1.51, + "grad_norm": 3.84181809425354, + "learning_rate": 2.522085082388322e-05, + "loss": 2.4797, + "step": 8110 + }, + { + "epoch": 1.51, + "grad_norm": 4.066086769104004, + "learning_rate": 2.51895244658856e-05, + "loss": 2.595, + "step": 8120 + }, + { + "epoch": 1.51, + "grad_norm": 4.644285678863525, + "learning_rate": 2.5158198107887976e-05, + "loss": 2.5138, + "step": 8130 + }, + { + "epoch": 1.51, + "grad_norm": 4.4279303550720215, + "learning_rate": 2.5126871749890356e-05, + "loss": 2.4851, + "step": 8140 + }, + { + "epoch": 1.51, + "grad_norm": 3.7944159507751465, + "learning_rate": 2.509554539189274e-05, + "loss": 2.3989, + "step": 8150 + }, + { + "epoch": 1.51, + "grad_norm": 4.593160152435303, + "learning_rate": 2.506421903389512e-05, + "loss": 2.5637, + "step": 8160 + }, + { + "epoch": 1.52, + "grad_norm": 4.316919326782227, + "learning_rate": 2.5032892675897503e-05, + "loss": 2.3603, + "step": 8170 + }, + { + "epoch": 1.52, + "grad_norm": 4.496823787689209, + "learning_rate": 2.5001566317899883e-05, + "loss": 2.4756, + "step": 8180 + }, + { + "epoch": 1.52, + "grad_norm": 4.07798957824707, + "learning_rate": 2.4970239959902263e-05, + "loss": 2.4217, + "step": 8190 + }, + { + "epoch": 1.52, + "grad_norm": 4.062137126922607, + "learning_rate": 2.4938913601904643e-05, + "loss": 2.2771, + "step": 8200 + }, + { + "epoch": 1.52, + "grad_norm": 4.37224817276001, + "learning_rate": 2.4907587243907023e-05, + "loss": 2.4003, + "step": 8210 + }, + { + "epoch": 1.53, + "grad_norm": 4.260566234588623, + "learning_rate": 2.4876260885909406e-05, + "loss": 2.3725, + "step": 8220 + }, + { + "epoch": 1.53, + "grad_norm": 4.098892688751221, + "learning_rate": 2.4844934527911786e-05, + "loss": 2.3704, + "step": 8230 + }, + { + "epoch": 1.53, + "grad_norm": 3.625776767730713, + "learning_rate": 2.481360816991417e-05, + "loss": 2.4508, + "step": 8240 + }, + { + "epoch": 1.53, + "grad_norm": 4.206084728240967, + "learning_rate": 2.4782281811916546e-05, + "loss": 2.4504, + "step": 8250 + }, + { + "epoch": 1.53, + "grad_norm": 4.482570171356201, + "learning_rate": 2.4750955453918927e-05, + "loss": 2.3864, + "step": 8260 + }, + { + "epoch": 1.54, + "grad_norm": 4.5410237312316895, + "learning_rate": 2.471962909592131e-05, + "loss": 2.5781, + "step": 8270 + }, + { + "epoch": 1.54, + "grad_norm": 3.9175031185150146, + "learning_rate": 2.468830273792369e-05, + "loss": 2.3522, + "step": 8280 + }, + { + "epoch": 1.54, + "grad_norm": 4.778284549713135, + "learning_rate": 2.4656976379926073e-05, + "loss": 2.4545, + "step": 8290 + }, + { + "epoch": 1.54, + "grad_norm": 4.8703789710998535, + "learning_rate": 2.462565002192845e-05, + "loss": 2.4581, + "step": 8300 + }, + { + "epoch": 1.54, + "grad_norm": 4.1762847900390625, + "learning_rate": 2.4594323663930833e-05, + "loss": 2.3582, + "step": 8310 + }, + { + "epoch": 1.54, + "grad_norm": 4.193213939666748, + "learning_rate": 2.4562997305933213e-05, + "loss": 2.2968, + "step": 8320 + }, + { + "epoch": 1.55, + "grad_norm": 4.130049705505371, + "learning_rate": 2.4531670947935593e-05, + "loss": 2.5317, + "step": 8330 + }, + { + "epoch": 1.55, + "grad_norm": 4.380124092102051, + "learning_rate": 2.4500344589937977e-05, + "loss": 2.4211, + "step": 8340 + }, + { + "epoch": 1.55, + "grad_norm": 4.307916164398193, + "learning_rate": 2.4469018231940354e-05, + "loss": 2.3788, + "step": 8350 + }, + { + "epoch": 1.55, + "grad_norm": 4.4227166175842285, + "learning_rate": 2.4437691873942737e-05, + "loss": 2.3199, + "step": 8360 + }, + { + "epoch": 1.55, + "grad_norm": 4.434095859527588, + "learning_rate": 2.4406365515945117e-05, + "loss": 2.4138, + "step": 8370 + }, + { + "epoch": 1.56, + "grad_norm": 4.016805171966553, + "learning_rate": 2.43750391579475e-05, + "loss": 2.3639, + "step": 8380 + }, + { + "epoch": 1.56, + "grad_norm": 3.9864070415496826, + "learning_rate": 2.4343712799949877e-05, + "loss": 2.4281, + "step": 8390 + }, + { + "epoch": 1.56, + "grad_norm": 3.868408441543579, + "learning_rate": 2.431238644195226e-05, + "loss": 2.4809, + "step": 8400 + }, + { + "epoch": 1.56, + "grad_norm": 4.65122127532959, + "learning_rate": 2.428106008395464e-05, + "loss": 2.4681, + "step": 8410 + }, + { + "epoch": 1.56, + "grad_norm": 3.935698986053467, + "learning_rate": 2.424973372595702e-05, + "loss": 2.3455, + "step": 8420 + }, + { + "epoch": 1.56, + "grad_norm": 4.314015865325928, + "learning_rate": 2.4218407367959404e-05, + "loss": 2.4527, + "step": 8430 + }, + { + "epoch": 1.57, + "grad_norm": 4.124387264251709, + "learning_rate": 2.418708100996178e-05, + "loss": 2.4122, + "step": 8440 + }, + { + "epoch": 1.57, + "grad_norm": 3.7196121215820312, + "learning_rate": 2.4155754651964164e-05, + "loss": 2.5219, + "step": 8450 + }, + { + "epoch": 1.57, + "grad_norm": 4.566164016723633, + "learning_rate": 2.4124428293966544e-05, + "loss": 2.424, + "step": 8460 + }, + { + "epoch": 1.57, + "grad_norm": 4.448875904083252, + "learning_rate": 2.4093101935968927e-05, + "loss": 2.4276, + "step": 8470 + }, + { + "epoch": 1.57, + "grad_norm": 4.657510280609131, + "learning_rate": 2.4061775577971307e-05, + "loss": 2.4654, + "step": 8480 + }, + { + "epoch": 1.58, + "grad_norm": 3.8274145126342773, + "learning_rate": 2.4030449219973684e-05, + "loss": 2.4654, + "step": 8490 + }, + { + "epoch": 1.58, + "grad_norm": 4.3402228355407715, + "learning_rate": 2.3999122861976068e-05, + "loss": 2.3597, + "step": 8500 + }, + { + "epoch": 1.58, + "grad_norm": 3.7467360496520996, + "learning_rate": 2.3967796503978448e-05, + "loss": 2.4173, + "step": 8510 + }, + { + "epoch": 1.58, + "grad_norm": 4.307443618774414, + "learning_rate": 2.393647014598083e-05, + "loss": 2.4235, + "step": 8520 + }, + { + "epoch": 1.58, + "grad_norm": 4.339209079742432, + "learning_rate": 2.390514378798321e-05, + "loss": 2.5007, + "step": 8530 + }, + { + "epoch": 1.59, + "grad_norm": 4.1376118659973145, + "learning_rate": 2.387381742998559e-05, + "loss": 2.3295, + "step": 8540 + }, + { + "epoch": 1.59, + "grad_norm": 4.090506076812744, + "learning_rate": 2.384249107198797e-05, + "loss": 2.4239, + "step": 8550 + }, + { + "epoch": 1.59, + "grad_norm": 4.034234046936035, + "learning_rate": 2.381116471399035e-05, + "loss": 2.419, + "step": 8560 + }, + { + "epoch": 1.59, + "grad_norm": 3.6677346229553223, + "learning_rate": 2.3779838355992735e-05, + "loss": 2.5574, + "step": 8570 + }, + { + "epoch": 1.59, + "grad_norm": 4.422360897064209, + "learning_rate": 2.3748511997995115e-05, + "loss": 2.3951, + "step": 8580 + }, + { + "epoch": 1.59, + "grad_norm": 4.14889669418335, + "learning_rate": 2.3717185639997495e-05, + "loss": 2.4181, + "step": 8590 + }, + { + "epoch": 1.6, + "grad_norm": 4.343019962310791, + "learning_rate": 2.3685859281999875e-05, + "loss": 2.4356, + "step": 8600 + }, + { + "epoch": 1.6, + "grad_norm": 4.155813217163086, + "learning_rate": 2.3654532924002258e-05, + "loss": 2.4328, + "step": 8610 + }, + { + "epoch": 1.6, + "grad_norm": 4.325819969177246, + "learning_rate": 2.3623206566004638e-05, + "loss": 2.4753, + "step": 8620 + }, + { + "epoch": 1.6, + "grad_norm": 4.4720540046691895, + "learning_rate": 2.3591880208007018e-05, + "loss": 2.4074, + "step": 8630 + }, + { + "epoch": 1.6, + "grad_norm": 4.326481342315674, + "learning_rate": 2.3560553850009398e-05, + "loss": 2.2994, + "step": 8640 + }, + { + "epoch": 1.61, + "grad_norm": 4.1999592781066895, + "learning_rate": 2.3529227492011778e-05, + "loss": 2.276, + "step": 8650 + }, + { + "epoch": 1.61, + "grad_norm": 4.166476726531982, + "learning_rate": 2.349790113401416e-05, + "loss": 2.4736, + "step": 8660 + }, + { + "epoch": 1.61, + "grad_norm": 4.361721515655518, + "learning_rate": 2.346657477601654e-05, + "loss": 2.4374, + "step": 8670 + }, + { + "epoch": 1.61, + "grad_norm": 4.636282920837402, + "learning_rate": 2.3435248418018925e-05, + "loss": 2.3267, + "step": 8680 + }, + { + "epoch": 1.61, + "grad_norm": 4.051875591278076, + "learning_rate": 2.34039220600213e-05, + "loss": 2.3327, + "step": 8690 + }, + { + "epoch": 1.61, + "grad_norm": 4.308687686920166, + "learning_rate": 2.3372595702023685e-05, + "loss": 2.3804, + "step": 8700 + }, + { + "epoch": 1.62, + "grad_norm": 4.162394046783447, + "learning_rate": 2.3341269344026065e-05, + "loss": 2.487, + "step": 8710 + }, + { + "epoch": 1.62, + "grad_norm": 4.165082931518555, + "learning_rate": 2.3309942986028445e-05, + "loss": 2.5037, + "step": 8720 + }, + { + "epoch": 1.62, + "grad_norm": 4.127786636352539, + "learning_rate": 2.3278616628030825e-05, + "loss": 2.3867, + "step": 8730 + }, + { + "epoch": 1.62, + "grad_norm": 5.055369853973389, + "learning_rate": 2.3247290270033205e-05, + "loss": 2.4696, + "step": 8740 + }, + { + "epoch": 1.62, + "grad_norm": 4.151759624481201, + "learning_rate": 2.321596391203559e-05, + "loss": 2.4747, + "step": 8750 + }, + { + "epoch": 1.63, + "grad_norm": 4.65687894821167, + "learning_rate": 2.318463755403797e-05, + "loss": 2.3878, + "step": 8760 + }, + { + "epoch": 1.63, + "grad_norm": 4.260522365570068, + "learning_rate": 2.315331119604035e-05, + "loss": 2.5407, + "step": 8770 + }, + { + "epoch": 1.63, + "grad_norm": 4.139651775360107, + "learning_rate": 2.312198483804273e-05, + "loss": 2.4235, + "step": 8780 + }, + { + "epoch": 1.63, + "grad_norm": 4.276207447052002, + "learning_rate": 2.309065848004511e-05, + "loss": 2.2987, + "step": 8790 + }, + { + "epoch": 1.63, + "grad_norm": 4.2793731689453125, + "learning_rate": 2.3059332122047492e-05, + "loss": 2.4383, + "step": 8800 + }, + { + "epoch": 1.64, + "grad_norm": 3.7700252532958984, + "learning_rate": 2.3028005764049872e-05, + "loss": 2.4058, + "step": 8810 + }, + { + "epoch": 1.64, + "grad_norm": 4.340311527252197, + "learning_rate": 2.2996679406052256e-05, + "loss": 2.4854, + "step": 8820 + }, + { + "epoch": 1.64, + "grad_norm": 4.079639434814453, + "learning_rate": 2.2965353048054632e-05, + "loss": 2.4113, + "step": 8830 + }, + { + "epoch": 1.64, + "grad_norm": 3.728581666946411, + "learning_rate": 2.2934026690057016e-05, + "loss": 2.33, + "step": 8840 + }, + { + "epoch": 1.64, + "grad_norm": 4.095951080322266, + "learning_rate": 2.2902700332059396e-05, + "loss": 2.4925, + "step": 8850 + }, + { + "epoch": 1.64, + "grad_norm": 4.893881320953369, + "learning_rate": 2.2871373974061776e-05, + "loss": 2.5037, + "step": 8860 + }, + { + "epoch": 1.65, + "grad_norm": 4.797602653503418, + "learning_rate": 2.284004761606416e-05, + "loss": 2.4481, + "step": 8870 + }, + { + "epoch": 1.65, + "grad_norm": 3.614184856414795, + "learning_rate": 2.2808721258066536e-05, + "loss": 2.4042, + "step": 8880 + }, + { + "epoch": 1.65, + "grad_norm": 5.491405487060547, + "learning_rate": 2.277739490006892e-05, + "loss": 2.3421, + "step": 8890 + }, + { + "epoch": 1.65, + "grad_norm": 4.135850429534912, + "learning_rate": 2.27460685420713e-05, + "loss": 2.3792, + "step": 8900 + }, + { + "epoch": 1.65, + "grad_norm": 4.752901554107666, + "learning_rate": 2.2714742184073683e-05, + "loss": 2.4197, + "step": 8910 + }, + { + "epoch": 1.66, + "grad_norm": 5.097724914550781, + "learning_rate": 2.2683415826076063e-05, + "loss": 2.5074, + "step": 8920 + }, + { + "epoch": 1.66, + "grad_norm": 4.123813629150391, + "learning_rate": 2.2652089468078443e-05, + "loss": 2.4965, + "step": 8930 + }, + { + "epoch": 1.66, + "grad_norm": 4.163042068481445, + "learning_rate": 2.2620763110080823e-05, + "loss": 2.4533, + "step": 8940 + }, + { + "epoch": 1.66, + "grad_norm": 4.805618762969971, + "learning_rate": 2.2589436752083203e-05, + "loss": 2.3576, + "step": 8950 + }, + { + "epoch": 1.66, + "grad_norm": 4.58022403717041, + "learning_rate": 2.2558110394085586e-05, + "loss": 2.4423, + "step": 8960 + }, + { + "epoch": 1.67, + "grad_norm": 4.180849552154541, + "learning_rate": 2.2526784036087966e-05, + "loss": 2.4075, + "step": 8970 + }, + { + "epoch": 1.67, + "grad_norm": 4.48970365524292, + "learning_rate": 2.2495457678090346e-05, + "loss": 2.4105, + "step": 8980 + }, + { + "epoch": 1.67, + "grad_norm": 4.819067478179932, + "learning_rate": 2.2464131320092726e-05, + "loss": 2.3278, + "step": 8990 + }, + { + "epoch": 1.67, + "grad_norm": 4.588673114776611, + "learning_rate": 2.243280496209511e-05, + "loss": 2.5266, + "step": 9000 + }, + { + "epoch": 1.67, + "grad_norm": 4.151445388793945, + "learning_rate": 2.240147860409749e-05, + "loss": 2.6258, + "step": 9010 + }, + { + "epoch": 1.67, + "grad_norm": 4.227921485900879, + "learning_rate": 2.237015224609987e-05, + "loss": 2.2707, + "step": 9020 + }, + { + "epoch": 1.68, + "grad_norm": 4.091616153717041, + "learning_rate": 2.233882588810225e-05, + "loss": 2.3745, + "step": 9030 + }, + { + "epoch": 1.68, + "grad_norm": 4.434237480163574, + "learning_rate": 2.230749953010463e-05, + "loss": 2.5214, + "step": 9040 + }, + { + "epoch": 1.68, + "grad_norm": 3.7377941608428955, + "learning_rate": 2.2276173172107013e-05, + "loss": 2.2664, + "step": 9050 + }, + { + "epoch": 1.68, + "grad_norm": 3.9352807998657227, + "learning_rate": 2.2244846814109393e-05, + "loss": 2.4196, + "step": 9060 + }, + { + "epoch": 1.68, + "grad_norm": 6.058120250701904, + "learning_rate": 2.2213520456111773e-05, + "loss": 2.3696, + "step": 9070 + }, + { + "epoch": 1.69, + "grad_norm": 3.753314733505249, + "learning_rate": 2.2182194098114153e-05, + "loss": 2.4226, + "step": 9080 + }, + { + "epoch": 1.69, + "grad_norm": 4.035268783569336, + "learning_rate": 2.2150867740116533e-05, + "loss": 2.3935, + "step": 9090 + }, + { + "epoch": 1.69, + "grad_norm": 4.0359649658203125, + "learning_rate": 2.2119541382118917e-05, + "loss": 2.5171, + "step": 9100 + }, + { + "epoch": 1.69, + "grad_norm": 4.143360614776611, + "learning_rate": 2.2088215024121297e-05, + "loss": 2.5488, + "step": 9110 + }, + { + "epoch": 1.69, + "grad_norm": 4.005861759185791, + "learning_rate": 2.2056888666123677e-05, + "loss": 2.5157, + "step": 9120 + }, + { + "epoch": 1.69, + "grad_norm": 5.021326541900635, + "learning_rate": 2.2025562308126057e-05, + "loss": 2.3483, + "step": 9130 + }, + { + "epoch": 1.7, + "grad_norm": 3.683558225631714, + "learning_rate": 2.199423595012844e-05, + "loss": 2.5039, + "step": 9140 + }, + { + "epoch": 1.7, + "grad_norm": 3.9170446395874023, + "learning_rate": 2.196290959213082e-05, + "loss": 2.3853, + "step": 9150 + }, + { + "epoch": 1.7, + "grad_norm": 4.003520965576172, + "learning_rate": 2.19315832341332e-05, + "loss": 2.5109, + "step": 9160 + }, + { + "epoch": 1.7, + "grad_norm": 3.8243813514709473, + "learning_rate": 2.190025687613558e-05, + "loss": 2.4832, + "step": 9170 + }, + { + "epoch": 1.7, + "grad_norm": 4.661709308624268, + "learning_rate": 2.186893051813796e-05, + "loss": 2.3179, + "step": 9180 + }, + { + "epoch": 1.71, + "grad_norm": 4.192775249481201, + "learning_rate": 2.1837604160140344e-05, + "loss": 2.4681, + "step": 9190 + }, + { + "epoch": 1.71, + "grad_norm": 4.422582149505615, + "learning_rate": 2.1806277802142724e-05, + "loss": 2.3914, + "step": 9200 + }, + { + "epoch": 1.71, + "grad_norm": 4.069379806518555, + "learning_rate": 2.1774951444145107e-05, + "loss": 2.4434, + "step": 9210 + }, + { + "epoch": 1.71, + "grad_norm": 4.300215721130371, + "learning_rate": 2.1743625086147484e-05, + "loss": 2.3971, + "step": 9220 + }, + { + "epoch": 1.71, + "grad_norm": 3.8108959197998047, + "learning_rate": 2.1712298728149867e-05, + "loss": 2.4546, + "step": 9230 + }, + { + "epoch": 1.72, + "grad_norm": 3.656400442123413, + "learning_rate": 2.1680972370152247e-05, + "loss": 2.4079, + "step": 9240 + }, + { + "epoch": 1.72, + "grad_norm": 4.267639636993408, + "learning_rate": 2.1649646012154627e-05, + "loss": 2.4183, + "step": 9250 + }, + { + "epoch": 1.72, + "grad_norm": 4.378291130065918, + "learning_rate": 2.161831965415701e-05, + "loss": 2.3521, + "step": 9260 + }, + { + "epoch": 1.72, + "grad_norm": 4.273284435272217, + "learning_rate": 2.1586993296159387e-05, + "loss": 2.3965, + "step": 9270 + }, + { + "epoch": 1.72, + "grad_norm": 4.253837585449219, + "learning_rate": 2.155566693816177e-05, + "loss": 2.4458, + "step": 9280 + }, + { + "epoch": 1.72, + "grad_norm": 4.412284851074219, + "learning_rate": 2.152434058016415e-05, + "loss": 2.5076, + "step": 9290 + }, + { + "epoch": 1.73, + "grad_norm": 4.642493724822998, + "learning_rate": 2.149301422216653e-05, + "loss": 2.4012, + "step": 9300 + }, + { + "epoch": 1.73, + "grad_norm": 3.4505698680877686, + "learning_rate": 2.1461687864168914e-05, + "loss": 2.3663, + "step": 9310 + }, + { + "epoch": 1.73, + "grad_norm": 4.132535934448242, + "learning_rate": 2.143036150617129e-05, + "loss": 2.3603, + "step": 9320 + }, + { + "epoch": 1.73, + "grad_norm": 3.732874870300293, + "learning_rate": 2.1399035148173674e-05, + "loss": 2.2906, + "step": 9330 + }, + { + "epoch": 1.73, + "grad_norm": 4.370763301849365, + "learning_rate": 2.1367708790176054e-05, + "loss": 2.4297, + "step": 9340 + }, + { + "epoch": 1.74, + "grad_norm": 4.205658435821533, + "learning_rate": 2.1336382432178438e-05, + "loss": 2.4445, + "step": 9350 + }, + { + "epoch": 1.74, + "grad_norm": 3.8486382961273193, + "learning_rate": 2.1305056074180818e-05, + "loss": 2.4696, + "step": 9360 + }, + { + "epoch": 1.74, + "grad_norm": 4.221676349639893, + "learning_rate": 2.1273729716183198e-05, + "loss": 2.3604, + "step": 9370 + }, + { + "epoch": 1.74, + "grad_norm": 4.008775234222412, + "learning_rate": 2.1242403358185578e-05, + "loss": 2.5529, + "step": 9380 + }, + { + "epoch": 1.74, + "grad_norm": 5.488894939422607, + "learning_rate": 2.1211077000187958e-05, + "loss": 2.3793, + "step": 9390 + }, + { + "epoch": 1.74, + "grad_norm": 3.996288776397705, + "learning_rate": 2.117975064219034e-05, + "loss": 2.4756, + "step": 9400 + }, + { + "epoch": 1.75, + "grad_norm": 4.614337921142578, + "learning_rate": 2.114842428419272e-05, + "loss": 2.3483, + "step": 9410 + }, + { + "epoch": 1.75, + "grad_norm": 4.302055835723877, + "learning_rate": 2.11170979261951e-05, + "loss": 2.4792, + "step": 9420 + }, + { + "epoch": 1.75, + "grad_norm": 4.433431148529053, + "learning_rate": 2.108577156819748e-05, + "loss": 2.4674, + "step": 9430 + }, + { + "epoch": 1.75, + "grad_norm": 4.374996662139893, + "learning_rate": 2.1054445210199865e-05, + "loss": 2.3494, + "step": 9440 + }, + { + "epoch": 1.75, + "grad_norm": 4.1850361824035645, + "learning_rate": 2.1023118852202245e-05, + "loss": 2.4677, + "step": 9450 + }, + { + "epoch": 1.76, + "grad_norm": 4.117648601531982, + "learning_rate": 2.0991792494204625e-05, + "loss": 2.3398, + "step": 9460 + }, + { + "epoch": 1.76, + "grad_norm": 3.606243848800659, + "learning_rate": 2.0960466136207005e-05, + "loss": 2.3634, + "step": 9470 + }, + { + "epoch": 1.76, + "grad_norm": 4.0432658195495605, + "learning_rate": 2.0929139778209385e-05, + "loss": 2.5315, + "step": 9480 + }, + { + "epoch": 1.76, + "grad_norm": 3.853435516357422, + "learning_rate": 2.089781342021177e-05, + "loss": 2.4228, + "step": 9490 + }, + { + "epoch": 1.76, + "grad_norm": 4.112980365753174, + "learning_rate": 2.086648706221415e-05, + "loss": 2.5621, + "step": 9500 + }, + { + "epoch": 1.77, + "grad_norm": 4.1322808265686035, + "learning_rate": 2.083516070421653e-05, + "loss": 2.3932, + "step": 9510 + }, + { + "epoch": 1.77, + "grad_norm": 3.9664454460144043, + "learning_rate": 2.080383434621891e-05, + "loss": 2.4334, + "step": 9520 + }, + { + "epoch": 1.77, + "grad_norm": 4.619282245635986, + "learning_rate": 2.0772507988221292e-05, + "loss": 2.4343, + "step": 9530 + }, + { + "epoch": 1.77, + "grad_norm": 3.839085578918457, + "learning_rate": 2.0741181630223672e-05, + "loss": 2.4187, + "step": 9540 + }, + { + "epoch": 1.77, + "grad_norm": 4.345643520355225, + "learning_rate": 2.0709855272226052e-05, + "loss": 2.4387, + "step": 9550 + }, + { + "epoch": 1.77, + "grad_norm": 4.0575642585754395, + "learning_rate": 2.0678528914228432e-05, + "loss": 2.4043, + "step": 9560 + }, + { + "epoch": 1.78, + "grad_norm": 4.259551048278809, + "learning_rate": 2.0647202556230812e-05, + "loss": 2.3613, + "step": 9570 + }, + { + "epoch": 1.78, + "grad_norm": 4.213980674743652, + "learning_rate": 2.0615876198233195e-05, + "loss": 2.4696, + "step": 9580 + }, + { + "epoch": 1.78, + "grad_norm": 4.4075446128845215, + "learning_rate": 2.0584549840235576e-05, + "loss": 2.4833, + "step": 9590 + }, + { + "epoch": 1.78, + "grad_norm": 5.569279193878174, + "learning_rate": 2.0553223482237956e-05, + "loss": 2.389, + "step": 9600 + }, + { + "epoch": 1.78, + "grad_norm": 3.837458372116089, + "learning_rate": 2.0521897124240336e-05, + "loss": 2.3882, + "step": 9610 + }, + { + "epoch": 1.79, + "grad_norm": 4.220689296722412, + "learning_rate": 2.0490570766242716e-05, + "loss": 2.5108, + "step": 9620 + }, + { + "epoch": 1.79, + "grad_norm": 4.240020751953125, + "learning_rate": 2.04592444082451e-05, + "loss": 2.5004, + "step": 9630 + }, + { + "epoch": 1.79, + "grad_norm": 4.560367584228516, + "learning_rate": 2.042791805024748e-05, + "loss": 2.5024, + "step": 9640 + }, + { + "epoch": 1.79, + "grad_norm": 4.0892181396484375, + "learning_rate": 2.0396591692249862e-05, + "loss": 2.3545, + "step": 9650 + }, + { + "epoch": 1.79, + "grad_norm": 4.34207010269165, + "learning_rate": 2.036526533425224e-05, + "loss": 2.4782, + "step": 9660 + }, + { + "epoch": 1.79, + "grad_norm": 4.363237380981445, + "learning_rate": 2.0333938976254623e-05, + "loss": 2.206, + "step": 9670 + }, + { + "epoch": 1.8, + "grad_norm": 3.8488094806671143, + "learning_rate": 2.0302612618257003e-05, + "loss": 2.3918, + "step": 9680 + }, + { + "epoch": 1.8, + "grad_norm": 5.453420639038086, + "learning_rate": 2.0271286260259383e-05, + "loss": 2.3648, + "step": 9690 + }, + { + "epoch": 1.8, + "grad_norm": 3.7727625370025635, + "learning_rate": 2.0239959902261766e-05, + "loss": 2.4054, + "step": 9700 + }, + { + "epoch": 1.8, + "grad_norm": 4.444394588470459, + "learning_rate": 2.0208633544264143e-05, + "loss": 2.4831, + "step": 9710 + }, + { + "epoch": 1.8, + "grad_norm": 4.399291038513184, + "learning_rate": 2.0177307186266526e-05, + "loss": 2.4292, + "step": 9720 + }, + { + "epoch": 1.81, + "grad_norm": 4.340801239013672, + "learning_rate": 2.0145980828268906e-05, + "loss": 2.2941, + "step": 9730 + }, + { + "epoch": 1.81, + "grad_norm": 4.915050029754639, + "learning_rate": 2.011465447027129e-05, + "loss": 2.446, + "step": 9740 + }, + { + "epoch": 1.81, + "grad_norm": 3.6448071002960205, + "learning_rate": 2.008332811227367e-05, + "loss": 2.3937, + "step": 9750 + }, + { + "epoch": 1.81, + "grad_norm": 4.50324010848999, + "learning_rate": 2.005200175427605e-05, + "loss": 2.3799, + "step": 9760 + }, + { + "epoch": 1.81, + "grad_norm": 4.91646146774292, + "learning_rate": 2.002067539627843e-05, + "loss": 2.3239, + "step": 9770 + }, + { + "epoch": 1.82, + "grad_norm": 4.287485599517822, + "learning_rate": 1.998934903828081e-05, + "loss": 2.3565, + "step": 9780 + }, + { + "epoch": 1.82, + "grad_norm": 4.101718425750732, + "learning_rate": 1.9958022680283193e-05, + "loss": 2.3881, + "step": 9790 + }, + { + "epoch": 1.82, + "grad_norm": 4.803049087524414, + "learning_rate": 1.992669632228557e-05, + "loss": 2.4375, + "step": 9800 + }, + { + "epoch": 1.82, + "grad_norm": 6.953117370605469, + "learning_rate": 1.9895369964287953e-05, + "loss": 2.3769, + "step": 9810 + }, + { + "epoch": 1.82, + "grad_norm": 4.480542182922363, + "learning_rate": 1.9864043606290333e-05, + "loss": 2.3742, + "step": 9820 + }, + { + "epoch": 1.82, + "grad_norm": 4.368782997131348, + "learning_rate": 1.9832717248292713e-05, + "loss": 2.5032, + "step": 9830 + }, + { + "epoch": 1.83, + "grad_norm": 4.5232367515563965, + "learning_rate": 1.9801390890295097e-05, + "loss": 2.3914, + "step": 9840 + }, + { + "epoch": 1.83, + "grad_norm": 4.795730113983154, + "learning_rate": 1.9770064532297473e-05, + "loss": 2.4551, + "step": 9850 + }, + { + "epoch": 1.83, + "grad_norm": 4.36059045791626, + "learning_rate": 1.9738738174299857e-05, + "loss": 2.426, + "step": 9860 + }, + { + "epoch": 1.83, + "grad_norm": 4.606714725494385, + "learning_rate": 1.9707411816302237e-05, + "loss": 2.3973, + "step": 9870 + }, + { + "epoch": 1.83, + "grad_norm": 4.512914180755615, + "learning_rate": 1.967608545830462e-05, + "loss": 2.4423, + "step": 9880 + }, + { + "epoch": 1.84, + "grad_norm": 3.6559979915618896, + "learning_rate": 1.9644759100307e-05, + "loss": 2.2358, + "step": 9890 + }, + { + "epoch": 1.84, + "grad_norm": 4.248470306396484, + "learning_rate": 1.961343274230938e-05, + "loss": 2.2735, + "step": 9900 + }, + { + "epoch": 1.84, + "grad_norm": 4.629327774047852, + "learning_rate": 1.958210638431176e-05, + "loss": 2.4155, + "step": 9910 + }, + { + "epoch": 1.84, + "grad_norm": 4.973908424377441, + "learning_rate": 1.955078002631414e-05, + "loss": 2.5043, + "step": 9920 + }, + { + "epoch": 1.84, + "grad_norm": 4.580214977264404, + "learning_rate": 1.9519453668316524e-05, + "loss": 2.5079, + "step": 9930 + }, + { + "epoch": 1.85, + "grad_norm": 4.727945804595947, + "learning_rate": 1.9488127310318904e-05, + "loss": 2.3416, + "step": 9940 + }, + { + "epoch": 1.85, + "grad_norm": 5.331240177154541, + "learning_rate": 1.9456800952321284e-05, + "loss": 2.424, + "step": 9950 + }, + { + "epoch": 1.85, + "grad_norm": 4.183077812194824, + "learning_rate": 1.9425474594323664e-05, + "loss": 2.4149, + "step": 9960 + }, + { + "epoch": 1.85, + "grad_norm": 4.234660625457764, + "learning_rate": 1.9394148236326047e-05, + "loss": 2.4433, + "step": 9970 + }, + { + "epoch": 1.85, + "grad_norm": 4.186022758483887, + "learning_rate": 1.9362821878328427e-05, + "loss": 2.559, + "step": 9980 + }, + { + "epoch": 1.85, + "grad_norm": 5.2144856452941895, + "learning_rate": 1.9331495520330807e-05, + "loss": 2.4598, + "step": 9990 + }, + { + "epoch": 1.86, + "grad_norm": 3.981110095977783, + "learning_rate": 1.9300169162333187e-05, + "loss": 2.4261, + "step": 10000 + }, + { + "epoch": 1.86, + "grad_norm": 4.171727180480957, + "learning_rate": 1.9268842804335567e-05, + "loss": 2.3655, + "step": 10010 + }, + { + "epoch": 1.86, + "grad_norm": 4.13902473449707, + "learning_rate": 1.923751644633795e-05, + "loss": 2.5224, + "step": 10020 + }, + { + "epoch": 1.86, + "grad_norm": 4.296168804168701, + "learning_rate": 1.920619008834033e-05, + "loss": 2.3685, + "step": 10030 + }, + { + "epoch": 1.86, + "grad_norm": 4.683605194091797, + "learning_rate": 1.9174863730342714e-05, + "loss": 2.4102, + "step": 10040 + }, + { + "epoch": 1.87, + "grad_norm": 4.063732147216797, + "learning_rate": 1.914353737234509e-05, + "loss": 2.2953, + "step": 10050 + }, + { + "epoch": 1.87, + "grad_norm": 6.8384222984313965, + "learning_rate": 1.9112211014347474e-05, + "loss": 2.501, + "step": 10060 + }, + { + "epoch": 1.87, + "grad_norm": 5.450084209442139, + "learning_rate": 1.9080884656349854e-05, + "loss": 2.4856, + "step": 10070 + }, + { + "epoch": 1.87, + "grad_norm": 4.239566802978516, + "learning_rate": 1.9049558298352234e-05, + "loss": 2.448, + "step": 10080 + }, + { + "epoch": 1.87, + "grad_norm": 4.171853065490723, + "learning_rate": 1.9018231940354618e-05, + "loss": 2.4451, + "step": 10090 + }, + { + "epoch": 1.87, + "grad_norm": 4.816874980926514, + "learning_rate": 1.8986905582356994e-05, + "loss": 2.4613, + "step": 10100 + }, + { + "epoch": 1.88, + "grad_norm": 4.647317886352539, + "learning_rate": 1.8955579224359378e-05, + "loss": 2.5905, + "step": 10110 + }, + { + "epoch": 1.88, + "grad_norm": 4.123816967010498, + "learning_rate": 1.8924252866361758e-05, + "loss": 2.4489, + "step": 10120 + }, + { + "epoch": 1.88, + "grad_norm": 4.358800411224365, + "learning_rate": 1.8892926508364138e-05, + "loss": 2.4604, + "step": 10130 + }, + { + "epoch": 1.88, + "grad_norm": 4.496860980987549, + "learning_rate": 1.8861600150366518e-05, + "loss": 2.376, + "step": 10140 + }, + { + "epoch": 1.88, + "grad_norm": 3.996311902999878, + "learning_rate": 1.8830273792368898e-05, + "loss": 2.5195, + "step": 10150 + }, + { + "epoch": 1.89, + "grad_norm": 4.059746742248535, + "learning_rate": 1.879894743437128e-05, + "loss": 2.346, + "step": 10160 + }, + { + "epoch": 1.89, + "grad_norm": 4.107177257537842, + "learning_rate": 1.876762107637366e-05, + "loss": 2.4174, + "step": 10170 + }, + { + "epoch": 1.89, + "grad_norm": 3.9779505729675293, + "learning_rate": 1.8736294718376045e-05, + "loss": 2.5275, + "step": 10180 + }, + { + "epoch": 1.89, + "grad_norm": 4.001068592071533, + "learning_rate": 1.870496836037842e-05, + "loss": 2.5245, + "step": 10190 + }, + { + "epoch": 1.89, + "grad_norm": 4.435389995574951, + "learning_rate": 1.8673642002380805e-05, + "loss": 2.4051, + "step": 10200 + }, + { + "epoch": 1.9, + "grad_norm": 4.331353664398193, + "learning_rate": 1.8642315644383185e-05, + "loss": 2.4511, + "step": 10210 + }, + { + "epoch": 1.9, + "grad_norm": 4.031054496765137, + "learning_rate": 1.8610989286385565e-05, + "loss": 2.4381, + "step": 10220 + }, + { + "epoch": 1.9, + "grad_norm": 4.013234615325928, + "learning_rate": 1.8579662928387948e-05, + "loss": 2.4026, + "step": 10230 + }, + { + "epoch": 1.9, + "grad_norm": 4.422504901885986, + "learning_rate": 1.8548336570390325e-05, + "loss": 2.5228, + "step": 10240 + }, + { + "epoch": 1.9, + "grad_norm": 4.050678730010986, + "learning_rate": 1.8517010212392708e-05, + "loss": 2.3733, + "step": 10250 + }, + { + "epoch": 1.9, + "grad_norm": 3.986072301864624, + "learning_rate": 1.848568385439509e-05, + "loss": 2.4298, + "step": 10260 + }, + { + "epoch": 1.91, + "grad_norm": 5.056432247161865, + "learning_rate": 1.8454357496397472e-05, + "loss": 2.3867, + "step": 10270 + }, + { + "epoch": 1.91, + "grad_norm": 4.96640682220459, + "learning_rate": 1.8423031138399852e-05, + "loss": 2.3661, + "step": 10280 + }, + { + "epoch": 1.91, + "grad_norm": 4.190069198608398, + "learning_rate": 1.8391704780402232e-05, + "loss": 2.3079, + "step": 10290 + }, + { + "epoch": 1.91, + "grad_norm": 4.016003131866455, + "learning_rate": 1.8360378422404612e-05, + "loss": 2.3572, + "step": 10300 + }, + { + "epoch": 1.91, + "grad_norm": 4.608148097991943, + "learning_rate": 1.8329052064406992e-05, + "loss": 2.4708, + "step": 10310 + }, + { + "epoch": 1.92, + "grad_norm": 4.779339790344238, + "learning_rate": 1.8297725706409375e-05, + "loss": 2.4984, + "step": 10320 + }, + { + "epoch": 1.92, + "grad_norm": 4.29642391204834, + "learning_rate": 1.8266399348411755e-05, + "loss": 2.4389, + "step": 10330 + }, + { + "epoch": 1.92, + "grad_norm": 3.987062692642212, + "learning_rate": 1.8235072990414135e-05, + "loss": 2.5029, + "step": 10340 + }, + { + "epoch": 1.92, + "grad_norm": 4.135943412780762, + "learning_rate": 1.8203746632416515e-05, + "loss": 2.392, + "step": 10350 + }, + { + "epoch": 1.92, + "grad_norm": 4.60791540145874, + "learning_rate": 1.8172420274418895e-05, + "loss": 2.4186, + "step": 10360 + }, + { + "epoch": 1.92, + "grad_norm": 4.076789855957031, + "learning_rate": 1.814109391642128e-05, + "loss": 2.449, + "step": 10370 + }, + { + "epoch": 1.93, + "grad_norm": 4.475488185882568, + "learning_rate": 1.810976755842366e-05, + "loss": 2.4329, + "step": 10380 + }, + { + "epoch": 1.93, + "grad_norm": 4.552157878875732, + "learning_rate": 1.807844120042604e-05, + "loss": 2.4157, + "step": 10390 + }, + { + "epoch": 1.93, + "grad_norm": 3.7248082160949707, + "learning_rate": 1.804711484242842e-05, + "loss": 2.3301, + "step": 10400 + }, + { + "epoch": 1.93, + "grad_norm": 4.07235050201416, + "learning_rate": 1.8015788484430802e-05, + "loss": 2.4807, + "step": 10410 + }, + { + "epoch": 1.93, + "grad_norm": 4.504077911376953, + "learning_rate": 1.7984462126433182e-05, + "loss": 2.371, + "step": 10420 + }, + { + "epoch": 1.94, + "grad_norm": 4.039147853851318, + "learning_rate": 1.7953135768435562e-05, + "loss": 2.4404, + "step": 10430 + }, + { + "epoch": 1.94, + "grad_norm": 4.518662452697754, + "learning_rate": 1.7921809410437942e-05, + "loss": 2.3984, + "step": 10440 + }, + { + "epoch": 1.94, + "grad_norm": 3.8982303142547607, + "learning_rate": 1.7890483052440322e-05, + "loss": 2.4115, + "step": 10450 + }, + { + "epoch": 1.94, + "grad_norm": 4.247034549713135, + "learning_rate": 1.7859156694442706e-05, + "loss": 2.4675, + "step": 10460 + }, + { + "epoch": 1.94, + "grad_norm": 5.116578102111816, + "learning_rate": 1.7827830336445086e-05, + "loss": 2.3421, + "step": 10470 + }, + { + "epoch": 1.95, + "grad_norm": 3.851707696914673, + "learning_rate": 1.7796503978447466e-05, + "loss": 2.4673, + "step": 10480 + }, + { + "epoch": 1.95, + "grad_norm": 4.440132141113281, + "learning_rate": 1.7765177620449846e-05, + "loss": 2.2985, + "step": 10490 + }, + { + "epoch": 1.95, + "grad_norm": 4.295937538146973, + "learning_rate": 1.773385126245223e-05, + "loss": 2.3367, + "step": 10500 + }, + { + "epoch": 1.95, + "grad_norm": 4.4065423011779785, + "learning_rate": 1.770252490445461e-05, + "loss": 2.4196, + "step": 10510 + }, + { + "epoch": 1.95, + "grad_norm": 4.549808502197266, + "learning_rate": 1.767119854645699e-05, + "loss": 2.4587, + "step": 10520 + }, + { + "epoch": 1.95, + "grad_norm": 3.632240056991577, + "learning_rate": 1.763987218845937e-05, + "loss": 2.3999, + "step": 10530 + }, + { + "epoch": 1.96, + "grad_norm": 4.433363914489746, + "learning_rate": 1.760854583046175e-05, + "loss": 2.354, + "step": 10540 + }, + { + "epoch": 1.96, + "grad_norm": 4.315452575683594, + "learning_rate": 1.7577219472464133e-05, + "loss": 2.4668, + "step": 10550 + }, + { + "epoch": 1.96, + "grad_norm": 4.167226791381836, + "learning_rate": 1.7545893114466513e-05, + "loss": 2.3308, + "step": 10560 + }, + { + "epoch": 1.96, + "grad_norm": 5.258768081665039, + "learning_rate": 1.7514566756468896e-05, + "loss": 2.3729, + "step": 10570 + }, + { + "epoch": 1.96, + "grad_norm": 4.293150901794434, + "learning_rate": 1.7483240398471273e-05, + "loss": 2.3616, + "step": 10580 + }, + { + "epoch": 1.97, + "grad_norm": 3.8591740131378174, + "learning_rate": 1.7451914040473656e-05, + "loss": 2.3923, + "step": 10590 + }, + { + "epoch": 1.97, + "grad_norm": 5.937702178955078, + "learning_rate": 1.7420587682476036e-05, + "loss": 2.5261, + "step": 10600 + }, + { + "epoch": 1.97, + "grad_norm": 4.620584964752197, + "learning_rate": 1.7389261324478416e-05, + "loss": 2.3608, + "step": 10610 + }, + { + "epoch": 1.97, + "grad_norm": 3.822798013687134, + "learning_rate": 1.73579349664808e-05, + "loss": 2.3959, + "step": 10620 + }, + { + "epoch": 1.97, + "grad_norm": 4.2800164222717285, + "learning_rate": 1.7326608608483177e-05, + "loss": 2.5203, + "step": 10630 + }, + { + "epoch": 1.98, + "grad_norm": 4.448559284210205, + "learning_rate": 1.729528225048556e-05, + "loss": 2.4006, + "step": 10640 + }, + { + "epoch": 1.98, + "grad_norm": 4.095371723175049, + "learning_rate": 1.726395589248794e-05, + "loss": 2.4556, + "step": 10650 + }, + { + "epoch": 1.98, + "grad_norm": 4.553467750549316, + "learning_rate": 1.723262953449032e-05, + "loss": 2.3279, + "step": 10660 + }, + { + "epoch": 1.98, + "grad_norm": 4.532577991485596, + "learning_rate": 1.7201303176492703e-05, + "loss": 2.4905, + "step": 10670 + }, + { + "epoch": 1.98, + "grad_norm": 4.202332496643066, + "learning_rate": 1.716997681849508e-05, + "loss": 2.3601, + "step": 10680 + }, + { + "epoch": 1.98, + "grad_norm": 4.557814121246338, + "learning_rate": 1.7138650460497463e-05, + "loss": 2.4099, + "step": 10690 + }, + { + "epoch": 1.99, + "grad_norm": 3.7724504470825195, + "learning_rate": 1.7107324102499844e-05, + "loss": 2.4612, + "step": 10700 + }, + { + "epoch": 1.99, + "grad_norm": 4.376640319824219, + "learning_rate": 1.7075997744502227e-05, + "loss": 2.3714, + "step": 10710 + }, + { + "epoch": 1.99, + "grad_norm": 3.8816592693328857, + "learning_rate": 1.7044671386504607e-05, + "loss": 2.383, + "step": 10720 + }, + { + "epoch": 1.99, + "grad_norm": 4.241684913635254, + "learning_rate": 1.7013345028506987e-05, + "loss": 2.3658, + "step": 10730 + }, + { + "epoch": 1.99, + "grad_norm": 5.058201313018799, + "learning_rate": 1.6982018670509367e-05, + "loss": 2.5489, + "step": 10740 + }, + { + "epoch": 2.0, + "grad_norm": 4.433839797973633, + "learning_rate": 1.6950692312511747e-05, + "loss": 2.3589, + "step": 10750 + }, + { + "epoch": 2.0, + "grad_norm": 5.016234397888184, + "learning_rate": 1.691936595451413e-05, + "loss": 2.3439, + "step": 10760 + }, + { + "epoch": 2.0, + "grad_norm": 4.006540298461914, + "learning_rate": 1.688803959651651e-05, + "loss": 2.2962, + "step": 10770 + }, + { + "epoch": 2.0, + "grad_norm": 4.735622406005859, + "learning_rate": 1.685671323851889e-05, + "loss": 2.3675, + "step": 10780 + }, + { + "epoch": 2.0, + "grad_norm": 4.5275349617004395, + "learning_rate": 1.682538688052127e-05, + "loss": 2.2321, + "step": 10790 + }, + { + "epoch": 2.0, + "grad_norm": 4.588350296020508, + "learning_rate": 1.6794060522523654e-05, + "loss": 2.3445, + "step": 10800 + }, + { + "epoch": 2.01, + "grad_norm": 4.578807830810547, + "learning_rate": 1.6762734164526034e-05, + "loss": 2.2022, + "step": 10810 + }, + { + "epoch": 2.01, + "grad_norm": 5.442037582397461, + "learning_rate": 1.6731407806528414e-05, + "loss": 2.1944, + "step": 10820 + }, + { + "epoch": 2.01, + "grad_norm": 5.329452991485596, + "learning_rate": 1.6700081448530794e-05, + "loss": 2.2167, + "step": 10830 + }, + { + "epoch": 2.01, + "grad_norm": 4.357962608337402, + "learning_rate": 1.6668755090533174e-05, + "loss": 2.3061, + "step": 10840 + }, + { + "epoch": 2.01, + "grad_norm": 5.071457862854004, + "learning_rate": 1.6637428732535558e-05, + "loss": 2.1617, + "step": 10850 + }, + { + "epoch": 2.02, + "grad_norm": 4.430005073547363, + "learning_rate": 1.6606102374537938e-05, + "loss": 2.2533, + "step": 10860 + }, + { + "epoch": 2.02, + "grad_norm": 4.683684825897217, + "learning_rate": 1.6574776016540318e-05, + "loss": 2.1427, + "step": 10870 + }, + { + "epoch": 2.02, + "grad_norm": 4.369844436645508, + "learning_rate": 1.6543449658542698e-05, + "loss": 2.2505, + "step": 10880 + }, + { + "epoch": 2.02, + "grad_norm": 4.296201229095459, + "learning_rate": 1.6512123300545078e-05, + "loss": 2.2814, + "step": 10890 + }, + { + "epoch": 2.02, + "grad_norm": 4.347317218780518, + "learning_rate": 1.648079694254746e-05, + "loss": 2.2257, + "step": 10900 + }, + { + "epoch": 2.03, + "grad_norm": 4.395873069763184, + "learning_rate": 1.644947058454984e-05, + "loss": 2.3565, + "step": 10910 + }, + { + "epoch": 2.03, + "grad_norm": 4.883645057678223, + "learning_rate": 1.641814422655222e-05, + "loss": 2.2033, + "step": 10920 + }, + { + "epoch": 2.03, + "grad_norm": 4.499231338500977, + "learning_rate": 1.63868178685546e-05, + "loss": 2.1711, + "step": 10930 + }, + { + "epoch": 2.03, + "grad_norm": 4.7956953048706055, + "learning_rate": 1.6355491510556985e-05, + "loss": 2.141, + "step": 10940 + }, + { + "epoch": 2.03, + "grad_norm": 4.847490310668945, + "learning_rate": 1.6324165152559365e-05, + "loss": 2.3622, + "step": 10950 + }, + { + "epoch": 2.03, + "grad_norm": 5.346663951873779, + "learning_rate": 1.6292838794561745e-05, + "loss": 2.2275, + "step": 10960 + }, + { + "epoch": 2.04, + "grad_norm": 4.818025588989258, + "learning_rate": 1.6261512436564125e-05, + "loss": 2.2773, + "step": 10970 + }, + { + "epoch": 2.04, + "grad_norm": 4.506927967071533, + "learning_rate": 1.6230186078566505e-05, + "loss": 2.2266, + "step": 10980 + }, + { + "epoch": 2.04, + "grad_norm": 4.6733622550964355, + "learning_rate": 1.6198859720568888e-05, + "loss": 2.2539, + "step": 10990 + }, + { + "epoch": 2.04, + "grad_norm": 4.735600471496582, + "learning_rate": 1.6167533362571268e-05, + "loss": 2.2552, + "step": 11000 + }, + { + "epoch": 2.04, + "grad_norm": 5.016603946685791, + "learning_rate": 1.613620700457365e-05, + "loss": 2.252, + "step": 11010 + }, + { + "epoch": 2.05, + "grad_norm": 4.684229850769043, + "learning_rate": 1.6104880646576028e-05, + "loss": 2.2791, + "step": 11020 + }, + { + "epoch": 2.05, + "grad_norm": 4.259072780609131, + "learning_rate": 1.607355428857841e-05, + "loss": 2.2869, + "step": 11030 + }, + { + "epoch": 2.05, + "grad_norm": 4.278809547424316, + "learning_rate": 1.604222793058079e-05, + "loss": 2.3341, + "step": 11040 + }, + { + "epoch": 2.05, + "grad_norm": 5.164812088012695, + "learning_rate": 1.601090157258317e-05, + "loss": 2.2361, + "step": 11050 + }, + { + "epoch": 2.05, + "grad_norm": 4.670204162597656, + "learning_rate": 1.5979575214585555e-05, + "loss": 2.2125, + "step": 11060 + }, + { + "epoch": 2.05, + "grad_norm": 4.744375228881836, + "learning_rate": 1.5948248856587932e-05, + "loss": 2.1317, + "step": 11070 + }, + { + "epoch": 2.06, + "grad_norm": 6.084680080413818, + "learning_rate": 1.5916922498590315e-05, + "loss": 2.1588, + "step": 11080 + }, + { + "epoch": 2.06, + "grad_norm": 4.742985725402832, + "learning_rate": 1.5885596140592695e-05, + "loss": 2.2572, + "step": 11090 + }, + { + "epoch": 2.06, + "grad_norm": 5.529914379119873, + "learning_rate": 1.585426978259508e-05, + "loss": 2.2142, + "step": 11100 + }, + { + "epoch": 2.06, + "grad_norm": 4.66815185546875, + "learning_rate": 1.582294342459746e-05, + "loss": 2.2082, + "step": 11110 + }, + { + "epoch": 2.06, + "grad_norm": 5.613138675689697, + "learning_rate": 1.579161706659984e-05, + "loss": 2.196, + "step": 11120 + }, + { + "epoch": 2.07, + "grad_norm": 4.429781913757324, + "learning_rate": 1.576029070860222e-05, + "loss": 2.3071, + "step": 11130 + }, + { + "epoch": 2.07, + "grad_norm": 4.774982929229736, + "learning_rate": 1.57289643506046e-05, + "loss": 2.2391, + "step": 11140 + }, + { + "epoch": 2.07, + "grad_norm": 4.520074367523193, + "learning_rate": 1.5697637992606982e-05, + "loss": 2.3476, + "step": 11150 + }, + { + "epoch": 2.07, + "grad_norm": 4.699487209320068, + "learning_rate": 1.566631163460936e-05, + "loss": 2.2987, + "step": 11160 + }, + { + "epoch": 2.07, + "grad_norm": 4.9261250495910645, + "learning_rate": 1.5634985276611742e-05, + "loss": 2.2451, + "step": 11170 + }, + { + "epoch": 2.08, + "grad_norm": 4.742948055267334, + "learning_rate": 1.5603658918614122e-05, + "loss": 2.2451, + "step": 11180 + }, + { + "epoch": 2.08, + "grad_norm": 4.750619411468506, + "learning_rate": 1.5572332560616502e-05, + "loss": 2.236, + "step": 11190 + }, + { + "epoch": 2.08, + "grad_norm": 6.201536178588867, + "learning_rate": 1.5541006202618886e-05, + "loss": 2.3287, + "step": 11200 + }, + { + "epoch": 2.08, + "grad_norm": 5.444113731384277, + "learning_rate": 1.5509679844621262e-05, + "loss": 2.1338, + "step": 11210 + }, + { + "epoch": 2.08, + "grad_norm": 4.299532890319824, + "learning_rate": 1.5478353486623646e-05, + "loss": 2.2989, + "step": 11220 + }, + { + "epoch": 2.08, + "grad_norm": 4.273916721343994, + "learning_rate": 1.5447027128626026e-05, + "loss": 2.2331, + "step": 11230 + }, + { + "epoch": 2.09, + "grad_norm": 4.77936315536499, + "learning_rate": 1.541570077062841e-05, + "loss": 2.2602, + "step": 11240 + }, + { + "epoch": 2.09, + "grad_norm": 5.429702281951904, + "learning_rate": 1.538437441263079e-05, + "loss": 2.3563, + "step": 11250 + }, + { + "epoch": 2.09, + "grad_norm": 4.403205871582031, + "learning_rate": 1.535304805463317e-05, + "loss": 2.2532, + "step": 11260 + }, + { + "epoch": 2.09, + "grad_norm": 4.882217884063721, + "learning_rate": 1.532172169663555e-05, + "loss": 2.3309, + "step": 11270 + }, + { + "epoch": 2.09, + "grad_norm": 4.7882232666015625, + "learning_rate": 1.529039533863793e-05, + "loss": 2.1901, + "step": 11280 + }, + { + "epoch": 2.1, + "grad_norm": 5.28114652633667, + "learning_rate": 1.5259068980640313e-05, + "loss": 2.2094, + "step": 11290 + }, + { + "epoch": 2.1, + "grad_norm": 5.084491729736328, + "learning_rate": 1.5227742622642693e-05, + "loss": 2.1454, + "step": 11300 + }, + { + "epoch": 2.1, + "grad_norm": 5.040525913238525, + "learning_rate": 1.5196416264645073e-05, + "loss": 2.1998, + "step": 11310 + }, + { + "epoch": 2.1, + "grad_norm": 4.516851425170898, + "learning_rate": 1.5165089906647453e-05, + "loss": 2.1945, + "step": 11320 + }, + { + "epoch": 2.1, + "grad_norm": 4.610063076019287, + "learning_rate": 1.5133763548649835e-05, + "loss": 2.2231, + "step": 11330 + }, + { + "epoch": 2.1, + "grad_norm": 4.8471598625183105, + "learning_rate": 1.5102437190652216e-05, + "loss": 2.345, + "step": 11340 + }, + { + "epoch": 2.11, + "grad_norm": 4.626014709472656, + "learning_rate": 1.5071110832654598e-05, + "loss": 2.285, + "step": 11350 + }, + { + "epoch": 2.11, + "grad_norm": 5.075950622558594, + "learning_rate": 1.5039784474656976e-05, + "loss": 2.1407, + "step": 11360 + }, + { + "epoch": 2.11, + "grad_norm": 4.894886016845703, + "learning_rate": 1.5008458116659358e-05, + "loss": 2.181, + "step": 11370 + }, + { + "epoch": 2.11, + "grad_norm": 4.974881172180176, + "learning_rate": 1.4977131758661738e-05, + "loss": 2.3302, + "step": 11380 + }, + { + "epoch": 2.11, + "grad_norm": 5.369019508361816, + "learning_rate": 1.494580540066412e-05, + "loss": 2.1679, + "step": 11390 + }, + { + "epoch": 2.12, + "grad_norm": 4.445016860961914, + "learning_rate": 1.4914479042666502e-05, + "loss": 2.2998, + "step": 11400 + }, + { + "epoch": 2.12, + "grad_norm": 4.570034503936768, + "learning_rate": 1.488315268466888e-05, + "loss": 2.2268, + "step": 11410 + }, + { + "epoch": 2.12, + "grad_norm": 4.77443265914917, + "learning_rate": 1.4851826326671262e-05, + "loss": 2.2892, + "step": 11420 + }, + { + "epoch": 2.12, + "grad_norm": 4.94240665435791, + "learning_rate": 1.4820499968673643e-05, + "loss": 2.1972, + "step": 11430 + }, + { + "epoch": 2.12, + "grad_norm": 4.679396152496338, + "learning_rate": 1.4789173610676025e-05, + "loss": 2.2014, + "step": 11440 + }, + { + "epoch": 2.13, + "grad_norm": 4.996157169342041, + "learning_rate": 1.4757847252678405e-05, + "loss": 2.1538, + "step": 11450 + }, + { + "epoch": 2.13, + "grad_norm": 4.940011501312256, + "learning_rate": 1.4726520894680783e-05, + "loss": 2.2863, + "step": 11460 + }, + { + "epoch": 2.13, + "grad_norm": 4.545008659362793, + "learning_rate": 1.4695194536683165e-05, + "loss": 2.2417, + "step": 11470 + }, + { + "epoch": 2.13, + "grad_norm": 5.11240291595459, + "learning_rate": 1.4663868178685547e-05, + "loss": 2.2372, + "step": 11480 + }, + { + "epoch": 2.13, + "grad_norm": 5.07687520980835, + "learning_rate": 1.4632541820687929e-05, + "loss": 2.2018, + "step": 11490 + }, + { + "epoch": 2.13, + "grad_norm": 5.014510631561279, + "learning_rate": 1.460121546269031e-05, + "loss": 2.1412, + "step": 11500 + }, + { + "epoch": 2.14, + "grad_norm": 5.126745223999023, + "learning_rate": 1.4569889104692689e-05, + "loss": 2.3769, + "step": 11510 + }, + { + "epoch": 2.14, + "grad_norm": 5.206657409667969, + "learning_rate": 1.453856274669507e-05, + "loss": 2.2612, + "step": 11520 + }, + { + "epoch": 2.14, + "grad_norm": 4.919262886047363, + "learning_rate": 1.450723638869745e-05, + "loss": 2.4109, + "step": 11530 + }, + { + "epoch": 2.14, + "grad_norm": 4.906914710998535, + "learning_rate": 1.4475910030699832e-05, + "loss": 2.1306, + "step": 11540 + }, + { + "epoch": 2.14, + "grad_norm": 4.850189685821533, + "learning_rate": 1.444458367270221e-05, + "loss": 2.3328, + "step": 11550 + }, + { + "epoch": 2.15, + "grad_norm": 11.900529861450195, + "learning_rate": 1.4413257314704592e-05, + "loss": 2.1918, + "step": 11560 + }, + { + "epoch": 2.15, + "grad_norm": 5.0364990234375, + "learning_rate": 1.4381930956706974e-05, + "loss": 2.3654, + "step": 11570 + }, + { + "epoch": 2.15, + "grad_norm": 4.807575225830078, + "learning_rate": 1.4350604598709356e-05, + "loss": 2.3238, + "step": 11580 + }, + { + "epoch": 2.15, + "grad_norm": 4.882495403289795, + "learning_rate": 1.4319278240711737e-05, + "loss": 2.3126, + "step": 11590 + }, + { + "epoch": 2.15, + "grad_norm": 5.3398118019104, + "learning_rate": 1.4287951882714116e-05, + "loss": 2.3554, + "step": 11600 + }, + { + "epoch": 2.16, + "grad_norm": 4.0508599281311035, + "learning_rate": 1.4256625524716496e-05, + "loss": 2.1841, + "step": 11610 + }, + { + "epoch": 2.16, + "grad_norm": 5.476123809814453, + "learning_rate": 1.4225299166718877e-05, + "loss": 2.1732, + "step": 11620 + }, + { + "epoch": 2.16, + "grad_norm": 5.1260085105896, + "learning_rate": 1.4193972808721259e-05, + "loss": 2.3234, + "step": 11630 + }, + { + "epoch": 2.16, + "grad_norm": 5.232734680175781, + "learning_rate": 1.4162646450723641e-05, + "loss": 2.3772, + "step": 11640 + }, + { + "epoch": 2.16, + "grad_norm": 5.351359844207764, + "learning_rate": 1.413132009272602e-05, + "loss": 2.1584, + "step": 11650 + }, + { + "epoch": 2.16, + "grad_norm": 4.6979146003723145, + "learning_rate": 1.4099993734728401e-05, + "loss": 2.2411, + "step": 11660 + }, + { + "epoch": 2.17, + "grad_norm": 4.614090919494629, + "learning_rate": 1.4068667376730783e-05, + "loss": 2.246, + "step": 11670 + }, + { + "epoch": 2.17, + "grad_norm": 4.733226299285889, + "learning_rate": 1.4037341018733163e-05, + "loss": 2.1933, + "step": 11680 + }, + { + "epoch": 2.17, + "grad_norm": 4.780957221984863, + "learning_rate": 1.4006014660735544e-05, + "loss": 2.1053, + "step": 11690 + }, + { + "epoch": 2.17, + "grad_norm": 5.091309547424316, + "learning_rate": 1.3974688302737923e-05, + "loss": 2.328, + "step": 11700 + }, + { + "epoch": 2.17, + "grad_norm": 4.841605186462402, + "learning_rate": 1.3943361944740304e-05, + "loss": 2.3678, + "step": 11710 + }, + { + "epoch": 2.18, + "grad_norm": 5.011669635772705, + "learning_rate": 1.3912035586742686e-05, + "loss": 2.1706, + "step": 11720 + }, + { + "epoch": 2.18, + "grad_norm": 4.761959552764893, + "learning_rate": 1.3880709228745068e-05, + "loss": 2.3205, + "step": 11730 + }, + { + "epoch": 2.18, + "grad_norm": 5.310210704803467, + "learning_rate": 1.3849382870747448e-05, + "loss": 2.1968, + "step": 11740 + }, + { + "epoch": 2.18, + "grad_norm": 4.438429832458496, + "learning_rate": 1.3818056512749828e-05, + "loss": 2.3005, + "step": 11750 + }, + { + "epoch": 2.18, + "grad_norm": 5.3258771896362305, + "learning_rate": 1.3786730154752208e-05, + "loss": 2.3267, + "step": 11760 + }, + { + "epoch": 2.18, + "grad_norm": 4.707846641540527, + "learning_rate": 1.375540379675459e-05, + "loss": 2.1814, + "step": 11770 + }, + { + "epoch": 2.19, + "grad_norm": 5.156281471252441, + "learning_rate": 1.3724077438756971e-05, + "loss": 2.297, + "step": 11780 + }, + { + "epoch": 2.19, + "grad_norm": 4.802826404571533, + "learning_rate": 1.3692751080759353e-05, + "loss": 2.2419, + "step": 11790 + }, + { + "epoch": 2.19, + "grad_norm": 5.247189044952393, + "learning_rate": 1.3661424722761732e-05, + "loss": 2.2894, + "step": 11800 + }, + { + "epoch": 2.19, + "grad_norm": 4.583905220031738, + "learning_rate": 1.3630098364764113e-05, + "loss": 2.0925, + "step": 11810 + }, + { + "epoch": 2.19, + "grad_norm": 6.071223258972168, + "learning_rate": 1.3598772006766495e-05, + "loss": 2.3123, + "step": 11820 + }, + { + "epoch": 2.2, + "grad_norm": 5.113489627838135, + "learning_rate": 1.3567445648768875e-05, + "loss": 2.2259, + "step": 11830 + }, + { + "epoch": 2.2, + "grad_norm": 5.095146179199219, + "learning_rate": 1.3536119290771257e-05, + "loss": 2.3375, + "step": 11840 + }, + { + "epoch": 2.2, + "grad_norm": 4.298186302185059, + "learning_rate": 1.3504792932773635e-05, + "loss": 2.1844, + "step": 11850 + }, + { + "epoch": 2.2, + "grad_norm": 5.170765399932861, + "learning_rate": 1.3473466574776017e-05, + "loss": 2.138, + "step": 11860 + }, + { + "epoch": 2.2, + "grad_norm": 4.8907246589660645, + "learning_rate": 1.3442140216778399e-05, + "loss": 2.3352, + "step": 11870 + }, + { + "epoch": 2.21, + "grad_norm": 4.390583515167236, + "learning_rate": 1.341081385878078e-05, + "loss": 2.2547, + "step": 11880 + }, + { + "epoch": 2.21, + "grad_norm": 4.003589630126953, + "learning_rate": 1.3379487500783159e-05, + "loss": 2.2183, + "step": 11890 + }, + { + "epoch": 2.21, + "grad_norm": 5.281210899353027, + "learning_rate": 1.334816114278554e-05, + "loss": 2.3186, + "step": 11900 + }, + { + "epoch": 2.21, + "grad_norm": 4.8342461585998535, + "learning_rate": 1.331683478478792e-05, + "loss": 2.2049, + "step": 11910 + }, + { + "epoch": 2.21, + "grad_norm": 5.283405780792236, + "learning_rate": 1.3285508426790302e-05, + "loss": 2.3101, + "step": 11920 + }, + { + "epoch": 2.21, + "grad_norm": 4.727782726287842, + "learning_rate": 1.3254182068792684e-05, + "loss": 2.1859, + "step": 11930 + }, + { + "epoch": 2.22, + "grad_norm": 6.096275329589844, + "learning_rate": 1.3222855710795062e-05, + "loss": 2.2318, + "step": 11940 + }, + { + "epoch": 2.22, + "grad_norm": 4.112122058868408, + "learning_rate": 1.3191529352797444e-05, + "loss": 2.2752, + "step": 11950 + }, + { + "epoch": 2.22, + "grad_norm": 4.975126266479492, + "learning_rate": 1.3160202994799826e-05, + "loss": 2.2632, + "step": 11960 + }, + { + "epoch": 2.22, + "grad_norm": 5.01999044418335, + "learning_rate": 1.3128876636802207e-05, + "loss": 2.2912, + "step": 11970 + }, + { + "epoch": 2.22, + "grad_norm": 4.252249240875244, + "learning_rate": 1.3097550278804587e-05, + "loss": 2.1791, + "step": 11980 + }, + { + "epoch": 2.23, + "grad_norm": 6.292074680328369, + "learning_rate": 1.3066223920806966e-05, + "loss": 2.3141, + "step": 11990 + }, + { + "epoch": 2.23, + "grad_norm": 4.5826921463012695, + "learning_rate": 1.3034897562809347e-05, + "loss": 2.3514, + "step": 12000 + }, + { + "epoch": 2.23, + "grad_norm": 4.747256278991699, + "learning_rate": 1.3003571204811729e-05, + "loss": 2.2335, + "step": 12010 + }, + { + "epoch": 2.23, + "grad_norm": 5.5181779861450195, + "learning_rate": 1.297224484681411e-05, + "loss": 2.3244, + "step": 12020 + }, + { + "epoch": 2.23, + "grad_norm": 5.566039085388184, + "learning_rate": 1.2940918488816493e-05, + "loss": 2.1537, + "step": 12030 + }, + { + "epoch": 2.23, + "grad_norm": 4.884830474853516, + "learning_rate": 1.2909592130818871e-05, + "loss": 2.2966, + "step": 12040 + }, + { + "epoch": 2.24, + "grad_norm": 4.978501319885254, + "learning_rate": 1.2878265772821253e-05, + "loss": 2.2039, + "step": 12050 + }, + { + "epoch": 2.24, + "grad_norm": 5.066789627075195, + "learning_rate": 1.2846939414823633e-05, + "loss": 2.3125, + "step": 12060 + }, + { + "epoch": 2.24, + "grad_norm": 4.925536632537842, + "learning_rate": 1.2815613056826014e-05, + "loss": 2.2723, + "step": 12070 + }, + { + "epoch": 2.24, + "grad_norm": 4.975927352905273, + "learning_rate": 1.2784286698828396e-05, + "loss": 2.3694, + "step": 12080 + }, + { + "epoch": 2.24, + "grad_norm": 4.9011335372924805, + "learning_rate": 1.2752960340830774e-05, + "loss": 2.1171, + "step": 12090 + }, + { + "epoch": 2.25, + "grad_norm": 6.841973304748535, + "learning_rate": 1.2721633982833156e-05, + "loss": 2.3144, + "step": 12100 + }, + { + "epoch": 2.25, + "grad_norm": 5.277383804321289, + "learning_rate": 1.2690307624835538e-05, + "loss": 2.2665, + "step": 12110 + }, + { + "epoch": 2.25, + "grad_norm": 5.377317428588867, + "learning_rate": 1.265898126683792e-05, + "loss": 2.1452, + "step": 12120 + }, + { + "epoch": 2.25, + "grad_norm": 5.7158074378967285, + "learning_rate": 1.26276549088403e-05, + "loss": 2.1965, + "step": 12130 + }, + { + "epoch": 2.25, + "grad_norm": 5.411214351654053, + "learning_rate": 1.2596328550842678e-05, + "loss": 2.2097, + "step": 12140 + }, + { + "epoch": 2.26, + "grad_norm": 5.748459339141846, + "learning_rate": 1.256500219284506e-05, + "loss": 2.303, + "step": 12150 + }, + { + "epoch": 2.26, + "grad_norm": 4.961619853973389, + "learning_rate": 1.2533675834847441e-05, + "loss": 2.2179, + "step": 12160 + }, + { + "epoch": 2.26, + "grad_norm": 4.2982001304626465, + "learning_rate": 1.2502349476849823e-05, + "loss": 2.2276, + "step": 12170 + }, + { + "epoch": 2.26, + "grad_norm": 5.200771808624268, + "learning_rate": 1.2471023118852203e-05, + "loss": 2.2831, + "step": 12180 + }, + { + "epoch": 2.26, + "grad_norm": 4.932064533233643, + "learning_rate": 1.2439696760854585e-05, + "loss": 2.2783, + "step": 12190 + }, + { + "epoch": 2.26, + "grad_norm": 4.722069263458252, + "learning_rate": 1.2408370402856965e-05, + "loss": 2.2223, + "step": 12200 + }, + { + "epoch": 2.27, + "grad_norm": 5.084744453430176, + "learning_rate": 1.2377044044859345e-05, + "loss": 2.3635, + "step": 12210 + }, + { + "epoch": 2.27, + "grad_norm": 5.1085357666015625, + "learning_rate": 1.2345717686861725e-05, + "loss": 2.2819, + "step": 12220 + }, + { + "epoch": 2.27, + "grad_norm": 4.719175338745117, + "learning_rate": 1.2314391328864107e-05, + "loss": 2.3169, + "step": 12230 + }, + { + "epoch": 2.27, + "grad_norm": 4.763276100158691, + "learning_rate": 1.2283064970866488e-05, + "loss": 2.3517, + "step": 12240 + }, + { + "epoch": 2.27, + "grad_norm": 4.76812744140625, + "learning_rate": 1.2251738612868868e-05, + "loss": 2.3041, + "step": 12250 + }, + { + "epoch": 2.28, + "grad_norm": 4.920300006866455, + "learning_rate": 1.222041225487125e-05, + "loss": 2.2292, + "step": 12260 + }, + { + "epoch": 2.28, + "grad_norm": 4.785378932952881, + "learning_rate": 1.218908589687363e-05, + "loss": 2.3244, + "step": 12270 + }, + { + "epoch": 2.28, + "grad_norm": 5.033608913421631, + "learning_rate": 1.215775953887601e-05, + "loss": 2.2818, + "step": 12280 + }, + { + "epoch": 2.28, + "grad_norm": 4.9999775886535645, + "learning_rate": 1.212643318087839e-05, + "loss": 2.1768, + "step": 12290 + }, + { + "epoch": 2.28, + "grad_norm": 5.320832252502441, + "learning_rate": 1.2095106822880772e-05, + "loss": 2.2857, + "step": 12300 + }, + { + "epoch": 2.29, + "grad_norm": 4.435542106628418, + "learning_rate": 1.2063780464883154e-05, + "loss": 2.2643, + "step": 12310 + }, + { + "epoch": 2.29, + "grad_norm": 5.2425336837768555, + "learning_rate": 1.2032454106885534e-05, + "loss": 2.2234, + "step": 12320 + }, + { + "epoch": 2.29, + "grad_norm": 4.80018949508667, + "learning_rate": 1.2001127748887915e-05, + "loss": 2.372, + "step": 12330 + }, + { + "epoch": 2.29, + "grad_norm": 6.5564045906066895, + "learning_rate": 1.1969801390890295e-05, + "loss": 2.239, + "step": 12340 + }, + { + "epoch": 2.29, + "grad_norm": 4.771701335906982, + "learning_rate": 1.1938475032892677e-05, + "loss": 2.3006, + "step": 12350 + }, + { + "epoch": 2.29, + "grad_norm": 5.302359104156494, + "learning_rate": 1.1907148674895057e-05, + "loss": 2.3294, + "step": 12360 + }, + { + "epoch": 2.3, + "grad_norm": 4.32417106628418, + "learning_rate": 1.1875822316897437e-05, + "loss": 2.1954, + "step": 12370 + }, + { + "epoch": 2.3, + "grad_norm": 5.4360456466674805, + "learning_rate": 1.1844495958899819e-05, + "loss": 2.2502, + "step": 12380 + }, + { + "epoch": 2.3, + "grad_norm": 5.270229816436768, + "learning_rate": 1.1813169600902199e-05, + "loss": 2.2166, + "step": 12390 + }, + { + "epoch": 2.3, + "grad_norm": 4.748353004455566, + "learning_rate": 1.178184324290458e-05, + "loss": 2.171, + "step": 12400 + }, + { + "epoch": 2.3, + "grad_norm": 4.804108619689941, + "learning_rate": 1.1750516884906962e-05, + "loss": 2.3239, + "step": 12410 + }, + { + "epoch": 2.31, + "grad_norm": 5.346190452575684, + "learning_rate": 1.1719190526909342e-05, + "loss": 2.1899, + "step": 12420 + }, + { + "epoch": 2.31, + "grad_norm": 5.100567817687988, + "learning_rate": 1.1687864168911723e-05, + "loss": 2.3432, + "step": 12430 + }, + { + "epoch": 2.31, + "grad_norm": 4.597306728363037, + "learning_rate": 1.1656537810914103e-05, + "loss": 2.2212, + "step": 12440 + }, + { + "epoch": 2.31, + "grad_norm": 4.855820655822754, + "learning_rate": 1.1625211452916484e-05, + "loss": 2.2674, + "step": 12450 + }, + { + "epoch": 2.31, + "grad_norm": 4.747831344604492, + "learning_rate": 1.1593885094918864e-05, + "loss": 2.298, + "step": 12460 + }, + { + "epoch": 2.31, + "grad_norm": 4.8098673820495605, + "learning_rate": 1.1562558736921246e-05, + "loss": 2.2259, + "step": 12470 + }, + { + "epoch": 2.32, + "grad_norm": 5.240844249725342, + "learning_rate": 1.1531232378923628e-05, + "loss": 2.2065, + "step": 12480 + }, + { + "epoch": 2.32, + "grad_norm": 5.017106056213379, + "learning_rate": 1.1499906020926008e-05, + "loss": 2.1382, + "step": 12490 + }, + { + "epoch": 2.32, + "grad_norm": 5.257959365844727, + "learning_rate": 1.146857966292839e-05, + "loss": 2.3176, + "step": 12500 + }, + { + "epoch": 2.32, + "grad_norm": 5.328380107879639, + "learning_rate": 1.1437253304930768e-05, + "loss": 2.2397, + "step": 12510 + }, + { + "epoch": 2.32, + "grad_norm": 4.653416156768799, + "learning_rate": 1.140592694693315e-05, + "loss": 2.1877, + "step": 12520 + }, + { + "epoch": 2.33, + "grad_norm": 5.2487592697143555, + "learning_rate": 1.1374600588935531e-05, + "loss": 2.2622, + "step": 12530 + }, + { + "epoch": 2.33, + "grad_norm": 4.720800399780273, + "learning_rate": 1.1343274230937911e-05, + "loss": 2.1956, + "step": 12540 + }, + { + "epoch": 2.33, + "grad_norm": 4.962193489074707, + "learning_rate": 1.1311947872940293e-05, + "loss": 2.1048, + "step": 12550 + }, + { + "epoch": 2.33, + "grad_norm": 5.106245994567871, + "learning_rate": 1.1280621514942673e-05, + "loss": 2.2074, + "step": 12560 + }, + { + "epoch": 2.33, + "grad_norm": 5.294451713562012, + "learning_rate": 1.1249295156945055e-05, + "loss": 2.3578, + "step": 12570 + }, + { + "epoch": 2.34, + "grad_norm": 5.8226847648620605, + "learning_rate": 1.1217968798947435e-05, + "loss": 2.2542, + "step": 12580 + }, + { + "epoch": 2.34, + "grad_norm": 5.236142158508301, + "learning_rate": 1.1186642440949815e-05, + "loss": 2.1268, + "step": 12590 + }, + { + "epoch": 2.34, + "grad_norm": 5.430288314819336, + "learning_rate": 1.1155316082952197e-05, + "loss": 2.1378, + "step": 12600 + }, + { + "epoch": 2.34, + "grad_norm": 5.315874099731445, + "learning_rate": 1.1123989724954577e-05, + "loss": 2.1105, + "step": 12610 + }, + { + "epoch": 2.34, + "grad_norm": 5.95720911026001, + "learning_rate": 1.1092663366956958e-05, + "loss": 2.2494, + "step": 12620 + }, + { + "epoch": 2.34, + "grad_norm": 4.5179667472839355, + "learning_rate": 1.106133700895934e-05, + "loss": 2.2131, + "step": 12630 + }, + { + "epoch": 2.35, + "grad_norm": 4.903148174285889, + "learning_rate": 1.103001065096172e-05, + "loss": 2.1935, + "step": 12640 + }, + { + "epoch": 2.35, + "grad_norm": 5.403289794921875, + "learning_rate": 1.09986842929641e-05, + "loss": 2.1902, + "step": 12650 + }, + { + "epoch": 2.35, + "grad_norm": 5.413578987121582, + "learning_rate": 1.096735793496648e-05, + "loss": 2.3298, + "step": 12660 + }, + { + "epoch": 2.35, + "grad_norm": 5.452661991119385, + "learning_rate": 1.0936031576968862e-05, + "loss": 2.2673, + "step": 12670 + }, + { + "epoch": 2.35, + "grad_norm": 4.752969264984131, + "learning_rate": 1.0904705218971242e-05, + "loss": 2.1987, + "step": 12680 + }, + { + "epoch": 2.36, + "grad_norm": 4.89524507522583, + "learning_rate": 1.0873378860973624e-05, + "loss": 2.2216, + "step": 12690 + }, + { + "epoch": 2.36, + "grad_norm": 4.80157995223999, + "learning_rate": 1.0842052502976005e-05, + "loss": 2.2769, + "step": 12700 + }, + { + "epoch": 2.36, + "grad_norm": 4.679454326629639, + "learning_rate": 1.0810726144978385e-05, + "loss": 2.2917, + "step": 12710 + }, + { + "epoch": 2.36, + "grad_norm": 4.759067058563232, + "learning_rate": 1.0779399786980767e-05, + "loss": 2.2885, + "step": 12720 + }, + { + "epoch": 2.36, + "grad_norm": 4.863231658935547, + "learning_rate": 1.0748073428983147e-05, + "loss": 2.2205, + "step": 12730 + }, + { + "epoch": 2.36, + "grad_norm": 5.668412208557129, + "learning_rate": 1.0716747070985527e-05, + "loss": 2.184, + "step": 12740 + }, + { + "epoch": 2.37, + "grad_norm": 5.075872421264648, + "learning_rate": 1.0685420712987909e-05, + "loss": 2.2732, + "step": 12750 + }, + { + "epoch": 2.37, + "grad_norm": 4.573457717895508, + "learning_rate": 1.0654094354990289e-05, + "loss": 2.1865, + "step": 12760 + }, + { + "epoch": 2.37, + "grad_norm": 5.100333213806152, + "learning_rate": 1.062276799699267e-05, + "loss": 2.1851, + "step": 12770 + }, + { + "epoch": 2.37, + "grad_norm": 4.814767360687256, + "learning_rate": 1.059144163899505e-05, + "loss": 2.2246, + "step": 12780 + }, + { + "epoch": 2.37, + "grad_norm": 5.054508686065674, + "learning_rate": 1.0560115280997432e-05, + "loss": 2.3433, + "step": 12790 + }, + { + "epoch": 2.38, + "grad_norm": 5.447021007537842, + "learning_rate": 1.0528788922999812e-05, + "loss": 2.2906, + "step": 12800 + }, + { + "epoch": 2.38, + "grad_norm": 5.723243713378906, + "learning_rate": 1.0497462565002192e-05, + "loss": 2.1602, + "step": 12810 + }, + { + "epoch": 2.38, + "grad_norm": 4.797729969024658, + "learning_rate": 1.0466136207004574e-05, + "loss": 2.213, + "step": 12820 + }, + { + "epoch": 2.38, + "grad_norm": 5.9997735023498535, + "learning_rate": 1.0434809849006954e-05, + "loss": 2.2403, + "step": 12830 + }, + { + "epoch": 2.38, + "grad_norm": 5.520061492919922, + "learning_rate": 1.0403483491009336e-05, + "loss": 2.2191, + "step": 12840 + }, + { + "epoch": 2.39, + "grad_norm": 4.711124420166016, + "learning_rate": 1.0372157133011716e-05, + "loss": 2.2004, + "step": 12850 + }, + { + "epoch": 2.39, + "grad_norm": 5.090340614318848, + "learning_rate": 1.0340830775014098e-05, + "loss": 2.2455, + "step": 12860 + }, + { + "epoch": 2.39, + "grad_norm": 5.26514196395874, + "learning_rate": 1.030950441701648e-05, + "loss": 2.1776, + "step": 12870 + }, + { + "epoch": 2.39, + "grad_norm": 5.399611473083496, + "learning_rate": 1.027817805901886e-05, + "loss": 2.171, + "step": 12880 + }, + { + "epoch": 2.39, + "grad_norm": 5.001101016998291, + "learning_rate": 1.024685170102124e-05, + "loss": 2.2674, + "step": 12890 + }, + { + "epoch": 2.39, + "grad_norm": 4.122973442077637, + "learning_rate": 1.021552534302362e-05, + "loss": 2.1839, + "step": 12900 + }, + { + "epoch": 2.4, + "grad_norm": 5.108747959136963, + "learning_rate": 1.0184198985026001e-05, + "loss": 2.2297, + "step": 12910 + }, + { + "epoch": 2.4, + "grad_norm": 5.05750036239624, + "learning_rate": 1.0152872627028383e-05, + "loss": 2.3335, + "step": 12920 + }, + { + "epoch": 2.4, + "grad_norm": 5.032607555389404, + "learning_rate": 1.0121546269030763e-05, + "loss": 2.3326, + "step": 12930 + }, + { + "epoch": 2.4, + "grad_norm": 4.749469757080078, + "learning_rate": 1.0090219911033145e-05, + "loss": 2.2638, + "step": 12940 + }, + { + "epoch": 2.4, + "grad_norm": 5.402693748474121, + "learning_rate": 1.0058893553035525e-05, + "loss": 2.2938, + "step": 12950 + }, + { + "epoch": 2.41, + "grad_norm": 4.914982318878174, + "learning_rate": 1.0027567195037905e-05, + "loss": 2.2562, + "step": 12960 + }, + { + "epoch": 2.41, + "grad_norm": 5.372003078460693, + "learning_rate": 9.996240837040286e-06, + "loss": 2.1931, + "step": 12970 + }, + { + "epoch": 2.41, + "grad_norm": 4.706940174102783, + "learning_rate": 9.964914479042667e-06, + "loss": 2.2326, + "step": 12980 + }, + { + "epoch": 2.41, + "grad_norm": 4.209222316741943, + "learning_rate": 9.933588121045048e-06, + "loss": 2.144, + "step": 12990 + }, + { + "epoch": 2.41, + "grad_norm": 5.133508205413818, + "learning_rate": 9.902261763047428e-06, + "loss": 2.3475, + "step": 13000 + }, + { + "epoch": 2.41, + "grad_norm": 4.845363616943359, + "learning_rate": 9.87093540504981e-06, + "loss": 2.2409, + "step": 13010 + }, + { + "epoch": 2.42, + "grad_norm": 4.653754234313965, + "learning_rate": 9.83960904705219e-06, + "loss": 2.3025, + "step": 13020 + }, + { + "epoch": 2.42, + "grad_norm": 5.139420509338379, + "learning_rate": 9.808282689054572e-06, + "loss": 2.2286, + "step": 13030 + }, + { + "epoch": 2.42, + "grad_norm": 5.228572845458984, + "learning_rate": 9.776956331056952e-06, + "loss": 2.4346, + "step": 13040 + }, + { + "epoch": 2.42, + "grad_norm": 5.307019233703613, + "learning_rate": 9.745629973059332e-06, + "loss": 2.2245, + "step": 13050 + }, + { + "epoch": 2.42, + "grad_norm": 5.422297954559326, + "learning_rate": 9.714303615061714e-06, + "loss": 2.1797, + "step": 13060 + }, + { + "epoch": 2.43, + "grad_norm": 5.005941390991211, + "learning_rate": 9.682977257064094e-06, + "loss": 2.2727, + "step": 13070 + }, + { + "epoch": 2.43, + "grad_norm": 5.4763689041137695, + "learning_rate": 9.651650899066475e-06, + "loss": 2.1005, + "step": 13080 + }, + { + "epoch": 2.43, + "grad_norm": 5.4908576011657715, + "learning_rate": 9.620324541068857e-06, + "loss": 2.162, + "step": 13090 + }, + { + "epoch": 2.43, + "grad_norm": 5.450204849243164, + "learning_rate": 9.588998183071237e-06, + "loss": 2.2548, + "step": 13100 + }, + { + "epoch": 2.43, + "grad_norm": 5.938765048980713, + "learning_rate": 9.557671825073617e-06, + "loss": 2.1718, + "step": 13110 + }, + { + "epoch": 2.44, + "grad_norm": 5.608719825744629, + "learning_rate": 9.526345467075997e-06, + "loss": 2.2955, + "step": 13120 + }, + { + "epoch": 2.44, + "grad_norm": 4.325197696685791, + "learning_rate": 9.495019109078379e-06, + "loss": 2.167, + "step": 13130 + }, + { + "epoch": 2.44, + "grad_norm": 4.6594767570495605, + "learning_rate": 9.46369275108076e-06, + "loss": 2.2667, + "step": 13140 + }, + { + "epoch": 2.44, + "grad_norm": 4.628880977630615, + "learning_rate": 9.43236639308314e-06, + "loss": 2.162, + "step": 13150 + }, + { + "epoch": 2.44, + "grad_norm": 5.200460433959961, + "learning_rate": 9.401040035085522e-06, + "loss": 2.2934, + "step": 13160 + }, + { + "epoch": 2.44, + "grad_norm": 5.961476802825928, + "learning_rate": 9.369713677087902e-06, + "loss": 2.3302, + "step": 13170 + }, + { + "epoch": 2.45, + "grad_norm": 5.322465896606445, + "learning_rate": 9.338387319090282e-06, + "loss": 2.1746, + "step": 13180 + }, + { + "epoch": 2.45, + "grad_norm": 5.763502597808838, + "learning_rate": 9.307060961092662e-06, + "loss": 2.2987, + "step": 13190 + }, + { + "epoch": 2.45, + "grad_norm": 5.322182655334473, + "learning_rate": 9.275734603095044e-06, + "loss": 2.326, + "step": 13200 + }, + { + "epoch": 2.45, + "grad_norm": 5.088197708129883, + "learning_rate": 9.244408245097426e-06, + "loss": 2.1486, + "step": 13210 + }, + { + "epoch": 2.45, + "grad_norm": 5.4456787109375, + "learning_rate": 9.213081887099806e-06, + "loss": 2.2645, + "step": 13220 + }, + { + "epoch": 2.46, + "grad_norm": 5.5985260009765625, + "learning_rate": 9.181755529102188e-06, + "loss": 2.3224, + "step": 13230 + }, + { + "epoch": 2.46, + "grad_norm": 5.466270923614502, + "learning_rate": 9.150429171104568e-06, + "loss": 2.3032, + "step": 13240 + }, + { + "epoch": 2.46, + "grad_norm": 5.2544732093811035, + "learning_rate": 9.11910281310695e-06, + "loss": 2.1505, + "step": 13250 + }, + { + "epoch": 2.46, + "grad_norm": 6.213293552398682, + "learning_rate": 9.08777645510933e-06, + "loss": 2.2228, + "step": 13260 + }, + { + "epoch": 2.46, + "grad_norm": 5.244242191314697, + "learning_rate": 9.05645009711171e-06, + "loss": 2.28, + "step": 13270 + }, + { + "epoch": 2.47, + "grad_norm": 5.124194145202637, + "learning_rate": 9.025123739114091e-06, + "loss": 2.3366, + "step": 13280 + }, + { + "epoch": 2.47, + "grad_norm": 4.310089111328125, + "learning_rate": 8.993797381116471e-06, + "loss": 2.2799, + "step": 13290 + }, + { + "epoch": 2.47, + "grad_norm": 5.166079998016357, + "learning_rate": 8.962471023118853e-06, + "loss": 2.2094, + "step": 13300 + }, + { + "epoch": 2.47, + "grad_norm": 5.7795329093933105, + "learning_rate": 8.931144665121235e-06, + "loss": 2.2734, + "step": 13310 + }, + { + "epoch": 2.47, + "grad_norm": 4.956170558929443, + "learning_rate": 8.899818307123615e-06, + "loss": 2.1813, + "step": 13320 + }, + { + "epoch": 2.47, + "grad_norm": 6.007333278656006, + "learning_rate": 8.868491949125995e-06, + "loss": 2.2364, + "step": 13330 + }, + { + "epoch": 2.48, + "grad_norm": 5.24399995803833, + "learning_rate": 8.837165591128375e-06, + "loss": 2.2734, + "step": 13340 + }, + { + "epoch": 2.48, + "grad_norm": 6.286114692687988, + "learning_rate": 8.805839233130756e-06, + "loss": 2.2163, + "step": 13350 + }, + { + "epoch": 2.48, + "grad_norm": 4.692686557769775, + "learning_rate": 8.774512875133136e-06, + "loss": 2.201, + "step": 13360 + }, + { + "epoch": 2.48, + "grad_norm": 4.9664483070373535, + "learning_rate": 8.743186517135518e-06, + "loss": 2.2684, + "step": 13370 + }, + { + "epoch": 2.48, + "grad_norm": 5.770281791687012, + "learning_rate": 8.7118601591379e-06, + "loss": 2.2949, + "step": 13380 + }, + { + "epoch": 2.49, + "grad_norm": 4.857741832733154, + "learning_rate": 8.68053380114028e-06, + "loss": 2.2934, + "step": 13390 + }, + { + "epoch": 2.49, + "grad_norm": 5.026750564575195, + "learning_rate": 8.649207443142662e-06, + "loss": 2.2011, + "step": 13400 + }, + { + "epoch": 2.49, + "grad_norm": 5.276968955993652, + "learning_rate": 8.617881085145042e-06, + "loss": 2.2554, + "step": 13410 + }, + { + "epoch": 2.49, + "grad_norm": 4.570988655090332, + "learning_rate": 8.586554727147422e-06, + "loss": 2.1372, + "step": 13420 + }, + { + "epoch": 2.49, + "grad_norm": 5.223836421966553, + "learning_rate": 8.555228369149803e-06, + "loss": 2.3167, + "step": 13430 + }, + { + "epoch": 2.49, + "grad_norm": 4.428679943084717, + "learning_rate": 8.523902011152183e-06, + "loss": 2.2213, + "step": 13440 + }, + { + "epoch": 2.5, + "grad_norm": 4.6929402351379395, + "learning_rate": 8.492575653154565e-06, + "loss": 2.3149, + "step": 13450 + }, + { + "epoch": 2.5, + "grad_norm": 5.652947902679443, + "learning_rate": 8.461249295156945e-06, + "loss": 2.3203, + "step": 13460 + }, + { + "epoch": 2.5, + "grad_norm": 5.486786842346191, + "learning_rate": 8.429922937159327e-06, + "loss": 2.2709, + "step": 13470 + }, + { + "epoch": 2.5, + "grad_norm": 5.114050388336182, + "learning_rate": 8.398596579161707e-06, + "loss": 2.3454, + "step": 13480 + }, + { + "epoch": 2.5, + "grad_norm": 5.2786173820495605, + "learning_rate": 8.367270221164087e-06, + "loss": 2.2613, + "step": 13490 + }, + { + "epoch": 2.51, + "grad_norm": 4.374701023101807, + "learning_rate": 8.335943863166469e-06, + "loss": 2.1595, + "step": 13500 + }, + { + "epoch": 2.51, + "grad_norm": 4.902259349822998, + "learning_rate": 8.304617505168849e-06, + "loss": 2.3199, + "step": 13510 + }, + { + "epoch": 2.51, + "grad_norm": 4.804900646209717, + "learning_rate": 8.27329114717123e-06, + "loss": 2.2834, + "step": 13520 + }, + { + "epoch": 2.51, + "grad_norm": 5.4029645919799805, + "learning_rate": 8.24196478917361e-06, + "loss": 2.3464, + "step": 13530 + }, + { + "epoch": 2.51, + "grad_norm": 4.877260684967041, + "learning_rate": 8.210638431175992e-06, + "loss": 2.3503, + "step": 13540 + }, + { + "epoch": 2.52, + "grad_norm": 4.730605602264404, + "learning_rate": 8.179312073178374e-06, + "loss": 2.344, + "step": 13550 + }, + { + "epoch": 2.52, + "grad_norm": 4.457608699798584, + "learning_rate": 8.147985715180754e-06, + "loss": 2.1494, + "step": 13560 + }, + { + "epoch": 2.52, + "grad_norm": 5.026783466339111, + "learning_rate": 8.116659357183134e-06, + "loss": 2.3395, + "step": 13570 + }, + { + "epoch": 2.52, + "grad_norm": 4.831313133239746, + "learning_rate": 8.085332999185514e-06, + "loss": 2.1817, + "step": 13580 + }, + { + "epoch": 2.52, + "grad_norm": 5.2338337898254395, + "learning_rate": 8.054006641187896e-06, + "loss": 2.2464, + "step": 13590 + }, + { + "epoch": 2.52, + "grad_norm": 5.4740519523620605, + "learning_rate": 8.022680283190277e-06, + "loss": 2.1953, + "step": 13600 + }, + { + "epoch": 2.53, + "grad_norm": 5.073474884033203, + "learning_rate": 7.991353925192658e-06, + "loss": 2.1982, + "step": 13610 + }, + { + "epoch": 2.53, + "grad_norm": 5.817521572113037, + "learning_rate": 7.96002756719504e-06, + "loss": 2.2207, + "step": 13620 + }, + { + "epoch": 2.53, + "grad_norm": 5.21115779876709, + "learning_rate": 7.92870120919742e-06, + "loss": 2.1872, + "step": 13630 + }, + { + "epoch": 2.53, + "grad_norm": 5.0212931632995605, + "learning_rate": 7.8973748511998e-06, + "loss": 2.2251, + "step": 13640 + }, + { + "epoch": 2.53, + "grad_norm": 4.8617939949035645, + "learning_rate": 7.866048493202181e-06, + "loss": 2.3814, + "step": 13650 + }, + { + "epoch": 2.54, + "grad_norm": 5.384790420532227, + "learning_rate": 7.834722135204561e-06, + "loss": 2.2883, + "step": 13660 + }, + { + "epoch": 2.54, + "grad_norm": 5.272597789764404, + "learning_rate": 7.803395777206943e-06, + "loss": 2.2633, + "step": 13670 + }, + { + "epoch": 2.54, + "grad_norm": 5.676297187805176, + "learning_rate": 7.772069419209323e-06, + "loss": 2.2315, + "step": 13680 + }, + { + "epoch": 2.54, + "grad_norm": 4.78164005279541, + "learning_rate": 7.740743061211705e-06, + "loss": 2.2953, + "step": 13690 + }, + { + "epoch": 2.54, + "grad_norm": 4.913782119750977, + "learning_rate": 7.709416703214085e-06, + "loss": 2.3002, + "step": 13700 + }, + { + "epoch": 2.54, + "grad_norm": 5.161614894866943, + "learning_rate": 7.678090345216465e-06, + "loss": 2.2274, + "step": 13710 + }, + { + "epoch": 2.55, + "grad_norm": 4.582902908325195, + "learning_rate": 7.646763987218846e-06, + "loss": 2.3175, + "step": 13720 + }, + { + "epoch": 2.55, + "grad_norm": 4.013784885406494, + "learning_rate": 7.615437629221226e-06, + "loss": 2.0668, + "step": 13730 + }, + { + "epoch": 2.55, + "grad_norm": 4.877525806427002, + "learning_rate": 7.584111271223608e-06, + "loss": 2.2382, + "step": 13740 + }, + { + "epoch": 2.55, + "grad_norm": 4.978527545928955, + "learning_rate": 7.552784913225988e-06, + "loss": 2.3498, + "step": 13750 + }, + { + "epoch": 2.55, + "grad_norm": 5.610579967498779, + "learning_rate": 7.52145855522837e-06, + "loss": 2.1325, + "step": 13760 + }, + { + "epoch": 2.56, + "grad_norm": 5.137538433074951, + "learning_rate": 7.490132197230751e-06, + "loss": 2.2978, + "step": 13770 + }, + { + "epoch": 2.56, + "grad_norm": 5.440046787261963, + "learning_rate": 7.458805839233131e-06, + "loss": 2.2039, + "step": 13780 + }, + { + "epoch": 2.56, + "grad_norm": 5.702132225036621, + "learning_rate": 7.4274794812355125e-06, + "loss": 2.2908, + "step": 13790 + }, + { + "epoch": 2.56, + "grad_norm": 4.557126998901367, + "learning_rate": 7.3961531232378925e-06, + "loss": 2.2488, + "step": 13800 + }, + { + "epoch": 2.56, + "grad_norm": 4.8882222175598145, + "learning_rate": 7.364826765240273e-06, + "loss": 2.2059, + "step": 13810 + }, + { + "epoch": 2.57, + "grad_norm": 6.6189093589782715, + "learning_rate": 7.333500407242655e-06, + "loss": 2.1869, + "step": 13820 + }, + { + "epoch": 2.57, + "grad_norm": 4.6229400634765625, + "learning_rate": 7.302174049245035e-06, + "loss": 2.2872, + "step": 13830 + }, + { + "epoch": 2.57, + "grad_norm": 4.87022590637207, + "learning_rate": 7.270847691247416e-06, + "loss": 2.2808, + "step": 13840 + }, + { + "epoch": 2.57, + "grad_norm": 4.438892841339111, + "learning_rate": 7.239521333249796e-06, + "loss": 2.3188, + "step": 13850 + }, + { + "epoch": 2.57, + "grad_norm": 4.652724266052246, + "learning_rate": 7.208194975252178e-06, + "loss": 2.2262, + "step": 13860 + }, + { + "epoch": 2.57, + "grad_norm": 5.241449356079102, + "learning_rate": 7.176868617254558e-06, + "loss": 2.1773, + "step": 13870 + }, + { + "epoch": 2.58, + "grad_norm": 5.65318489074707, + "learning_rate": 7.145542259256939e-06, + "loss": 2.2564, + "step": 13880 + }, + { + "epoch": 2.58, + "grad_norm": 6.262519836425781, + "learning_rate": 7.11421590125932e-06, + "loss": 2.3014, + "step": 13890 + }, + { + "epoch": 2.58, + "grad_norm": 4.8109025955200195, + "learning_rate": 7.0828895432617e-06, + "loss": 2.1783, + "step": 13900 + }, + { + "epoch": 2.58, + "grad_norm": 10.613912582397461, + "learning_rate": 7.051563185264082e-06, + "loss": 2.3027, + "step": 13910 + }, + { + "epoch": 2.58, + "grad_norm": 5.484175682067871, + "learning_rate": 7.020236827266462e-06, + "loss": 2.2926, + "step": 13920 + }, + { + "epoch": 2.59, + "grad_norm": 5.171184539794922, + "learning_rate": 6.988910469268843e-06, + "loss": 2.179, + "step": 13930 + }, + { + "epoch": 2.59, + "grad_norm": 5.096653938293457, + "learning_rate": 6.957584111271225e-06, + "loss": 2.1388, + "step": 13940 + }, + { + "epoch": 2.59, + "grad_norm": 6.9391069412231445, + "learning_rate": 6.926257753273605e-06, + "loss": 2.1041, + "step": 13950 + }, + { + "epoch": 2.59, + "grad_norm": 4.7677459716796875, + "learning_rate": 6.894931395275986e-06, + "loss": 2.2272, + "step": 13960 + }, + { + "epoch": 2.59, + "grad_norm": 5.2970147132873535, + "learning_rate": 6.863605037278366e-06, + "loss": 2.1962, + "step": 13970 + }, + { + "epoch": 2.6, + "grad_norm": 5.634983539581299, + "learning_rate": 6.8322786792807474e-06, + "loss": 2.307, + "step": 13980 + }, + { + "epoch": 2.6, + "grad_norm": 5.822270393371582, + "learning_rate": 6.800952321283128e-06, + "loss": 2.0941, + "step": 13990 + }, + { + "epoch": 2.6, + "grad_norm": 4.998510360717773, + "learning_rate": 6.769625963285508e-06, + "loss": 2.2717, + "step": 14000 + }, + { + "epoch": 2.6, + "grad_norm": 4.330641269683838, + "learning_rate": 6.73829960528789e-06, + "loss": 2.2546, + "step": 14010 + }, + { + "epoch": 2.6, + "grad_norm": 5.172261714935303, + "learning_rate": 6.70697324729027e-06, + "loss": 2.2181, + "step": 14020 + }, + { + "epoch": 2.6, + "grad_norm": 5.40900993347168, + "learning_rate": 6.675646889292651e-06, + "loss": 2.3187, + "step": 14030 + }, + { + "epoch": 2.61, + "grad_norm": 5.111061096191406, + "learning_rate": 6.644320531295031e-06, + "loss": 2.1499, + "step": 14040 + }, + { + "epoch": 2.61, + "grad_norm": 5.557284355163574, + "learning_rate": 6.612994173297413e-06, + "loss": 2.3055, + "step": 14050 + }, + { + "epoch": 2.61, + "grad_norm": 5.03141450881958, + "learning_rate": 6.5816678152997944e-06, + "loss": 2.2504, + "step": 14060 + }, + { + "epoch": 2.61, + "grad_norm": 4.888171195983887, + "learning_rate": 6.550341457302174e-06, + "loss": 2.0959, + "step": 14070 + }, + { + "epoch": 2.61, + "grad_norm": 5.806394100189209, + "learning_rate": 6.519015099304555e-06, + "loss": 2.2443, + "step": 14080 + }, + { + "epoch": 2.62, + "grad_norm": 4.673512935638428, + "learning_rate": 6.487688741306935e-06, + "loss": 2.2093, + "step": 14090 + }, + { + "epoch": 2.62, + "grad_norm": 6.377676486968994, + "learning_rate": 6.456362383309317e-06, + "loss": 2.2957, + "step": 14100 + }, + { + "epoch": 2.62, + "grad_norm": 5.2512359619140625, + "learning_rate": 6.425036025311698e-06, + "loss": 2.1859, + "step": 14110 + }, + { + "epoch": 2.62, + "grad_norm": 5.803738594055176, + "learning_rate": 6.393709667314078e-06, + "loss": 2.2449, + "step": 14120 + }, + { + "epoch": 2.62, + "grad_norm": 5.135036945343018, + "learning_rate": 6.36238330931646e-06, + "loss": 2.2144, + "step": 14130 + }, + { + "epoch": 2.62, + "grad_norm": 4.972255706787109, + "learning_rate": 6.33105695131884e-06, + "loss": 2.1646, + "step": 14140 + }, + { + "epoch": 2.63, + "grad_norm": 4.952043533325195, + "learning_rate": 6.299730593321221e-06, + "loss": 2.2383, + "step": 14150 + }, + { + "epoch": 2.63, + "grad_norm": 5.106963157653809, + "learning_rate": 6.268404235323602e-06, + "loss": 2.2635, + "step": 14160 + }, + { + "epoch": 2.63, + "grad_norm": 4.5615129470825195, + "learning_rate": 6.237077877325982e-06, + "loss": 2.3088, + "step": 14170 + }, + { + "epoch": 2.63, + "grad_norm": 4.770695209503174, + "learning_rate": 6.205751519328363e-06, + "loss": 2.2693, + "step": 14180 + }, + { + "epoch": 2.63, + "grad_norm": 5.178565502166748, + "learning_rate": 6.174425161330744e-06, + "loss": 2.3658, + "step": 14190 + }, + { + "epoch": 2.64, + "grad_norm": 4.672639846801758, + "learning_rate": 6.143098803333125e-06, + "loss": 2.206, + "step": 14200 + }, + { + "epoch": 2.64, + "grad_norm": 5.218943119049072, + "learning_rate": 6.111772445335506e-06, + "loss": 2.156, + "step": 14210 + }, + { + "epoch": 2.64, + "grad_norm": 5.135859489440918, + "learning_rate": 6.080446087337886e-06, + "loss": 2.1683, + "step": 14220 + }, + { + "epoch": 2.64, + "grad_norm": 5.083263397216797, + "learning_rate": 6.049119729340267e-06, + "loss": 2.1906, + "step": 14230 + }, + { + "epoch": 2.64, + "grad_norm": 4.857705593109131, + "learning_rate": 6.017793371342648e-06, + "loss": 2.1322, + "step": 14240 + }, + { + "epoch": 2.65, + "grad_norm": 5.133806228637695, + "learning_rate": 5.986467013345029e-06, + "loss": 2.1363, + "step": 14250 + }, + { + "epoch": 2.65, + "grad_norm": 5.316963195800781, + "learning_rate": 5.9551406553474094e-06, + "loss": 2.1679, + "step": 14260 + }, + { + "epoch": 2.65, + "grad_norm": 6.094076156616211, + "learning_rate": 5.92381429734979e-06, + "loss": 2.2329, + "step": 14270 + }, + { + "epoch": 2.65, + "grad_norm": 5.19959020614624, + "learning_rate": 5.892487939352171e-06, + "loss": 2.2172, + "step": 14280 + }, + { + "epoch": 2.65, + "grad_norm": 5.317006587982178, + "learning_rate": 5.861161581354552e-06, + "loss": 2.3854, + "step": 14290 + }, + { + "epoch": 2.65, + "grad_norm": 4.754021644592285, + "learning_rate": 5.829835223356933e-06, + "loss": 2.1969, + "step": 14300 + }, + { + "epoch": 2.66, + "grad_norm": 5.336154937744141, + "learning_rate": 5.798508865359314e-06, + "loss": 2.3282, + "step": 14310 + }, + { + "epoch": 2.66, + "grad_norm": 4.85452938079834, + "learning_rate": 5.767182507361695e-06, + "loss": 2.35, + "step": 14320 + }, + { + "epoch": 2.66, + "grad_norm": 5.174147605895996, + "learning_rate": 5.735856149364075e-06, + "loss": 2.1795, + "step": 14330 + }, + { + "epoch": 2.66, + "grad_norm": 5.9505228996276855, + "learning_rate": 5.704529791366456e-06, + "loss": 2.0833, + "step": 14340 + }, + { + "epoch": 2.66, + "grad_norm": 5.462031364440918, + "learning_rate": 5.6732034333688365e-06, + "loss": 2.2265, + "step": 14350 + }, + { + "epoch": 2.67, + "grad_norm": 5.277505397796631, + "learning_rate": 5.641877075371218e-06, + "loss": 2.3298, + "step": 14360 + }, + { + "epoch": 2.67, + "grad_norm": 5.46626615524292, + "learning_rate": 5.610550717373598e-06, + "loss": 2.2778, + "step": 14370 + }, + { + "epoch": 2.67, + "grad_norm": 5.012364387512207, + "learning_rate": 5.579224359375979e-06, + "loss": 2.2084, + "step": 14380 + }, + { + "epoch": 2.67, + "grad_norm": 4.736668586730957, + "learning_rate": 5.54789800137836e-06, + "loss": 2.2649, + "step": 14390 + }, + { + "epoch": 2.67, + "grad_norm": 4.878015995025635, + "learning_rate": 5.516571643380741e-06, + "loss": 2.199, + "step": 14400 + }, + { + "epoch": 2.67, + "grad_norm": 5.050587177276611, + "learning_rate": 5.485245285383122e-06, + "loss": 2.2238, + "step": 14410 + }, + { + "epoch": 2.68, + "grad_norm": 5.755616664886475, + "learning_rate": 5.453918927385503e-06, + "loss": 2.1919, + "step": 14420 + }, + { + "epoch": 2.68, + "grad_norm": 5.753908157348633, + "learning_rate": 5.4225925693878835e-06, + "loss": 2.2535, + "step": 14430 + }, + { + "epoch": 2.68, + "grad_norm": 5.909355640411377, + "learning_rate": 5.391266211390264e-06, + "loss": 2.1856, + "step": 14440 + }, + { + "epoch": 2.68, + "grad_norm": 4.696736812591553, + "learning_rate": 5.359939853392644e-06, + "loss": 2.1645, + "step": 14450 + }, + { + "epoch": 2.68, + "grad_norm": 4.7360520362854, + "learning_rate": 5.328613495395025e-06, + "loss": 2.2176, + "step": 14460 + }, + { + "epoch": 2.69, + "grad_norm": 4.935107707977295, + "learning_rate": 5.297287137397407e-06, + "loss": 2.3128, + "step": 14470 + }, + { + "epoch": 2.69, + "grad_norm": 4.986132621765137, + "learning_rate": 5.265960779399787e-06, + "loss": 2.0601, + "step": 14480 + }, + { + "epoch": 2.69, + "grad_norm": 5.014797210693359, + "learning_rate": 5.234634421402168e-06, + "loss": 2.1279, + "step": 14490 + }, + { + "epoch": 2.69, + "grad_norm": 5.099488258361816, + "learning_rate": 5.203308063404549e-06, + "loss": 2.157, + "step": 14500 + }, + { + "epoch": 2.69, + "grad_norm": 4.824801445007324, + "learning_rate": 5.17198170540693e-06, + "loss": 2.2426, + "step": 14510 + }, + { + "epoch": 2.7, + "grad_norm": 5.455517768859863, + "learning_rate": 5.14065534740931e-06, + "loss": 2.3237, + "step": 14520 + }, + { + "epoch": 2.7, + "grad_norm": 5.147612571716309, + "learning_rate": 5.1093289894116914e-06, + "loss": 2.157, + "step": 14530 + }, + { + "epoch": 2.7, + "grad_norm": 5.479028224945068, + "learning_rate": 5.078002631414072e-06, + "loss": 2.2894, + "step": 14540 + }, + { + "epoch": 2.7, + "grad_norm": 5.406881809234619, + "learning_rate": 5.046676273416453e-06, + "loss": 2.1629, + "step": 14550 + }, + { + "epoch": 2.7, + "grad_norm": 5.455532073974609, + "learning_rate": 5.015349915418833e-06, + "loss": 2.0312, + "step": 14560 + }, + { + "epoch": 2.7, + "grad_norm": 5.509167194366455, + "learning_rate": 4.984023557421214e-06, + "loss": 2.3442, + "step": 14570 + }, + { + "epoch": 2.71, + "grad_norm": 5.4643330574035645, + "learning_rate": 4.952697199423596e-06, + "loss": 2.1014, + "step": 14580 + }, + { + "epoch": 2.71, + "grad_norm": 4.990197658538818, + "learning_rate": 4.921370841425976e-06, + "loss": 2.2279, + "step": 14590 + }, + { + "epoch": 2.71, + "grad_norm": 5.77827262878418, + "learning_rate": 4.890044483428357e-06, + "loss": 2.1901, + "step": 14600 + }, + { + "epoch": 2.71, + "grad_norm": 5.1118083000183105, + "learning_rate": 4.858718125430738e-06, + "loss": 2.3313, + "step": 14610 + }, + { + "epoch": 2.71, + "grad_norm": 5.493002891540527, + "learning_rate": 4.8273917674331185e-06, + "loss": 2.2621, + "step": 14620 + }, + { + "epoch": 2.72, + "grad_norm": 4.995043754577637, + "learning_rate": 4.796065409435499e-06, + "loss": 2.0892, + "step": 14630 + }, + { + "epoch": 2.72, + "grad_norm": 5.6254119873046875, + "learning_rate": 4.76473905143788e-06, + "loss": 2.1689, + "step": 14640 + }, + { + "epoch": 2.72, + "grad_norm": 4.596071243286133, + "learning_rate": 4.733412693440261e-06, + "loss": 2.2291, + "step": 14650 + }, + { + "epoch": 2.72, + "grad_norm": 5.493969917297363, + "learning_rate": 4.702086335442642e-06, + "loss": 2.0971, + "step": 14660 + }, + { + "epoch": 2.72, + "grad_norm": 5.211254119873047, + "learning_rate": 4.670759977445022e-06, + "loss": 2.3204, + "step": 14670 + }, + { + "epoch": 2.72, + "grad_norm": 5.054335594177246, + "learning_rate": 4.639433619447403e-06, + "loss": 2.2843, + "step": 14680 + }, + { + "epoch": 2.73, + "grad_norm": 4.881555557250977, + "learning_rate": 4.608107261449784e-06, + "loss": 2.212, + "step": 14690 + }, + { + "epoch": 2.73, + "grad_norm": 5.238095760345459, + "learning_rate": 4.5767809034521655e-06, + "loss": 2.2209, + "step": 14700 + }, + { + "epoch": 2.73, + "grad_norm": 5.044524192810059, + "learning_rate": 4.5454545454545455e-06, + "loss": 2.3081, + "step": 14710 + }, + { + "epoch": 2.73, + "grad_norm": 4.570669651031494, + "learning_rate": 4.514128187456926e-06, + "loss": 2.0992, + "step": 14720 + }, + { + "epoch": 2.73, + "grad_norm": 4.995473861694336, + "learning_rate": 4.482801829459307e-06, + "loss": 2.3147, + "step": 14730 + }, + { + "epoch": 2.74, + "grad_norm": 4.873288631439209, + "learning_rate": 4.451475471461688e-06, + "loss": 2.2422, + "step": 14740 + }, + { + "epoch": 2.74, + "grad_norm": 5.107931613922119, + "learning_rate": 4.420149113464069e-06, + "loss": 2.2725, + "step": 14750 + }, + { + "epoch": 2.74, + "grad_norm": 5.3016533851623535, + "learning_rate": 4.38882275546645e-06, + "loss": 2.1693, + "step": 14760 + }, + { + "epoch": 2.74, + "grad_norm": 5.348349571228027, + "learning_rate": 4.357496397468831e-06, + "loss": 2.2864, + "step": 14770 + }, + { + "epoch": 2.74, + "grad_norm": 5.596982002258301, + "learning_rate": 4.326170039471212e-06, + "loss": 2.2719, + "step": 14780 + }, + { + "epoch": 2.75, + "grad_norm": 4.691074371337891, + "learning_rate": 4.294843681473592e-06, + "loss": 2.3477, + "step": 14790 + }, + { + "epoch": 2.75, + "grad_norm": 5.885664463043213, + "learning_rate": 4.2635173234759726e-06, + "loss": 2.079, + "step": 14800 + }, + { + "epoch": 2.75, + "grad_norm": 5.616001129150391, + "learning_rate": 4.232190965478354e-06, + "loss": 2.1574, + "step": 14810 + }, + { + "epoch": 2.75, + "grad_norm": 5.573058605194092, + "learning_rate": 4.200864607480734e-06, + "loss": 2.2285, + "step": 14820 + }, + { + "epoch": 2.75, + "grad_norm": 5.251656532287598, + "learning_rate": 4.169538249483115e-06, + "loss": 2.2224, + "step": 14830 + }, + { + "epoch": 2.75, + "grad_norm": 5.339073657989502, + "learning_rate": 4.138211891485496e-06, + "loss": 2.3068, + "step": 14840 + }, + { + "epoch": 2.76, + "grad_norm": 5.788910865783691, + "learning_rate": 4.106885533487877e-06, + "loss": 2.2796, + "step": 14850 + }, + { + "epoch": 2.76, + "grad_norm": 5.806098461151123, + "learning_rate": 4.075559175490257e-06, + "loss": 2.2256, + "step": 14860 + }, + { + "epoch": 2.76, + "grad_norm": 5.479308128356934, + "learning_rate": 4.044232817492639e-06, + "loss": 2.2174, + "step": 14870 + }, + { + "epoch": 2.76, + "grad_norm": 4.946530342102051, + "learning_rate": 4.01290645949502e-06, + "loss": 2.2028, + "step": 14880 + }, + { + "epoch": 2.76, + "grad_norm": 4.5223822593688965, + "learning_rate": 3.9815801014974005e-06, + "loss": 2.1426, + "step": 14890 + }, + { + "epoch": 2.77, + "grad_norm": 5.463576316833496, + "learning_rate": 3.9502537434997805e-06, + "loss": 2.1199, + "step": 14900 + }, + { + "epoch": 2.77, + "grad_norm": 5.110754013061523, + "learning_rate": 3.918927385502161e-06, + "loss": 2.1892, + "step": 14910 + }, + { + "epoch": 2.77, + "grad_norm": 4.9819512367248535, + "learning_rate": 3.887601027504543e-06, + "loss": 2.1054, + "step": 14920 + }, + { + "epoch": 2.77, + "grad_norm": 4.723372459411621, + "learning_rate": 3.856274669506923e-06, + "loss": 2.2199, + "step": 14930 + }, + { + "epoch": 2.77, + "grad_norm": 5.602808475494385, + "learning_rate": 3.824948311509304e-06, + "loss": 2.1164, + "step": 14940 + }, + { + "epoch": 2.78, + "grad_norm": 5.397231101989746, + "learning_rate": 3.793621953511685e-06, + "loss": 2.3285, + "step": 14950 + }, + { + "epoch": 2.78, + "grad_norm": 4.740065097808838, + "learning_rate": 3.7622955955140657e-06, + "loss": 2.2537, + "step": 14960 + }, + { + "epoch": 2.78, + "grad_norm": 5.875923156738281, + "learning_rate": 3.730969237516446e-06, + "loss": 2.2326, + "step": 14970 + }, + { + "epoch": 2.78, + "grad_norm": 5.001502990722656, + "learning_rate": 3.6996428795188275e-06, + "loss": 2.17, + "step": 14980 + }, + { + "epoch": 2.78, + "grad_norm": 5.4369282722473145, + "learning_rate": 3.6683165215212084e-06, + "loss": 2.1858, + "step": 14990 + }, + { + "epoch": 2.78, + "grad_norm": 4.982198715209961, + "learning_rate": 3.636990163523589e-06, + "loss": 2.1899, + "step": 15000 + }, + { + "epoch": 2.79, + "grad_norm": 4.815186977386475, + "learning_rate": 3.6056638055259697e-06, + "loss": 2.1004, + "step": 15010 + }, + { + "epoch": 2.79, + "grad_norm": 5.177938938140869, + "learning_rate": 3.57433744752835e-06, + "loss": 2.2429, + "step": 15020 + }, + { + "epoch": 2.79, + "grad_norm": 5.127625942230225, + "learning_rate": 3.543011089530731e-06, + "loss": 2.1429, + "step": 15030 + }, + { + "epoch": 2.79, + "grad_norm": 5.616495132446289, + "learning_rate": 3.5116847315331123e-06, + "loss": 2.2848, + "step": 15040 + }, + { + "epoch": 2.79, + "grad_norm": 5.018442153930664, + "learning_rate": 3.4803583735354932e-06, + "loss": 2.2801, + "step": 15050 + }, + { + "epoch": 2.8, + "grad_norm": 4.80119514465332, + "learning_rate": 3.4490320155378737e-06, + "loss": 2.2253, + "step": 15060 + }, + { + "epoch": 2.8, + "grad_norm": 5.015815734863281, + "learning_rate": 3.4177056575402545e-06, + "loss": 2.2014, + "step": 15070 + }, + { + "epoch": 2.8, + "grad_norm": 5.846337795257568, + "learning_rate": 3.386379299542635e-06, + "loss": 2.2732, + "step": 15080 + }, + { + "epoch": 2.8, + "grad_norm": 5.304662227630615, + "learning_rate": 3.3550529415450163e-06, + "loss": 2.3228, + "step": 15090 + }, + { + "epoch": 2.8, + "grad_norm": 4.77760124206543, + "learning_rate": 3.323726583547397e-06, + "loss": 2.2546, + "step": 15100 + }, + { + "epoch": 2.8, + "grad_norm": 4.972886562347412, + "learning_rate": 3.2924002255497776e-06, + "loss": 2.2863, + "step": 15110 + }, + { + "epoch": 2.81, + "grad_norm": 5.197108745574951, + "learning_rate": 3.2610738675521585e-06, + "loss": 2.314, + "step": 15120 + }, + { + "epoch": 2.81, + "grad_norm": 5.3886799812316895, + "learning_rate": 3.229747509554539e-06, + "loss": 2.2824, + "step": 15130 + }, + { + "epoch": 2.81, + "grad_norm": 4.588632583618164, + "learning_rate": 3.19842115155692e-06, + "loss": 2.1507, + "step": 15140 + }, + { + "epoch": 2.81, + "grad_norm": 5.631302356719971, + "learning_rate": 3.167094793559301e-06, + "loss": 2.3639, + "step": 15150 + }, + { + "epoch": 2.81, + "grad_norm": 4.998614311218262, + "learning_rate": 3.135768435561682e-06, + "loss": 2.2462, + "step": 15160 + }, + { + "epoch": 2.82, + "grad_norm": 5.030011177062988, + "learning_rate": 3.1044420775640625e-06, + "loss": 2.3389, + "step": 15170 + }, + { + "epoch": 2.82, + "grad_norm": 4.784780502319336, + "learning_rate": 3.0731157195664433e-06, + "loss": 2.2283, + "step": 15180 + }, + { + "epoch": 2.82, + "grad_norm": 4.889308452606201, + "learning_rate": 3.0417893615688242e-06, + "loss": 2.1219, + "step": 15190 + }, + { + "epoch": 2.82, + "grad_norm": 4.779045581817627, + "learning_rate": 3.010463003571205e-06, + "loss": 2.2276, + "step": 15200 + }, + { + "epoch": 2.82, + "grad_norm": 4.917059898376465, + "learning_rate": 2.9791366455735856e-06, + "loss": 2.1144, + "step": 15210 + }, + { + "epoch": 2.83, + "grad_norm": 5.68520450592041, + "learning_rate": 2.947810287575967e-06, + "loss": 2.4605, + "step": 15220 + }, + { + "epoch": 2.83, + "grad_norm": 4.577776908874512, + "learning_rate": 2.9164839295783473e-06, + "loss": 2.2559, + "step": 15230 + }, + { + "epoch": 2.83, + "grad_norm": 5.267956733703613, + "learning_rate": 2.885157571580728e-06, + "loss": 2.2218, + "step": 15240 + }, + { + "epoch": 2.83, + "grad_norm": 5.275062561035156, + "learning_rate": 2.853831213583109e-06, + "loss": 2.2437, + "step": 15250 + }, + { + "epoch": 2.83, + "grad_norm": 4.182082176208496, + "learning_rate": 2.82250485558549e-06, + "loss": 2.2602, + "step": 15260 + }, + { + "epoch": 2.83, + "grad_norm": 5.157698631286621, + "learning_rate": 2.7911784975878704e-06, + "loss": 2.2095, + "step": 15270 + }, + { + "epoch": 2.84, + "grad_norm": 5.279921054840088, + "learning_rate": 2.7598521395902513e-06, + "loss": 2.1501, + "step": 15280 + }, + { + "epoch": 2.84, + "grad_norm": 5.391173362731934, + "learning_rate": 2.728525781592632e-06, + "loss": 2.1421, + "step": 15290 + }, + { + "epoch": 2.84, + "grad_norm": 5.1854023933410645, + "learning_rate": 2.697199423595013e-06, + "loss": 2.3726, + "step": 15300 + }, + { + "epoch": 2.84, + "grad_norm": 5.282604217529297, + "learning_rate": 2.665873065597394e-06, + "loss": 2.1056, + "step": 15310 + }, + { + "epoch": 2.84, + "grad_norm": 5.295992851257324, + "learning_rate": 2.6345467075997744e-06, + "loss": 2.3241, + "step": 15320 + }, + { + "epoch": 2.85, + "grad_norm": 5.822473049163818, + "learning_rate": 2.6032203496021557e-06, + "loss": 2.2288, + "step": 15330 + }, + { + "epoch": 2.85, + "grad_norm": 6.813709259033203, + "learning_rate": 2.571893991604536e-06, + "loss": 2.2335, + "step": 15340 + }, + { + "epoch": 2.85, + "grad_norm": 4.577134132385254, + "learning_rate": 2.540567633606917e-06, + "loss": 2.2846, + "step": 15350 + }, + { + "epoch": 2.85, + "grad_norm": 4.539644241333008, + "learning_rate": 2.509241275609298e-06, + "loss": 2.1281, + "step": 15360 + }, + { + "epoch": 2.85, + "grad_norm": 4.731322288513184, + "learning_rate": 2.4779149176116787e-06, + "loss": 2.2189, + "step": 15370 + }, + { + "epoch": 2.85, + "grad_norm": 5.15498161315918, + "learning_rate": 2.446588559614059e-06, + "loss": 2.1157, + "step": 15380 + }, + { + "epoch": 2.86, + "grad_norm": 5.277298450469971, + "learning_rate": 2.4152622016164405e-06, + "loss": 2.3078, + "step": 15390 + }, + { + "epoch": 2.86, + "grad_norm": 4.953650951385498, + "learning_rate": 2.383935843618821e-06, + "loss": 2.2351, + "step": 15400 + }, + { + "epoch": 2.86, + "grad_norm": 5.887104034423828, + "learning_rate": 2.352609485621202e-06, + "loss": 2.108, + "step": 15410 + }, + { + "epoch": 2.86, + "grad_norm": 5.2219767570495605, + "learning_rate": 2.3212831276235827e-06, + "loss": 2.3245, + "step": 15420 + }, + { + "epoch": 2.86, + "grad_norm": 5.635054588317871, + "learning_rate": 2.289956769625963e-06, + "loss": 2.2183, + "step": 15430 + }, + { + "epoch": 2.87, + "grad_norm": 4.352700710296631, + "learning_rate": 2.258630411628344e-06, + "loss": 2.2135, + "step": 15440 + }, + { + "epoch": 2.87, + "grad_norm": 5.863667011260986, + "learning_rate": 2.227304053630725e-06, + "loss": 2.1377, + "step": 15450 + }, + { + "epoch": 2.87, + "grad_norm": 5.384456157684326, + "learning_rate": 2.1959776956331058e-06, + "loss": 2.2731, + "step": 15460 + }, + { + "epoch": 2.87, + "grad_norm": 5.075336456298828, + "learning_rate": 2.1646513376354867e-06, + "loss": 2.12, + "step": 15470 + }, + { + "epoch": 2.87, + "grad_norm": 5.152316570281982, + "learning_rate": 2.1333249796378675e-06, + "loss": 2.1699, + "step": 15480 + }, + { + "epoch": 2.88, + "grad_norm": 4.9257917404174805, + "learning_rate": 2.101998621640248e-06, + "loss": 2.1559, + "step": 15490 + }, + { + "epoch": 2.88, + "grad_norm": 4.926279544830322, + "learning_rate": 2.0706722636426293e-06, + "loss": 2.2984, + "step": 15500 + }, + { + "epoch": 2.88, + "grad_norm": 5.468809604644775, + "learning_rate": 2.0393459056450097e-06, + "loss": 2.0854, + "step": 15510 + }, + { + "epoch": 2.88, + "grad_norm": 14.988544464111328, + "learning_rate": 2.0080195476473906e-06, + "loss": 2.2182, + "step": 15520 + }, + { + "epoch": 2.88, + "grad_norm": 5.264973163604736, + "learning_rate": 1.9766931896497715e-06, + "loss": 2.3079, + "step": 15530 + }, + { + "epoch": 2.88, + "grad_norm": 5.474333763122559, + "learning_rate": 1.9453668316521524e-06, + "loss": 2.1352, + "step": 15540 + }, + { + "epoch": 2.89, + "grad_norm": 5.057586193084717, + "learning_rate": 1.914040473654533e-06, + "loss": 2.1288, + "step": 15550 + }, + { + "epoch": 2.89, + "grad_norm": 5.180942535400391, + "learning_rate": 1.882714115656914e-06, + "loss": 2.3115, + "step": 15560 + }, + { + "epoch": 2.89, + "grad_norm": 5.402101516723633, + "learning_rate": 1.8513877576592946e-06, + "loss": 2.1244, + "step": 15570 + }, + { + "epoch": 2.89, + "grad_norm": 4.428703308105469, + "learning_rate": 1.8200613996616752e-06, + "loss": 2.1784, + "step": 15580 + }, + { + "epoch": 2.89, + "grad_norm": 4.959710597991943, + "learning_rate": 1.7887350416640563e-06, + "loss": 2.3097, + "step": 15590 + }, + { + "epoch": 2.9, + "grad_norm": 5.586377143859863, + "learning_rate": 1.757408683666437e-06, + "loss": 2.1855, + "step": 15600 + }, + { + "epoch": 2.9, + "grad_norm": 4.836302280426025, + "learning_rate": 1.7260823256688177e-06, + "loss": 2.2558, + "step": 15610 + }, + { + "epoch": 2.9, + "grad_norm": 5.129261493682861, + "learning_rate": 1.6947559676711988e-06, + "loss": 2.1964, + "step": 15620 + }, + { + "epoch": 2.9, + "grad_norm": 5.32321310043335, + "learning_rate": 1.6634296096735794e-06, + "loss": 2.1325, + "step": 15630 + }, + { + "epoch": 2.9, + "grad_norm": 5.156566619873047, + "learning_rate": 1.6321032516759603e-06, + "loss": 2.25, + "step": 15640 + }, + { + "epoch": 2.91, + "grad_norm": 4.7280755043029785, + "learning_rate": 1.600776893678341e-06, + "loss": 2.2077, + "step": 15650 + }, + { + "epoch": 2.91, + "grad_norm": 5.000637531280518, + "learning_rate": 1.5694505356807218e-06, + "loss": 2.2315, + "step": 15660 + }, + { + "epoch": 2.91, + "grad_norm": 4.943790435791016, + "learning_rate": 1.5381241776831025e-06, + "loss": 2.1645, + "step": 15670 + }, + { + "epoch": 2.91, + "grad_norm": 5.3277668952941895, + "learning_rate": 1.5067978196854834e-06, + "loss": 2.2574, + "step": 15680 + }, + { + "epoch": 2.91, + "grad_norm": 5.879887580871582, + "learning_rate": 1.4754714616878643e-06, + "loss": 2.1678, + "step": 15690 + }, + { + "epoch": 2.91, + "grad_norm": 6.744934558868408, + "learning_rate": 1.444145103690245e-06, + "loss": 2.1535, + "step": 15700 + }, + { + "epoch": 2.92, + "grad_norm": 4.511887550354004, + "learning_rate": 1.4128187456926258e-06, + "loss": 2.1764, + "step": 15710 + }, + { + "epoch": 2.92, + "grad_norm": 4.7104811668396, + "learning_rate": 1.3814923876950067e-06, + "loss": 2.1938, + "step": 15720 + }, + { + "epoch": 2.92, + "grad_norm": 4.853920936584473, + "learning_rate": 1.3501660296973873e-06, + "loss": 2.1526, + "step": 15730 + }, + { + "epoch": 2.92, + "grad_norm": 4.676470756530762, + "learning_rate": 1.3188396716997682e-06, + "loss": 2.1985, + "step": 15740 + }, + { + "epoch": 2.92, + "grad_norm": 4.885119915008545, + "learning_rate": 1.287513313702149e-06, + "loss": 2.2122, + "step": 15750 + }, + { + "epoch": 2.93, + "grad_norm": 5.216524600982666, + "learning_rate": 1.25618695570453e-06, + "loss": 2.3254, + "step": 15760 + }, + { + "epoch": 2.93, + "grad_norm": 5.545653343200684, + "learning_rate": 1.2248605977069106e-06, + "loss": 2.2905, + "step": 15770 + }, + { + "epoch": 2.93, + "grad_norm": 5.003011703491211, + "learning_rate": 1.1935342397092915e-06, + "loss": 2.2472, + "step": 15780 + }, + { + "epoch": 2.93, + "grad_norm": 5.019803524017334, + "learning_rate": 1.1622078817116724e-06, + "loss": 2.3646, + "step": 15790 + }, + { + "epoch": 2.93, + "grad_norm": 5.200807571411133, + "learning_rate": 1.130881523714053e-06, + "loss": 2.4219, + "step": 15800 + }, + { + "epoch": 2.93, + "grad_norm": 4.575010299682617, + "learning_rate": 1.099555165716434e-06, + "loss": 2.3115, + "step": 15810 + }, + { + "epoch": 2.94, + "grad_norm": 4.7481842041015625, + "learning_rate": 1.0682288077188146e-06, + "loss": 2.3336, + "step": 15820 + }, + { + "epoch": 2.94, + "grad_norm": 5.135469436645508, + "learning_rate": 1.0369024497211955e-06, + "loss": 2.1925, + "step": 15830 + }, + { + "epoch": 2.94, + "grad_norm": 5.409132957458496, + "learning_rate": 1.0055760917235761e-06, + "loss": 2.2156, + "step": 15840 + }, + { + "epoch": 2.94, + "grad_norm": 4.531829357147217, + "learning_rate": 9.74249733725957e-07, + "loss": 2.2851, + "step": 15850 + }, + { + "epoch": 2.94, + "grad_norm": 4.880288124084473, + "learning_rate": 9.429233757283379e-07, + "loss": 2.2134, + "step": 15860 + }, + { + "epoch": 2.95, + "grad_norm": 5.735778331756592, + "learning_rate": 9.115970177307186e-07, + "loss": 2.1477, + "step": 15870 + }, + { + "epoch": 2.95, + "grad_norm": 5.089371681213379, + "learning_rate": 8.802706597330994e-07, + "loss": 2.1704, + "step": 15880 + }, + { + "epoch": 2.95, + "grad_norm": 5.147428512573242, + "learning_rate": 8.489443017354803e-07, + "loss": 2.2521, + "step": 15890 + }, + { + "epoch": 2.95, + "grad_norm": 5.981338977813721, + "learning_rate": 8.176179437378612e-07, + "loss": 2.2188, + "step": 15900 + }, + { + "epoch": 2.95, + "grad_norm": 6.076031684875488, + "learning_rate": 7.862915857402419e-07, + "loss": 2.1894, + "step": 15910 + }, + { + "epoch": 2.96, + "grad_norm": 5.003175258636475, + "learning_rate": 7.549652277426227e-07, + "loss": 2.2015, + "step": 15920 + }, + { + "epoch": 2.96, + "grad_norm": 5.125662803649902, + "learning_rate": 7.236388697450035e-07, + "loss": 2.1842, + "step": 15930 + }, + { + "epoch": 2.96, + "grad_norm": 5.404399871826172, + "learning_rate": 6.923125117473843e-07, + "loss": 2.2261, + "step": 15940 + }, + { + "epoch": 2.96, + "grad_norm": 4.939316272735596, + "learning_rate": 6.60986153749765e-07, + "loss": 2.2224, + "step": 15950 + }, + { + "epoch": 2.96, + "grad_norm": 6.108341693878174, + "learning_rate": 6.296597957521459e-07, + "loss": 2.1957, + "step": 15960 + }, + { + "epoch": 2.96, + "grad_norm": 4.7472333908081055, + "learning_rate": 5.983334377545267e-07, + "loss": 2.1746, + "step": 15970 + }, + { + "epoch": 2.97, + "grad_norm": 5.0396599769592285, + "learning_rate": 5.670070797569075e-07, + "loss": 2.2503, + "step": 15980 + }, + { + "epoch": 2.97, + "grad_norm": 6.954887866973877, + "learning_rate": 5.356807217592883e-07, + "loss": 2.1326, + "step": 15990 + }, + { + "epoch": 2.97, + "grad_norm": 5.1434736251831055, + "learning_rate": 5.043543637616691e-07, + "loss": 2.1736, + "step": 16000 + }, + { + "epoch": 2.97, + "grad_norm": 4.424656391143799, + "learning_rate": 4.7302800576404993e-07, + "loss": 2.297, + "step": 16010 + }, + { + "epoch": 2.97, + "grad_norm": 4.53824520111084, + "learning_rate": 4.417016477664307e-07, + "loss": 2.1963, + "step": 16020 + }, + { + "epoch": 2.98, + "grad_norm": 5.093719005584717, + "learning_rate": 4.103752897688115e-07, + "loss": 2.2149, + "step": 16030 + }, + { + "epoch": 2.98, + "grad_norm": 4.9572062492370605, + "learning_rate": 3.790489317711923e-07, + "loss": 2.2097, + "step": 16040 + }, + { + "epoch": 2.98, + "grad_norm": 5.130666255950928, + "learning_rate": 3.477225737735731e-07, + "loss": 2.2865, + "step": 16050 + }, + { + "epoch": 2.98, + "grad_norm": 6.0656633377075195, + "learning_rate": 3.163962157759539e-07, + "loss": 2.2507, + "step": 16060 + }, + { + "epoch": 2.98, + "grad_norm": 5.007623195648193, + "learning_rate": 2.850698577783347e-07, + "loss": 2.196, + "step": 16070 + }, + { + "epoch": 2.98, + "grad_norm": 5.159407138824463, + "learning_rate": 2.537434997807155e-07, + "loss": 2.3414, + "step": 16080 + }, + { + "epoch": 2.99, + "grad_norm": 4.766927719116211, + "learning_rate": 2.2241714178309631e-07, + "loss": 2.252, + "step": 16090 + }, + { + "epoch": 2.99, + "grad_norm": 5.673045635223389, + "learning_rate": 1.910907837854771e-07, + "loss": 2.1263, + "step": 16100 + }, + { + "epoch": 2.99, + "grad_norm": 5.0451507568359375, + "learning_rate": 1.597644257878579e-07, + "loss": 2.1164, + "step": 16110 + }, + { + "epoch": 2.99, + "grad_norm": 4.848662853240967, + "learning_rate": 1.284380677902387e-07, + "loss": 2.1926, + "step": 16120 + }, + { + "epoch": 2.99, + "grad_norm": 5.478034973144531, + "learning_rate": 9.711170979261952e-08, + "loss": 2.2998, + "step": 16130 + }, + { + "epoch": 3.0, + "grad_norm": 5.586289405822754, + "learning_rate": 6.578535179500031e-08, + "loss": 2.2523, + "step": 16140 + }, + { + "epoch": 3.0, + "grad_norm": 5.570580005645752, + "learning_rate": 3.445899379738112e-08, + "loss": 2.2318, + "step": 16150 + }, + { + "epoch": 3.0, + "grad_norm": 5.062375068664551, + "learning_rate": 3.13263579976192e-09, + "loss": 2.234, + "step": 16160 + } + ], + "logging_steps": 10, + "max_steps": 16161, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 6.298475820377702e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}