diff --git "a/checkpoint-23845/trainer_state.json" "b/checkpoint-23845/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-23845/trainer_state.json" @@ -0,0 +1,16721 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 23845, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0020968756552736424, + "grad_norm": 16.383289337158203, + "learning_rate": 0.0002, + "loss": 5.0295, + "step": 10 + }, + { + "epoch": 0.004193751310547285, + "grad_norm": 15.346772193908691, + "learning_rate": 0.0002, + "loss": 1.344, + "step": 20 + }, + { + "epoch": 0.006290626965820926, + "grad_norm": 5.9838738441467285, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 30 + }, + { + "epoch": 0.00838750262109457, + "grad_norm": 1.9119038581848145, + "learning_rate": 0.0002, + "loss": 0.4024, + "step": 40 + }, + { + "epoch": 0.010484378276368212, + "grad_norm": 1.0427629947662354, + "learning_rate": 0.0002, + "loss": 0.393, + "step": 50 + }, + { + "epoch": 0.012581253931641853, + "grad_norm": 3.237114429473877, + "learning_rate": 0.0002, + "loss": 0.2799, + "step": 60 + }, + { + "epoch": 0.014678129586915495, + "grad_norm": 2.2672061920166016, + "learning_rate": 0.0002, + "loss": 0.3716, + "step": 70 + }, + { + "epoch": 0.01677500524218914, + "grad_norm": 0.4691790044307709, + "learning_rate": 0.0002, + "loss": 0.3543, + "step": 80 + }, + { + "epoch": 0.018871880897462782, + "grad_norm": 4.190754413604736, + "learning_rate": 0.0002, + "loss": 0.2702, + "step": 90 + }, + { + "epoch": 0.020968756552736424, + "grad_norm": 1.0056904554367065, + "learning_rate": 0.0002, + "loss": 0.347, + "step": 100 + }, + { + "epoch": 0.023065632208010067, + "grad_norm": 1.437057375907898, + "learning_rate": 0.0002, + "loss": 0.2648, + "step": 110 + }, + { + "epoch": 0.025162507863283706, + "grad_norm": 0.36117708683013916, + "learning_rate": 0.0002, + "loss": 0.3003, + "step": 120 + }, + { + "epoch": 0.027259383518557348, + "grad_norm": 8.518256187438965, + "learning_rate": 0.0002, + "loss": 0.3501, + "step": 130 + }, + { + "epoch": 0.02935625917383099, + "grad_norm": 0.6031225323677063, + "learning_rate": 0.0002, + "loss": 0.2877, + "step": 140 + }, + { + "epoch": 0.03145313482910463, + "grad_norm": 0.909845232963562, + "learning_rate": 0.0002, + "loss": 0.2695, + "step": 150 + }, + { + "epoch": 0.03355001048437828, + "grad_norm": 0.46850165724754333, + "learning_rate": 0.0002, + "loss": 0.2755, + "step": 160 + }, + { + "epoch": 0.03564688613965192, + "grad_norm": 1.2919809818267822, + "learning_rate": 0.0002, + "loss": 0.381, + "step": 170 + }, + { + "epoch": 0.037743761794925564, + "grad_norm": 0.36922141909599304, + "learning_rate": 0.0002, + "loss": 0.3034, + "step": 180 + }, + { + "epoch": 0.0398406374501992, + "grad_norm": 0.41144391894340515, + "learning_rate": 0.0002, + "loss": 0.3339, + "step": 190 + }, + { + "epoch": 0.04193751310547285, + "grad_norm": 0.5304984450340271, + "learning_rate": 0.0002, + "loss": 0.2819, + "step": 200 + }, + { + "epoch": 0.04403438876074649, + "grad_norm": 0.36329638957977295, + "learning_rate": 0.0002, + "loss": 0.2943, + "step": 210 + }, + { + "epoch": 0.04613126441602013, + "grad_norm": 5.069764614105225, + "learning_rate": 0.0002, + "loss": 0.326, + "step": 220 + }, + { + "epoch": 0.04822814007129377, + "grad_norm": 5.163594722747803, + "learning_rate": 0.0002, + "loss": 0.2818, + "step": 230 + }, + { + "epoch": 0.05032501572656741, + "grad_norm": 2.1980459690093994, + "learning_rate": 0.0002, + "loss": 0.2522, + "step": 240 + }, + { + "epoch": 0.05242189138184106, + "grad_norm": 2.0721938610076904, + "learning_rate": 0.0002, + "loss": 0.2838, + "step": 250 + }, + { + "epoch": 0.054518767037114696, + "grad_norm": 0.21942710876464844, + "learning_rate": 0.0002, + "loss": 0.2509, + "step": 260 + }, + { + "epoch": 0.05661564269238834, + "grad_norm": 1.9733415842056274, + "learning_rate": 0.0002, + "loss": 0.2499, + "step": 270 + }, + { + "epoch": 0.05871251834766198, + "grad_norm": 0.1733933985233307, + "learning_rate": 0.0002, + "loss": 0.2478, + "step": 280 + }, + { + "epoch": 0.06080939400293563, + "grad_norm": 5.54453706741333, + "learning_rate": 0.0002, + "loss": 0.2746, + "step": 290 + }, + { + "epoch": 0.06290626965820927, + "grad_norm": 0.30464133620262146, + "learning_rate": 0.0002, + "loss": 0.3196, + "step": 300 + }, + { + "epoch": 0.0650031453134829, + "grad_norm": 5.397165298461914, + "learning_rate": 0.0002, + "loss": 0.3544, + "step": 310 + }, + { + "epoch": 0.06710002096875656, + "grad_norm": 0.21483393013477325, + "learning_rate": 0.0002, + "loss": 0.2509, + "step": 320 + }, + { + "epoch": 0.0691968966240302, + "grad_norm": 0.24638743698596954, + "learning_rate": 0.0002, + "loss": 0.2819, + "step": 330 + }, + { + "epoch": 0.07129377227930384, + "grad_norm": 9.33228874206543, + "learning_rate": 0.0002, + "loss": 0.3348, + "step": 340 + }, + { + "epoch": 0.07339064793457747, + "grad_norm": 2.1489148139953613, + "learning_rate": 0.0002, + "loss": 0.2708, + "step": 350 + }, + { + "epoch": 0.07548752358985113, + "grad_norm": 0.23243466019630432, + "learning_rate": 0.0002, + "loss": 0.249, + "step": 360 + }, + { + "epoch": 0.07758439924512477, + "grad_norm": 0.13301965594291687, + "learning_rate": 0.0002, + "loss": 0.2739, + "step": 370 + }, + { + "epoch": 0.0796812749003984, + "grad_norm": 0.2184901237487793, + "learning_rate": 0.0002, + "loss": 0.2718, + "step": 380 + }, + { + "epoch": 0.08177815055567204, + "grad_norm": 5.580859661102295, + "learning_rate": 0.0002, + "loss": 0.2882, + "step": 390 + }, + { + "epoch": 0.0838750262109457, + "grad_norm": 0.2310085892677307, + "learning_rate": 0.0002, + "loss": 0.2837, + "step": 400 + }, + { + "epoch": 0.08597190186621934, + "grad_norm": 0.16177065670490265, + "learning_rate": 0.0002, + "loss": 0.2476, + "step": 410 + }, + { + "epoch": 0.08806877752149297, + "grad_norm": 3.8961215019226074, + "learning_rate": 0.0002, + "loss": 0.3048, + "step": 420 + }, + { + "epoch": 0.09016565317676661, + "grad_norm": 0.09581358730792999, + "learning_rate": 0.0002, + "loss": 0.3008, + "step": 430 + }, + { + "epoch": 0.09226252883204027, + "grad_norm": 0.12218118458986282, + "learning_rate": 0.0002, + "loss": 0.2672, + "step": 440 + }, + { + "epoch": 0.0943594044873139, + "grad_norm": 0.11212728172540665, + "learning_rate": 0.0002, + "loss": 0.2465, + "step": 450 + }, + { + "epoch": 0.09645628014258754, + "grad_norm": 0.16998110711574554, + "learning_rate": 0.0002, + "loss": 0.268, + "step": 460 + }, + { + "epoch": 0.09855315579786118, + "grad_norm": 1.1045947074890137, + "learning_rate": 0.0002, + "loss": 0.249, + "step": 470 + }, + { + "epoch": 0.10065003145313482, + "grad_norm": 0.0940975770354271, + "learning_rate": 0.0002, + "loss": 0.2994, + "step": 480 + }, + { + "epoch": 0.10274690710840848, + "grad_norm": 0.1036498099565506, + "learning_rate": 0.0002, + "loss": 0.2763, + "step": 490 + }, + { + "epoch": 0.10484378276368211, + "grad_norm": 0.42365631461143494, + "learning_rate": 0.0002, + "loss": 0.2984, + "step": 500 + }, + { + "epoch": 0.10694065841895575, + "grad_norm": 0.08997116982936859, + "learning_rate": 0.0002, + "loss": 0.2476, + "step": 510 + }, + { + "epoch": 0.10903753407422939, + "grad_norm": 0.15138301253318787, + "learning_rate": 0.0002, + "loss": 0.2439, + "step": 520 + }, + { + "epoch": 0.11113440972950304, + "grad_norm": 2.1935176849365234, + "learning_rate": 0.0002, + "loss": 0.334, + "step": 530 + }, + { + "epoch": 0.11323128538477668, + "grad_norm": 0.15546467900276184, + "learning_rate": 0.0002, + "loss": 0.2875, + "step": 540 + }, + { + "epoch": 0.11532816104005032, + "grad_norm": 0.15564927458763123, + "learning_rate": 0.0002, + "loss": 0.2735, + "step": 550 + }, + { + "epoch": 0.11742503669532396, + "grad_norm": 0.1129130944609642, + "learning_rate": 0.0002, + "loss": 0.3484, + "step": 560 + }, + { + "epoch": 0.11952191235059761, + "grad_norm": 0.2688151001930237, + "learning_rate": 0.0002, + "loss": 0.28, + "step": 570 + }, + { + "epoch": 0.12161878800587125, + "grad_norm": 0.13618075847625732, + "learning_rate": 0.0002, + "loss": 0.3393, + "step": 580 + }, + { + "epoch": 0.12371566366114489, + "grad_norm": 0.1488252729177475, + "learning_rate": 0.0002, + "loss": 0.3001, + "step": 590 + }, + { + "epoch": 0.12581253931641853, + "grad_norm": 0.10563705116510391, + "learning_rate": 0.0002, + "loss": 0.3055, + "step": 600 + }, + { + "epoch": 0.12790941497169217, + "grad_norm": 0.07457701861858368, + "learning_rate": 0.0002, + "loss": 0.3068, + "step": 610 + }, + { + "epoch": 0.1300062906269658, + "grad_norm": 0.5072939991950989, + "learning_rate": 0.0002, + "loss": 0.2483, + "step": 620 + }, + { + "epoch": 0.13210316628223948, + "grad_norm": 0.20537391304969788, + "learning_rate": 0.0002, + "loss": 0.307, + "step": 630 + }, + { + "epoch": 0.13420004193751311, + "grad_norm": 0.144338920712471, + "learning_rate": 0.0002, + "loss": 0.275, + "step": 640 + }, + { + "epoch": 0.13629691759278675, + "grad_norm": 0.1316433846950531, + "learning_rate": 0.0002, + "loss": 0.2458, + "step": 650 + }, + { + "epoch": 0.1383937932480604, + "grad_norm": 3.318430185317993, + "learning_rate": 0.0002, + "loss": 0.3049, + "step": 660 + }, + { + "epoch": 0.14049066890333403, + "grad_norm": 0.08888725936412811, + "learning_rate": 0.0002, + "loss": 0.2817, + "step": 670 + }, + { + "epoch": 0.14258754455860767, + "grad_norm": 0.13618157804012299, + "learning_rate": 0.0002, + "loss": 0.2855, + "step": 680 + }, + { + "epoch": 0.1446844202138813, + "grad_norm": 0.1266484558582306, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 690 + }, + { + "epoch": 0.14678129586915495, + "grad_norm": 0.11817031353712082, + "learning_rate": 0.0002, + "loss": 0.2691, + "step": 700 + }, + { + "epoch": 0.14887817152442862, + "grad_norm": 0.12560369074344635, + "learning_rate": 0.0002, + "loss": 0.2685, + "step": 710 + }, + { + "epoch": 0.15097504717970225, + "grad_norm": 0.1666841357946396, + "learning_rate": 0.0002, + "loss": 0.2776, + "step": 720 + }, + { + "epoch": 0.1530719228349759, + "grad_norm": 0.16758698225021362, + "learning_rate": 0.0002, + "loss": 0.2887, + "step": 730 + }, + { + "epoch": 0.15516879849024953, + "grad_norm": 0.821657121181488, + "learning_rate": 0.0002, + "loss": 0.3388, + "step": 740 + }, + { + "epoch": 0.15726567414552317, + "grad_norm": 0.11897563934326172, + "learning_rate": 0.0002, + "loss": 0.2833, + "step": 750 + }, + { + "epoch": 0.1593625498007968, + "grad_norm": 2.6172780990600586, + "learning_rate": 0.0002, + "loss": 0.2832, + "step": 760 + }, + { + "epoch": 0.16145942545607045, + "grad_norm": 2.524484157562256, + "learning_rate": 0.0002, + "loss": 0.3135, + "step": 770 + }, + { + "epoch": 0.1635563011113441, + "grad_norm": 1.6356699466705322, + "learning_rate": 0.0002, + "loss": 0.267, + "step": 780 + }, + { + "epoch": 0.16565317676661773, + "grad_norm": 1.7804538011550903, + "learning_rate": 0.0002, + "loss": 0.3106, + "step": 790 + }, + { + "epoch": 0.1677500524218914, + "grad_norm": 0.07052750140428543, + "learning_rate": 0.0002, + "loss": 0.246, + "step": 800 + }, + { + "epoch": 0.16984692807716503, + "grad_norm": 0.11844762414693832, + "learning_rate": 0.0002, + "loss": 0.2458, + "step": 810 + }, + { + "epoch": 0.17194380373243867, + "grad_norm": 0.0921357050538063, + "learning_rate": 0.0002, + "loss": 0.2461, + "step": 820 + }, + { + "epoch": 0.1740406793877123, + "grad_norm": 0.1175508201122284, + "learning_rate": 0.0002, + "loss": 0.2725, + "step": 830 + }, + { + "epoch": 0.17613755504298595, + "grad_norm": 0.11097151786088943, + "learning_rate": 0.0002, + "loss": 0.2694, + "step": 840 + }, + { + "epoch": 0.1782344306982596, + "grad_norm": 2.3500242233276367, + "learning_rate": 0.0002, + "loss": 0.3577, + "step": 850 + }, + { + "epoch": 0.18033130635353323, + "grad_norm": 0.46957358717918396, + "learning_rate": 0.0002, + "loss": 0.3253, + "step": 860 + }, + { + "epoch": 0.18242818200880687, + "grad_norm": 0.24576787650585175, + "learning_rate": 0.0002, + "loss": 0.2529, + "step": 870 + }, + { + "epoch": 0.18452505766408053, + "grad_norm": 0.21132692694664001, + "learning_rate": 0.0002, + "loss": 0.2894, + "step": 880 + }, + { + "epoch": 0.18662193331935417, + "grad_norm": 0.16808678209781647, + "learning_rate": 0.0002, + "loss": 0.2559, + "step": 890 + }, + { + "epoch": 0.1887188089746278, + "grad_norm": 0.1904708743095398, + "learning_rate": 0.0002, + "loss": 0.2703, + "step": 900 + }, + { + "epoch": 0.19081568462990145, + "grad_norm": 0.0702681764960289, + "learning_rate": 0.0002, + "loss": 0.2449, + "step": 910 + }, + { + "epoch": 0.1929125602851751, + "grad_norm": 0.16173458099365234, + "learning_rate": 0.0002, + "loss": 0.2731, + "step": 920 + }, + { + "epoch": 0.19500943594044873, + "grad_norm": 0.1556364893913269, + "learning_rate": 0.0002, + "loss": 0.2642, + "step": 930 + }, + { + "epoch": 0.19710631159572237, + "grad_norm": 0.09365347027778625, + "learning_rate": 0.0002, + "loss": 0.2828, + "step": 940 + }, + { + "epoch": 0.199203187250996, + "grad_norm": 0.12134792655706406, + "learning_rate": 0.0002, + "loss": 0.2459, + "step": 950 + }, + { + "epoch": 0.20130006290626964, + "grad_norm": 0.14340221881866455, + "learning_rate": 0.0002, + "loss": 0.3118, + "step": 960 + }, + { + "epoch": 0.2033969385615433, + "grad_norm": 0.15696048736572266, + "learning_rate": 0.0002, + "loss": 0.2679, + "step": 970 + }, + { + "epoch": 0.20549381421681695, + "grad_norm": 0.33497804403305054, + "learning_rate": 0.0002, + "loss": 0.3172, + "step": 980 + }, + { + "epoch": 0.2075906898720906, + "grad_norm": 0.15141892433166504, + "learning_rate": 0.0002, + "loss": 0.2592, + "step": 990 + }, + { + "epoch": 0.20968756552736423, + "grad_norm": 0.10774557292461395, + "learning_rate": 0.0002, + "loss": 0.3013, + "step": 1000 + }, + { + "epoch": 0.21178444118263787, + "grad_norm": 0.09470309317111969, + "learning_rate": 0.0002, + "loss": 0.2705, + "step": 1010 + }, + { + "epoch": 0.2138813168379115, + "grad_norm": 0.12090291827917099, + "learning_rate": 0.0002, + "loss": 0.2576, + "step": 1020 + }, + { + "epoch": 0.21597819249318514, + "grad_norm": 0.12572237849235535, + "learning_rate": 0.0002, + "loss": 0.2671, + "step": 1030 + }, + { + "epoch": 0.21807506814845878, + "grad_norm": 0.19516214728355408, + "learning_rate": 0.0002, + "loss": 0.2878, + "step": 1040 + }, + { + "epoch": 0.22017194380373245, + "grad_norm": 0.08513014763593674, + "learning_rate": 0.0002, + "loss": 0.3277, + "step": 1050 + }, + { + "epoch": 0.2222688194590061, + "grad_norm": 0.14394588768482208, + "learning_rate": 0.0002, + "loss": 0.3265, + "step": 1060 + }, + { + "epoch": 0.22436569511427973, + "grad_norm": 3.9625415802001953, + "learning_rate": 0.0002, + "loss": 0.2838, + "step": 1070 + }, + { + "epoch": 0.22646257076955337, + "grad_norm": 1.3177988529205322, + "learning_rate": 0.0002, + "loss": 0.2482, + "step": 1080 + }, + { + "epoch": 0.228559446424827, + "grad_norm": 0.1534273624420166, + "learning_rate": 0.0002, + "loss": 0.2459, + "step": 1090 + }, + { + "epoch": 0.23065632208010065, + "grad_norm": 0.18359704315662384, + "learning_rate": 0.0002, + "loss": 0.2471, + "step": 1100 + }, + { + "epoch": 0.23275319773537428, + "grad_norm": 0.1258287876844406, + "learning_rate": 0.0002, + "loss": 0.2448, + "step": 1110 + }, + { + "epoch": 0.23485007339064792, + "grad_norm": 0.19109375774860382, + "learning_rate": 0.0002, + "loss": 0.3102, + "step": 1120 + }, + { + "epoch": 0.2369469490459216, + "grad_norm": 2.0408198833465576, + "learning_rate": 0.0002, + "loss": 0.2702, + "step": 1130 + }, + { + "epoch": 0.23904382470119523, + "grad_norm": 0.06613194197416306, + "learning_rate": 0.0002, + "loss": 0.2669, + "step": 1140 + }, + { + "epoch": 0.24114070035646887, + "grad_norm": 0.11054085940122604, + "learning_rate": 0.0002, + "loss": 0.2457, + "step": 1150 + }, + { + "epoch": 0.2432375760117425, + "grad_norm": 1.8500365018844604, + "learning_rate": 0.0002, + "loss": 0.2886, + "step": 1160 + }, + { + "epoch": 0.24533445166701615, + "grad_norm": 0.12011546641588211, + "learning_rate": 0.0002, + "loss": 0.246, + "step": 1170 + }, + { + "epoch": 0.24743132732228978, + "grad_norm": 0.18363837897777557, + "learning_rate": 0.0002, + "loss": 0.3159, + "step": 1180 + }, + { + "epoch": 0.24952820297756342, + "grad_norm": 0.07881256937980652, + "learning_rate": 0.0002, + "loss": 0.2481, + "step": 1190 + }, + { + "epoch": 0.25162507863283706, + "grad_norm": 0.08021403849124908, + "learning_rate": 0.0002, + "loss": 0.2977, + "step": 1200 + }, + { + "epoch": 0.25372195428811073, + "grad_norm": 1.153460144996643, + "learning_rate": 0.0002, + "loss": 0.2741, + "step": 1210 + }, + { + "epoch": 0.25581882994338434, + "grad_norm": 0.8089906573295593, + "learning_rate": 0.0002, + "loss": 0.3158, + "step": 1220 + }, + { + "epoch": 0.257915705598658, + "grad_norm": 0.533995509147644, + "learning_rate": 0.0002, + "loss": 0.2876, + "step": 1230 + }, + { + "epoch": 0.2600125812539316, + "grad_norm": 2.0046756267547607, + "learning_rate": 0.0002, + "loss": 0.3035, + "step": 1240 + }, + { + "epoch": 0.2621094569092053, + "grad_norm": 0.13666591048240662, + "learning_rate": 0.0002, + "loss": 0.2692, + "step": 1250 + }, + { + "epoch": 0.26420633256447895, + "grad_norm": 3.1030569076538086, + "learning_rate": 0.0002, + "loss": 0.2813, + "step": 1260 + }, + { + "epoch": 0.26630320821975256, + "grad_norm": 0.08713892847299576, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 1270 + }, + { + "epoch": 0.26840008387502623, + "grad_norm": 1.8841831684112549, + "learning_rate": 0.0002, + "loss": 0.3362, + "step": 1280 + }, + { + "epoch": 0.27049695953029984, + "grad_norm": 0.3094707429409027, + "learning_rate": 0.0002, + "loss": 0.293, + "step": 1290 + }, + { + "epoch": 0.2725938351855735, + "grad_norm": 0.10224305093288422, + "learning_rate": 0.0002, + "loss": 0.2461, + "step": 1300 + }, + { + "epoch": 0.2746907108408471, + "grad_norm": 0.14384222030639648, + "learning_rate": 0.0002, + "loss": 0.3165, + "step": 1310 + }, + { + "epoch": 0.2767875864961208, + "grad_norm": 2.114215850830078, + "learning_rate": 0.0002, + "loss": 0.2484, + "step": 1320 + }, + { + "epoch": 0.2788844621513944, + "grad_norm": 0.0842517539858818, + "learning_rate": 0.0002, + "loss": 0.2698, + "step": 1330 + }, + { + "epoch": 0.28098133780666806, + "grad_norm": 0.06867516785860062, + "learning_rate": 0.0002, + "loss": 0.2455, + "step": 1340 + }, + { + "epoch": 0.28307821346194173, + "grad_norm": 0.0686967521905899, + "learning_rate": 0.0002, + "loss": 0.247, + "step": 1350 + }, + { + "epoch": 0.28517508911721534, + "grad_norm": 1.7461965084075928, + "learning_rate": 0.0002, + "loss": 0.2686, + "step": 1360 + }, + { + "epoch": 0.287271964772489, + "grad_norm": 0.09826788306236267, + "learning_rate": 0.0002, + "loss": 0.2804, + "step": 1370 + }, + { + "epoch": 0.2893688404277626, + "grad_norm": 0.09541603922843933, + "learning_rate": 0.0002, + "loss": 0.2871, + "step": 1380 + }, + { + "epoch": 0.2914657160830363, + "grad_norm": 0.12282253801822662, + "learning_rate": 0.0002, + "loss": 0.3452, + "step": 1390 + }, + { + "epoch": 0.2935625917383099, + "grad_norm": 0.09135173261165619, + "learning_rate": 0.0002, + "loss": 0.2458, + "step": 1400 + }, + { + "epoch": 0.29565946739358356, + "grad_norm": 1.8899190425872803, + "learning_rate": 0.0002, + "loss": 0.2687, + "step": 1410 + }, + { + "epoch": 0.29775634304885723, + "grad_norm": 0.13118763267993927, + "learning_rate": 0.0002, + "loss": 0.3037, + "step": 1420 + }, + { + "epoch": 0.29985321870413084, + "grad_norm": 1.688429355621338, + "learning_rate": 0.0002, + "loss": 0.299, + "step": 1430 + }, + { + "epoch": 0.3019500943594045, + "grad_norm": 0.11612743139266968, + "learning_rate": 0.0002, + "loss": 0.2699, + "step": 1440 + }, + { + "epoch": 0.3040469700146781, + "grad_norm": 0.14019401371479034, + "learning_rate": 0.0002, + "loss": 0.2503, + "step": 1450 + }, + { + "epoch": 0.3061438456699518, + "grad_norm": 0.8129562735557556, + "learning_rate": 0.0002, + "loss": 0.2853, + "step": 1460 + }, + { + "epoch": 0.3082407213252254, + "grad_norm": 5.135369777679443, + "learning_rate": 0.0002, + "loss": 0.2949, + "step": 1470 + }, + { + "epoch": 0.31033759698049906, + "grad_norm": 0.11974827200174332, + "learning_rate": 0.0002, + "loss": 0.2485, + "step": 1480 + }, + { + "epoch": 0.3124344726357727, + "grad_norm": 0.10319698601961136, + "learning_rate": 0.0002, + "loss": 0.2879, + "step": 1490 + }, + { + "epoch": 0.31453134829104634, + "grad_norm": 0.09822767227888107, + "learning_rate": 0.0002, + "loss": 0.2452, + "step": 1500 + }, + { + "epoch": 0.31662822394632, + "grad_norm": 0.09973836690187454, + "learning_rate": 0.0002, + "loss": 0.2777, + "step": 1510 + }, + { + "epoch": 0.3187250996015936, + "grad_norm": 16.39473533630371, + "learning_rate": 0.0002, + "loss": 0.2979, + "step": 1520 + }, + { + "epoch": 0.3208219752568673, + "grad_norm": 0.07360737025737762, + "learning_rate": 0.0002, + "loss": 0.2444, + "step": 1530 + }, + { + "epoch": 0.3229188509121409, + "grad_norm": 0.11402890831232071, + "learning_rate": 0.0002, + "loss": 0.2453, + "step": 1540 + }, + { + "epoch": 0.32501572656741456, + "grad_norm": 0.09771336615085602, + "learning_rate": 0.0002, + "loss": 0.2441, + "step": 1550 + }, + { + "epoch": 0.3271126022226882, + "grad_norm": 1.3607581853866577, + "learning_rate": 0.0002, + "loss": 0.3091, + "step": 1560 + }, + { + "epoch": 0.32920947787796184, + "grad_norm": 0.09807518124580383, + "learning_rate": 0.0002, + "loss": 0.2491, + "step": 1570 + }, + { + "epoch": 0.33130635353323545, + "grad_norm": 0.098614901304245, + "learning_rate": 0.0002, + "loss": 0.344, + "step": 1580 + }, + { + "epoch": 0.3334032291885091, + "grad_norm": 0.08970670402050018, + "learning_rate": 0.0002, + "loss": 0.2451, + "step": 1590 + }, + { + "epoch": 0.3355001048437828, + "grad_norm": 0.11069203168153763, + "learning_rate": 0.0002, + "loss": 0.2449, + "step": 1600 + }, + { + "epoch": 0.3375969804990564, + "grad_norm": 0.1457228660583496, + "learning_rate": 0.0002, + "loss": 0.3231, + "step": 1610 + }, + { + "epoch": 0.33969385615433006, + "grad_norm": 0.12434522062540054, + "learning_rate": 0.0002, + "loss": 0.2892, + "step": 1620 + }, + { + "epoch": 0.3417907318096037, + "grad_norm": 0.14981336891651154, + "learning_rate": 0.0002, + "loss": 0.2437, + "step": 1630 + }, + { + "epoch": 0.34388760746487734, + "grad_norm": 1.7012590169906616, + "learning_rate": 0.0002, + "loss": 0.2796, + "step": 1640 + }, + { + "epoch": 0.34598448312015095, + "grad_norm": 0.10405845195055008, + "learning_rate": 0.0002, + "loss": 0.3362, + "step": 1650 + }, + { + "epoch": 0.3480813587754246, + "grad_norm": 0.19532528519630432, + "learning_rate": 0.0002, + "loss": 0.2668, + "step": 1660 + }, + { + "epoch": 0.35017823443069823, + "grad_norm": 0.08479733765125275, + "learning_rate": 0.0002, + "loss": 0.2911, + "step": 1670 + }, + { + "epoch": 0.3522751100859719, + "grad_norm": 1.8583425283432007, + "learning_rate": 0.0002, + "loss": 0.2697, + "step": 1680 + }, + { + "epoch": 0.35437198574124557, + "grad_norm": 0.08247324079275131, + "learning_rate": 0.0002, + "loss": 0.2444, + "step": 1690 + }, + { + "epoch": 0.3564688613965192, + "grad_norm": 2.080596923828125, + "learning_rate": 0.0002, + "loss": 0.2691, + "step": 1700 + }, + { + "epoch": 0.35856573705179284, + "grad_norm": 0.07867144048213959, + "learning_rate": 0.0002, + "loss": 0.2624, + "step": 1710 + }, + { + "epoch": 0.36066261270706645, + "grad_norm": 0.15009847283363342, + "learning_rate": 0.0002, + "loss": 0.2445, + "step": 1720 + }, + { + "epoch": 0.3627594883623401, + "grad_norm": 0.0880853533744812, + "learning_rate": 0.0002, + "loss": 0.3024, + "step": 1730 + }, + { + "epoch": 0.36485636401761373, + "grad_norm": 0.1102822870016098, + "learning_rate": 0.0002, + "loss": 0.2654, + "step": 1740 + }, + { + "epoch": 0.3669532396728874, + "grad_norm": 0.20041993260383606, + "learning_rate": 0.0002, + "loss": 0.3052, + "step": 1750 + }, + { + "epoch": 0.36905011532816107, + "grad_norm": 0.07399024814367294, + "learning_rate": 0.0002, + "loss": 0.3312, + "step": 1760 + }, + { + "epoch": 0.3711469909834347, + "grad_norm": 0.2129296362400055, + "learning_rate": 0.0002, + "loss": 0.2788, + "step": 1770 + }, + { + "epoch": 0.37324386663870834, + "grad_norm": 0.09153830260038376, + "learning_rate": 0.0002, + "loss": 0.2726, + "step": 1780 + }, + { + "epoch": 0.37534074229398195, + "grad_norm": 0.09638633579015732, + "learning_rate": 0.0002, + "loss": 0.2442, + "step": 1790 + }, + { + "epoch": 0.3774376179492556, + "grad_norm": 0.12968820333480835, + "learning_rate": 0.0002, + "loss": 0.2456, + "step": 1800 + }, + { + "epoch": 0.37953449360452923, + "grad_norm": 0.10243703424930573, + "learning_rate": 0.0002, + "loss": 0.287, + "step": 1810 + }, + { + "epoch": 0.3816313692598029, + "grad_norm": 0.09311744570732117, + "learning_rate": 0.0002, + "loss": 0.2971, + "step": 1820 + }, + { + "epoch": 0.3837282449150765, + "grad_norm": 0.09674180299043655, + "learning_rate": 0.0002, + "loss": 0.2549, + "step": 1830 + }, + { + "epoch": 0.3858251205703502, + "grad_norm": 0.13742677867412567, + "learning_rate": 0.0002, + "loss": 0.245, + "step": 1840 + }, + { + "epoch": 0.38792199622562384, + "grad_norm": 1.1147104501724243, + "learning_rate": 0.0002, + "loss": 0.3029, + "step": 1850 + }, + { + "epoch": 0.39001887188089746, + "grad_norm": 0.06606106460094452, + "learning_rate": 0.0002, + "loss": 0.2708, + "step": 1860 + }, + { + "epoch": 0.3921157475361711, + "grad_norm": 0.06749361008405685, + "learning_rate": 0.0002, + "loss": 0.2616, + "step": 1870 + }, + { + "epoch": 0.39421262319144473, + "grad_norm": 0.07357250154018402, + "learning_rate": 0.0002, + "loss": 0.2621, + "step": 1880 + }, + { + "epoch": 0.3963094988467184, + "grad_norm": 0.10349666327238083, + "learning_rate": 0.0002, + "loss": 0.2466, + "step": 1890 + }, + { + "epoch": 0.398406374501992, + "grad_norm": 0.14224931597709656, + "learning_rate": 0.0002, + "loss": 0.2444, + "step": 1900 + }, + { + "epoch": 0.4005032501572657, + "grad_norm": 0.10312917083501816, + "learning_rate": 0.0002, + "loss": 0.2435, + "step": 1910 + }, + { + "epoch": 0.4026001258125393, + "grad_norm": 0.10063254833221436, + "learning_rate": 0.0002, + "loss": 0.2444, + "step": 1920 + }, + { + "epoch": 0.40469700146781296, + "grad_norm": 0.1020762100815773, + "learning_rate": 0.0002, + "loss": 0.2549, + "step": 1930 + }, + { + "epoch": 0.4067938771230866, + "grad_norm": 0.09656281769275665, + "learning_rate": 0.0002, + "loss": 0.2446, + "step": 1940 + }, + { + "epoch": 0.40889075277836023, + "grad_norm": 0.14976176619529724, + "learning_rate": 0.0002, + "loss": 0.27, + "step": 1950 + }, + { + "epoch": 0.4109876284336339, + "grad_norm": 1.410688042640686, + "learning_rate": 0.0002, + "loss": 0.2953, + "step": 1960 + }, + { + "epoch": 0.4130845040889075, + "grad_norm": 0.14357423782348633, + "learning_rate": 0.0002, + "loss": 0.2664, + "step": 1970 + }, + { + "epoch": 0.4151813797441812, + "grad_norm": 0.08045023679733276, + "learning_rate": 0.0002, + "loss": 0.2645, + "step": 1980 + }, + { + "epoch": 0.4172782553994548, + "grad_norm": 1.9619622230529785, + "learning_rate": 0.0002, + "loss": 0.2757, + "step": 1990 + }, + { + "epoch": 0.41937513105472846, + "grad_norm": 0.06940364837646484, + "learning_rate": 0.0002, + "loss": 0.2685, + "step": 2000 + }, + { + "epoch": 0.4214720067100021, + "grad_norm": 0.11344420164823532, + "learning_rate": 0.0002, + "loss": 0.2834, + "step": 2010 + }, + { + "epoch": 0.42356888236527573, + "grad_norm": 0.08228524029254913, + "learning_rate": 0.0002, + "loss": 0.2635, + "step": 2020 + }, + { + "epoch": 0.4256657580205494, + "grad_norm": 0.07158046960830688, + "learning_rate": 0.0002, + "loss": 0.2528, + "step": 2030 + }, + { + "epoch": 0.427762633675823, + "grad_norm": 4.314914703369141, + "learning_rate": 0.0002, + "loss": 0.2649, + "step": 2040 + }, + { + "epoch": 0.4298595093310967, + "grad_norm": 0.16018514335155487, + "learning_rate": 0.0002, + "loss": 0.2586, + "step": 2050 + }, + { + "epoch": 0.4319563849863703, + "grad_norm": 1.0571707487106323, + "learning_rate": 0.0002, + "loss": 0.2592, + "step": 2060 + }, + { + "epoch": 0.43405326064164396, + "grad_norm": 0.48940908908843994, + "learning_rate": 0.0002, + "loss": 0.2692, + "step": 2070 + }, + { + "epoch": 0.43615013629691757, + "grad_norm": 0.07459184527397156, + "learning_rate": 0.0002, + "loss": 0.2498, + "step": 2080 + }, + { + "epoch": 0.43824701195219123, + "grad_norm": 0.07219909131526947, + "learning_rate": 0.0002, + "loss": 0.2599, + "step": 2090 + }, + { + "epoch": 0.4403438876074649, + "grad_norm": 0.06879916042089462, + "learning_rate": 0.0002, + "loss": 0.2565, + "step": 2100 + }, + { + "epoch": 0.4424407632627385, + "grad_norm": 1.8906136751174927, + "learning_rate": 0.0002, + "loss": 0.2874, + "step": 2110 + }, + { + "epoch": 0.4445376389180122, + "grad_norm": 0.10700809955596924, + "learning_rate": 0.0002, + "loss": 0.2437, + "step": 2120 + }, + { + "epoch": 0.4466345145732858, + "grad_norm": 0.06732141971588135, + "learning_rate": 0.0002, + "loss": 0.2432, + "step": 2130 + }, + { + "epoch": 0.44873139022855946, + "grad_norm": 0.11520029604434967, + "learning_rate": 0.0002, + "loss": 0.2463, + "step": 2140 + }, + { + "epoch": 0.45082826588383307, + "grad_norm": 0.08251114189624786, + "learning_rate": 0.0002, + "loss": 0.2658, + "step": 2150 + }, + { + "epoch": 0.45292514153910673, + "grad_norm": 0.10146190226078033, + "learning_rate": 0.0002, + "loss": 0.2444, + "step": 2160 + }, + { + "epoch": 0.45502201719438035, + "grad_norm": 0.31207767128944397, + "learning_rate": 0.0002, + "loss": 0.2574, + "step": 2170 + }, + { + "epoch": 0.457118892849654, + "grad_norm": 0.14245358109474182, + "learning_rate": 0.0002, + "loss": 0.2446, + "step": 2180 + }, + { + "epoch": 0.4592157685049277, + "grad_norm": 0.08076608926057816, + "learning_rate": 0.0002, + "loss": 0.2912, + "step": 2190 + }, + { + "epoch": 0.4613126441602013, + "grad_norm": 0.09804554283618927, + "learning_rate": 0.0002, + "loss": 0.2441, + "step": 2200 + }, + { + "epoch": 0.46340951981547496, + "grad_norm": 0.09204811602830887, + "learning_rate": 0.0002, + "loss": 0.2421, + "step": 2210 + }, + { + "epoch": 0.46550639547074857, + "grad_norm": 0.07955732196569443, + "learning_rate": 0.0002, + "loss": 0.3149, + "step": 2220 + }, + { + "epoch": 0.46760327112602224, + "grad_norm": 0.1081770807504654, + "learning_rate": 0.0002, + "loss": 0.2477, + "step": 2230 + }, + { + "epoch": 0.46970014678129585, + "grad_norm": 0.1168019101023674, + "learning_rate": 0.0002, + "loss": 0.303, + "step": 2240 + }, + { + "epoch": 0.4717970224365695, + "grad_norm": 0.11181426793336868, + "learning_rate": 0.0002, + "loss": 0.2832, + "step": 2250 + }, + { + "epoch": 0.4738938980918432, + "grad_norm": 0.11993461847305298, + "learning_rate": 0.0002, + "loss": 0.2953, + "step": 2260 + }, + { + "epoch": 0.4759907737471168, + "grad_norm": 0.0672910287976265, + "learning_rate": 0.0002, + "loss": 0.244, + "step": 2270 + }, + { + "epoch": 0.47808764940239046, + "grad_norm": 0.5338168144226074, + "learning_rate": 0.0002, + "loss": 0.2712, + "step": 2280 + }, + { + "epoch": 0.48018452505766407, + "grad_norm": 0.08533361554145813, + "learning_rate": 0.0002, + "loss": 0.2579, + "step": 2290 + }, + { + "epoch": 0.48228140071293774, + "grad_norm": 2.0753543376922607, + "learning_rate": 0.0002, + "loss": 0.2879, + "step": 2300 + }, + { + "epoch": 0.48437827636821135, + "grad_norm": 0.12475968152284622, + "learning_rate": 0.0002, + "loss": 0.2439, + "step": 2310 + }, + { + "epoch": 0.486475152023485, + "grad_norm": 0.07769046723842621, + "learning_rate": 0.0002, + "loss": 0.2444, + "step": 2320 + }, + { + "epoch": 0.4885720276787586, + "grad_norm": 0.45657530426979065, + "learning_rate": 0.0002, + "loss": 0.2415, + "step": 2330 + }, + { + "epoch": 0.4906689033340323, + "grad_norm": 2.8291752338409424, + "learning_rate": 0.0002, + "loss": 0.2837, + "step": 2340 + }, + { + "epoch": 0.49276577898930596, + "grad_norm": 0.16259396076202393, + "learning_rate": 0.0002, + "loss": 0.2737, + "step": 2350 + }, + { + "epoch": 0.49486265464457957, + "grad_norm": 0.27614033222198486, + "learning_rate": 0.0002, + "loss": 0.245, + "step": 2360 + }, + { + "epoch": 0.49695953029985324, + "grad_norm": 0.20377802848815918, + "learning_rate": 0.0002, + "loss": 0.2555, + "step": 2370 + }, + { + "epoch": 0.49905640595512685, + "grad_norm": 0.1173492893576622, + "learning_rate": 0.0002, + "loss": 0.2459, + "step": 2380 + }, + { + "epoch": 0.5011532816104005, + "grad_norm": 0.2819477617740631, + "learning_rate": 0.0002, + "loss": 0.3102, + "step": 2390 + }, + { + "epoch": 0.5032501572656741, + "grad_norm": 0.11425253748893738, + "learning_rate": 0.0002, + "loss": 0.244, + "step": 2400 + }, + { + "epoch": 0.5053470329209477, + "grad_norm": 1.2937549352645874, + "learning_rate": 0.0002, + "loss": 0.329, + "step": 2410 + }, + { + "epoch": 0.5074439085762215, + "grad_norm": 0.16594870388507843, + "learning_rate": 0.0002, + "loss": 0.262, + "step": 2420 + }, + { + "epoch": 0.5095407842314951, + "grad_norm": 1.6648545265197754, + "learning_rate": 0.0002, + "loss": 0.2893, + "step": 2430 + }, + { + "epoch": 0.5116376598867687, + "grad_norm": 0.06415042281150818, + "learning_rate": 0.0002, + "loss": 0.2481, + "step": 2440 + }, + { + "epoch": 0.5137345355420424, + "grad_norm": 0.0840664952993393, + "learning_rate": 0.0002, + "loss": 0.2687, + "step": 2450 + }, + { + "epoch": 0.515831411197316, + "grad_norm": 1.8631309270858765, + "learning_rate": 0.0002, + "loss": 0.27, + "step": 2460 + }, + { + "epoch": 0.5179282868525896, + "grad_norm": 0.08405490219593048, + "learning_rate": 0.0002, + "loss": 0.2447, + "step": 2470 + }, + { + "epoch": 0.5200251625078632, + "grad_norm": 0.10409712791442871, + "learning_rate": 0.0002, + "loss": 0.244, + "step": 2480 + }, + { + "epoch": 0.522122038163137, + "grad_norm": 0.08199995011091232, + "learning_rate": 0.0002, + "loss": 0.2448, + "step": 2490 + }, + { + "epoch": 0.5242189138184106, + "grad_norm": 0.0819941982626915, + "learning_rate": 0.0002, + "loss": 0.2859, + "step": 2500 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.07962234318256378, + "learning_rate": 0.0002, + "loss": 0.2777, + "step": 2510 + }, + { + "epoch": 0.5284126651289579, + "grad_norm": 0.09645619243383408, + "learning_rate": 0.0002, + "loss": 0.2444, + "step": 2520 + }, + { + "epoch": 0.5305095407842315, + "grad_norm": 0.10941855609416962, + "learning_rate": 0.0002, + "loss": 0.2643, + "step": 2530 + }, + { + "epoch": 0.5326064164395051, + "grad_norm": 0.11371888220310211, + "learning_rate": 0.0002, + "loss": 0.2447, + "step": 2540 + }, + { + "epoch": 0.5347032920947787, + "grad_norm": 1.2199335098266602, + "learning_rate": 0.0002, + "loss": 0.2603, + "step": 2550 + }, + { + "epoch": 0.5368001677500525, + "grad_norm": 0.054858073592185974, + "learning_rate": 0.0002, + "loss": 0.2899, + "step": 2560 + }, + { + "epoch": 0.5388970434053261, + "grad_norm": 0.0723327100276947, + "learning_rate": 0.0002, + "loss": 0.2434, + "step": 2570 + }, + { + "epoch": 0.5409939190605997, + "grad_norm": 0.11272138357162476, + "learning_rate": 0.0002, + "loss": 0.2828, + "step": 2580 + }, + { + "epoch": 0.5430907947158734, + "grad_norm": 0.07356975972652435, + "learning_rate": 0.0002, + "loss": 0.2623, + "step": 2590 + }, + { + "epoch": 0.545187670371147, + "grad_norm": 0.12150203436613083, + "learning_rate": 0.0002, + "loss": 0.2496, + "step": 2600 + }, + { + "epoch": 0.5472845460264206, + "grad_norm": 0.1342150866985321, + "learning_rate": 0.0002, + "loss": 0.2574, + "step": 2610 + }, + { + "epoch": 0.5493814216816942, + "grad_norm": 0.10773837566375732, + "learning_rate": 0.0002, + "loss": 0.2426, + "step": 2620 + }, + { + "epoch": 0.551478297336968, + "grad_norm": 0.12423650175333023, + "learning_rate": 0.0002, + "loss": 0.2621, + "step": 2630 + }, + { + "epoch": 0.5535751729922416, + "grad_norm": 0.28494974970817566, + "learning_rate": 0.0002, + "loss": 0.2926, + "step": 2640 + }, + { + "epoch": 0.5556720486475152, + "grad_norm": 0.08630665391683578, + "learning_rate": 0.0002, + "loss": 0.2438, + "step": 2650 + }, + { + "epoch": 0.5577689243027888, + "grad_norm": 0.08209650963544846, + "learning_rate": 0.0002, + "loss": 0.3102, + "step": 2660 + }, + { + "epoch": 0.5598657999580625, + "grad_norm": 0.1205248087644577, + "learning_rate": 0.0002, + "loss": 0.2485, + "step": 2670 + }, + { + "epoch": 0.5619626756133361, + "grad_norm": 1.9188755750656128, + "learning_rate": 0.0002, + "loss": 0.265, + "step": 2680 + }, + { + "epoch": 0.5640595512686097, + "grad_norm": 0.07110995054244995, + "learning_rate": 0.0002, + "loss": 0.2709, + "step": 2690 + }, + { + "epoch": 0.5661564269238835, + "grad_norm": 0.1682436764240265, + "learning_rate": 0.0002, + "loss": 0.2453, + "step": 2700 + }, + { + "epoch": 0.5682533025791571, + "grad_norm": 1.8173266649246216, + "learning_rate": 0.0002, + "loss": 0.2608, + "step": 2710 + }, + { + "epoch": 0.5703501782344307, + "grad_norm": 0.07287939637899399, + "learning_rate": 0.0002, + "loss": 0.2699, + "step": 2720 + }, + { + "epoch": 0.5724470538897043, + "grad_norm": 0.08551536500453949, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 2730 + }, + { + "epoch": 0.574543929544978, + "grad_norm": 0.17491847276687622, + "learning_rate": 0.0002, + "loss": 0.2873, + "step": 2740 + }, + { + "epoch": 0.5766408052002516, + "grad_norm": 0.06950732320547104, + "learning_rate": 0.0002, + "loss": 0.3286, + "step": 2750 + }, + { + "epoch": 0.5787376808555252, + "grad_norm": 0.1365327090024948, + "learning_rate": 0.0002, + "loss": 0.255, + "step": 2760 + }, + { + "epoch": 0.580834556510799, + "grad_norm": 1.6514012813568115, + "learning_rate": 0.0002, + "loss": 0.3005, + "step": 2770 + }, + { + "epoch": 0.5829314321660726, + "grad_norm": 0.10234736651182175, + "learning_rate": 0.0002, + "loss": 0.2445, + "step": 2780 + }, + { + "epoch": 0.5850283078213462, + "grad_norm": 0.11370868235826492, + "learning_rate": 0.0002, + "loss": 0.2449, + "step": 2790 + }, + { + "epoch": 0.5871251834766198, + "grad_norm": 0.3800756633281708, + "learning_rate": 0.0002, + "loss": 0.2567, + "step": 2800 + }, + { + "epoch": 0.5892220591318935, + "grad_norm": 0.0628165528178215, + "learning_rate": 0.0002, + "loss": 0.2713, + "step": 2810 + }, + { + "epoch": 0.5913189347871671, + "grad_norm": 0.11104313284158707, + "learning_rate": 0.0002, + "loss": 0.2699, + "step": 2820 + }, + { + "epoch": 0.5934158104424407, + "grad_norm": 0.10503487288951874, + "learning_rate": 0.0002, + "loss": 0.3043, + "step": 2830 + }, + { + "epoch": 0.5955126860977145, + "grad_norm": 0.09854432940483093, + "learning_rate": 0.0002, + "loss": 0.269, + "step": 2840 + }, + { + "epoch": 0.5976095617529881, + "grad_norm": 0.06774982064962387, + "learning_rate": 0.0002, + "loss": 0.2625, + "step": 2850 + }, + { + "epoch": 0.5997064374082617, + "grad_norm": 0.09751401841640472, + "learning_rate": 0.0002, + "loss": 0.2426, + "step": 2860 + }, + { + "epoch": 0.6018033130635353, + "grad_norm": 0.07544960081577301, + "learning_rate": 0.0002, + "loss": 0.2439, + "step": 2870 + }, + { + "epoch": 0.603900188718809, + "grad_norm": 0.5726602673530579, + "learning_rate": 0.0002, + "loss": 0.2875, + "step": 2880 + }, + { + "epoch": 0.6059970643740826, + "grad_norm": 0.08737173676490784, + "learning_rate": 0.0002, + "loss": 0.2657, + "step": 2890 + }, + { + "epoch": 0.6080939400293562, + "grad_norm": 3.507925033569336, + "learning_rate": 0.0002, + "loss": 0.309, + "step": 2900 + }, + { + "epoch": 0.6101908156846299, + "grad_norm": 2.278453826904297, + "learning_rate": 0.0002, + "loss": 0.2669, + "step": 2910 + }, + { + "epoch": 0.6122876913399036, + "grad_norm": 1.2136021852493286, + "learning_rate": 0.0002, + "loss": 0.269, + "step": 2920 + }, + { + "epoch": 0.6143845669951772, + "grad_norm": 0.09733498841524124, + "learning_rate": 0.0002, + "loss": 0.2508, + "step": 2930 + }, + { + "epoch": 0.6164814426504508, + "grad_norm": 0.0754965990781784, + "learning_rate": 0.0002, + "loss": 0.279, + "step": 2940 + }, + { + "epoch": 0.6185783183057245, + "grad_norm": 0.8958814740180969, + "learning_rate": 0.0002, + "loss": 0.2697, + "step": 2950 + }, + { + "epoch": 0.6206751939609981, + "grad_norm": 0.07990656793117523, + "learning_rate": 0.0002, + "loss": 0.2661, + "step": 2960 + }, + { + "epoch": 0.6227720696162717, + "grad_norm": 2.7301337718963623, + "learning_rate": 0.0002, + "loss": 0.3001, + "step": 2970 + }, + { + "epoch": 0.6248689452715454, + "grad_norm": 0.09214778244495392, + "learning_rate": 0.0002, + "loss": 0.3344, + "step": 2980 + }, + { + "epoch": 0.6269658209268191, + "grad_norm": 3.18005633354187, + "learning_rate": 0.0002, + "loss": 0.328, + "step": 2990 + }, + { + "epoch": 0.6290626965820927, + "grad_norm": 0.11004272103309631, + "learning_rate": 0.0002, + "loss": 0.2492, + "step": 3000 + }, + { + "epoch": 0.6311595722373663, + "grad_norm": 5.838320732116699, + "learning_rate": 0.0002, + "loss": 0.2496, + "step": 3010 + }, + { + "epoch": 0.63325644789264, + "grad_norm": 0.18130512535572052, + "learning_rate": 0.0002, + "loss": 0.2471, + "step": 3020 + }, + { + "epoch": 0.6353533235479136, + "grad_norm": 0.09574416279792786, + "learning_rate": 0.0002, + "loss": 0.2726, + "step": 3030 + }, + { + "epoch": 0.6374501992031872, + "grad_norm": 0.128231942653656, + "learning_rate": 0.0002, + "loss": 0.2451, + "step": 3040 + }, + { + "epoch": 0.6395470748584609, + "grad_norm": 0.16388727724552155, + "learning_rate": 0.0002, + "loss": 0.2746, + "step": 3050 + }, + { + "epoch": 0.6416439505137346, + "grad_norm": 0.10822491347789764, + "learning_rate": 0.0002, + "loss": 0.2904, + "step": 3060 + }, + { + "epoch": 0.6437408261690082, + "grad_norm": 0.15372033417224884, + "learning_rate": 0.0002, + "loss": 0.2527, + "step": 3070 + }, + { + "epoch": 0.6458377018242818, + "grad_norm": 0.1321578323841095, + "learning_rate": 0.0002, + "loss": 0.2474, + "step": 3080 + }, + { + "epoch": 0.6479345774795554, + "grad_norm": 0.1597270965576172, + "learning_rate": 0.0002, + "loss": 0.2998, + "step": 3090 + }, + { + "epoch": 0.6500314531348291, + "grad_norm": 0.07458209246397018, + "learning_rate": 0.0002, + "loss": 0.2457, + "step": 3100 + }, + { + "epoch": 0.6521283287901027, + "grad_norm": 2.394174575805664, + "learning_rate": 0.0002, + "loss": 0.275, + "step": 3110 + }, + { + "epoch": 0.6542252044453764, + "grad_norm": 0.12575379014015198, + "learning_rate": 0.0002, + "loss": 0.247, + "step": 3120 + }, + { + "epoch": 0.6563220801006501, + "grad_norm": 0.8199005722999573, + "learning_rate": 0.0002, + "loss": 0.2463, + "step": 3130 + }, + { + "epoch": 0.6584189557559237, + "grad_norm": 0.15044209361076355, + "learning_rate": 0.0002, + "loss": 0.2937, + "step": 3140 + }, + { + "epoch": 0.6605158314111973, + "grad_norm": 1.6776069402694702, + "learning_rate": 0.0002, + "loss": 0.2731, + "step": 3150 + }, + { + "epoch": 0.6626127070664709, + "grad_norm": 0.12253274023532867, + "learning_rate": 0.0002, + "loss": 0.2451, + "step": 3160 + }, + { + "epoch": 0.6647095827217446, + "grad_norm": 0.050848811864852905, + "learning_rate": 0.0002, + "loss": 0.2688, + "step": 3170 + }, + { + "epoch": 0.6668064583770182, + "grad_norm": 0.09614066779613495, + "learning_rate": 0.0002, + "loss": 0.269, + "step": 3180 + }, + { + "epoch": 0.6689033340322919, + "grad_norm": 0.09169217199087143, + "learning_rate": 0.0002, + "loss": 0.2444, + "step": 3190 + }, + { + "epoch": 0.6710002096875656, + "grad_norm": 0.13360008597373962, + "learning_rate": 0.0002, + "loss": 0.2455, + "step": 3200 + }, + { + "epoch": 0.6730970853428392, + "grad_norm": 0.11396951973438263, + "learning_rate": 0.0002, + "loss": 0.2441, + "step": 3210 + }, + { + "epoch": 0.6751939609981128, + "grad_norm": 0.09393135458230972, + "learning_rate": 0.0002, + "loss": 0.2456, + "step": 3220 + }, + { + "epoch": 0.6772908366533864, + "grad_norm": 0.08728672564029694, + "learning_rate": 0.0002, + "loss": 0.2451, + "step": 3230 + }, + { + "epoch": 0.6793877123086601, + "grad_norm": 0.07470987737178802, + "learning_rate": 0.0002, + "loss": 0.3351, + "step": 3240 + }, + { + "epoch": 0.6814845879639337, + "grad_norm": 0.13528290390968323, + "learning_rate": 0.0002, + "loss": 0.2898, + "step": 3250 + }, + { + "epoch": 0.6835814636192074, + "grad_norm": 3.9980695247650146, + "learning_rate": 0.0002, + "loss": 0.2946, + "step": 3260 + }, + { + "epoch": 0.6856783392744811, + "grad_norm": 0.09749490767717361, + "learning_rate": 0.0002, + "loss": 0.304, + "step": 3270 + }, + { + "epoch": 0.6877752149297547, + "grad_norm": 0.36559662222862244, + "learning_rate": 0.0002, + "loss": 0.2947, + "step": 3280 + }, + { + "epoch": 0.6898720905850283, + "grad_norm": 1.9381011724472046, + "learning_rate": 0.0002, + "loss": 0.2889, + "step": 3290 + }, + { + "epoch": 0.6919689662403019, + "grad_norm": 0.16328813135623932, + "learning_rate": 0.0002, + "loss": 0.2509, + "step": 3300 + }, + { + "epoch": 0.6940658418955756, + "grad_norm": 0.09724120050668716, + "learning_rate": 0.0002, + "loss": 0.2452, + "step": 3310 + }, + { + "epoch": 0.6961627175508492, + "grad_norm": 0.08283833414316177, + "learning_rate": 0.0002, + "loss": 0.2574, + "step": 3320 + }, + { + "epoch": 0.6982595932061229, + "grad_norm": 0.12056735903024673, + "learning_rate": 0.0002, + "loss": 0.2604, + "step": 3330 + }, + { + "epoch": 0.7003564688613965, + "grad_norm": 0.10020340979099274, + "learning_rate": 0.0002, + "loss": 0.2446, + "step": 3340 + }, + { + "epoch": 0.7024533445166702, + "grad_norm": 0.10376536846160889, + "learning_rate": 0.0002, + "loss": 0.2585, + "step": 3350 + }, + { + "epoch": 0.7045502201719438, + "grad_norm": 0.12203848361968994, + "learning_rate": 0.0002, + "loss": 0.2989, + "step": 3360 + }, + { + "epoch": 0.7066470958272174, + "grad_norm": 1.848594069480896, + "learning_rate": 0.0002, + "loss": 0.2576, + "step": 3370 + }, + { + "epoch": 0.7087439714824911, + "grad_norm": 0.3774644136428833, + "learning_rate": 0.0002, + "loss": 0.308, + "step": 3380 + }, + { + "epoch": 0.7108408471377647, + "grad_norm": 0.09510187804698944, + "learning_rate": 0.0002, + "loss": 0.2577, + "step": 3390 + }, + { + "epoch": 0.7129377227930384, + "grad_norm": 3.617866277694702, + "learning_rate": 0.0002, + "loss": 0.2625, + "step": 3400 + }, + { + "epoch": 0.715034598448312, + "grad_norm": 0.09932883083820343, + "learning_rate": 0.0002, + "loss": 0.2765, + "step": 3410 + }, + { + "epoch": 0.7171314741035857, + "grad_norm": 0.39475202560424805, + "learning_rate": 0.0002, + "loss": 0.3117, + "step": 3420 + }, + { + "epoch": 0.7192283497588593, + "grad_norm": 1.8625463247299194, + "learning_rate": 0.0002, + "loss": 0.2932, + "step": 3430 + }, + { + "epoch": 0.7213252254141329, + "grad_norm": 0.11761441826820374, + "learning_rate": 0.0002, + "loss": 0.2646, + "step": 3440 + }, + { + "epoch": 0.7234221010694066, + "grad_norm": 0.11470779776573181, + "learning_rate": 0.0002, + "loss": 0.2439, + "step": 3450 + }, + { + "epoch": 0.7255189767246802, + "grad_norm": 0.10077525675296783, + "learning_rate": 0.0002, + "loss": 0.2416, + "step": 3460 + }, + { + "epoch": 0.7276158523799539, + "grad_norm": 3.409393072128296, + "learning_rate": 0.0002, + "loss": 0.3278, + "step": 3470 + }, + { + "epoch": 0.7297127280352275, + "grad_norm": 0.0658923089504242, + "learning_rate": 0.0002, + "loss": 0.2462, + "step": 3480 + }, + { + "epoch": 0.7318096036905012, + "grad_norm": 0.10743657499551773, + "learning_rate": 0.0002, + "loss": 0.246, + "step": 3490 + }, + { + "epoch": 0.7339064793457748, + "grad_norm": 0.10818490386009216, + "learning_rate": 0.0002, + "loss": 0.2451, + "step": 3500 + }, + { + "epoch": 0.7360033550010484, + "grad_norm": 0.09132494032382965, + "learning_rate": 0.0002, + "loss": 0.2458, + "step": 3510 + }, + { + "epoch": 0.7381002306563221, + "grad_norm": 0.09728691726922989, + "learning_rate": 0.0002, + "loss": 0.2841, + "step": 3520 + }, + { + "epoch": 0.7401971063115957, + "grad_norm": 0.10897944122552872, + "learning_rate": 0.0002, + "loss": 0.2616, + "step": 3530 + }, + { + "epoch": 0.7422939819668694, + "grad_norm": 0.143765389919281, + "learning_rate": 0.0002, + "loss": 0.2645, + "step": 3540 + }, + { + "epoch": 0.744390857622143, + "grad_norm": 0.07788211852312088, + "learning_rate": 0.0002, + "loss": 0.2443, + "step": 3550 + }, + { + "epoch": 0.7464877332774167, + "grad_norm": 3.2369561195373535, + "learning_rate": 0.0002, + "loss": 0.2807, + "step": 3560 + }, + { + "epoch": 0.7485846089326903, + "grad_norm": 0.0991063192486763, + "learning_rate": 0.0002, + "loss": 0.2438, + "step": 3570 + }, + { + "epoch": 0.7506814845879639, + "grad_norm": 0.07748661190271378, + "learning_rate": 0.0002, + "loss": 0.2694, + "step": 3580 + }, + { + "epoch": 0.7527783602432375, + "grad_norm": 1.6730282306671143, + "learning_rate": 0.0002, + "loss": 0.2765, + "step": 3590 + }, + { + "epoch": 0.7548752358985112, + "grad_norm": 0.10425275564193726, + "learning_rate": 0.0002, + "loss": 0.267, + "step": 3600 + }, + { + "epoch": 0.7569721115537849, + "grad_norm": 0.08449215441942215, + "learning_rate": 0.0002, + "loss": 0.3076, + "step": 3610 + }, + { + "epoch": 0.7590689872090585, + "grad_norm": 0.09143321216106415, + "learning_rate": 0.0002, + "loss": 0.2721, + "step": 3620 + }, + { + "epoch": 0.7611658628643322, + "grad_norm": 0.0790523886680603, + "learning_rate": 0.0002, + "loss": 0.2613, + "step": 3630 + }, + { + "epoch": 0.7632627385196058, + "grad_norm": 1.5371196269989014, + "learning_rate": 0.0002, + "loss": 0.2739, + "step": 3640 + }, + { + "epoch": 0.7653596141748794, + "grad_norm": 0.06781799346208572, + "learning_rate": 0.0002, + "loss": 0.2796, + "step": 3650 + }, + { + "epoch": 0.767456489830153, + "grad_norm": 0.08765111118555069, + "learning_rate": 0.0002, + "loss": 0.2636, + "step": 3660 + }, + { + "epoch": 0.7695533654854267, + "grad_norm": 0.9966158866882324, + "learning_rate": 0.0002, + "loss": 0.2556, + "step": 3670 + }, + { + "epoch": 0.7716502411407004, + "grad_norm": 1.4500865936279297, + "learning_rate": 0.0002, + "loss": 0.2672, + "step": 3680 + }, + { + "epoch": 0.773747116795974, + "grad_norm": 0.1110655665397644, + "learning_rate": 0.0002, + "loss": 0.2743, + "step": 3690 + }, + { + "epoch": 0.7758439924512477, + "grad_norm": 0.0707395151257515, + "learning_rate": 0.0002, + "loss": 0.2632, + "step": 3700 + }, + { + "epoch": 0.7779408681065213, + "grad_norm": 2.7958946228027344, + "learning_rate": 0.0002, + "loss": 0.2711, + "step": 3710 + }, + { + "epoch": 0.7800377437617949, + "grad_norm": 0.07658112794160843, + "learning_rate": 0.0002, + "loss": 0.3528, + "step": 3720 + }, + { + "epoch": 0.7821346194170685, + "grad_norm": 0.05015365406870842, + "learning_rate": 0.0002, + "loss": 0.2567, + "step": 3730 + }, + { + "epoch": 0.7842314950723422, + "grad_norm": 0.0651109591126442, + "learning_rate": 0.0002, + "loss": 0.3063, + "step": 3740 + }, + { + "epoch": 0.7863283707276159, + "grad_norm": 0.06813528388738632, + "learning_rate": 0.0002, + "loss": 0.2449, + "step": 3750 + }, + { + "epoch": 0.7884252463828895, + "grad_norm": 0.07487435638904572, + "learning_rate": 0.0002, + "loss": 0.2611, + "step": 3760 + }, + { + "epoch": 0.7905221220381632, + "grad_norm": 0.14289982616901398, + "learning_rate": 0.0002, + "loss": 0.2834, + "step": 3770 + }, + { + "epoch": 0.7926189976934368, + "grad_norm": 3.4994521141052246, + "learning_rate": 0.0002, + "loss": 0.3402, + "step": 3780 + }, + { + "epoch": 0.7947158733487104, + "grad_norm": 0.06324627995491028, + "learning_rate": 0.0002, + "loss": 0.2671, + "step": 3790 + }, + { + "epoch": 0.796812749003984, + "grad_norm": 2.5252187252044678, + "learning_rate": 0.0002, + "loss": 0.294, + "step": 3800 + }, + { + "epoch": 0.7989096246592577, + "grad_norm": 0.07738825678825378, + "learning_rate": 0.0002, + "loss": 0.2859, + "step": 3810 + }, + { + "epoch": 0.8010065003145314, + "grad_norm": 0.13051818311214447, + "learning_rate": 0.0002, + "loss": 0.2495, + "step": 3820 + }, + { + "epoch": 0.803103375969805, + "grad_norm": 0.5131255388259888, + "learning_rate": 0.0002, + "loss": 0.2471, + "step": 3830 + }, + { + "epoch": 0.8052002516250786, + "grad_norm": 0.103961281478405, + "learning_rate": 0.0002, + "loss": 0.2948, + "step": 3840 + }, + { + "epoch": 0.8072971272803523, + "grad_norm": 0.09874695539474487, + "learning_rate": 0.0002, + "loss": 0.2747, + "step": 3850 + }, + { + "epoch": 0.8093940029356259, + "grad_norm": 0.10877745598554611, + "learning_rate": 0.0002, + "loss": 0.2673, + "step": 3860 + }, + { + "epoch": 0.8114908785908995, + "grad_norm": 0.07221799343824387, + "learning_rate": 0.0002, + "loss": 0.2812, + "step": 3870 + }, + { + "epoch": 0.8135877542461732, + "grad_norm": 0.07352789491415024, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 3880 + }, + { + "epoch": 0.8156846299014469, + "grad_norm": 0.09816540032625198, + "learning_rate": 0.0002, + "loss": 0.2962, + "step": 3890 + }, + { + "epoch": 0.8177815055567205, + "grad_norm": 0.12398573011159897, + "learning_rate": 0.0002, + "loss": 0.2442, + "step": 3900 + }, + { + "epoch": 0.8198783812119941, + "grad_norm": 0.05769842490553856, + "learning_rate": 0.0002, + "loss": 0.2656, + "step": 3910 + }, + { + "epoch": 0.8219752568672678, + "grad_norm": 2.771360397338867, + "learning_rate": 0.0002, + "loss": 0.2693, + "step": 3920 + }, + { + "epoch": 0.8240721325225414, + "grad_norm": 0.07756297290325165, + "learning_rate": 0.0002, + "loss": 0.2973, + "step": 3930 + }, + { + "epoch": 0.826169008177815, + "grad_norm": 0.9152072072029114, + "learning_rate": 0.0002, + "loss": 0.2702, + "step": 3940 + }, + { + "epoch": 0.8282658838330887, + "grad_norm": 0.10475791990756989, + "learning_rate": 0.0002, + "loss": 0.2577, + "step": 3950 + }, + { + "epoch": 0.8303627594883624, + "grad_norm": 2.7981786727905273, + "learning_rate": 0.0002, + "loss": 0.2676, + "step": 3960 + }, + { + "epoch": 0.832459635143636, + "grad_norm": 0.10307871550321579, + "learning_rate": 0.0002, + "loss": 0.2442, + "step": 3970 + }, + { + "epoch": 0.8345565107989096, + "grad_norm": 2.470330238342285, + "learning_rate": 0.0002, + "loss": 0.2901, + "step": 3980 + }, + { + "epoch": 0.8366533864541833, + "grad_norm": 0.0711425393819809, + "learning_rate": 0.0002, + "loss": 0.266, + "step": 3990 + }, + { + "epoch": 0.8387502621094569, + "grad_norm": 0.05242660269141197, + "learning_rate": 0.0002, + "loss": 0.2807, + "step": 4000 + }, + { + "epoch": 0.8408471377647305, + "grad_norm": 0.049969688057899475, + "learning_rate": 0.0002, + "loss": 0.3441, + "step": 4010 + }, + { + "epoch": 0.8429440134200042, + "grad_norm": 1.7452404499053955, + "learning_rate": 0.0002, + "loss": 0.2691, + "step": 4020 + }, + { + "epoch": 0.8450408890752779, + "grad_norm": 0.0709633007645607, + "learning_rate": 0.0002, + "loss": 0.2796, + "step": 4030 + }, + { + "epoch": 0.8471377647305515, + "grad_norm": 1.0136632919311523, + "learning_rate": 0.0002, + "loss": 0.2799, + "step": 4040 + }, + { + "epoch": 0.8492346403858251, + "grad_norm": 0.14160498976707458, + "learning_rate": 0.0002, + "loss": 0.2753, + "step": 4050 + }, + { + "epoch": 0.8513315160410988, + "grad_norm": 0.08907493203878403, + "learning_rate": 0.0002, + "loss": 0.2594, + "step": 4060 + }, + { + "epoch": 0.8534283916963724, + "grad_norm": 0.06804440915584564, + "learning_rate": 0.0002, + "loss": 0.2597, + "step": 4070 + }, + { + "epoch": 0.855525267351646, + "grad_norm": 0.10351166874170303, + "learning_rate": 0.0002, + "loss": 0.2582, + "step": 4080 + }, + { + "epoch": 0.8576221430069196, + "grad_norm": 0.09024465084075928, + "learning_rate": 0.0002, + "loss": 0.2422, + "step": 4090 + }, + { + "epoch": 0.8597190186621934, + "grad_norm": 0.09236204624176025, + "learning_rate": 0.0002, + "loss": 0.2435, + "step": 4100 + }, + { + "epoch": 0.861815894317467, + "grad_norm": 0.08826452493667603, + "learning_rate": 0.0002, + "loss": 0.2725, + "step": 4110 + }, + { + "epoch": 0.8639127699727406, + "grad_norm": 0.08384134620428085, + "learning_rate": 0.0002, + "loss": 0.2443, + "step": 4120 + }, + { + "epoch": 0.8660096456280143, + "grad_norm": 0.06862927228212357, + "learning_rate": 0.0002, + "loss": 0.2904, + "step": 4130 + }, + { + "epoch": 0.8681065212832879, + "grad_norm": 0.06906203180551529, + "learning_rate": 0.0002, + "loss": 0.2589, + "step": 4140 + }, + { + "epoch": 0.8702033969385615, + "grad_norm": 0.11079169809818268, + "learning_rate": 0.0002, + "loss": 0.2449, + "step": 4150 + }, + { + "epoch": 0.8723002725938351, + "grad_norm": 0.05974194407463074, + "learning_rate": 0.0002, + "loss": 0.2743, + "step": 4160 + }, + { + "epoch": 0.8743971482491089, + "grad_norm": 0.08867297321557999, + "learning_rate": 0.0002, + "loss": 0.2437, + "step": 4170 + }, + { + "epoch": 0.8764940239043825, + "grad_norm": 0.0735638365149498, + "learning_rate": 0.0002, + "loss": 0.2449, + "step": 4180 + }, + { + "epoch": 0.8785908995596561, + "grad_norm": 0.051879338920116425, + "learning_rate": 0.0002, + "loss": 0.2431, + "step": 4190 + }, + { + "epoch": 0.8806877752149298, + "grad_norm": 0.0882013738155365, + "learning_rate": 0.0002, + "loss": 0.2443, + "step": 4200 + }, + { + "epoch": 0.8827846508702034, + "grad_norm": 0.09889431297779083, + "learning_rate": 0.0002, + "loss": 0.2955, + "step": 4210 + }, + { + "epoch": 0.884881526525477, + "grad_norm": 0.09547612071037292, + "learning_rate": 0.0002, + "loss": 0.2664, + "step": 4220 + }, + { + "epoch": 0.8869784021807506, + "grad_norm": 0.1306789666414261, + "learning_rate": 0.0002, + "loss": 0.2444, + "step": 4230 + }, + { + "epoch": 0.8890752778360244, + "grad_norm": 0.07582208514213562, + "learning_rate": 0.0002, + "loss": 0.2995, + "step": 4240 + }, + { + "epoch": 0.891172153491298, + "grad_norm": 0.30311834812164307, + "learning_rate": 0.0002, + "loss": 0.2771, + "step": 4250 + }, + { + "epoch": 0.8932690291465716, + "grad_norm": 0.07074534893035889, + "learning_rate": 0.0002, + "loss": 0.272, + "step": 4260 + }, + { + "epoch": 0.8953659048018453, + "grad_norm": 0.12505321204662323, + "learning_rate": 0.0002, + "loss": 0.2447, + "step": 4270 + }, + { + "epoch": 0.8974627804571189, + "grad_norm": 1.3733842372894287, + "learning_rate": 0.0002, + "loss": 0.3084, + "step": 4280 + }, + { + "epoch": 0.8995596561123925, + "grad_norm": 2.174771785736084, + "learning_rate": 0.0002, + "loss": 0.2643, + "step": 4290 + }, + { + "epoch": 0.9016565317676661, + "grad_norm": 0.05621308833360672, + "learning_rate": 0.0002, + "loss": 0.3032, + "step": 4300 + }, + { + "epoch": 0.9037534074229399, + "grad_norm": 0.1187891885638237, + "learning_rate": 0.0002, + "loss": 0.2606, + "step": 4310 + }, + { + "epoch": 0.9058502830782135, + "grad_norm": 7.58337926864624, + "learning_rate": 0.0002, + "loss": 0.3478, + "step": 4320 + }, + { + "epoch": 0.9079471587334871, + "grad_norm": 0.07003562152385712, + "learning_rate": 0.0002, + "loss": 0.2734, + "step": 4330 + }, + { + "epoch": 0.9100440343887607, + "grad_norm": 0.08889756351709366, + "learning_rate": 0.0002, + "loss": 0.2636, + "step": 4340 + }, + { + "epoch": 0.9121409100440344, + "grad_norm": 0.07013625651597977, + "learning_rate": 0.0002, + "loss": 0.2724, + "step": 4350 + }, + { + "epoch": 0.914237785699308, + "grad_norm": 0.06386245042085648, + "learning_rate": 0.0002, + "loss": 0.2429, + "step": 4360 + }, + { + "epoch": 0.9163346613545816, + "grad_norm": 0.09225624054670334, + "learning_rate": 0.0002, + "loss": 0.2461, + "step": 4370 + }, + { + "epoch": 0.9184315370098554, + "grad_norm": 0.07157222181558609, + "learning_rate": 0.0002, + "loss": 0.2636, + "step": 4380 + }, + { + "epoch": 0.920528412665129, + "grad_norm": 0.05457375571131706, + "learning_rate": 0.0002, + "loss": 0.2875, + "step": 4390 + }, + { + "epoch": 0.9226252883204026, + "grad_norm": 0.06286972016096115, + "learning_rate": 0.0002, + "loss": 0.2682, + "step": 4400 + }, + { + "epoch": 0.9247221639756762, + "grad_norm": 0.12173102796077728, + "learning_rate": 0.0002, + "loss": 0.287, + "step": 4410 + }, + { + "epoch": 0.9268190396309499, + "grad_norm": 0.10016096383333206, + "learning_rate": 0.0002, + "loss": 0.2683, + "step": 4420 + }, + { + "epoch": 0.9289159152862235, + "grad_norm": 0.3565249741077423, + "learning_rate": 0.0002, + "loss": 0.2688, + "step": 4430 + }, + { + "epoch": 0.9310127909414971, + "grad_norm": 1.8416194915771484, + "learning_rate": 0.0002, + "loss": 0.2896, + "step": 4440 + }, + { + "epoch": 0.9331096665967709, + "grad_norm": 0.07465513795614243, + "learning_rate": 0.0002, + "loss": 0.2444, + "step": 4450 + }, + { + "epoch": 0.9352065422520445, + "grad_norm": 0.12205296009778976, + "learning_rate": 0.0002, + "loss": 0.2596, + "step": 4460 + }, + { + "epoch": 0.9373034179073181, + "grad_norm": 0.07248952239751816, + "learning_rate": 0.0002, + "loss": 0.2653, + "step": 4470 + }, + { + "epoch": 0.9394002935625917, + "grad_norm": 0.6678045392036438, + "learning_rate": 0.0002, + "loss": 0.3125, + "step": 4480 + }, + { + "epoch": 0.9414971692178654, + "grad_norm": 0.08333967626094818, + "learning_rate": 0.0002, + "loss": 0.2988, + "step": 4490 + }, + { + "epoch": 0.943594044873139, + "grad_norm": 1.3239233493804932, + "learning_rate": 0.0002, + "loss": 0.2533, + "step": 4500 + }, + { + "epoch": 0.9456909205284126, + "grad_norm": 0.10770973563194275, + "learning_rate": 0.0002, + "loss": 0.264, + "step": 4510 + }, + { + "epoch": 0.9477877961836864, + "grad_norm": 0.07700380682945251, + "learning_rate": 0.0002, + "loss": 0.2439, + "step": 4520 + }, + { + "epoch": 0.94988467183896, + "grad_norm": 0.05980914458632469, + "learning_rate": 0.0002, + "loss": 0.291, + "step": 4530 + }, + { + "epoch": 0.9519815474942336, + "grad_norm": 0.05780167877674103, + "learning_rate": 0.0002, + "loss": 0.3009, + "step": 4540 + }, + { + "epoch": 0.9540784231495072, + "grad_norm": 0.0757979080080986, + "learning_rate": 0.0002, + "loss": 0.2625, + "step": 4550 + }, + { + "epoch": 0.9561752988047809, + "grad_norm": 0.06890128552913666, + "learning_rate": 0.0002, + "loss": 0.2426, + "step": 4560 + }, + { + "epoch": 0.9582721744600545, + "grad_norm": 0.07870949804782867, + "learning_rate": 0.0002, + "loss": 0.271, + "step": 4570 + }, + { + "epoch": 0.9603690501153281, + "grad_norm": 0.08925454318523407, + "learning_rate": 0.0002, + "loss": 0.2442, + "step": 4580 + }, + { + "epoch": 0.9624659257706017, + "grad_norm": 0.08406192064285278, + "learning_rate": 0.0002, + "loss": 0.2521, + "step": 4590 + }, + { + "epoch": 0.9645628014258755, + "grad_norm": 0.10540168732404709, + "learning_rate": 0.0002, + "loss": 0.2424, + "step": 4600 + }, + { + "epoch": 0.9666596770811491, + "grad_norm": 0.08675917983055115, + "learning_rate": 0.0002, + "loss": 0.2872, + "step": 4610 + }, + { + "epoch": 0.9687565527364227, + "grad_norm": 2.3604719638824463, + "learning_rate": 0.0002, + "loss": 0.2933, + "step": 4620 + }, + { + "epoch": 0.9708534283916964, + "grad_norm": 0.1408311426639557, + "learning_rate": 0.0002, + "loss": 0.2604, + "step": 4630 + }, + { + "epoch": 0.97295030404697, + "grad_norm": 0.1045566201210022, + "learning_rate": 0.0002, + "loss": 0.2426, + "step": 4640 + }, + { + "epoch": 0.9750471797022436, + "grad_norm": 0.08125372976064682, + "learning_rate": 0.0002, + "loss": 0.2542, + "step": 4650 + }, + { + "epoch": 0.9771440553575172, + "grad_norm": 3.7775802612304688, + "learning_rate": 0.0002, + "loss": 0.2831, + "step": 4660 + }, + { + "epoch": 0.979240931012791, + "grad_norm": 0.12283279001712799, + "learning_rate": 0.0002, + "loss": 0.2473, + "step": 4670 + }, + { + "epoch": 0.9813378066680646, + "grad_norm": 0.11770537495613098, + "learning_rate": 0.0002, + "loss": 0.2449, + "step": 4680 + }, + { + "epoch": 0.9834346823233382, + "grad_norm": 0.07334215193986893, + "learning_rate": 0.0002, + "loss": 0.2601, + "step": 4690 + }, + { + "epoch": 0.9855315579786119, + "grad_norm": 0.0814523994922638, + "learning_rate": 0.0002, + "loss": 0.2918, + "step": 4700 + }, + { + "epoch": 0.9876284336338855, + "grad_norm": 0.054461974650621414, + "learning_rate": 0.0002, + "loss": 0.2717, + "step": 4710 + }, + { + "epoch": 0.9897253092891591, + "grad_norm": 0.06933179497718811, + "learning_rate": 0.0002, + "loss": 0.2711, + "step": 4720 + }, + { + "epoch": 0.9918221849444327, + "grad_norm": 0.16758371889591217, + "learning_rate": 0.0002, + "loss": 0.2685, + "step": 4730 + }, + { + "epoch": 0.9939190605997065, + "grad_norm": 0.06895585358142853, + "learning_rate": 0.0002, + "loss": 0.2709, + "step": 4740 + }, + { + "epoch": 0.9960159362549801, + "grad_norm": 0.05912035331130028, + "learning_rate": 0.0002, + "loss": 0.2467, + "step": 4750 + }, + { + "epoch": 0.9981128119102537, + "grad_norm": 0.13841500878334045, + "learning_rate": 0.0002, + "loss": 0.2673, + "step": 4760 + }, + { + "epoch": 1.0002096875655273, + "grad_norm": 0.10441073030233383, + "learning_rate": 0.0002, + "loss": 0.27, + "step": 4770 + }, + { + "epoch": 1.002306563220801, + "grad_norm": 0.16520650684833527, + "learning_rate": 0.0002, + "loss": 0.2687, + "step": 4780 + }, + { + "epoch": 1.0044034388760748, + "grad_norm": 1.2790181636810303, + "learning_rate": 0.0002, + "loss": 0.2502, + "step": 4790 + }, + { + "epoch": 1.0065003145313483, + "grad_norm": 0.08104977756738663, + "learning_rate": 0.0002, + "loss": 0.2438, + "step": 4800 + }, + { + "epoch": 1.008597190186622, + "grad_norm": 0.07370474934577942, + "learning_rate": 0.0002, + "loss": 0.2611, + "step": 4810 + }, + { + "epoch": 1.0106940658418955, + "grad_norm": 0.07946150749921799, + "learning_rate": 0.0002, + "loss": 0.2761, + "step": 4820 + }, + { + "epoch": 1.0127909414971692, + "grad_norm": 0.1244131550192833, + "learning_rate": 0.0002, + "loss": 0.2435, + "step": 4830 + }, + { + "epoch": 1.014887817152443, + "grad_norm": 0.08558488637208939, + "learning_rate": 0.0002, + "loss": 0.2958, + "step": 4840 + }, + { + "epoch": 1.0169846928077164, + "grad_norm": 2.46805477142334, + "learning_rate": 0.0002, + "loss": 0.265, + "step": 4850 + }, + { + "epoch": 1.0190815684629901, + "grad_norm": 0.06383222341537476, + "learning_rate": 0.0002, + "loss": 0.3791, + "step": 4860 + }, + { + "epoch": 1.0211784441182639, + "grad_norm": 3.0055954456329346, + "learning_rate": 0.0002, + "loss": 0.3129, + "step": 4870 + }, + { + "epoch": 1.0232753197735374, + "grad_norm": 2.8449673652648926, + "learning_rate": 0.0002, + "loss": 0.3186, + "step": 4880 + }, + { + "epoch": 1.025372195428811, + "grad_norm": 0.07504308223724365, + "learning_rate": 0.0002, + "loss": 0.2656, + "step": 4890 + }, + { + "epoch": 1.0274690710840848, + "grad_norm": 0.9263021945953369, + "learning_rate": 0.0002, + "loss": 0.2822, + "step": 4900 + }, + { + "epoch": 1.0295659467393583, + "grad_norm": 0.07075469195842743, + "learning_rate": 0.0002, + "loss": 0.2842, + "step": 4910 + }, + { + "epoch": 1.031662822394632, + "grad_norm": 0.36625659465789795, + "learning_rate": 0.0002, + "loss": 0.2499, + "step": 4920 + }, + { + "epoch": 1.0337596980499058, + "grad_norm": 0.07981095463037491, + "learning_rate": 0.0002, + "loss": 0.2609, + "step": 4930 + }, + { + "epoch": 1.0358565737051793, + "grad_norm": 0.09062483161687851, + "learning_rate": 0.0002, + "loss": 0.2947, + "step": 4940 + }, + { + "epoch": 1.037953449360453, + "grad_norm": 0.5152995586395264, + "learning_rate": 0.0002, + "loss": 0.2673, + "step": 4950 + }, + { + "epoch": 1.0400503250157265, + "grad_norm": 0.09733402729034424, + "learning_rate": 0.0002, + "loss": 0.2574, + "step": 4960 + }, + { + "epoch": 1.0421472006710002, + "grad_norm": 0.107651486992836, + "learning_rate": 0.0002, + "loss": 0.3059, + "step": 4970 + }, + { + "epoch": 1.044244076326274, + "grad_norm": 0.22185586392879486, + "learning_rate": 0.0002, + "loss": 0.3173, + "step": 4980 + }, + { + "epoch": 1.0463409519815474, + "grad_norm": 0.15324990451335907, + "learning_rate": 0.0002, + "loss": 0.2735, + "step": 4990 + }, + { + "epoch": 1.0484378276368211, + "grad_norm": 0.12578018009662628, + "learning_rate": 0.0002, + "loss": 0.2551, + "step": 5000 + }, + { + "epoch": 1.0505347032920949, + "grad_norm": 0.06980187445878983, + "learning_rate": 0.0002, + "loss": 0.2488, + "step": 5010 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.11194098740816116, + "learning_rate": 0.0002, + "loss": 0.2662, + "step": 5020 + }, + { + "epoch": 1.054728454602642, + "grad_norm": 0.0770438089966774, + "learning_rate": 0.0002, + "loss": 0.2616, + "step": 5030 + }, + { + "epoch": 1.0568253302579158, + "grad_norm": 0.10786891728639603, + "learning_rate": 0.0002, + "loss": 0.2821, + "step": 5040 + }, + { + "epoch": 1.0589222059131893, + "grad_norm": 0.09369146078824997, + "learning_rate": 0.0002, + "loss": 0.2591, + "step": 5050 + }, + { + "epoch": 1.061019081568463, + "grad_norm": 0.07498877495527267, + "learning_rate": 0.0002, + "loss": 0.2595, + "step": 5060 + }, + { + "epoch": 1.0631159572237365, + "grad_norm": 0.226748526096344, + "learning_rate": 0.0002, + "loss": 0.2828, + "step": 5070 + }, + { + "epoch": 1.0652128328790103, + "grad_norm": 0.09331674873828888, + "learning_rate": 0.0002, + "loss": 0.2564, + "step": 5080 + }, + { + "epoch": 1.067309708534284, + "grad_norm": 0.10891097784042358, + "learning_rate": 0.0002, + "loss": 0.2443, + "step": 5090 + }, + { + "epoch": 1.0694065841895575, + "grad_norm": 0.0820799469947815, + "learning_rate": 0.0002, + "loss": 0.2548, + "step": 5100 + }, + { + "epoch": 1.0715034598448312, + "grad_norm": 0.052631497383117676, + "learning_rate": 0.0002, + "loss": 0.2444, + "step": 5110 + }, + { + "epoch": 1.073600335500105, + "grad_norm": 0.12196387350559235, + "learning_rate": 0.0002, + "loss": 0.2594, + "step": 5120 + }, + { + "epoch": 1.0756972111553784, + "grad_norm": 0.07985110580921173, + "learning_rate": 0.0002, + "loss": 0.2743, + "step": 5130 + }, + { + "epoch": 1.0777940868106521, + "grad_norm": 0.05758510157465935, + "learning_rate": 0.0002, + "loss": 0.2919, + "step": 5140 + }, + { + "epoch": 1.0798909624659259, + "grad_norm": 0.10148131102323532, + "learning_rate": 0.0002, + "loss": 0.2434, + "step": 5150 + }, + { + "epoch": 1.0819878381211994, + "grad_norm": 0.07871562242507935, + "learning_rate": 0.0002, + "loss": 0.2798, + "step": 5160 + }, + { + "epoch": 1.084084713776473, + "grad_norm": 0.06986663490533829, + "learning_rate": 0.0002, + "loss": 0.2966, + "step": 5170 + }, + { + "epoch": 1.0861815894317468, + "grad_norm": 0.05995866283774376, + "learning_rate": 0.0002, + "loss": 0.2569, + "step": 5180 + }, + { + "epoch": 1.0882784650870203, + "grad_norm": 0.12248244881629944, + "learning_rate": 0.0002, + "loss": 0.2906, + "step": 5190 + }, + { + "epoch": 1.090375340742294, + "grad_norm": 0.052593328058719635, + "learning_rate": 0.0002, + "loss": 0.2561, + "step": 5200 + }, + { + "epoch": 1.0924722163975675, + "grad_norm": 0.05631803721189499, + "learning_rate": 0.0002, + "loss": 0.2943, + "step": 5210 + }, + { + "epoch": 1.0945690920528413, + "grad_norm": 2.102130889892578, + "learning_rate": 0.0002, + "loss": 0.2991, + "step": 5220 + }, + { + "epoch": 1.096665967708115, + "grad_norm": 2.2242186069488525, + "learning_rate": 0.0002, + "loss": 0.258, + "step": 5230 + }, + { + "epoch": 1.0987628433633885, + "grad_norm": 0.06405676156282425, + "learning_rate": 0.0002, + "loss": 0.2845, + "step": 5240 + }, + { + "epoch": 1.1008597190186622, + "grad_norm": 0.05279644578695297, + "learning_rate": 0.0002, + "loss": 0.262, + "step": 5250 + }, + { + "epoch": 1.102956594673936, + "grad_norm": 0.06458443403244019, + "learning_rate": 0.0002, + "loss": 0.246, + "step": 5260 + }, + { + "epoch": 1.1050534703292094, + "grad_norm": 0.07742719352245331, + "learning_rate": 0.0002, + "loss": 0.2663, + "step": 5270 + }, + { + "epoch": 1.1071503459844831, + "grad_norm": 0.0937500149011612, + "learning_rate": 0.0002, + "loss": 0.2441, + "step": 5280 + }, + { + "epoch": 1.1092472216397569, + "grad_norm": 0.05128967761993408, + "learning_rate": 0.0002, + "loss": 0.2623, + "step": 5290 + }, + { + "epoch": 1.1113440972950304, + "grad_norm": 0.053862564265728, + "learning_rate": 0.0002, + "loss": 0.26, + "step": 5300 + }, + { + "epoch": 1.113440972950304, + "grad_norm": 0.07308442145586014, + "learning_rate": 0.0002, + "loss": 0.2653, + "step": 5310 + }, + { + "epoch": 1.1155378486055776, + "grad_norm": 0.08775884658098221, + "learning_rate": 0.0002, + "loss": 0.257, + "step": 5320 + }, + { + "epoch": 1.1176347242608513, + "grad_norm": 0.06515777856111526, + "learning_rate": 0.0002, + "loss": 0.2857, + "step": 5330 + }, + { + "epoch": 1.119731599916125, + "grad_norm": 0.07291428744792938, + "learning_rate": 0.0002, + "loss": 0.2446, + "step": 5340 + }, + { + "epoch": 1.1218284755713985, + "grad_norm": 0.10684103518724442, + "learning_rate": 0.0002, + "loss": 0.2425, + "step": 5350 + }, + { + "epoch": 1.1239253512266723, + "grad_norm": 2.470691680908203, + "learning_rate": 0.0002, + "loss": 0.2873, + "step": 5360 + }, + { + "epoch": 1.126022226881946, + "grad_norm": 1.8290586471557617, + "learning_rate": 0.0002, + "loss": 0.2646, + "step": 5370 + }, + { + "epoch": 1.1281191025372195, + "grad_norm": 0.08278166502714157, + "learning_rate": 0.0002, + "loss": 0.2603, + "step": 5380 + }, + { + "epoch": 1.1302159781924932, + "grad_norm": 0.07949317991733551, + "learning_rate": 0.0002, + "loss": 0.2741, + "step": 5390 + }, + { + "epoch": 1.132312853847767, + "grad_norm": 0.11934222280979156, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 5400 + }, + { + "epoch": 1.1344097295030404, + "grad_norm": 0.09001024812459946, + "learning_rate": 0.0002, + "loss": 0.2533, + "step": 5410 + }, + { + "epoch": 1.1365066051583141, + "grad_norm": 0.0852462500333786, + "learning_rate": 0.0002, + "loss": 0.2657, + "step": 5420 + }, + { + "epoch": 1.1386034808135879, + "grad_norm": 0.08970583975315094, + "learning_rate": 0.0002, + "loss": 0.2831, + "step": 5430 + }, + { + "epoch": 1.1407003564688614, + "grad_norm": 0.10680624097585678, + "learning_rate": 0.0002, + "loss": 0.2702, + "step": 5440 + }, + { + "epoch": 1.142797232124135, + "grad_norm": 0.1428230106830597, + "learning_rate": 0.0002, + "loss": 0.2779, + "step": 5450 + }, + { + "epoch": 1.1448941077794086, + "grad_norm": 0.0725833997130394, + "learning_rate": 0.0002, + "loss": 0.2491, + "step": 5460 + }, + { + "epoch": 1.1469909834346823, + "grad_norm": 1.3643620014190674, + "learning_rate": 0.0002, + "loss": 0.2456, + "step": 5470 + }, + { + "epoch": 1.149087859089956, + "grad_norm": 0.07396101206541061, + "learning_rate": 0.0002, + "loss": 0.2459, + "step": 5480 + }, + { + "epoch": 1.1511847347452295, + "grad_norm": 0.07980525493621826, + "learning_rate": 0.0002, + "loss": 0.2576, + "step": 5490 + }, + { + "epoch": 1.1532816104005033, + "grad_norm": 0.056196991354227066, + "learning_rate": 0.0002, + "loss": 0.2878, + "step": 5500 + }, + { + "epoch": 1.155378486055777, + "grad_norm": 0.057810667902231216, + "learning_rate": 0.0002, + "loss": 0.2955, + "step": 5510 + }, + { + "epoch": 1.1574753617110505, + "grad_norm": 0.15831854939460754, + "learning_rate": 0.0002, + "loss": 0.2455, + "step": 5520 + }, + { + "epoch": 1.1595722373663242, + "grad_norm": 0.07265540957450867, + "learning_rate": 0.0002, + "loss": 0.2538, + "step": 5530 + }, + { + "epoch": 1.1616691130215977, + "grad_norm": 0.21104565262794495, + "learning_rate": 0.0002, + "loss": 0.2576, + "step": 5540 + }, + { + "epoch": 1.1637659886768714, + "grad_norm": 0.0715293288230896, + "learning_rate": 0.0002, + "loss": 0.2443, + "step": 5550 + }, + { + "epoch": 1.1658628643321451, + "grad_norm": 0.10274804383516312, + "learning_rate": 0.0002, + "loss": 0.2583, + "step": 5560 + }, + { + "epoch": 1.1679597399874186, + "grad_norm": 4.549091339111328, + "learning_rate": 0.0002, + "loss": 0.2638, + "step": 5570 + }, + { + "epoch": 1.1700566156426924, + "grad_norm": 0.6761789917945862, + "learning_rate": 0.0002, + "loss": 0.2445, + "step": 5580 + }, + { + "epoch": 1.172153491297966, + "grad_norm": 0.10144541412591934, + "learning_rate": 0.0002, + "loss": 0.244, + "step": 5590 + }, + { + "epoch": 1.1742503669532396, + "grad_norm": 1.6135568618774414, + "learning_rate": 0.0002, + "loss": 0.3067, + "step": 5600 + }, + { + "epoch": 1.1763472426085133, + "grad_norm": 0.07532897591590881, + "learning_rate": 0.0002, + "loss": 0.2809, + "step": 5610 + }, + { + "epoch": 1.178444118263787, + "grad_norm": 0.05869752913713455, + "learning_rate": 0.0002, + "loss": 0.2423, + "step": 5620 + }, + { + "epoch": 1.1805409939190605, + "grad_norm": 0.10139182209968567, + "learning_rate": 0.0002, + "loss": 0.2565, + "step": 5630 + }, + { + "epoch": 1.1826378695743343, + "grad_norm": 0.25902649760246277, + "learning_rate": 0.0002, + "loss": 0.2676, + "step": 5640 + }, + { + "epoch": 1.184734745229608, + "grad_norm": 0.12678462266921997, + "learning_rate": 0.0002, + "loss": 0.2711, + "step": 5650 + }, + { + "epoch": 1.1868316208848815, + "grad_norm": 0.06784678995609283, + "learning_rate": 0.0002, + "loss": 0.2678, + "step": 5660 + }, + { + "epoch": 1.1889284965401552, + "grad_norm": 3.252060651779175, + "learning_rate": 0.0002, + "loss": 0.3044, + "step": 5670 + }, + { + "epoch": 1.191025372195429, + "grad_norm": 0.07103057205677032, + "learning_rate": 0.0002, + "loss": 0.2694, + "step": 5680 + }, + { + "epoch": 1.1931222478507024, + "grad_norm": 0.08140352368354797, + "learning_rate": 0.0002, + "loss": 0.2463, + "step": 5690 + }, + { + "epoch": 1.1952191235059761, + "grad_norm": 1.4584933519363403, + "learning_rate": 0.0002, + "loss": 0.2636, + "step": 5700 + }, + { + "epoch": 1.1973159991612496, + "grad_norm": 1.501371145248413, + "learning_rate": 0.0002, + "loss": 0.2687, + "step": 5710 + }, + { + "epoch": 1.1994128748165234, + "grad_norm": 0.0723869800567627, + "learning_rate": 0.0002, + "loss": 0.2591, + "step": 5720 + }, + { + "epoch": 1.201509750471797, + "grad_norm": 1.0655723810195923, + "learning_rate": 0.0002, + "loss": 0.297, + "step": 5730 + }, + { + "epoch": 1.2036066261270706, + "grad_norm": 5.795892715454102, + "learning_rate": 0.0002, + "loss": 0.2696, + "step": 5740 + }, + { + "epoch": 1.2057035017823443, + "grad_norm": 0.08217945694923401, + "learning_rate": 0.0002, + "loss": 0.2435, + "step": 5750 + }, + { + "epoch": 1.207800377437618, + "grad_norm": 0.1614740788936615, + "learning_rate": 0.0002, + "loss": 0.2606, + "step": 5760 + }, + { + "epoch": 1.2098972530928915, + "grad_norm": 0.07378556579351425, + "learning_rate": 0.0002, + "loss": 0.3326, + "step": 5770 + }, + { + "epoch": 1.2119941287481653, + "grad_norm": 0.07546745985746384, + "learning_rate": 0.0002, + "loss": 0.2596, + "step": 5780 + }, + { + "epoch": 1.2140910044034388, + "grad_norm": 2.203601360321045, + "learning_rate": 0.0002, + "loss": 0.2825, + "step": 5790 + }, + { + "epoch": 1.2161878800587125, + "grad_norm": 0.0542774498462677, + "learning_rate": 0.0002, + "loss": 0.2657, + "step": 5800 + }, + { + "epoch": 1.2182847557139862, + "grad_norm": 1.252591848373413, + "learning_rate": 0.0002, + "loss": 0.2573, + "step": 5810 + }, + { + "epoch": 1.2203816313692597, + "grad_norm": 0.1051403135061264, + "learning_rate": 0.0002, + "loss": 0.3223, + "step": 5820 + }, + { + "epoch": 1.2224785070245334, + "grad_norm": 0.13013513386249542, + "learning_rate": 0.0002, + "loss": 0.2583, + "step": 5830 + }, + { + "epoch": 1.2245753826798071, + "grad_norm": 0.11686304211616516, + "learning_rate": 0.0002, + "loss": 0.2663, + "step": 5840 + }, + { + "epoch": 1.2266722583350806, + "grad_norm": 2.8360345363616943, + "learning_rate": 0.0002, + "loss": 0.2861, + "step": 5850 + }, + { + "epoch": 1.2287691339903544, + "grad_norm": 0.06085509806871414, + "learning_rate": 0.0002, + "loss": 0.2602, + "step": 5860 + }, + { + "epoch": 1.230866009645628, + "grad_norm": 0.07168704271316528, + "learning_rate": 0.0002, + "loss": 0.2433, + "step": 5870 + }, + { + "epoch": 1.2329628853009016, + "grad_norm": 0.05347032472491264, + "learning_rate": 0.0002, + "loss": 0.2706, + "step": 5880 + }, + { + "epoch": 1.2350597609561753, + "grad_norm": 0.0719766765832901, + "learning_rate": 0.0002, + "loss": 0.2464, + "step": 5890 + }, + { + "epoch": 1.237156636611449, + "grad_norm": 2.9147186279296875, + "learning_rate": 0.0002, + "loss": 0.3168, + "step": 5900 + }, + { + "epoch": 1.2392535122667225, + "grad_norm": 0.0879439190030098, + "learning_rate": 0.0002, + "loss": 0.2585, + "step": 5910 + }, + { + "epoch": 1.2413503879219963, + "grad_norm": 0.09813595563173294, + "learning_rate": 0.0002, + "loss": 0.2606, + "step": 5920 + }, + { + "epoch": 1.24344726357727, + "grad_norm": 1.3518685102462769, + "learning_rate": 0.0002, + "loss": 0.2705, + "step": 5930 + }, + { + "epoch": 1.2455441392325435, + "grad_norm": 1.7133814096450806, + "learning_rate": 0.0002, + "loss": 0.2633, + "step": 5940 + }, + { + "epoch": 1.2476410148878172, + "grad_norm": 0.09435052424669266, + "learning_rate": 0.0002, + "loss": 0.2657, + "step": 5950 + }, + { + "epoch": 1.2497378905430907, + "grad_norm": 0.9879856705665588, + "learning_rate": 0.0002, + "loss": 0.2667, + "step": 5960 + }, + { + "epoch": 1.2518347661983644, + "grad_norm": 1.1095253229141235, + "learning_rate": 0.0002, + "loss": 0.2978, + "step": 5970 + }, + { + "epoch": 1.2539316418536381, + "grad_norm": 0.12190214544534683, + "learning_rate": 0.0002, + "loss": 0.2556, + "step": 5980 + }, + { + "epoch": 1.2560285175089116, + "grad_norm": 0.05829515680670738, + "learning_rate": 0.0002, + "loss": 0.2434, + "step": 5990 + }, + { + "epoch": 1.2581253931641854, + "grad_norm": 0.0911063402891159, + "learning_rate": 0.0002, + "loss": 0.2427, + "step": 6000 + }, + { + "epoch": 1.2602222688194589, + "grad_norm": 0.08175632357597351, + "learning_rate": 0.0002, + "loss": 0.2714, + "step": 6010 + }, + { + "epoch": 1.2623191444747326, + "grad_norm": 0.0865393579006195, + "learning_rate": 0.0002, + "loss": 0.3005, + "step": 6020 + }, + { + "epoch": 1.2644160201300063, + "grad_norm": 0.08705785870552063, + "learning_rate": 0.0002, + "loss": 0.2793, + "step": 6030 + }, + { + "epoch": 1.2665128957852798, + "grad_norm": 0.06791888922452927, + "learning_rate": 0.0002, + "loss": 0.2426, + "step": 6040 + }, + { + "epoch": 1.2686097714405535, + "grad_norm": 0.10770615190267563, + "learning_rate": 0.0002, + "loss": 0.3023, + "step": 6050 + }, + { + "epoch": 1.2707066470958273, + "grad_norm": 0.06609726697206497, + "learning_rate": 0.0002, + "loss": 0.2593, + "step": 6060 + }, + { + "epoch": 1.2728035227511008, + "grad_norm": 0.06439893692731857, + "learning_rate": 0.0002, + "loss": 0.2482, + "step": 6070 + }, + { + "epoch": 1.2749003984063745, + "grad_norm": 0.07232308387756348, + "learning_rate": 0.0002, + "loss": 0.2696, + "step": 6080 + }, + { + "epoch": 1.2769972740616482, + "grad_norm": 0.09587598592042923, + "learning_rate": 0.0002, + "loss": 0.2738, + "step": 6090 + }, + { + "epoch": 1.2790941497169217, + "grad_norm": 0.06788215786218643, + "learning_rate": 0.0002, + "loss": 0.2758, + "step": 6100 + }, + { + "epoch": 1.2811910253721954, + "grad_norm": 0.09348220378160477, + "learning_rate": 0.0002, + "loss": 0.2445, + "step": 6110 + }, + { + "epoch": 1.2832879010274691, + "grad_norm": 0.058364514261484146, + "learning_rate": 0.0002, + "loss": 0.2433, + "step": 6120 + }, + { + "epoch": 1.2853847766827426, + "grad_norm": 0.06461839377880096, + "learning_rate": 0.0002, + "loss": 0.2731, + "step": 6130 + }, + { + "epoch": 1.2874816523380164, + "grad_norm": 2.521012306213379, + "learning_rate": 0.0002, + "loss": 0.2755, + "step": 6140 + }, + { + "epoch": 1.28957852799329, + "grad_norm": 7.787893772125244, + "learning_rate": 0.0002, + "loss": 0.262, + "step": 6150 + }, + { + "epoch": 1.2916754036485636, + "grad_norm": 0.07614123821258545, + "learning_rate": 0.0002, + "loss": 0.2444, + "step": 6160 + }, + { + "epoch": 1.2937722793038373, + "grad_norm": 0.06463972479104996, + "learning_rate": 0.0002, + "loss": 0.2627, + "step": 6170 + }, + { + "epoch": 1.295869154959111, + "grad_norm": 0.07085216790437698, + "learning_rate": 0.0002, + "loss": 0.2832, + "step": 6180 + }, + { + "epoch": 1.2979660306143845, + "grad_norm": 0.6012230515480042, + "learning_rate": 0.0002, + "loss": 0.2546, + "step": 6190 + }, + { + "epoch": 1.3000629062696583, + "grad_norm": 1.896254062652588, + "learning_rate": 0.0002, + "loss": 0.2563, + "step": 6200 + }, + { + "epoch": 1.302159781924932, + "grad_norm": 0.06487879157066345, + "learning_rate": 0.0002, + "loss": 0.2453, + "step": 6210 + }, + { + "epoch": 1.3042566575802055, + "grad_norm": 0.0652952492237091, + "learning_rate": 0.0002, + "loss": 0.2583, + "step": 6220 + }, + { + "epoch": 1.3063535332354792, + "grad_norm": 0.0889468789100647, + "learning_rate": 0.0002, + "loss": 0.2433, + "step": 6230 + }, + { + "epoch": 1.3084504088907527, + "grad_norm": 0.07459229975938797, + "learning_rate": 0.0002, + "loss": 0.257, + "step": 6240 + }, + { + "epoch": 1.3105472845460264, + "grad_norm": 0.40443506836891174, + "learning_rate": 0.0002, + "loss": 0.2907, + "step": 6250 + }, + { + "epoch": 1.3126441602013, + "grad_norm": 0.0493333600461483, + "learning_rate": 0.0002, + "loss": 0.2444, + "step": 6260 + }, + { + "epoch": 1.3147410358565736, + "grad_norm": 0.06762096285820007, + "learning_rate": 0.0002, + "loss": 0.2752, + "step": 6270 + }, + { + "epoch": 1.3168379115118474, + "grad_norm": 0.08447722345590591, + "learning_rate": 0.0002, + "loss": 0.2925, + "step": 6280 + }, + { + "epoch": 1.3189347871671209, + "grad_norm": 0.1290348619222641, + "learning_rate": 0.0002, + "loss": 0.2443, + "step": 6290 + }, + { + "epoch": 1.3210316628223946, + "grad_norm": 1.5261108875274658, + "learning_rate": 0.0002, + "loss": 0.258, + "step": 6300 + }, + { + "epoch": 1.3231285384776683, + "grad_norm": 0.07950276881456375, + "learning_rate": 0.0002, + "loss": 0.2439, + "step": 6310 + }, + { + "epoch": 1.3252254141329418, + "grad_norm": 0.054064471274614334, + "learning_rate": 0.0002, + "loss": 0.2655, + "step": 6320 + }, + { + "epoch": 1.3273222897882155, + "grad_norm": 1.496674656867981, + "learning_rate": 0.0002, + "loss": 0.2555, + "step": 6330 + }, + { + "epoch": 1.3294191654434893, + "grad_norm": 0.0939754843711853, + "learning_rate": 0.0002, + "loss": 0.2475, + "step": 6340 + }, + { + "epoch": 1.3315160410987628, + "grad_norm": 0.060312144458293915, + "learning_rate": 0.0002, + "loss": 0.2626, + "step": 6350 + }, + { + "epoch": 1.3336129167540365, + "grad_norm": 0.05656959488987923, + "learning_rate": 0.0002, + "loss": 0.2441, + "step": 6360 + }, + { + "epoch": 1.3357097924093102, + "grad_norm": 0.054611966013908386, + "learning_rate": 0.0002, + "loss": 0.2695, + "step": 6370 + }, + { + "epoch": 1.3378066680645837, + "grad_norm": 0.10933967679738998, + "learning_rate": 0.0002, + "loss": 0.2848, + "step": 6380 + }, + { + "epoch": 1.3399035437198574, + "grad_norm": 0.0762239620089531, + "learning_rate": 0.0002, + "loss": 0.2553, + "step": 6390 + }, + { + "epoch": 1.3420004193751311, + "grad_norm": 0.06087252497673035, + "learning_rate": 0.0002, + "loss": 0.2708, + "step": 6400 + }, + { + "epoch": 1.3440972950304046, + "grad_norm": 1.4033501148223877, + "learning_rate": 0.0002, + "loss": 0.2925, + "step": 6410 + }, + { + "epoch": 1.3461941706856784, + "grad_norm": 3.9681739807128906, + "learning_rate": 0.0002, + "loss": 0.2727, + "step": 6420 + }, + { + "epoch": 1.348291046340952, + "grad_norm": 0.07361237704753876, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 6430 + }, + { + "epoch": 1.3503879219962256, + "grad_norm": 0.07740651816129684, + "learning_rate": 0.0002, + "loss": 0.2779, + "step": 6440 + }, + { + "epoch": 1.3524847976514993, + "grad_norm": 0.09559556841850281, + "learning_rate": 0.0002, + "loss": 0.2537, + "step": 6450 + }, + { + "epoch": 1.354581673306773, + "grad_norm": 3.6172733306884766, + "learning_rate": 0.0002, + "loss": 0.2645, + "step": 6460 + }, + { + "epoch": 1.3566785489620465, + "grad_norm": 0.06684648990631104, + "learning_rate": 0.0002, + "loss": 0.2456, + "step": 6470 + }, + { + "epoch": 1.3587754246173203, + "grad_norm": 0.05803783982992172, + "learning_rate": 0.0002, + "loss": 0.244, + "step": 6480 + }, + { + "epoch": 1.3608723002725938, + "grad_norm": 0.07825397700071335, + "learning_rate": 0.0002, + "loss": 0.2634, + "step": 6490 + }, + { + "epoch": 1.3629691759278675, + "grad_norm": 0.13001962006092072, + "learning_rate": 0.0002, + "loss": 0.2442, + "step": 6500 + }, + { + "epoch": 1.365066051583141, + "grad_norm": 0.05548061430454254, + "learning_rate": 0.0002, + "loss": 0.2701, + "step": 6510 + }, + { + "epoch": 1.3671629272384147, + "grad_norm": 0.07405369728803635, + "learning_rate": 0.0002, + "loss": 0.2435, + "step": 6520 + }, + { + "epoch": 1.3692598028936884, + "grad_norm": 0.059938061982393265, + "learning_rate": 0.0002, + "loss": 0.2445, + "step": 6530 + }, + { + "epoch": 1.371356678548962, + "grad_norm": 0.09082169085741043, + "learning_rate": 0.0002, + "loss": 0.2602, + "step": 6540 + }, + { + "epoch": 1.3734535542042356, + "grad_norm": 0.06905685365200043, + "learning_rate": 0.0002, + "loss": 0.2459, + "step": 6550 + }, + { + "epoch": 1.3755504298595094, + "grad_norm": 1.4017658233642578, + "learning_rate": 0.0002, + "loss": 0.2773, + "step": 6560 + }, + { + "epoch": 1.3776473055147829, + "grad_norm": 0.0469370074570179, + "learning_rate": 0.0002, + "loss": 0.2582, + "step": 6570 + }, + { + "epoch": 1.3797441811700566, + "grad_norm": 3.311746835708618, + "learning_rate": 0.0002, + "loss": 0.2984, + "step": 6580 + }, + { + "epoch": 1.3818410568253303, + "grad_norm": 0.062438882887363434, + "learning_rate": 0.0002, + "loss": 0.2445, + "step": 6590 + }, + { + "epoch": 1.3839379324806038, + "grad_norm": 0.05996204912662506, + "learning_rate": 0.0002, + "loss": 0.2442, + "step": 6600 + }, + { + "epoch": 1.3860348081358775, + "grad_norm": 0.0879199281334877, + "learning_rate": 0.0002, + "loss": 0.3161, + "step": 6610 + }, + { + "epoch": 1.3881316837911513, + "grad_norm": 0.09142164140939713, + "learning_rate": 0.0002, + "loss": 0.2952, + "step": 6620 + }, + { + "epoch": 1.3902285594464248, + "grad_norm": 0.10828060656785965, + "learning_rate": 0.0002, + "loss": 0.2692, + "step": 6630 + }, + { + "epoch": 1.3923254351016985, + "grad_norm": 0.0749066174030304, + "learning_rate": 0.0002, + "loss": 0.2895, + "step": 6640 + }, + { + "epoch": 1.3944223107569722, + "grad_norm": 0.05128903314471245, + "learning_rate": 0.0002, + "loss": 0.242, + "step": 6650 + }, + { + "epoch": 1.3965191864122457, + "grad_norm": 0.07985924929380417, + "learning_rate": 0.0002, + "loss": 0.2753, + "step": 6660 + }, + { + "epoch": 1.3986160620675194, + "grad_norm": 0.10848308354616165, + "learning_rate": 0.0002, + "loss": 0.3211, + "step": 6670 + }, + { + "epoch": 1.4007129377227931, + "grad_norm": 0.07220837473869324, + "learning_rate": 0.0002, + "loss": 0.2456, + "step": 6680 + }, + { + "epoch": 1.4028098133780667, + "grad_norm": 8.65963363647461, + "learning_rate": 0.0002, + "loss": 0.2821, + "step": 6690 + }, + { + "epoch": 1.4049066890333404, + "grad_norm": 0.10323558002710342, + "learning_rate": 0.0002, + "loss": 0.2568, + "step": 6700 + }, + { + "epoch": 1.407003564688614, + "grad_norm": 0.08332827687263489, + "learning_rate": 0.0002, + "loss": 0.2456, + "step": 6710 + }, + { + "epoch": 1.4091004403438876, + "grad_norm": 0.07475430518388748, + "learning_rate": 0.0002, + "loss": 0.247, + "step": 6720 + }, + { + "epoch": 1.4111973159991613, + "grad_norm": 0.09305676817893982, + "learning_rate": 0.0002, + "loss": 0.2805, + "step": 6730 + }, + { + "epoch": 1.4132941916544348, + "grad_norm": 1.4678685665130615, + "learning_rate": 0.0002, + "loss": 0.3179, + "step": 6740 + }, + { + "epoch": 1.4153910673097085, + "grad_norm": 0.07024402916431427, + "learning_rate": 0.0002, + "loss": 0.2688, + "step": 6750 + }, + { + "epoch": 1.417487942964982, + "grad_norm": 0.3857756555080414, + "learning_rate": 0.0002, + "loss": 0.2435, + "step": 6760 + }, + { + "epoch": 1.4195848186202558, + "grad_norm": 0.06822605431079865, + "learning_rate": 0.0002, + "loss": 0.2819, + "step": 6770 + }, + { + "epoch": 1.4216816942755295, + "grad_norm": 0.12242255359888077, + "learning_rate": 0.0002, + "loss": 0.2546, + "step": 6780 + }, + { + "epoch": 1.423778569930803, + "grad_norm": 0.6426852345466614, + "learning_rate": 0.0002, + "loss": 0.2707, + "step": 6790 + }, + { + "epoch": 1.4258754455860767, + "grad_norm": 0.084357351064682, + "learning_rate": 0.0002, + "loss": 0.2605, + "step": 6800 + }, + { + "epoch": 1.4279723212413504, + "grad_norm": 0.06686043739318848, + "learning_rate": 0.0002, + "loss": 0.2603, + "step": 6810 + }, + { + "epoch": 1.430069196896624, + "grad_norm": 0.12673796713352203, + "learning_rate": 0.0002, + "loss": 0.2622, + "step": 6820 + }, + { + "epoch": 1.4321660725518977, + "grad_norm": 0.11757438629865646, + "learning_rate": 0.0002, + "loss": 0.3659, + "step": 6830 + }, + { + "epoch": 1.4342629482071714, + "grad_norm": 0.05200193449854851, + "learning_rate": 0.0002, + "loss": 0.3081, + "step": 6840 + }, + { + "epoch": 1.4363598238624449, + "grad_norm": 0.5917229652404785, + "learning_rate": 0.0002, + "loss": 0.2994, + "step": 6850 + }, + { + "epoch": 1.4384566995177186, + "grad_norm": 0.10604757815599442, + "learning_rate": 0.0002, + "loss": 0.2734, + "step": 6860 + }, + { + "epoch": 1.4405535751729923, + "grad_norm": 0.06061761453747749, + "learning_rate": 0.0002, + "loss": 0.2447, + "step": 6870 + }, + { + "epoch": 1.4426504508282658, + "grad_norm": 1.4297677278518677, + "learning_rate": 0.0002, + "loss": 0.3055, + "step": 6880 + }, + { + "epoch": 1.4447473264835395, + "grad_norm": 0.5999753475189209, + "learning_rate": 0.0002, + "loss": 0.3373, + "step": 6890 + }, + { + "epoch": 1.4468442021388133, + "grad_norm": 3.725703001022339, + "learning_rate": 0.0002, + "loss": 0.2953, + "step": 6900 + }, + { + "epoch": 1.4489410777940868, + "grad_norm": 0.12710486352443695, + "learning_rate": 0.0002, + "loss": 0.3168, + "step": 6910 + }, + { + "epoch": 1.4510379534493605, + "grad_norm": 0.10325577110052109, + "learning_rate": 0.0002, + "loss": 0.2799, + "step": 6920 + }, + { + "epoch": 1.4531348291046342, + "grad_norm": 0.07545780390501022, + "learning_rate": 0.0002, + "loss": 0.2691, + "step": 6930 + }, + { + "epoch": 1.4552317047599077, + "grad_norm": 0.07704131305217743, + "learning_rate": 0.0002, + "loss": 0.289, + "step": 6940 + }, + { + "epoch": 1.4573285804151814, + "grad_norm": 0.06797541677951813, + "learning_rate": 0.0002, + "loss": 0.2999, + "step": 6950 + }, + { + "epoch": 1.4594254560704552, + "grad_norm": 1.3126064538955688, + "learning_rate": 0.0002, + "loss": 0.2766, + "step": 6960 + }, + { + "epoch": 1.4615223317257287, + "grad_norm": 0.0826839804649353, + "learning_rate": 0.0002, + "loss": 0.2506, + "step": 6970 + }, + { + "epoch": 1.4636192073810024, + "grad_norm": 2.723210334777832, + "learning_rate": 0.0002, + "loss": 0.3051, + "step": 6980 + }, + { + "epoch": 1.4657160830362759, + "grad_norm": 0.30324873328208923, + "learning_rate": 0.0002, + "loss": 0.2619, + "step": 6990 + }, + { + "epoch": 1.4678129586915496, + "grad_norm": 0.0709717720746994, + "learning_rate": 0.0002, + "loss": 0.2588, + "step": 7000 + }, + { + "epoch": 1.469909834346823, + "grad_norm": 0.07733996957540512, + "learning_rate": 0.0002, + "loss": 0.2447, + "step": 7010 + }, + { + "epoch": 1.4720067100020968, + "grad_norm": 0.33669042587280273, + "learning_rate": 0.0002, + "loss": 0.2651, + "step": 7020 + }, + { + "epoch": 1.4741035856573705, + "grad_norm": 2.0644760131835938, + "learning_rate": 0.0002, + "loss": 0.2853, + "step": 7030 + }, + { + "epoch": 1.476200461312644, + "grad_norm": 0.09015507996082306, + "learning_rate": 0.0002, + "loss": 0.2632, + "step": 7040 + }, + { + "epoch": 1.4782973369679178, + "grad_norm": 3.7130730152130127, + "learning_rate": 0.0002, + "loss": 0.2879, + "step": 7050 + }, + { + "epoch": 1.4803942126231915, + "grad_norm": 0.05841604992747307, + "learning_rate": 0.0002, + "loss": 0.2734, + "step": 7060 + }, + { + "epoch": 1.482491088278465, + "grad_norm": 0.33909642696380615, + "learning_rate": 0.0002, + "loss": 0.2807, + "step": 7070 + }, + { + "epoch": 1.4845879639337387, + "grad_norm": 0.09259719401597977, + "learning_rate": 0.0002, + "loss": 0.269, + "step": 7080 + }, + { + "epoch": 1.4866848395890124, + "grad_norm": 0.11720839887857437, + "learning_rate": 0.0002, + "loss": 0.2477, + "step": 7090 + }, + { + "epoch": 1.488781715244286, + "grad_norm": 0.1009572297334671, + "learning_rate": 0.0002, + "loss": 0.2686, + "step": 7100 + }, + { + "epoch": 1.4908785908995597, + "grad_norm": 0.33339136838912964, + "learning_rate": 0.0002, + "loss": 0.2724, + "step": 7110 + }, + { + "epoch": 1.4929754665548334, + "grad_norm": 0.10810337960720062, + "learning_rate": 0.0002, + "loss": 0.2732, + "step": 7120 + }, + { + "epoch": 1.4950723422101069, + "grad_norm": 0.08828122913837433, + "learning_rate": 0.0002, + "loss": 0.2717, + "step": 7130 + }, + { + "epoch": 1.4971692178653806, + "grad_norm": 0.1396791934967041, + "learning_rate": 0.0002, + "loss": 0.2538, + "step": 7140 + }, + { + "epoch": 1.4992660935206543, + "grad_norm": 2.327631950378418, + "learning_rate": 0.0002, + "loss": 0.2808, + "step": 7150 + }, + { + "epoch": 1.5013629691759278, + "grad_norm": 0.10392621159553528, + "learning_rate": 0.0002, + "loss": 0.2442, + "step": 7160 + }, + { + "epoch": 1.5034598448312015, + "grad_norm": 2.2027974128723145, + "learning_rate": 0.0002, + "loss": 0.2753, + "step": 7170 + }, + { + "epoch": 1.5055567204864753, + "grad_norm": 0.060588303953409195, + "learning_rate": 0.0002, + "loss": 0.2438, + "step": 7180 + }, + { + "epoch": 1.5076535961417488, + "grad_norm": 0.08776184171438217, + "learning_rate": 0.0002, + "loss": 0.2631, + "step": 7190 + }, + { + "epoch": 1.5097504717970225, + "grad_norm": 0.10150139033794403, + "learning_rate": 0.0002, + "loss": 0.2703, + "step": 7200 + }, + { + "epoch": 1.5118473474522962, + "grad_norm": 0.12122219055891037, + "learning_rate": 0.0002, + "loss": 0.2416, + "step": 7210 + }, + { + "epoch": 1.5139442231075697, + "grad_norm": 0.05999595671892166, + "learning_rate": 0.0002, + "loss": 0.3042, + "step": 7220 + }, + { + "epoch": 1.5160410987628432, + "grad_norm": 0.06926919519901276, + "learning_rate": 0.0002, + "loss": 0.2628, + "step": 7230 + }, + { + "epoch": 1.5181379744181172, + "grad_norm": 3.6650075912475586, + "learning_rate": 0.0002, + "loss": 0.2884, + "step": 7240 + }, + { + "epoch": 1.5202348500733907, + "grad_norm": 0.0633549615740776, + "learning_rate": 0.0002, + "loss": 0.2462, + "step": 7250 + }, + { + "epoch": 1.5223317257286642, + "grad_norm": 0.07859442383050919, + "learning_rate": 0.0002, + "loss": 0.3138, + "step": 7260 + }, + { + "epoch": 1.524428601383938, + "grad_norm": 1.6131170988082886, + "learning_rate": 0.0002, + "loss": 0.2494, + "step": 7270 + }, + { + "epoch": 1.5265254770392116, + "grad_norm": 0.0931943953037262, + "learning_rate": 0.0002, + "loss": 0.2499, + "step": 7280 + }, + { + "epoch": 1.528622352694485, + "grad_norm": 0.09297845512628555, + "learning_rate": 0.0002, + "loss": 0.2433, + "step": 7290 + }, + { + "epoch": 1.5307192283497588, + "grad_norm": 0.06294408440589905, + "learning_rate": 0.0002, + "loss": 0.2568, + "step": 7300 + }, + { + "epoch": 1.5328161040050325, + "grad_norm": 0.08879171311855316, + "learning_rate": 0.0002, + "loss": 0.2588, + "step": 7310 + }, + { + "epoch": 1.534912979660306, + "grad_norm": 0.05376000702381134, + "learning_rate": 0.0002, + "loss": 0.2445, + "step": 7320 + }, + { + "epoch": 1.5370098553155798, + "grad_norm": 3.216952323913574, + "learning_rate": 0.0002, + "loss": 0.3338, + "step": 7330 + }, + { + "epoch": 1.5391067309708535, + "grad_norm": 0.07937444746494293, + "learning_rate": 0.0002, + "loss": 0.2437, + "step": 7340 + }, + { + "epoch": 1.541203606626127, + "grad_norm": 1.347196340560913, + "learning_rate": 0.0002, + "loss": 0.3316, + "step": 7350 + }, + { + "epoch": 1.5433004822814007, + "grad_norm": 0.0543079674243927, + "learning_rate": 0.0002, + "loss": 0.2436, + "step": 7360 + }, + { + "epoch": 1.5453973579366744, + "grad_norm": 0.13290783762931824, + "learning_rate": 0.0002, + "loss": 0.2417, + "step": 7370 + }, + { + "epoch": 1.547494233591948, + "grad_norm": 2.971339225769043, + "learning_rate": 0.0002, + "loss": 0.2807, + "step": 7380 + }, + { + "epoch": 1.5495911092472217, + "grad_norm": 0.08533356338739395, + "learning_rate": 0.0002, + "loss": 0.2965, + "step": 7390 + }, + { + "epoch": 1.5516879849024954, + "grad_norm": 0.06842143833637238, + "learning_rate": 0.0002, + "loss": 0.2577, + "step": 7400 + }, + { + "epoch": 1.5537848605577689, + "grad_norm": 0.08902810513973236, + "learning_rate": 0.0002, + "loss": 0.2698, + "step": 7410 + }, + { + "epoch": 1.5558817362130426, + "grad_norm": 0.07102424651384354, + "learning_rate": 0.0002, + "loss": 0.319, + "step": 7420 + }, + { + "epoch": 1.5579786118683163, + "grad_norm": 0.06909465044736862, + "learning_rate": 0.0002, + "loss": 0.2628, + "step": 7430 + }, + { + "epoch": 1.5600754875235898, + "grad_norm": 0.06648514419794083, + "learning_rate": 0.0002, + "loss": 0.2435, + "step": 7440 + }, + { + "epoch": 1.5621723631788635, + "grad_norm": 0.09439200162887573, + "learning_rate": 0.0002, + "loss": 0.2439, + "step": 7450 + }, + { + "epoch": 1.5642692388341373, + "grad_norm": 0.09378870576620102, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 7460 + }, + { + "epoch": 1.5663661144894108, + "grad_norm": 0.07520859688520432, + "learning_rate": 0.0002, + "loss": 0.2449, + "step": 7470 + }, + { + "epoch": 1.5684629901446843, + "grad_norm": 0.10274530947208405, + "learning_rate": 0.0002, + "loss": 0.2897, + "step": 7480 + }, + { + "epoch": 1.5705598657999582, + "grad_norm": 0.07290078699588776, + "learning_rate": 0.0002, + "loss": 0.2664, + "step": 7490 + }, + { + "epoch": 1.5726567414552317, + "grad_norm": 0.057594846934080124, + "learning_rate": 0.0002, + "loss": 0.2447, + "step": 7500 + }, + { + "epoch": 1.5747536171105052, + "grad_norm": 0.080964095890522, + "learning_rate": 0.0002, + "loss": 0.2876, + "step": 7510 + }, + { + "epoch": 1.5768504927657792, + "grad_norm": 0.08604875952005386, + "learning_rate": 0.0002, + "loss": 0.2595, + "step": 7520 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.06626357138156891, + "learning_rate": 0.0002, + "loss": 0.2807, + "step": 7530 + }, + { + "epoch": 1.5810442440763262, + "grad_norm": 0.06062173843383789, + "learning_rate": 0.0002, + "loss": 0.3133, + "step": 7540 + }, + { + "epoch": 1.5831411197315999, + "grad_norm": 0.0724383071064949, + "learning_rate": 0.0002, + "loss": 0.2705, + "step": 7550 + }, + { + "epoch": 1.5852379953868736, + "grad_norm": 0.09347286075353622, + "learning_rate": 0.0002, + "loss": 0.2544, + "step": 7560 + }, + { + "epoch": 1.587334871042147, + "grad_norm": 0.0953749343752861, + "learning_rate": 0.0002, + "loss": 0.2632, + "step": 7570 + }, + { + "epoch": 1.5894317466974208, + "grad_norm": 0.10970430821180344, + "learning_rate": 0.0002, + "loss": 0.2791, + "step": 7580 + }, + { + "epoch": 1.5915286223526945, + "grad_norm": 2.3808388710021973, + "learning_rate": 0.0002, + "loss": 0.2915, + "step": 7590 + }, + { + "epoch": 1.593625498007968, + "grad_norm": 0.05185944214463234, + "learning_rate": 0.0002, + "loss": 0.2729, + "step": 7600 + }, + { + "epoch": 1.5957223736632418, + "grad_norm": 0.07438002526760101, + "learning_rate": 0.0002, + "loss": 0.2438, + "step": 7610 + }, + { + "epoch": 1.5978192493185155, + "grad_norm": 0.08762764185667038, + "learning_rate": 0.0002, + "loss": 0.2747, + "step": 7620 + }, + { + "epoch": 1.599916124973789, + "grad_norm": 0.6587502956390381, + "learning_rate": 0.0002, + "loss": 0.2635, + "step": 7630 + }, + { + "epoch": 1.6020130006290627, + "grad_norm": 0.05637750402092934, + "learning_rate": 0.0002, + "loss": 0.2433, + "step": 7640 + }, + { + "epoch": 1.6041098762843364, + "grad_norm": 0.05925126373767853, + "learning_rate": 0.0002, + "loss": 0.2443, + "step": 7650 + }, + { + "epoch": 1.60620675193961, + "grad_norm": 2.566734552383423, + "learning_rate": 0.0002, + "loss": 0.2935, + "step": 7660 + }, + { + "epoch": 1.6083036275948837, + "grad_norm": 0.06364738196134567, + "learning_rate": 0.0002, + "loss": 0.32, + "step": 7670 + }, + { + "epoch": 1.6104005032501574, + "grad_norm": 0.11035982519388199, + "learning_rate": 0.0002, + "loss": 0.2459, + "step": 7680 + }, + { + "epoch": 1.6124973789054309, + "grad_norm": 0.050208210945129395, + "learning_rate": 0.0002, + "loss": 0.2894, + "step": 7690 + }, + { + "epoch": 1.6145942545607046, + "grad_norm": 0.1036805585026741, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 7700 + }, + { + "epoch": 1.6166911302159783, + "grad_norm": 0.07067201286554337, + "learning_rate": 0.0002, + "loss": 0.2688, + "step": 7710 + }, + { + "epoch": 1.6187880058712518, + "grad_norm": 0.0834813043475151, + "learning_rate": 0.0002, + "loss": 0.2896, + "step": 7720 + }, + { + "epoch": 1.6208848815265253, + "grad_norm": 0.13986825942993164, + "learning_rate": 0.0002, + "loss": 0.2951, + "step": 7730 + }, + { + "epoch": 1.6229817571817993, + "grad_norm": 3.069458484649658, + "learning_rate": 0.0002, + "loss": 0.3001, + "step": 7740 + }, + { + "epoch": 1.6250786328370728, + "grad_norm": 0.07894471287727356, + "learning_rate": 0.0002, + "loss": 0.2437, + "step": 7750 + }, + { + "epoch": 1.6271755084923463, + "grad_norm": 0.0820389837026596, + "learning_rate": 0.0002, + "loss": 0.2813, + "step": 7760 + }, + { + "epoch": 1.6292723841476202, + "grad_norm": 0.05259540304541588, + "learning_rate": 0.0002, + "loss": 0.2432, + "step": 7770 + }, + { + "epoch": 1.6313692598028937, + "grad_norm": 0.07753746956586838, + "learning_rate": 0.0002, + "loss": 0.242, + "step": 7780 + }, + { + "epoch": 1.6334661354581672, + "grad_norm": 0.0717770904302597, + "learning_rate": 0.0002, + "loss": 0.2442, + "step": 7790 + }, + { + "epoch": 1.635563011113441, + "grad_norm": 0.05608302354812622, + "learning_rate": 0.0002, + "loss": 0.2669, + "step": 7800 + }, + { + "epoch": 1.6376598867687147, + "grad_norm": 4.554051399230957, + "learning_rate": 0.0002, + "loss": 0.2656, + "step": 7810 + }, + { + "epoch": 1.6397567624239882, + "grad_norm": 0.07997504621744156, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 7820 + }, + { + "epoch": 1.6418536380792619, + "grad_norm": 0.08014536648988724, + "learning_rate": 0.0002, + "loss": 0.2685, + "step": 7830 + }, + { + "epoch": 1.6439505137345356, + "grad_norm": 0.09674616903066635, + "learning_rate": 0.0002, + "loss": 0.2594, + "step": 7840 + }, + { + "epoch": 1.646047389389809, + "grad_norm": 0.9341862797737122, + "learning_rate": 0.0002, + "loss": 0.3198, + "step": 7850 + }, + { + "epoch": 1.6481442650450828, + "grad_norm": 0.09305543452501297, + "learning_rate": 0.0002, + "loss": 0.2446, + "step": 7860 + }, + { + "epoch": 1.6502411407003565, + "grad_norm": 0.07002803683280945, + "learning_rate": 0.0002, + "loss": 0.2493, + "step": 7870 + }, + { + "epoch": 1.65233801635563, + "grad_norm": 1.9571644067764282, + "learning_rate": 0.0002, + "loss": 0.2759, + "step": 7880 + }, + { + "epoch": 1.6544348920109038, + "grad_norm": 0.15317511558532715, + "learning_rate": 0.0002, + "loss": 0.2539, + "step": 7890 + }, + { + "epoch": 1.6565317676661775, + "grad_norm": 0.09483631700277328, + "learning_rate": 0.0002, + "loss": 0.2441, + "step": 7900 + }, + { + "epoch": 1.658628643321451, + "grad_norm": 0.1098591759800911, + "learning_rate": 0.0002, + "loss": 0.2442, + "step": 7910 + }, + { + "epoch": 1.6607255189767247, + "grad_norm": 0.05958917737007141, + "learning_rate": 0.0002, + "loss": 0.2752, + "step": 7920 + }, + { + "epoch": 1.6628223946319984, + "grad_norm": 0.07363269478082657, + "learning_rate": 0.0002, + "loss": 0.2616, + "step": 7930 + }, + { + "epoch": 1.664919270287272, + "grad_norm": 0.06663299351930618, + "learning_rate": 0.0002, + "loss": 0.2682, + "step": 7940 + }, + { + "epoch": 1.6670161459425457, + "grad_norm": 0.06834384053945541, + "learning_rate": 0.0002, + "loss": 0.2699, + "step": 7950 + }, + { + "epoch": 1.6691130215978194, + "grad_norm": 0.06692808866500854, + "learning_rate": 0.0002, + "loss": 0.2617, + "step": 7960 + }, + { + "epoch": 1.6712098972530929, + "grad_norm": 2.3505663871765137, + "learning_rate": 0.0002, + "loss": 0.271, + "step": 7970 + }, + { + "epoch": 1.6733067729083664, + "grad_norm": 0.08932004123926163, + "learning_rate": 0.0002, + "loss": 0.2759, + "step": 7980 + }, + { + "epoch": 1.6754036485636403, + "grad_norm": 0.058079443871974945, + "learning_rate": 0.0002, + "loss": 0.2748, + "step": 7990 + }, + { + "epoch": 1.6775005242189138, + "grad_norm": 0.08751317113637924, + "learning_rate": 0.0002, + "loss": 0.2944, + "step": 8000 + }, + { + "epoch": 1.6795973998741873, + "grad_norm": 2.1509130001068115, + "learning_rate": 0.0002, + "loss": 0.2603, + "step": 8010 + }, + { + "epoch": 1.6816942755294613, + "grad_norm": 0.045726533979177475, + "learning_rate": 0.0002, + "loss": 0.2453, + "step": 8020 + }, + { + "epoch": 1.6837911511847348, + "grad_norm": 0.1127922460436821, + "learning_rate": 0.0002, + "loss": 0.2807, + "step": 8030 + }, + { + "epoch": 1.6858880268400083, + "grad_norm": 8.662659645080566, + "learning_rate": 0.0002, + "loss": 0.2642, + "step": 8040 + }, + { + "epoch": 1.687984902495282, + "grad_norm": 0.6873268485069275, + "learning_rate": 0.0002, + "loss": 0.3018, + "step": 8050 + }, + { + "epoch": 1.6900817781505557, + "grad_norm": 0.07165851444005966, + "learning_rate": 0.0002, + "loss": 0.2429, + "step": 8060 + }, + { + "epoch": 1.6921786538058292, + "grad_norm": 0.1486668735742569, + "learning_rate": 0.0002, + "loss": 0.3148, + "step": 8070 + }, + { + "epoch": 1.694275529461103, + "grad_norm": 1.325584053993225, + "learning_rate": 0.0002, + "loss": 0.2655, + "step": 8080 + }, + { + "epoch": 1.6963724051163767, + "grad_norm": 2.7720537185668945, + "learning_rate": 0.0002, + "loss": 0.2934, + "step": 8090 + }, + { + "epoch": 1.6984692807716502, + "grad_norm": 0.21440936625003815, + "learning_rate": 0.0002, + "loss": 0.2749, + "step": 8100 + }, + { + "epoch": 1.7005661564269239, + "grad_norm": 0.9134746789932251, + "learning_rate": 0.0002, + "loss": 0.2626, + "step": 8110 + }, + { + "epoch": 1.7026630320821976, + "grad_norm": 0.07481775432825089, + "learning_rate": 0.0002, + "loss": 0.249, + "step": 8120 + }, + { + "epoch": 1.704759907737471, + "grad_norm": 0.45117613673210144, + "learning_rate": 0.0002, + "loss": 0.2484, + "step": 8130 + }, + { + "epoch": 1.7068567833927448, + "grad_norm": 2.4827306270599365, + "learning_rate": 0.0002, + "loss": 0.288, + "step": 8140 + }, + { + "epoch": 1.7089536590480185, + "grad_norm": 0.1967839002609253, + "learning_rate": 0.0002, + "loss": 0.246, + "step": 8150 + }, + { + "epoch": 1.711050534703292, + "grad_norm": 1.2610207796096802, + "learning_rate": 0.0002, + "loss": 0.2692, + "step": 8160 + }, + { + "epoch": 1.7131474103585658, + "grad_norm": 0.09038415551185608, + "learning_rate": 0.0002, + "loss": 0.2565, + "step": 8170 + }, + { + "epoch": 1.7152442860138395, + "grad_norm": 0.524497389793396, + "learning_rate": 0.0002, + "loss": 0.2742, + "step": 8180 + }, + { + "epoch": 1.717341161669113, + "grad_norm": 0.12098541110754013, + "learning_rate": 0.0002, + "loss": 0.2724, + "step": 8190 + }, + { + "epoch": 1.7194380373243867, + "grad_norm": 0.109773188829422, + "learning_rate": 0.0002, + "loss": 0.254, + "step": 8200 + }, + { + "epoch": 1.7215349129796604, + "grad_norm": 0.2105139046907425, + "learning_rate": 0.0002, + "loss": 0.3159, + "step": 8210 + }, + { + "epoch": 1.723631788634934, + "grad_norm": 0.05839413404464722, + "learning_rate": 0.0002, + "loss": 0.263, + "step": 8220 + }, + { + "epoch": 1.7257286642902074, + "grad_norm": 1.7193588018417358, + "learning_rate": 0.0002, + "loss": 0.2703, + "step": 8230 + }, + { + "epoch": 1.7278255399454814, + "grad_norm": 0.11846201121807098, + "learning_rate": 0.0002, + "loss": 0.2539, + "step": 8240 + }, + { + "epoch": 1.7299224156007549, + "grad_norm": 0.10171257704496384, + "learning_rate": 0.0002, + "loss": 0.2734, + "step": 8250 + }, + { + "epoch": 1.7320192912560284, + "grad_norm": 2.8296456336975098, + "learning_rate": 0.0002, + "loss": 0.2807, + "step": 8260 + }, + { + "epoch": 1.7341161669113023, + "grad_norm": 0.08424999564886093, + "learning_rate": 0.0002, + "loss": 0.2451, + "step": 8270 + }, + { + "epoch": 1.7362130425665758, + "grad_norm": 0.08350451290607452, + "learning_rate": 0.0002, + "loss": 0.2823, + "step": 8280 + }, + { + "epoch": 1.7383099182218493, + "grad_norm": 0.1168193444609642, + "learning_rate": 0.0002, + "loss": 0.2566, + "step": 8290 + }, + { + "epoch": 1.740406793877123, + "grad_norm": 0.09300057590007782, + "learning_rate": 0.0002, + "loss": 0.2451, + "step": 8300 + }, + { + "epoch": 1.7425036695323968, + "grad_norm": 0.05919062718749046, + "learning_rate": 0.0002, + "loss": 0.2455, + "step": 8310 + }, + { + "epoch": 1.7446005451876703, + "grad_norm": 6.351716995239258, + "learning_rate": 0.0002, + "loss": 0.3258, + "step": 8320 + }, + { + "epoch": 1.746697420842944, + "grad_norm": 0.08938953280448914, + "learning_rate": 0.0002, + "loss": 0.2976, + "step": 8330 + }, + { + "epoch": 1.7487942964982177, + "grad_norm": 0.17226704955101013, + "learning_rate": 0.0002, + "loss": 0.2789, + "step": 8340 + }, + { + "epoch": 1.7508911721534912, + "grad_norm": 0.13070009648799896, + "learning_rate": 0.0002, + "loss": 0.2643, + "step": 8350 + }, + { + "epoch": 1.752988047808765, + "grad_norm": 1.1679989099502563, + "learning_rate": 0.0002, + "loss": 0.3043, + "step": 8360 + }, + { + "epoch": 1.7550849234640387, + "grad_norm": 0.09552992135286331, + "learning_rate": 0.0002, + "loss": 0.3402, + "step": 8370 + }, + { + "epoch": 1.7571817991193122, + "grad_norm": 0.18032927811145782, + "learning_rate": 0.0002, + "loss": 0.2462, + "step": 8380 + }, + { + "epoch": 1.7592786747745859, + "grad_norm": 4.900735378265381, + "learning_rate": 0.0002, + "loss": 0.5324, + "step": 8390 + }, + { + "epoch": 1.7613755504298596, + "grad_norm": 2.462291955947876, + "learning_rate": 0.0002, + "loss": 0.2466, + "step": 8400 + }, + { + "epoch": 1.763472426085133, + "grad_norm": 0.7509610056877136, + "learning_rate": 0.0002, + "loss": 0.2514, + "step": 8410 + }, + { + "epoch": 1.7655693017404068, + "grad_norm": 0.07756595313549042, + "learning_rate": 0.0002, + "loss": 0.2746, + "step": 8420 + }, + { + "epoch": 1.7676661773956805, + "grad_norm": 0.08571748435497284, + "learning_rate": 0.0002, + "loss": 0.2912, + "step": 8430 + }, + { + "epoch": 1.769763053050954, + "grad_norm": 0.07590015232563019, + "learning_rate": 0.0002, + "loss": 0.2677, + "step": 8440 + }, + { + "epoch": 1.7718599287062278, + "grad_norm": 0.11724907904863358, + "learning_rate": 0.0002, + "loss": 0.2674, + "step": 8450 + }, + { + "epoch": 1.7739568043615015, + "grad_norm": 2.972846269607544, + "learning_rate": 0.0002, + "loss": 0.3069, + "step": 8460 + }, + { + "epoch": 1.776053680016775, + "grad_norm": 0.057090677320957184, + "learning_rate": 0.0002, + "loss": 0.2476, + "step": 8470 + }, + { + "epoch": 1.7781505556720485, + "grad_norm": 0.09555861353874207, + "learning_rate": 0.0002, + "loss": 0.295, + "step": 8480 + }, + { + "epoch": 1.7802474313273224, + "grad_norm": 0.08062522113323212, + "learning_rate": 0.0002, + "loss": 0.2774, + "step": 8490 + }, + { + "epoch": 1.782344306982596, + "grad_norm": 0.06056349724531174, + "learning_rate": 0.0002, + "loss": 0.2455, + "step": 8500 + }, + { + "epoch": 1.7844411826378694, + "grad_norm": 2.523094415664673, + "learning_rate": 0.0002, + "loss": 0.3411, + "step": 8510 + }, + { + "epoch": 1.7865380582931434, + "grad_norm": 0.061809323728084564, + "learning_rate": 0.0002, + "loss": 0.2455, + "step": 8520 + }, + { + "epoch": 1.7886349339484169, + "grad_norm": 0.17487798631191254, + "learning_rate": 0.0002, + "loss": 0.2666, + "step": 8530 + }, + { + "epoch": 1.7907318096036904, + "grad_norm": 0.05645996332168579, + "learning_rate": 0.0002, + "loss": 0.2445, + "step": 8540 + }, + { + "epoch": 1.792828685258964, + "grad_norm": 0.0833887979388237, + "learning_rate": 0.0002, + "loss": 0.2443, + "step": 8550 + }, + { + "epoch": 1.7949255609142378, + "grad_norm": 0.08759808540344238, + "learning_rate": 0.0002, + "loss": 0.2932, + "step": 8560 + }, + { + "epoch": 1.7970224365695113, + "grad_norm": 0.07473572343587875, + "learning_rate": 0.0002, + "loss": 0.2739, + "step": 8570 + }, + { + "epoch": 1.799119312224785, + "grad_norm": 0.07650966942310333, + "learning_rate": 0.0002, + "loss": 0.244, + "step": 8580 + }, + { + "epoch": 1.8012161878800588, + "grad_norm": 0.06564634293317795, + "learning_rate": 0.0002, + "loss": 0.2775, + "step": 8590 + }, + { + "epoch": 1.8033130635353323, + "grad_norm": 0.12063200771808624, + "learning_rate": 0.0002, + "loss": 0.2608, + "step": 8600 + }, + { + "epoch": 1.805409939190606, + "grad_norm": 0.08069991320371628, + "learning_rate": 0.0002, + "loss": 0.2691, + "step": 8610 + }, + { + "epoch": 1.8075068148458797, + "grad_norm": 0.07421161234378815, + "learning_rate": 0.0002, + "loss": 0.2749, + "step": 8620 + }, + { + "epoch": 1.8096036905011532, + "grad_norm": 0.0516473613679409, + "learning_rate": 0.0002, + "loss": 0.2455, + "step": 8630 + }, + { + "epoch": 1.811700566156427, + "grad_norm": 0.07442668080329895, + "learning_rate": 0.0002, + "loss": 0.2453, + "step": 8640 + }, + { + "epoch": 1.8137974418117007, + "grad_norm": 0.07932697981595993, + "learning_rate": 0.0002, + "loss": 0.2445, + "step": 8650 + }, + { + "epoch": 1.8158943174669742, + "grad_norm": 0.09281004965305328, + "learning_rate": 0.0002, + "loss": 0.2556, + "step": 8660 + }, + { + "epoch": 1.8179911931222479, + "grad_norm": 0.06372177600860596, + "learning_rate": 0.0002, + "loss": 0.3174, + "step": 8670 + }, + { + "epoch": 1.8200880687775216, + "grad_norm": 0.07562694698572159, + "learning_rate": 0.0002, + "loss": 0.2428, + "step": 8680 + }, + { + "epoch": 1.822184944432795, + "grad_norm": 0.07504847645759583, + "learning_rate": 0.0002, + "loss": 0.2452, + "step": 8690 + }, + { + "epoch": 1.8242818200880688, + "grad_norm": 0.10201393067836761, + "learning_rate": 0.0002, + "loss": 0.2464, + "step": 8700 + }, + { + "epoch": 1.8263786957433426, + "grad_norm": 0.07061925530433655, + "learning_rate": 0.0002, + "loss": 0.2587, + "step": 8710 + }, + { + "epoch": 1.828475571398616, + "grad_norm": 2.701824188232422, + "learning_rate": 0.0002, + "loss": 0.2845, + "step": 8720 + }, + { + "epoch": 1.8305724470538896, + "grad_norm": 0.20525291562080383, + "learning_rate": 0.0002, + "loss": 0.2437, + "step": 8730 + }, + { + "epoch": 1.8326693227091635, + "grad_norm": 0.05398187041282654, + "learning_rate": 0.0002, + "loss": 0.2434, + "step": 8740 + }, + { + "epoch": 1.834766198364437, + "grad_norm": 0.06944382935762405, + "learning_rate": 0.0002, + "loss": 0.2451, + "step": 8750 + }, + { + "epoch": 1.8368630740197105, + "grad_norm": 5.459568023681641, + "learning_rate": 0.0002, + "loss": 0.2816, + "step": 8760 + }, + { + "epoch": 1.8389599496749844, + "grad_norm": 0.06237971410155296, + "learning_rate": 0.0002, + "loss": 0.2576, + "step": 8770 + }, + { + "epoch": 1.841056825330258, + "grad_norm": 0.06791366636753082, + "learning_rate": 0.0002, + "loss": 0.2438, + "step": 8780 + }, + { + "epoch": 1.8431537009855314, + "grad_norm": 0.06800734996795654, + "learning_rate": 0.0002, + "loss": 0.2681, + "step": 8790 + }, + { + "epoch": 1.8452505766408052, + "grad_norm": 4.149272918701172, + "learning_rate": 0.0002, + "loss": 0.2848, + "step": 8800 + }, + { + "epoch": 1.8473474522960789, + "grad_norm": 0.06702340394258499, + "learning_rate": 0.0002, + "loss": 0.2555, + "step": 8810 + }, + { + "epoch": 1.8494443279513524, + "grad_norm": 0.048510730266571045, + "learning_rate": 0.0002, + "loss": 0.2442, + "step": 8820 + }, + { + "epoch": 1.851541203606626, + "grad_norm": 0.055737774819135666, + "learning_rate": 0.0002, + "loss": 0.2624, + "step": 8830 + }, + { + "epoch": 1.8536380792618998, + "grad_norm": 0.09368737787008286, + "learning_rate": 0.0002, + "loss": 0.244, + "step": 8840 + }, + { + "epoch": 1.8557349549171733, + "grad_norm": 0.06682492047548294, + "learning_rate": 0.0002, + "loss": 0.3114, + "step": 8850 + }, + { + "epoch": 1.857831830572447, + "grad_norm": 0.1425323784351349, + "learning_rate": 0.0002, + "loss": 0.2614, + "step": 8860 + }, + { + "epoch": 1.8599287062277208, + "grad_norm": 0.08993595838546753, + "learning_rate": 0.0002, + "loss": 0.2435, + "step": 8870 + }, + { + "epoch": 1.8620255818829943, + "grad_norm": 0.06886916607618332, + "learning_rate": 0.0002, + "loss": 0.2788, + "step": 8880 + }, + { + "epoch": 1.864122457538268, + "grad_norm": 0.21538451313972473, + "learning_rate": 0.0002, + "loss": 0.2426, + "step": 8890 + }, + { + "epoch": 1.8662193331935417, + "grad_norm": 0.06627918034791946, + "learning_rate": 0.0002, + "loss": 0.2864, + "step": 8900 + }, + { + "epoch": 1.8683162088488152, + "grad_norm": 0.06920845806598663, + "learning_rate": 0.0002, + "loss": 0.26, + "step": 8910 + }, + { + "epoch": 1.870413084504089, + "grad_norm": 0.05559088662266731, + "learning_rate": 0.0002, + "loss": 0.2447, + "step": 8920 + }, + { + "epoch": 1.8725099601593627, + "grad_norm": 1.7723051309585571, + "learning_rate": 0.0002, + "loss": 0.2817, + "step": 8930 + }, + { + "epoch": 1.8746068358146362, + "grad_norm": 0.09814611822366714, + "learning_rate": 0.0002, + "loss": 0.2847, + "step": 8940 + }, + { + "epoch": 1.8767037114699099, + "grad_norm": 0.06993977725505829, + "learning_rate": 0.0002, + "loss": 0.2676, + "step": 8950 + }, + { + "epoch": 1.8788005871251836, + "grad_norm": 4.195693016052246, + "learning_rate": 0.0002, + "loss": 0.3096, + "step": 8960 + }, + { + "epoch": 1.880897462780457, + "grad_norm": 0.06579772382974625, + "learning_rate": 0.0002, + "loss": 0.2924, + "step": 8970 + }, + { + "epoch": 1.8829943384357306, + "grad_norm": 0.0795520544052124, + "learning_rate": 0.0002, + "loss": 0.245, + "step": 8980 + }, + { + "epoch": 1.8850912140910046, + "grad_norm": 0.09593985229730606, + "learning_rate": 0.0002, + "loss": 0.2643, + "step": 8990 + }, + { + "epoch": 1.887188089746278, + "grad_norm": 0.06778479367494583, + "learning_rate": 0.0002, + "loss": 0.2442, + "step": 9000 + }, + { + "epoch": 1.8892849654015516, + "grad_norm": 0.06771530956029892, + "learning_rate": 0.0002, + "loss": 0.2669, + "step": 9010 + }, + { + "epoch": 1.8913818410568253, + "grad_norm": 0.0755813866853714, + "learning_rate": 0.0002, + "loss": 0.2909, + "step": 9020 + }, + { + "epoch": 1.893478716712099, + "grad_norm": 0.07563014328479767, + "learning_rate": 0.0002, + "loss": 0.2468, + "step": 9030 + }, + { + "epoch": 1.8955755923673725, + "grad_norm": 0.08678235858678818, + "learning_rate": 0.0002, + "loss": 0.2764, + "step": 9040 + }, + { + "epoch": 1.8976724680226462, + "grad_norm": 0.14399957656860352, + "learning_rate": 0.0002, + "loss": 0.2894, + "step": 9050 + }, + { + "epoch": 1.89976934367792, + "grad_norm": 0.07211213558912277, + "learning_rate": 0.0002, + "loss": 0.2441, + "step": 9060 + }, + { + "epoch": 1.9018662193331934, + "grad_norm": 0.0553593710064888, + "learning_rate": 0.0002, + "loss": 0.2857, + "step": 9070 + }, + { + "epoch": 1.9039630949884672, + "grad_norm": 0.057425178587436676, + "learning_rate": 0.0002, + "loss": 0.2807, + "step": 9080 + }, + { + "epoch": 1.9060599706437409, + "grad_norm": 0.06174889951944351, + "learning_rate": 0.0002, + "loss": 0.2811, + "step": 9090 + }, + { + "epoch": 1.9081568462990144, + "grad_norm": 0.045786961913108826, + "learning_rate": 0.0002, + "loss": 0.257, + "step": 9100 + }, + { + "epoch": 1.910253721954288, + "grad_norm": 0.09140277653932571, + "learning_rate": 0.0002, + "loss": 0.269, + "step": 9110 + }, + { + "epoch": 1.9123505976095618, + "grad_norm": 0.05962507426738739, + "learning_rate": 0.0002, + "loss": 0.2443, + "step": 9120 + }, + { + "epoch": 1.9144474732648353, + "grad_norm": 0.07254903763532639, + "learning_rate": 0.0002, + "loss": 0.2663, + "step": 9130 + }, + { + "epoch": 1.916544348920109, + "grad_norm": 0.0659898966550827, + "learning_rate": 0.0002, + "loss": 0.2649, + "step": 9140 + }, + { + "epoch": 1.9186412245753828, + "grad_norm": 2.222851514816284, + "learning_rate": 0.0002, + "loss": 0.2714, + "step": 9150 + }, + { + "epoch": 1.9207381002306563, + "grad_norm": 2.2575175762176514, + "learning_rate": 0.0002, + "loss": 0.2615, + "step": 9160 + }, + { + "epoch": 1.92283497588593, + "grad_norm": 0.1300947368144989, + "learning_rate": 0.0002, + "loss": 0.3135, + "step": 9170 + }, + { + "epoch": 1.9249318515412037, + "grad_norm": 2.992399215698242, + "learning_rate": 0.0002, + "loss": 0.2685, + "step": 9180 + }, + { + "epoch": 1.9270287271964772, + "grad_norm": 4.321889400482178, + "learning_rate": 0.0002, + "loss": 0.2793, + "step": 9190 + }, + { + "epoch": 1.929125602851751, + "grad_norm": 0.09589577466249466, + "learning_rate": 0.0002, + "loss": 0.2665, + "step": 9200 + }, + { + "epoch": 1.9312224785070247, + "grad_norm": 0.06917759776115417, + "learning_rate": 0.0002, + "loss": 0.2601, + "step": 9210 + }, + { + "epoch": 1.9333193541622982, + "grad_norm": 0.06501258164644241, + "learning_rate": 0.0002, + "loss": 0.2616, + "step": 9220 + }, + { + "epoch": 1.9354162298175717, + "grad_norm": 3.5356762409210205, + "learning_rate": 0.0002, + "loss": 0.2917, + "step": 9230 + }, + { + "epoch": 1.9375131054728456, + "grad_norm": 0.08394140750169754, + "learning_rate": 0.0002, + "loss": 0.2707, + "step": 9240 + }, + { + "epoch": 1.939609981128119, + "grad_norm": 0.052391644567251205, + "learning_rate": 0.0002, + "loss": 0.2675, + "step": 9250 + }, + { + "epoch": 1.9417068567833926, + "grad_norm": 0.09241612255573273, + "learning_rate": 0.0002, + "loss": 0.2575, + "step": 9260 + }, + { + "epoch": 1.9438037324386663, + "grad_norm": 0.0741746574640274, + "learning_rate": 0.0002, + "loss": 0.2438, + "step": 9270 + }, + { + "epoch": 1.94590060809394, + "grad_norm": 0.08500709384679794, + "learning_rate": 0.0002, + "loss": 0.267, + "step": 9280 + }, + { + "epoch": 1.9479974837492136, + "grad_norm": 0.10579940676689148, + "learning_rate": 0.0002, + "loss": 0.2436, + "step": 9290 + }, + { + "epoch": 1.9500943594044873, + "grad_norm": 0.07684406638145447, + "learning_rate": 0.0002, + "loss": 0.2426, + "step": 9300 + }, + { + "epoch": 1.952191235059761, + "grad_norm": 0.06794850528240204, + "learning_rate": 0.0002, + "loss": 0.245, + "step": 9310 + }, + { + "epoch": 1.9542881107150345, + "grad_norm": 0.08876547962427139, + "learning_rate": 0.0002, + "loss": 0.243, + "step": 9320 + }, + { + "epoch": 1.9563849863703082, + "grad_norm": 0.051869798451662064, + "learning_rate": 0.0002, + "loss": 0.2457, + "step": 9330 + }, + { + "epoch": 1.958481862025582, + "grad_norm": 0.10372921824455261, + "learning_rate": 0.0002, + "loss": 0.2917, + "step": 9340 + }, + { + "epoch": 1.9605787376808554, + "grad_norm": 0.0588776133954525, + "learning_rate": 0.0002, + "loss": 0.2619, + "step": 9350 + }, + { + "epoch": 1.9626756133361292, + "grad_norm": 0.08060908317565918, + "learning_rate": 0.0002, + "loss": 0.2438, + "step": 9360 + }, + { + "epoch": 1.9647724889914029, + "grad_norm": 0.06950705498456955, + "learning_rate": 0.0002, + "loss": 0.2563, + "step": 9370 + }, + { + "epoch": 1.9668693646466764, + "grad_norm": 0.07383579760789871, + "learning_rate": 0.0002, + "loss": 0.2449, + "step": 9380 + }, + { + "epoch": 1.96896624030195, + "grad_norm": 0.06754910200834274, + "learning_rate": 0.0002, + "loss": 0.2596, + "step": 9390 + }, + { + "epoch": 1.9710631159572238, + "grad_norm": 0.0853751078248024, + "learning_rate": 0.0002, + "loss": 0.309, + "step": 9400 + }, + { + "epoch": 1.9731599916124973, + "grad_norm": 0.0764593631029129, + "learning_rate": 0.0002, + "loss": 0.2483, + "step": 9410 + }, + { + "epoch": 1.975256867267771, + "grad_norm": 5.169427871704102, + "learning_rate": 0.0002, + "loss": 0.3071, + "step": 9420 + }, + { + "epoch": 1.9773537429230448, + "grad_norm": 0.19787247478961945, + "learning_rate": 0.0002, + "loss": 0.3134, + "step": 9430 + }, + { + "epoch": 1.9794506185783183, + "grad_norm": 0.10594961792230606, + "learning_rate": 0.0002, + "loss": 0.3063, + "step": 9440 + }, + { + "epoch": 1.981547494233592, + "grad_norm": 0.060998719185590744, + "learning_rate": 0.0002, + "loss": 0.2438, + "step": 9450 + }, + { + "epoch": 1.9836443698888657, + "grad_norm": 0.10632716119289398, + "learning_rate": 0.0002, + "loss": 0.2706, + "step": 9460 + }, + { + "epoch": 1.9857412455441392, + "grad_norm": 0.09421350806951523, + "learning_rate": 0.0002, + "loss": 0.271, + "step": 9470 + }, + { + "epoch": 1.9878381211994127, + "grad_norm": 0.12134144455194473, + "learning_rate": 0.0002, + "loss": 0.2447, + "step": 9480 + }, + { + "epoch": 1.9899349968546867, + "grad_norm": 0.05131775140762329, + "learning_rate": 0.0002, + "loss": 0.2712, + "step": 9490 + }, + { + "epoch": 1.9920318725099602, + "grad_norm": 0.08443418890237808, + "learning_rate": 0.0002, + "loss": 0.2905, + "step": 9500 + }, + { + "epoch": 1.9941287481652337, + "grad_norm": 0.055085159838199615, + "learning_rate": 0.0002, + "loss": 0.2446, + "step": 9510 + }, + { + "epoch": 1.9962256238205074, + "grad_norm": 0.07487735152244568, + "learning_rate": 0.0002, + "loss": 0.2739, + "step": 9520 + }, + { + "epoch": 1.9983224994757811, + "grad_norm": 0.06377065926790237, + "learning_rate": 0.0002, + "loss": 0.2445, + "step": 9530 + }, + { + "epoch": 2.0004193751310546, + "grad_norm": 2.2102627754211426, + "learning_rate": 0.0002, + "loss": 0.3525, + "step": 9540 + }, + { + "epoch": 2.0025162507863286, + "grad_norm": 0.14670875668525696, + "learning_rate": 0.0002, + "loss": 0.3185, + "step": 9550 + }, + { + "epoch": 2.004613126441602, + "grad_norm": 0.049855392426252365, + "learning_rate": 0.0002, + "loss": 0.2661, + "step": 9560 + }, + { + "epoch": 2.0067100020968756, + "grad_norm": 0.0794902816414833, + "learning_rate": 0.0002, + "loss": 0.2777, + "step": 9570 + }, + { + "epoch": 2.0088068777521495, + "grad_norm": 0.07908865809440613, + "learning_rate": 0.0002, + "loss": 0.2656, + "step": 9580 + }, + { + "epoch": 2.010903753407423, + "grad_norm": 0.07637414336204529, + "learning_rate": 0.0002, + "loss": 0.2459, + "step": 9590 + }, + { + "epoch": 2.0130006290626965, + "grad_norm": 0.07523901760578156, + "learning_rate": 0.0002, + "loss": 0.2455, + "step": 9600 + }, + { + "epoch": 2.0150975047179704, + "grad_norm": 0.05918925628066063, + "learning_rate": 0.0002, + "loss": 0.2853, + "step": 9610 + }, + { + "epoch": 2.017194380373244, + "grad_norm": 0.05923960730433464, + "learning_rate": 0.0002, + "loss": 0.2718, + "step": 9620 + }, + { + "epoch": 2.0192912560285174, + "grad_norm": 0.10112644731998444, + "learning_rate": 0.0002, + "loss": 0.2652, + "step": 9630 + }, + { + "epoch": 2.021388131683791, + "grad_norm": 0.07971510291099548, + "learning_rate": 0.0002, + "loss": 0.2928, + "step": 9640 + }, + { + "epoch": 2.023485007339065, + "grad_norm": 0.04727751389145851, + "learning_rate": 0.0002, + "loss": 0.2591, + "step": 9650 + }, + { + "epoch": 2.0255818829943384, + "grad_norm": 0.0745919719338417, + "learning_rate": 0.0002, + "loss": 0.2425, + "step": 9660 + }, + { + "epoch": 2.027678758649612, + "grad_norm": 2.6715872287750244, + "learning_rate": 0.0002, + "loss": 0.261, + "step": 9670 + }, + { + "epoch": 2.029775634304886, + "grad_norm": 0.06958667933940887, + "learning_rate": 0.0002, + "loss": 0.2612, + "step": 9680 + }, + { + "epoch": 2.0318725099601593, + "grad_norm": 0.056604884564876556, + "learning_rate": 0.0002, + "loss": 0.2437, + "step": 9690 + }, + { + "epoch": 2.033969385615433, + "grad_norm": 0.07836975902318954, + "learning_rate": 0.0002, + "loss": 0.2568, + "step": 9700 + }, + { + "epoch": 2.036066261270707, + "grad_norm": 3.000828742980957, + "learning_rate": 0.0002, + "loss": 0.2926, + "step": 9710 + }, + { + "epoch": 2.0381631369259803, + "grad_norm": 0.08067448437213898, + "learning_rate": 0.0002, + "loss": 0.2551, + "step": 9720 + }, + { + "epoch": 2.040260012581254, + "grad_norm": 0.057211246341466904, + "learning_rate": 0.0002, + "loss": 0.2437, + "step": 9730 + }, + { + "epoch": 2.0423568882365277, + "grad_norm": 1.01911461353302, + "learning_rate": 0.0002, + "loss": 0.2426, + "step": 9740 + }, + { + "epoch": 2.0444537638918012, + "grad_norm": 0.10453460365533829, + "learning_rate": 0.0002, + "loss": 0.2917, + "step": 9750 + }, + { + "epoch": 2.0465506395470747, + "grad_norm": 0.17895422875881195, + "learning_rate": 0.0002, + "loss": 0.2462, + "step": 9760 + }, + { + "epoch": 2.0486475152023487, + "grad_norm": 0.10618945211172104, + "learning_rate": 0.0002, + "loss": 0.245, + "step": 9770 + }, + { + "epoch": 2.050744390857622, + "grad_norm": 0.10148117691278458, + "learning_rate": 0.0002, + "loss": 0.2517, + "step": 9780 + }, + { + "epoch": 2.0528412665128957, + "grad_norm": 0.11543615907430649, + "learning_rate": 0.0002, + "loss": 0.2431, + "step": 9790 + }, + { + "epoch": 2.0549381421681696, + "grad_norm": 0.05872245505452156, + "learning_rate": 0.0002, + "loss": 0.2701, + "step": 9800 + }, + { + "epoch": 2.057035017823443, + "grad_norm": 0.08900139480829239, + "learning_rate": 0.0002, + "loss": 0.2461, + "step": 9810 + }, + { + "epoch": 2.0591318934787166, + "grad_norm": 0.08121102303266525, + "learning_rate": 0.0002, + "loss": 0.2445, + "step": 9820 + }, + { + "epoch": 2.0612287691339906, + "grad_norm": 0.07904442399740219, + "learning_rate": 0.0002, + "loss": 0.2526, + "step": 9830 + }, + { + "epoch": 2.063325644789264, + "grad_norm": 8.165481567382812, + "learning_rate": 0.0002, + "loss": 0.2719, + "step": 9840 + }, + { + "epoch": 2.0654225204445376, + "grad_norm": 0.07967832684516907, + "learning_rate": 0.0002, + "loss": 0.2536, + "step": 9850 + }, + { + "epoch": 2.0675193960998115, + "grad_norm": 3.6327831745147705, + "learning_rate": 0.0002, + "loss": 0.2661, + "step": 9860 + }, + { + "epoch": 2.069616271755085, + "grad_norm": 0.0879238173365593, + "learning_rate": 0.0002, + "loss": 0.2562, + "step": 9870 + }, + { + "epoch": 2.0717131474103585, + "grad_norm": 0.11624127626419067, + "learning_rate": 0.0002, + "loss": 0.2569, + "step": 9880 + }, + { + "epoch": 2.073810023065632, + "grad_norm": 0.05972060561180115, + "learning_rate": 0.0002, + "loss": 0.2419, + "step": 9890 + }, + { + "epoch": 2.075906898720906, + "grad_norm": 0.09197453409433365, + "learning_rate": 0.0002, + "loss": 0.26, + "step": 9900 + }, + { + "epoch": 2.0780037743761794, + "grad_norm": 2.677229404449463, + "learning_rate": 0.0002, + "loss": 0.2719, + "step": 9910 + }, + { + "epoch": 2.080100650031453, + "grad_norm": 0.07371783256530762, + "learning_rate": 0.0002, + "loss": 0.2428, + "step": 9920 + }, + { + "epoch": 2.082197525686727, + "grad_norm": 0.08582104742527008, + "learning_rate": 0.0002, + "loss": 0.2652, + "step": 9930 + }, + { + "epoch": 2.0842944013420004, + "grad_norm": 0.08578525483608246, + "learning_rate": 0.0002, + "loss": 0.2734, + "step": 9940 + }, + { + "epoch": 2.086391276997274, + "grad_norm": 0.4039681851863861, + "learning_rate": 0.0002, + "loss": 0.3084, + "step": 9950 + }, + { + "epoch": 2.088488152652548, + "grad_norm": 0.33731213212013245, + "learning_rate": 0.0002, + "loss": 0.2818, + "step": 9960 + }, + { + "epoch": 2.0905850283078213, + "grad_norm": 0.13896989822387695, + "learning_rate": 0.0002, + "loss": 0.2617, + "step": 9970 + }, + { + "epoch": 2.092681903963095, + "grad_norm": 0.060974400490522385, + "learning_rate": 0.0002, + "loss": 0.2651, + "step": 9980 + }, + { + "epoch": 2.094778779618369, + "grad_norm": 0.06085608899593353, + "learning_rate": 0.0002, + "loss": 0.2443, + "step": 9990 + }, + { + "epoch": 2.0968756552736423, + "grad_norm": 0.07415081560611725, + "learning_rate": 0.0002, + "loss": 0.2908, + "step": 10000 + }, + { + "epoch": 2.098972530928916, + "grad_norm": 1.3254259824752808, + "learning_rate": 0.0002, + "loss": 0.2585, + "step": 10010 + }, + { + "epoch": 2.1010694065841897, + "grad_norm": 0.06385322660207748, + "learning_rate": 0.0002, + "loss": 0.2449, + "step": 10020 + }, + { + "epoch": 2.1031662822394632, + "grad_norm": 0.07257821410894394, + "learning_rate": 0.0002, + "loss": 0.2441, + "step": 10030 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.07844216376543045, + "learning_rate": 0.0002, + "loss": 0.2934, + "step": 10040 + }, + { + "epoch": 2.1073600335500107, + "grad_norm": 0.07658874243497849, + "learning_rate": 0.0002, + "loss": 0.3154, + "step": 10050 + }, + { + "epoch": 2.109456909205284, + "grad_norm": 0.07644514739513397, + "learning_rate": 0.0002, + "loss": 0.2876, + "step": 10060 + }, + { + "epoch": 2.1115537848605577, + "grad_norm": 0.06457582861185074, + "learning_rate": 0.0002, + "loss": 0.2656, + "step": 10070 + }, + { + "epoch": 2.1136506605158316, + "grad_norm": 3.1369924545288086, + "learning_rate": 0.0002, + "loss": 0.2947, + "step": 10080 + }, + { + "epoch": 2.115747536171105, + "grad_norm": 5.061370849609375, + "learning_rate": 0.0002, + "loss": 0.2607, + "step": 10090 + }, + { + "epoch": 2.1178444118263786, + "grad_norm": 0.0855194702744484, + "learning_rate": 0.0002, + "loss": 0.2476, + "step": 10100 + }, + { + "epoch": 2.1199412874816526, + "grad_norm": 0.8303119540214539, + "learning_rate": 0.0002, + "loss": 0.322, + "step": 10110 + }, + { + "epoch": 2.122038163136926, + "grad_norm": 0.05734449625015259, + "learning_rate": 0.0002, + "loss": 0.245, + "step": 10120 + }, + { + "epoch": 2.1241350387921996, + "grad_norm": 0.08011757582426071, + "learning_rate": 0.0002, + "loss": 0.2963, + "step": 10130 + }, + { + "epoch": 2.126231914447473, + "grad_norm": 0.09854753315448761, + "learning_rate": 0.0002, + "loss": 0.2799, + "step": 10140 + }, + { + "epoch": 2.128328790102747, + "grad_norm": 0.07053971290588379, + "learning_rate": 0.0002, + "loss": 0.2443, + "step": 10150 + }, + { + "epoch": 2.1304256657580205, + "grad_norm": 0.07749179005622864, + "learning_rate": 0.0002, + "loss": 0.2464, + "step": 10160 + }, + { + "epoch": 2.132522541413294, + "grad_norm": 0.09702768921852112, + "learning_rate": 0.0002, + "loss": 0.2631, + "step": 10170 + }, + { + "epoch": 2.134619417068568, + "grad_norm": 0.15351717174053192, + "learning_rate": 0.0002, + "loss": 0.2431, + "step": 10180 + }, + { + "epoch": 2.1367162927238414, + "grad_norm": 0.05788223445415497, + "learning_rate": 0.0002, + "loss": 0.2448, + "step": 10190 + }, + { + "epoch": 2.138813168379115, + "grad_norm": 0.052700772881507874, + "learning_rate": 0.0002, + "loss": 0.2615, + "step": 10200 + }, + { + "epoch": 2.140910044034389, + "grad_norm": 0.053113095462322235, + "learning_rate": 0.0002, + "loss": 0.244, + "step": 10210 + }, + { + "epoch": 2.1430069196896624, + "grad_norm": 3.7025339603424072, + "learning_rate": 0.0002, + "loss": 0.2842, + "step": 10220 + }, + { + "epoch": 2.145103795344936, + "grad_norm": 0.28933843970298767, + "learning_rate": 0.0002, + "loss": 0.2887, + "step": 10230 + }, + { + "epoch": 2.14720067100021, + "grad_norm": 3.0457377433776855, + "learning_rate": 0.0002, + "loss": 0.2622, + "step": 10240 + }, + { + "epoch": 2.1492975466554833, + "grad_norm": 0.3718264102935791, + "learning_rate": 0.0002, + "loss": 0.2572, + "step": 10250 + }, + { + "epoch": 2.151394422310757, + "grad_norm": 0.1321519911289215, + "learning_rate": 0.0002, + "loss": 0.2448, + "step": 10260 + }, + { + "epoch": 2.153491297966031, + "grad_norm": 0.1818964034318924, + "learning_rate": 0.0002, + "loss": 0.2604, + "step": 10270 + }, + { + "epoch": 2.1555881736213043, + "grad_norm": 4.632304668426514, + "learning_rate": 0.0002, + "loss": 0.269, + "step": 10280 + }, + { + "epoch": 2.157685049276578, + "grad_norm": 0.26138266921043396, + "learning_rate": 0.0002, + "loss": 0.2507, + "step": 10290 + }, + { + "epoch": 2.1597819249318517, + "grad_norm": 11.915481567382812, + "learning_rate": 0.0002, + "loss": 0.2785, + "step": 10300 + }, + { + "epoch": 2.1618788005871252, + "grad_norm": 0.5993645191192627, + "learning_rate": 0.0002, + "loss": 0.268, + "step": 10310 + }, + { + "epoch": 2.1639756762423987, + "grad_norm": 0.1072174459695816, + "learning_rate": 0.0002, + "loss": 0.2469, + "step": 10320 + }, + { + "epoch": 2.1660725518976727, + "grad_norm": 0.20691901445388794, + "learning_rate": 0.0002, + "loss": 0.2753, + "step": 10330 + }, + { + "epoch": 2.168169427552946, + "grad_norm": 0.5400320887565613, + "learning_rate": 0.0002, + "loss": 0.2453, + "step": 10340 + }, + { + "epoch": 2.1702663032082197, + "grad_norm": 4.566557884216309, + "learning_rate": 0.0002, + "loss": 0.3557, + "step": 10350 + }, + { + "epoch": 2.1723631788634936, + "grad_norm": 0.29634520411491394, + "learning_rate": 0.0002, + "loss": 0.2446, + "step": 10360 + }, + { + "epoch": 2.174460054518767, + "grad_norm": 0.07772216945886612, + "learning_rate": 0.0002, + "loss": 0.2554, + "step": 10370 + }, + { + "epoch": 2.1765569301740406, + "grad_norm": 0.897849440574646, + "learning_rate": 0.0002, + "loss": 0.2464, + "step": 10380 + }, + { + "epoch": 2.178653805829314, + "grad_norm": 2.097255229949951, + "learning_rate": 0.0002, + "loss": 0.2715, + "step": 10390 + }, + { + "epoch": 2.180750681484588, + "grad_norm": 3.3917410373687744, + "learning_rate": 0.0002, + "loss": 0.3093, + "step": 10400 + }, + { + "epoch": 2.1828475571398616, + "grad_norm": 0.07134232670068741, + "learning_rate": 0.0002, + "loss": 0.2477, + "step": 10410 + }, + { + "epoch": 2.184944432795135, + "grad_norm": 0.11076632142066956, + "learning_rate": 0.0002, + "loss": 0.3185, + "step": 10420 + }, + { + "epoch": 2.187041308450409, + "grad_norm": 0.08823218941688538, + "learning_rate": 0.0002, + "loss": 0.2458, + "step": 10430 + }, + { + "epoch": 2.1891381841056825, + "grad_norm": 0.10004838556051254, + "learning_rate": 0.0002, + "loss": 0.2452, + "step": 10440 + }, + { + "epoch": 2.191235059760956, + "grad_norm": 0.10475442558526993, + "learning_rate": 0.0002, + "loss": 0.3018, + "step": 10450 + }, + { + "epoch": 2.19333193541623, + "grad_norm": 0.14505670964717865, + "learning_rate": 0.0002, + "loss": 0.3324, + "step": 10460 + }, + { + "epoch": 2.1954288110715034, + "grad_norm": 0.11367640644311905, + "learning_rate": 0.0002, + "loss": 0.245, + "step": 10470 + }, + { + "epoch": 2.197525686726777, + "grad_norm": 0.08633550256490707, + "learning_rate": 0.0002, + "loss": 0.2464, + "step": 10480 + }, + { + "epoch": 2.199622562382051, + "grad_norm": 0.05888749659061432, + "learning_rate": 0.0002, + "loss": 0.2436, + "step": 10490 + }, + { + "epoch": 2.2017194380373244, + "grad_norm": 0.0671803280711174, + "learning_rate": 0.0002, + "loss": 0.2423, + "step": 10500 + }, + { + "epoch": 2.203816313692598, + "grad_norm": 0.10740163177251816, + "learning_rate": 0.0002, + "loss": 0.2593, + "step": 10510 + }, + { + "epoch": 2.205913189347872, + "grad_norm": 3.762645959854126, + "learning_rate": 0.0002, + "loss": 0.2459, + "step": 10520 + }, + { + "epoch": 2.2080100650031453, + "grad_norm": 0.05283233895897865, + "learning_rate": 0.0002, + "loss": 0.2686, + "step": 10530 + }, + { + "epoch": 2.210106940658419, + "grad_norm": 4.323790550231934, + "learning_rate": 0.0002, + "loss": 0.2576, + "step": 10540 + }, + { + "epoch": 2.212203816313693, + "grad_norm": 0.412309467792511, + "learning_rate": 0.0002, + "loss": 0.2884, + "step": 10550 + }, + { + "epoch": 2.2143006919689663, + "grad_norm": 0.07542133331298828, + "learning_rate": 0.0002, + "loss": 0.2772, + "step": 10560 + }, + { + "epoch": 2.21639756762424, + "grad_norm": 2.28047776222229, + "learning_rate": 0.0002, + "loss": 0.3163, + "step": 10570 + }, + { + "epoch": 2.2184944432795137, + "grad_norm": 0.09531054645776749, + "learning_rate": 0.0002, + "loss": 0.2428, + "step": 10580 + }, + { + "epoch": 2.2205913189347872, + "grad_norm": 0.08148845285177231, + "learning_rate": 0.0002, + "loss": 0.2787, + "step": 10590 + }, + { + "epoch": 2.2226881945900607, + "grad_norm": 3.0185651779174805, + "learning_rate": 0.0002, + "loss": 0.3192, + "step": 10600 + }, + { + "epoch": 2.2247850702453347, + "grad_norm": 3.6113274097442627, + "learning_rate": 0.0002, + "loss": 0.3345, + "step": 10610 + }, + { + "epoch": 2.226881945900608, + "grad_norm": 2.7463948726654053, + "learning_rate": 0.0002, + "loss": 0.3242, + "step": 10620 + }, + { + "epoch": 2.2289788215558817, + "grad_norm": 0.14245733618736267, + "learning_rate": 0.0002, + "loss": 0.268, + "step": 10630 + }, + { + "epoch": 2.231075697211155, + "grad_norm": 0.07894956320524216, + "learning_rate": 0.0002, + "loss": 0.2615, + "step": 10640 + }, + { + "epoch": 2.233172572866429, + "grad_norm": 4.841274261474609, + "learning_rate": 0.0002, + "loss": 0.271, + "step": 10650 + }, + { + "epoch": 2.2352694485217026, + "grad_norm": 0.27884334325790405, + "learning_rate": 0.0002, + "loss": 0.2677, + "step": 10660 + }, + { + "epoch": 2.237366324176976, + "grad_norm": 4.750665664672852, + "learning_rate": 0.0002, + "loss": 0.3232, + "step": 10670 + }, + { + "epoch": 2.23946319983225, + "grad_norm": 0.05985650420188904, + "learning_rate": 0.0002, + "loss": 0.27, + "step": 10680 + }, + { + "epoch": 2.2415600754875236, + "grad_norm": 5.086830139160156, + "learning_rate": 0.0002, + "loss": 0.3163, + "step": 10690 + }, + { + "epoch": 2.243656951142797, + "grad_norm": 0.7766889929771423, + "learning_rate": 0.0002, + "loss": 0.2444, + "step": 10700 + }, + { + "epoch": 2.245753826798071, + "grad_norm": 0.10954524576663971, + "learning_rate": 0.0002, + "loss": 0.2748, + "step": 10710 + }, + { + "epoch": 2.2478507024533445, + "grad_norm": 0.09740043431520462, + "learning_rate": 0.0002, + "loss": 0.2736, + "step": 10720 + }, + { + "epoch": 2.249947578108618, + "grad_norm": 0.09562437236309052, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 10730 + }, + { + "epoch": 2.252044453763892, + "grad_norm": 0.09752257168292999, + "learning_rate": 0.0002, + "loss": 0.2735, + "step": 10740 + }, + { + "epoch": 2.2541413294191655, + "grad_norm": 0.09222041815519333, + "learning_rate": 0.0002, + "loss": 0.2467, + "step": 10750 + }, + { + "epoch": 2.256238205074439, + "grad_norm": 2.323028802871704, + "learning_rate": 0.0002, + "loss": 0.2676, + "step": 10760 + }, + { + "epoch": 2.258335080729713, + "grad_norm": 0.05767745524644852, + "learning_rate": 0.0002, + "loss": 0.2518, + "step": 10770 + }, + { + "epoch": 2.2604319563849864, + "grad_norm": 0.060549937188625336, + "learning_rate": 0.0002, + "loss": 0.2453, + "step": 10780 + }, + { + "epoch": 2.26252883204026, + "grad_norm": 0.06989111751317978, + "learning_rate": 0.0002, + "loss": 0.2623, + "step": 10790 + }, + { + "epoch": 2.264625707695534, + "grad_norm": 1.974779725074768, + "learning_rate": 0.0002, + "loss": 0.2831, + "step": 10800 + }, + { + "epoch": 2.2667225833508073, + "grad_norm": 0.13178583979606628, + "learning_rate": 0.0002, + "loss": 0.2442, + "step": 10810 + }, + { + "epoch": 2.268819459006081, + "grad_norm": 5.346510410308838, + "learning_rate": 0.0002, + "loss": 0.3102, + "step": 10820 + }, + { + "epoch": 2.2709163346613543, + "grad_norm": 0.09758991748094559, + "learning_rate": 0.0002, + "loss": 0.2635, + "step": 10830 + }, + { + "epoch": 2.2730132103166283, + "grad_norm": 0.11416572332382202, + "learning_rate": 0.0002, + "loss": 0.2448, + "step": 10840 + }, + { + "epoch": 2.275110085971902, + "grad_norm": 0.09764792770147324, + "learning_rate": 0.0002, + "loss": 0.3057, + "step": 10850 + }, + { + "epoch": 2.2772069616271757, + "grad_norm": 0.07572513073682785, + "learning_rate": 0.0002, + "loss": 0.2745, + "step": 10860 + }, + { + "epoch": 2.2793038372824492, + "grad_norm": 0.08429969102144241, + "learning_rate": 0.0002, + "loss": 0.3372, + "step": 10870 + }, + { + "epoch": 2.2814007129377227, + "grad_norm": 0.09187424182891846, + "learning_rate": 0.0002, + "loss": 0.2887, + "step": 10880 + }, + { + "epoch": 2.2834975885929962, + "grad_norm": 0.10824614018201828, + "learning_rate": 0.0002, + "loss": 0.2446, + "step": 10890 + }, + { + "epoch": 2.28559446424827, + "grad_norm": 0.06268329918384552, + "learning_rate": 0.0002, + "loss": 0.2442, + "step": 10900 + }, + { + "epoch": 2.2876913399035437, + "grad_norm": 2.6071887016296387, + "learning_rate": 0.0002, + "loss": 0.2885, + "step": 10910 + }, + { + "epoch": 2.289788215558817, + "grad_norm": 0.33325842022895813, + "learning_rate": 0.0002, + "loss": 0.3228, + "step": 10920 + }, + { + "epoch": 2.291885091214091, + "grad_norm": 0.10186664015054703, + "learning_rate": 0.0002, + "loss": 0.2773, + "step": 10930 + }, + { + "epoch": 2.2939819668693646, + "grad_norm": 0.1469857096672058, + "learning_rate": 0.0002, + "loss": 0.2651, + "step": 10940 + }, + { + "epoch": 2.296078842524638, + "grad_norm": 0.23534300923347473, + "learning_rate": 0.0002, + "loss": 0.2654, + "step": 10950 + }, + { + "epoch": 2.298175718179912, + "grad_norm": 0.09361186623573303, + "learning_rate": 0.0002, + "loss": 0.2612, + "step": 10960 + }, + { + "epoch": 2.3002725938351856, + "grad_norm": 3.4896860122680664, + "learning_rate": 0.0002, + "loss": 0.2827, + "step": 10970 + }, + { + "epoch": 2.302369469490459, + "grad_norm": 0.07228028774261475, + "learning_rate": 0.0002, + "loss": 0.2664, + "step": 10980 + }, + { + "epoch": 2.304466345145733, + "grad_norm": 0.10563217848539352, + "learning_rate": 0.0002, + "loss": 0.2438, + "step": 10990 + }, + { + "epoch": 2.3065632208010065, + "grad_norm": 0.10082051157951355, + "learning_rate": 0.0002, + "loss": 0.2989, + "step": 11000 + }, + { + "epoch": 2.30866009645628, + "grad_norm": 0.09514462947845459, + "learning_rate": 0.0002, + "loss": 0.2583, + "step": 11010 + }, + { + "epoch": 2.310756972111554, + "grad_norm": 0.05637963116168976, + "learning_rate": 0.0002, + "loss": 0.2447, + "step": 11020 + }, + { + "epoch": 2.3128538477668275, + "grad_norm": 0.0820513516664505, + "learning_rate": 0.0002, + "loss": 0.2448, + "step": 11030 + }, + { + "epoch": 2.314950723422101, + "grad_norm": 0.49234119057655334, + "learning_rate": 0.0002, + "loss": 0.2607, + "step": 11040 + }, + { + "epoch": 2.317047599077375, + "grad_norm": 0.056676942855119705, + "learning_rate": 0.0002, + "loss": 0.2876, + "step": 11050 + }, + { + "epoch": 2.3191444747326484, + "grad_norm": 0.17538276314735413, + "learning_rate": 0.0002, + "loss": 0.256, + "step": 11060 + }, + { + "epoch": 2.321241350387922, + "grad_norm": 1.0409997701644897, + "learning_rate": 0.0002, + "loss": 0.2467, + "step": 11070 + }, + { + "epoch": 2.3233382260431954, + "grad_norm": 3.7242491245269775, + "learning_rate": 0.0002, + "loss": 0.2733, + "step": 11080 + }, + { + "epoch": 2.3254351016984693, + "grad_norm": 0.08921827375888824, + "learning_rate": 0.0002, + "loss": 0.2972, + "step": 11090 + }, + { + "epoch": 2.327531977353743, + "grad_norm": 0.083032988011837, + "learning_rate": 0.0002, + "loss": 0.3251, + "step": 11100 + }, + { + "epoch": 2.329628853009017, + "grad_norm": 0.10303895175457001, + "learning_rate": 0.0002, + "loss": 0.2617, + "step": 11110 + }, + { + "epoch": 2.3317257286642903, + "grad_norm": 0.1020461916923523, + "learning_rate": 0.0002, + "loss": 0.2736, + "step": 11120 + }, + { + "epoch": 2.333822604319564, + "grad_norm": 0.09795411676168442, + "learning_rate": 0.0002, + "loss": 0.2948, + "step": 11130 + }, + { + "epoch": 2.3359194799748373, + "grad_norm": 0.05664985999464989, + "learning_rate": 0.0002, + "loss": 0.2932, + "step": 11140 + }, + { + "epoch": 2.3380163556301112, + "grad_norm": 0.06109614670276642, + "learning_rate": 0.0002, + "loss": 0.3056, + "step": 11150 + }, + { + "epoch": 2.3401132312853847, + "grad_norm": 0.0780511125922203, + "learning_rate": 0.0002, + "loss": 0.2438, + "step": 11160 + }, + { + "epoch": 2.3422101069406582, + "grad_norm": 0.06719834357500076, + "learning_rate": 0.0002, + "loss": 0.2926, + "step": 11170 + }, + { + "epoch": 2.344306982595932, + "grad_norm": 0.11557465046644211, + "learning_rate": 0.0002, + "loss": 0.2482, + "step": 11180 + }, + { + "epoch": 2.3464038582512057, + "grad_norm": 0.06901777535676956, + "learning_rate": 0.0002, + "loss": 0.2958, + "step": 11190 + }, + { + "epoch": 2.348500733906479, + "grad_norm": 0.13302722573280334, + "learning_rate": 0.0002, + "loss": 0.2453, + "step": 11200 + }, + { + "epoch": 2.350597609561753, + "grad_norm": 0.060133375227451324, + "learning_rate": 0.0002, + "loss": 0.2639, + "step": 11210 + }, + { + "epoch": 2.3526944852170266, + "grad_norm": 0.062260959297418594, + "learning_rate": 0.0002, + "loss": 0.2538, + "step": 11220 + }, + { + "epoch": 2.3547913608723, + "grad_norm": 3.3237318992614746, + "learning_rate": 0.0002, + "loss": 0.2868, + "step": 11230 + }, + { + "epoch": 2.356888236527574, + "grad_norm": 0.08286066353321075, + "learning_rate": 0.0002, + "loss": 0.2731, + "step": 11240 + }, + { + "epoch": 2.3589851121828476, + "grad_norm": 0.07192427664995193, + "learning_rate": 0.0002, + "loss": 0.2618, + "step": 11250 + }, + { + "epoch": 2.361081987838121, + "grad_norm": 0.09143181145191193, + "learning_rate": 0.0002, + "loss": 0.2445, + "step": 11260 + }, + { + "epoch": 2.363178863493395, + "grad_norm": 0.09221427142620087, + "learning_rate": 0.0002, + "loss": 0.2411, + "step": 11270 + }, + { + "epoch": 2.3652757391486685, + "grad_norm": 0.06351318210363388, + "learning_rate": 0.0002, + "loss": 0.2427, + "step": 11280 + }, + { + "epoch": 2.367372614803942, + "grad_norm": 0.09457197785377502, + "learning_rate": 0.0002, + "loss": 0.244, + "step": 11290 + }, + { + "epoch": 2.369469490459216, + "grad_norm": 0.05240204185247421, + "learning_rate": 0.0002, + "loss": 0.2893, + "step": 11300 + }, + { + "epoch": 2.3715663661144895, + "grad_norm": 0.08396653831005096, + "learning_rate": 0.0002, + "loss": 0.2583, + "step": 11310 + }, + { + "epoch": 2.373663241769763, + "grad_norm": 0.17449672520160675, + "learning_rate": 0.0002, + "loss": 0.3056, + "step": 11320 + }, + { + "epoch": 2.3757601174250365, + "grad_norm": 0.07481683790683746, + "learning_rate": 0.0002, + "loss": 0.2709, + "step": 11330 + }, + { + "epoch": 2.3778569930803104, + "grad_norm": 0.07266558706760406, + "learning_rate": 0.0002, + "loss": 0.2439, + "step": 11340 + }, + { + "epoch": 2.379953868735584, + "grad_norm": 0.059189122170209885, + "learning_rate": 0.0002, + "loss": 0.2421, + "step": 11350 + }, + { + "epoch": 2.382050744390858, + "grad_norm": 0.08741532266139984, + "learning_rate": 0.0002, + "loss": 0.2607, + "step": 11360 + }, + { + "epoch": 2.3841476200461313, + "grad_norm": 0.14015233516693115, + "learning_rate": 0.0002, + "loss": 0.3175, + "step": 11370 + }, + { + "epoch": 2.386244495701405, + "grad_norm": 10.488336563110352, + "learning_rate": 0.0002, + "loss": 0.2651, + "step": 11380 + }, + { + "epoch": 2.3883413713566783, + "grad_norm": 0.06913798302412033, + "learning_rate": 0.0002, + "loss": 0.2583, + "step": 11390 + }, + { + "epoch": 2.3904382470119523, + "grad_norm": 0.29002469778060913, + "learning_rate": 0.0002, + "loss": 0.256, + "step": 11400 + }, + { + "epoch": 2.392535122667226, + "grad_norm": 0.11059385538101196, + "learning_rate": 0.0002, + "loss": 0.2908, + "step": 11410 + }, + { + "epoch": 2.3946319983224993, + "grad_norm": 0.08097716420888901, + "learning_rate": 0.0002, + "loss": 0.2495, + "step": 11420 + }, + { + "epoch": 2.3967288739777732, + "grad_norm": 0.09082842618227005, + "learning_rate": 0.0002, + "loss": 0.3123, + "step": 11430 + }, + { + "epoch": 2.3988257496330467, + "grad_norm": 4.789621353149414, + "learning_rate": 0.0002, + "loss": 0.3085, + "step": 11440 + }, + { + "epoch": 2.4009226252883202, + "grad_norm": 0.05977439135313034, + "learning_rate": 0.0002, + "loss": 0.2743, + "step": 11450 + }, + { + "epoch": 2.403019500943594, + "grad_norm": 3.4836881160736084, + "learning_rate": 0.0002, + "loss": 0.2806, + "step": 11460 + }, + { + "epoch": 2.4051163765988677, + "grad_norm": 0.07663311809301376, + "learning_rate": 0.0002, + "loss": 0.2564, + "step": 11470 + }, + { + "epoch": 2.407213252254141, + "grad_norm": 0.053984083235263824, + "learning_rate": 0.0002, + "loss": 0.2439, + "step": 11480 + }, + { + "epoch": 2.409310127909415, + "grad_norm": 0.059521205723285675, + "learning_rate": 0.0002, + "loss": 0.2712, + "step": 11490 + }, + { + "epoch": 2.4114070035646886, + "grad_norm": 2.755619764328003, + "learning_rate": 0.0002, + "loss": 0.3518, + "step": 11500 + }, + { + "epoch": 2.413503879219962, + "grad_norm": 0.06272760033607483, + "learning_rate": 0.0002, + "loss": 0.2542, + "step": 11510 + }, + { + "epoch": 2.415600754875236, + "grad_norm": 0.06255768239498138, + "learning_rate": 0.0002, + "loss": 0.2765, + "step": 11520 + }, + { + "epoch": 2.4176976305305096, + "grad_norm": 0.0611007884144783, + "learning_rate": 0.0002, + "loss": 0.2723, + "step": 11530 + }, + { + "epoch": 2.419794506185783, + "grad_norm": 0.0764545425772667, + "learning_rate": 0.0002, + "loss": 0.2426, + "step": 11540 + }, + { + "epoch": 2.421891381841057, + "grad_norm": 0.0603049099445343, + "learning_rate": 0.0002, + "loss": 0.2438, + "step": 11550 + }, + { + "epoch": 2.4239882574963305, + "grad_norm": 0.05984706059098244, + "learning_rate": 0.0002, + "loss": 0.2613, + "step": 11560 + }, + { + "epoch": 2.426085133151604, + "grad_norm": 0.09332500398159027, + "learning_rate": 0.0002, + "loss": 0.2595, + "step": 11570 + }, + { + "epoch": 2.4281820088068775, + "grad_norm": 2.8640525341033936, + "learning_rate": 0.0002, + "loss": 0.2837, + "step": 11580 + }, + { + "epoch": 2.4302788844621515, + "grad_norm": 0.06720197945833206, + "learning_rate": 0.0002, + "loss": 0.264, + "step": 11590 + }, + { + "epoch": 2.432375760117425, + "grad_norm": 0.053041424602270126, + "learning_rate": 0.0002, + "loss": 0.2623, + "step": 11600 + }, + { + "epoch": 2.434472635772699, + "grad_norm": 0.06075757369399071, + "learning_rate": 0.0002, + "loss": 0.3068, + "step": 11610 + }, + { + "epoch": 2.4365695114279724, + "grad_norm": 0.09649069607257843, + "learning_rate": 0.0002, + "loss": 0.2651, + "step": 11620 + }, + { + "epoch": 2.438666387083246, + "grad_norm": 0.10784143209457397, + "learning_rate": 0.0002, + "loss": 0.2532, + "step": 11630 + }, + { + "epoch": 2.4407632627385194, + "grad_norm": 2.704793930053711, + "learning_rate": 0.0002, + "loss": 0.2717, + "step": 11640 + }, + { + "epoch": 2.4428601383937933, + "grad_norm": 0.09455972164869308, + "learning_rate": 0.0002, + "loss": 0.2796, + "step": 11650 + }, + { + "epoch": 2.444957014049067, + "grad_norm": 0.05830010399222374, + "learning_rate": 0.0002, + "loss": 0.2727, + "step": 11660 + }, + { + "epoch": 2.4470538897043403, + "grad_norm": 3.8376691341400146, + "learning_rate": 0.0002, + "loss": 0.2645, + "step": 11670 + }, + { + "epoch": 2.4491507653596143, + "grad_norm": 0.07729987800121307, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 11680 + }, + { + "epoch": 2.451247641014888, + "grad_norm": 0.07105562835931778, + "learning_rate": 0.0002, + "loss": 0.2436, + "step": 11690 + }, + { + "epoch": 2.4533445166701613, + "grad_norm": 0.07121610641479492, + "learning_rate": 0.0002, + "loss": 0.2442, + "step": 11700 + }, + { + "epoch": 2.4554413923254352, + "grad_norm": 0.07170971482992172, + "learning_rate": 0.0002, + "loss": 0.2578, + "step": 11710 + }, + { + "epoch": 2.4575382679807087, + "grad_norm": 0.047798383980989456, + "learning_rate": 0.0002, + "loss": 0.2844, + "step": 11720 + }, + { + "epoch": 2.4596351436359822, + "grad_norm": 0.07498259842395782, + "learning_rate": 0.0002, + "loss": 0.2451, + "step": 11730 + }, + { + "epoch": 2.461732019291256, + "grad_norm": 0.08825545012950897, + "learning_rate": 0.0002, + "loss": 0.2914, + "step": 11740 + }, + { + "epoch": 2.4638288949465297, + "grad_norm": 0.09498555958271027, + "learning_rate": 0.0002, + "loss": 0.2938, + "step": 11750 + }, + { + "epoch": 2.465925770601803, + "grad_norm": 0.13061000406742096, + "learning_rate": 0.0002, + "loss": 0.247, + "step": 11760 + }, + { + "epoch": 2.468022646257077, + "grad_norm": 0.11089792102575302, + "learning_rate": 0.0002, + "loss": 0.2754, + "step": 11770 + }, + { + "epoch": 2.4701195219123506, + "grad_norm": 0.15551145374774933, + "learning_rate": 0.0002, + "loss": 0.2447, + "step": 11780 + }, + { + "epoch": 2.472216397567624, + "grad_norm": 0.06919834762811661, + "learning_rate": 0.0002, + "loss": 0.264, + "step": 11790 + }, + { + "epoch": 2.474313273222898, + "grad_norm": 0.09006861597299576, + "learning_rate": 0.0002, + "loss": 0.2445, + "step": 11800 + }, + { + "epoch": 2.4764101488781716, + "grad_norm": 0.05835675820708275, + "learning_rate": 0.0002, + "loss": 0.2441, + "step": 11810 + }, + { + "epoch": 2.478507024533445, + "grad_norm": 0.061936892569065094, + "learning_rate": 0.0002, + "loss": 0.2446, + "step": 11820 + }, + { + "epoch": 2.4806039001887186, + "grad_norm": 0.07322103530168533, + "learning_rate": 0.0002, + "loss": 0.2708, + "step": 11830 + }, + { + "epoch": 2.4827007758439925, + "grad_norm": 0.07008501887321472, + "learning_rate": 0.0002, + "loss": 0.2441, + "step": 11840 + }, + { + "epoch": 2.484797651499266, + "grad_norm": 0.042795274406671524, + "learning_rate": 0.0002, + "loss": 0.243, + "step": 11850 + }, + { + "epoch": 2.48689452715454, + "grad_norm": 0.0587536059319973, + "learning_rate": 0.0002, + "loss": 0.2427, + "step": 11860 + }, + { + "epoch": 2.4889914028098135, + "grad_norm": 2.3064823150634766, + "learning_rate": 0.0002, + "loss": 0.2805, + "step": 11870 + }, + { + "epoch": 2.491088278465087, + "grad_norm": 0.07847301661968231, + "learning_rate": 0.0002, + "loss": 0.2439, + "step": 11880 + }, + { + "epoch": 2.4931851541203605, + "grad_norm": 0.05622512847185135, + "learning_rate": 0.0002, + "loss": 0.2738, + "step": 11890 + }, + { + "epoch": 2.4952820297756344, + "grad_norm": 4.064495086669922, + "learning_rate": 0.0002, + "loss": 0.346, + "step": 11900 + }, + { + "epoch": 2.497378905430908, + "grad_norm": 0.09369252622127533, + "learning_rate": 0.0002, + "loss": 0.2805, + "step": 11910 + }, + { + "epoch": 2.4994757810861814, + "grad_norm": 0.313037633895874, + "learning_rate": 0.0002, + "loss": 0.2441, + "step": 11920 + }, + { + "epoch": 2.5015726567414553, + "grad_norm": 0.8791367411613464, + "learning_rate": 0.0002, + "loss": 0.2752, + "step": 11930 + }, + { + "epoch": 2.503669532396729, + "grad_norm": 0.10336744785308838, + "learning_rate": 0.0002, + "loss": 0.3267, + "step": 11940 + }, + { + "epoch": 2.5057664080520023, + "grad_norm": 0.31161123514175415, + "learning_rate": 0.0002, + "loss": 0.2941, + "step": 11950 + }, + { + "epoch": 2.5078632837072763, + "grad_norm": 1.464455246925354, + "learning_rate": 0.0002, + "loss": 0.3281, + "step": 11960 + }, + { + "epoch": 2.50996015936255, + "grad_norm": 6.812768936157227, + "learning_rate": 0.0002, + "loss": 0.2481, + "step": 11970 + }, + { + "epoch": 2.5120570350178233, + "grad_norm": 0.17166270315647125, + "learning_rate": 0.0002, + "loss": 0.2806, + "step": 11980 + }, + { + "epoch": 2.5141539106730972, + "grad_norm": 0.09918298572301865, + "learning_rate": 0.0002, + "loss": 0.2464, + "step": 11990 + }, + { + "epoch": 2.5162507863283707, + "grad_norm": 0.21109609305858612, + "learning_rate": 0.0002, + "loss": 0.2637, + "step": 12000 + }, + { + "epoch": 2.5183476619836442, + "grad_norm": 0.16542023420333862, + "learning_rate": 0.0002, + "loss": 0.2467, + "step": 12010 + }, + { + "epoch": 2.5204445376389177, + "grad_norm": 0.10262157768011093, + "learning_rate": 0.0002, + "loss": 0.2483, + "step": 12020 + }, + { + "epoch": 2.5225414132941917, + "grad_norm": 0.08214957267045975, + "learning_rate": 0.0002, + "loss": 0.2441, + "step": 12030 + }, + { + "epoch": 2.524638288949465, + "grad_norm": 0.07718580961227417, + "learning_rate": 0.0002, + "loss": 0.2776, + "step": 12040 + }, + { + "epoch": 2.526735164604739, + "grad_norm": 0.09089051187038422, + "learning_rate": 0.0002, + "loss": 0.2864, + "step": 12050 + }, + { + "epoch": 2.5288320402600126, + "grad_norm": 0.08136362582445145, + "learning_rate": 0.0002, + "loss": 0.2663, + "step": 12060 + }, + { + "epoch": 2.530928915915286, + "grad_norm": 0.10573694854974747, + "learning_rate": 0.0002, + "loss": 0.2758, + "step": 12070 + }, + { + "epoch": 2.5330257915705596, + "grad_norm": 0.12686964869499207, + "learning_rate": 0.0002, + "loss": 0.246, + "step": 12080 + }, + { + "epoch": 2.5351226672258336, + "grad_norm": 0.1271832436323166, + "learning_rate": 0.0002, + "loss": 0.2457, + "step": 12090 + }, + { + "epoch": 2.537219542881107, + "grad_norm": 0.06386271864175797, + "learning_rate": 0.0002, + "loss": 0.2438, + "step": 12100 + }, + { + "epoch": 2.539316418536381, + "grad_norm": 0.07435470819473267, + "learning_rate": 0.0002, + "loss": 0.2447, + "step": 12110 + }, + { + "epoch": 2.5414132941916545, + "grad_norm": 0.06934060901403427, + "learning_rate": 0.0002, + "loss": 0.262, + "step": 12120 + }, + { + "epoch": 2.543510169846928, + "grad_norm": 0.08146541565656662, + "learning_rate": 0.0002, + "loss": 0.2833, + "step": 12130 + }, + { + "epoch": 2.5456070455022015, + "grad_norm": 0.06254391372203827, + "learning_rate": 0.0002, + "loss": 0.2428, + "step": 12140 + }, + { + "epoch": 2.5477039211574755, + "grad_norm": 0.057234663516283035, + "learning_rate": 0.0002, + "loss": 0.306, + "step": 12150 + }, + { + "epoch": 2.549800796812749, + "grad_norm": 0.08865554630756378, + "learning_rate": 0.0002, + "loss": 0.2638, + "step": 12160 + }, + { + "epoch": 2.551897672468023, + "grad_norm": 0.076085664331913, + "learning_rate": 0.0002, + "loss": 0.2732, + "step": 12170 + }, + { + "epoch": 2.5539945481232964, + "grad_norm": 0.06641250103712082, + "learning_rate": 0.0002, + "loss": 0.2444, + "step": 12180 + }, + { + "epoch": 2.55609142377857, + "grad_norm": 0.5051395297050476, + "learning_rate": 0.0002, + "loss": 0.2449, + "step": 12190 + }, + { + "epoch": 2.5581882994338434, + "grad_norm": 0.056382421404123306, + "learning_rate": 0.0002, + "loss": 0.2768, + "step": 12200 + }, + { + "epoch": 2.5602851750891173, + "grad_norm": 3.523155689239502, + "learning_rate": 0.0002, + "loss": 0.3345, + "step": 12210 + }, + { + "epoch": 2.562382050744391, + "grad_norm": 0.06870336085557938, + "learning_rate": 0.0002, + "loss": 0.2456, + "step": 12220 + }, + { + "epoch": 2.5644789263996643, + "grad_norm": 5.934406757354736, + "learning_rate": 0.0002, + "loss": 0.2691, + "step": 12230 + }, + { + "epoch": 2.5665758020549383, + "grad_norm": 0.09467433393001556, + "learning_rate": 0.0002, + "loss": 0.2741, + "step": 12240 + }, + { + "epoch": 2.568672677710212, + "grad_norm": 1.3006917238235474, + "learning_rate": 0.0002, + "loss": 0.2645, + "step": 12250 + }, + { + "epoch": 2.5707695533654853, + "grad_norm": 0.0707492008805275, + "learning_rate": 0.0002, + "loss": 0.2871, + "step": 12260 + }, + { + "epoch": 2.572866429020759, + "grad_norm": 0.05547978729009628, + "learning_rate": 0.0002, + "loss": 0.2447, + "step": 12270 + }, + { + "epoch": 2.5749633046760327, + "grad_norm": 0.060534629970788956, + "learning_rate": 0.0002, + "loss": 0.3156, + "step": 12280 + }, + { + "epoch": 2.5770601803313062, + "grad_norm": 0.7947044372558594, + "learning_rate": 0.0002, + "loss": 0.2579, + "step": 12290 + }, + { + "epoch": 2.57915705598658, + "grad_norm": 0.11533192545175552, + "learning_rate": 0.0002, + "loss": 0.2643, + "step": 12300 + }, + { + "epoch": 2.5812539316418537, + "grad_norm": 0.10663612186908722, + "learning_rate": 0.0002, + "loss": 0.2523, + "step": 12310 + }, + { + "epoch": 2.583350807297127, + "grad_norm": 0.0963052287697792, + "learning_rate": 0.0002, + "loss": 0.2856, + "step": 12320 + }, + { + "epoch": 2.5854476829524007, + "grad_norm": 0.08930831402540207, + "learning_rate": 0.0002, + "loss": 0.2777, + "step": 12330 + }, + { + "epoch": 2.5875445586076746, + "grad_norm": 0.10064201056957245, + "learning_rate": 0.0002, + "loss": 0.3087, + "step": 12340 + }, + { + "epoch": 2.589641434262948, + "grad_norm": 0.06038425862789154, + "learning_rate": 0.0002, + "loss": 0.2443, + "step": 12350 + }, + { + "epoch": 2.591738309918222, + "grad_norm": 0.07416463643312454, + "learning_rate": 0.0002, + "loss": 0.2586, + "step": 12360 + }, + { + "epoch": 2.5938351855734956, + "grad_norm": 2.1738131046295166, + "learning_rate": 0.0002, + "loss": 0.2933, + "step": 12370 + }, + { + "epoch": 2.595932061228769, + "grad_norm": 0.06500290334224701, + "learning_rate": 0.0002, + "loss": 0.2422, + "step": 12380 + }, + { + "epoch": 2.5980289368840426, + "grad_norm": 0.06393703818321228, + "learning_rate": 0.0002, + "loss": 0.2721, + "step": 12390 + }, + { + "epoch": 2.6001258125393165, + "grad_norm": 0.06458470970392227, + "learning_rate": 0.0002, + "loss": 0.2637, + "step": 12400 + }, + { + "epoch": 2.60222268819459, + "grad_norm": 0.08135922253131866, + "learning_rate": 0.0002, + "loss": 0.263, + "step": 12410 + }, + { + "epoch": 2.604319563849864, + "grad_norm": 0.052150268107652664, + "learning_rate": 0.0002, + "loss": 0.2408, + "step": 12420 + }, + { + "epoch": 2.6064164395051375, + "grad_norm": 0.08048977702856064, + "learning_rate": 0.0002, + "loss": 0.2547, + "step": 12430 + }, + { + "epoch": 2.608513315160411, + "grad_norm": 9.111763000488281, + "learning_rate": 0.0002, + "loss": 0.3519, + "step": 12440 + }, + { + "epoch": 2.6106101908156845, + "grad_norm": 0.1035270020365715, + "learning_rate": 0.0002, + "loss": 0.272, + "step": 12450 + }, + { + "epoch": 2.6127070664709584, + "grad_norm": 3.41133189201355, + "learning_rate": 0.0002, + "loss": 0.3269, + "step": 12460 + }, + { + "epoch": 2.614803942126232, + "grad_norm": 0.12809054553508759, + "learning_rate": 0.0002, + "loss": 0.2616, + "step": 12470 + }, + { + "epoch": 2.6169008177815054, + "grad_norm": 0.17681822180747986, + "learning_rate": 0.0002, + "loss": 0.2686, + "step": 12480 + }, + { + "epoch": 2.6189976934367794, + "grad_norm": 0.10416349768638611, + "learning_rate": 0.0002, + "loss": 0.305, + "step": 12490 + }, + { + "epoch": 2.621094569092053, + "grad_norm": 2.7841596603393555, + "learning_rate": 0.0002, + "loss": 0.2643, + "step": 12500 + }, + { + "epoch": 2.6231914447473264, + "grad_norm": 0.08688453584909439, + "learning_rate": 0.0002, + "loss": 0.2711, + "step": 12510 + }, + { + "epoch": 2.6252883204026, + "grad_norm": 1.3929383754730225, + "learning_rate": 0.0002, + "loss": 0.2929, + "step": 12520 + }, + { + "epoch": 2.627385196057874, + "grad_norm": 0.6145888566970825, + "learning_rate": 0.0002, + "loss": 0.2848, + "step": 12530 + }, + { + "epoch": 2.6294820717131473, + "grad_norm": 0.0767504870891571, + "learning_rate": 0.0002, + "loss": 0.2468, + "step": 12540 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.06284788995981216, + "learning_rate": 0.0002, + "loss": 0.3128, + "step": 12550 + }, + { + "epoch": 2.6336758230236947, + "grad_norm": 1.2423477172851562, + "learning_rate": 0.0002, + "loss": 0.27, + "step": 12560 + }, + { + "epoch": 2.6357726986789682, + "grad_norm": 0.07569570094347, + "learning_rate": 0.0002, + "loss": 0.2664, + "step": 12570 + }, + { + "epoch": 2.6378695743342417, + "grad_norm": 0.07165157049894333, + "learning_rate": 0.0002, + "loss": 0.2519, + "step": 12580 + }, + { + "epoch": 2.6399664499895157, + "grad_norm": 5.022453308105469, + "learning_rate": 0.0002, + "loss": 0.2629, + "step": 12590 + }, + { + "epoch": 2.642063325644789, + "grad_norm": 7.140848159790039, + "learning_rate": 0.0002, + "loss": 0.3051, + "step": 12600 + }, + { + "epoch": 2.644160201300063, + "grad_norm": 2.355073928833008, + "learning_rate": 0.0002, + "loss": 0.2495, + "step": 12610 + }, + { + "epoch": 2.6462570769553366, + "grad_norm": 0.0591590479016304, + "learning_rate": 0.0002, + "loss": 0.3141, + "step": 12620 + }, + { + "epoch": 2.64835395261061, + "grad_norm": 0.10556333512067795, + "learning_rate": 0.0002, + "loss": 0.2489, + "step": 12630 + }, + { + "epoch": 2.6504508282658836, + "grad_norm": 0.09041710197925568, + "learning_rate": 0.0002, + "loss": 0.2462, + "step": 12640 + }, + { + "epoch": 2.6525477039211576, + "grad_norm": 5.497735977172852, + "learning_rate": 0.0002, + "loss": 0.295, + "step": 12650 + }, + { + "epoch": 2.654644579576431, + "grad_norm": 0.15341436862945557, + "learning_rate": 0.0002, + "loss": 0.2849, + "step": 12660 + }, + { + "epoch": 2.656741455231705, + "grad_norm": 0.15221437811851501, + "learning_rate": 0.0002, + "loss": 0.2732, + "step": 12670 + }, + { + "epoch": 2.6588383308869785, + "grad_norm": 0.06831462681293488, + "learning_rate": 0.0002, + "loss": 0.2436, + "step": 12680 + }, + { + "epoch": 2.660935206542252, + "grad_norm": 1.100542426109314, + "learning_rate": 0.0002, + "loss": 0.2926, + "step": 12690 + }, + { + "epoch": 2.6630320821975255, + "grad_norm": 0.08846069127321243, + "learning_rate": 0.0002, + "loss": 0.264, + "step": 12700 + }, + { + "epoch": 2.6651289578527995, + "grad_norm": 0.06772548705339432, + "learning_rate": 0.0002, + "loss": 0.3007, + "step": 12710 + }, + { + "epoch": 2.667225833508073, + "grad_norm": 0.06996563822031021, + "learning_rate": 0.0002, + "loss": 0.2703, + "step": 12720 + }, + { + "epoch": 2.6693227091633465, + "grad_norm": 0.1149798259139061, + "learning_rate": 0.0002, + "loss": 0.2862, + "step": 12730 + }, + { + "epoch": 2.6714195848186204, + "grad_norm": 0.08013114333152771, + "learning_rate": 0.0002, + "loss": 0.2716, + "step": 12740 + }, + { + "epoch": 2.673516460473894, + "grad_norm": 0.0764973983168602, + "learning_rate": 0.0002, + "loss": 0.2576, + "step": 12750 + }, + { + "epoch": 2.6756133361291674, + "grad_norm": 0.05453988537192345, + "learning_rate": 0.0002, + "loss": 0.262, + "step": 12760 + }, + { + "epoch": 2.677710211784441, + "grad_norm": 0.07638183236122131, + "learning_rate": 0.0002, + "loss": 0.245, + "step": 12770 + }, + { + "epoch": 2.679807087439715, + "grad_norm": 0.07708717882633209, + "learning_rate": 0.0002, + "loss": 0.2424, + "step": 12780 + }, + { + "epoch": 2.6819039630949884, + "grad_norm": 0.8045825362205505, + "learning_rate": 0.0002, + "loss": 0.3402, + "step": 12790 + }, + { + "epoch": 2.6840008387502623, + "grad_norm": 0.057965174317359924, + "learning_rate": 0.0002, + "loss": 0.2758, + "step": 12800 + }, + { + "epoch": 2.686097714405536, + "grad_norm": 4.041475772857666, + "learning_rate": 0.0002, + "loss": 0.2563, + "step": 12810 + }, + { + "epoch": 2.6881945900608093, + "grad_norm": 0.2805425822734833, + "learning_rate": 0.0002, + "loss": 0.2633, + "step": 12820 + }, + { + "epoch": 2.690291465716083, + "grad_norm": 1.1750140190124512, + "learning_rate": 0.0002, + "loss": 0.312, + "step": 12830 + }, + { + "epoch": 2.6923883413713567, + "grad_norm": 0.18456393480300903, + "learning_rate": 0.0002, + "loss": 0.2763, + "step": 12840 + }, + { + "epoch": 2.6944852170266302, + "grad_norm": 0.09328345954418182, + "learning_rate": 0.0002, + "loss": 0.2438, + "step": 12850 + }, + { + "epoch": 2.696582092681904, + "grad_norm": 0.05224505066871643, + "learning_rate": 0.0002, + "loss": 0.2803, + "step": 12860 + }, + { + "epoch": 2.6986789683371777, + "grad_norm": 0.07151336967945099, + "learning_rate": 0.0002, + "loss": 0.2445, + "step": 12870 + }, + { + "epoch": 2.700775843992451, + "grad_norm": 0.07107602059841156, + "learning_rate": 0.0002, + "loss": 0.279, + "step": 12880 + }, + { + "epoch": 2.7028727196477247, + "grad_norm": 0.06088366359472275, + "learning_rate": 0.0002, + "loss": 0.2707, + "step": 12890 + }, + { + "epoch": 2.7049695953029986, + "grad_norm": 0.08219806104898453, + "learning_rate": 0.0002, + "loss": 0.3286, + "step": 12900 + }, + { + "epoch": 2.707066470958272, + "grad_norm": 0.4720647633075714, + "learning_rate": 0.0002, + "loss": 0.2994, + "step": 12910 + }, + { + "epoch": 2.709163346613546, + "grad_norm": 0.078771211206913, + "learning_rate": 0.0002, + "loss": 0.2687, + "step": 12920 + }, + { + "epoch": 2.7112602222688196, + "grad_norm": 3.5635182857513428, + "learning_rate": 0.0002, + "loss": 0.2775, + "step": 12930 + }, + { + "epoch": 2.713357097924093, + "grad_norm": 0.09435427188873291, + "learning_rate": 0.0002, + "loss": 0.2463, + "step": 12940 + }, + { + "epoch": 2.7154539735793666, + "grad_norm": 3.500767946243286, + "learning_rate": 0.0002, + "loss": 0.2477, + "step": 12950 + }, + { + "epoch": 2.7175508492346405, + "grad_norm": 0.08721265196800232, + "learning_rate": 0.0002, + "loss": 0.2891, + "step": 12960 + }, + { + "epoch": 2.719647724889914, + "grad_norm": 2.141937255859375, + "learning_rate": 0.0002, + "loss": 0.2708, + "step": 12970 + }, + { + "epoch": 2.7217446005451875, + "grad_norm": 0.10957151651382446, + "learning_rate": 0.0002, + "loss": 0.3073, + "step": 12980 + }, + { + "epoch": 2.7238414762004615, + "grad_norm": 0.2539743185043335, + "learning_rate": 0.0002, + "loss": 0.2466, + "step": 12990 + }, + { + "epoch": 2.725938351855735, + "grad_norm": 5.60904598236084, + "learning_rate": 0.0002, + "loss": 0.2514, + "step": 13000 + }, + { + "epoch": 2.7280352275110085, + "grad_norm": 0.9625059962272644, + "learning_rate": 0.0002, + "loss": 0.2513, + "step": 13010 + }, + { + "epoch": 2.730132103166282, + "grad_norm": 10.193999290466309, + "learning_rate": 0.0002, + "loss": 0.3065, + "step": 13020 + }, + { + "epoch": 2.732228978821556, + "grad_norm": 0.7805821299552917, + "learning_rate": 0.0002, + "loss": 0.2819, + "step": 13030 + }, + { + "epoch": 2.7343258544768294, + "grad_norm": 0.08317407965660095, + "learning_rate": 0.0002, + "loss": 0.2425, + "step": 13040 + }, + { + "epoch": 2.7364227301321034, + "grad_norm": 0.12909261882305145, + "learning_rate": 0.0002, + "loss": 0.3156, + "step": 13050 + }, + { + "epoch": 2.738519605787377, + "grad_norm": 0.16666528582572937, + "learning_rate": 0.0002, + "loss": 0.2451, + "step": 13060 + }, + { + "epoch": 2.7406164814426504, + "grad_norm": 0.3270515501499176, + "learning_rate": 0.0002, + "loss": 0.2447, + "step": 13070 + }, + { + "epoch": 2.742713357097924, + "grad_norm": 0.08127190172672272, + "learning_rate": 0.0002, + "loss": 0.2737, + "step": 13080 + }, + { + "epoch": 2.744810232753198, + "grad_norm": 0.09699930250644684, + "learning_rate": 0.0002, + "loss": 0.2668, + "step": 13090 + }, + { + "epoch": 2.7469071084084713, + "grad_norm": 2.5660605430603027, + "learning_rate": 0.0002, + "loss": 0.2529, + "step": 13100 + }, + { + "epoch": 2.7490039840637452, + "grad_norm": 0.35261228680610657, + "learning_rate": 0.0002, + "loss": 0.2446, + "step": 13110 + }, + { + "epoch": 2.7511008597190187, + "grad_norm": 0.1015433669090271, + "learning_rate": 0.0002, + "loss": 0.2428, + "step": 13120 + }, + { + "epoch": 2.7531977353742922, + "grad_norm": 1.886006236076355, + "learning_rate": 0.0002, + "loss": 0.2649, + "step": 13130 + }, + { + "epoch": 2.7552946110295657, + "grad_norm": 0.1277870535850525, + "learning_rate": 0.0002, + "loss": 0.2456, + "step": 13140 + }, + { + "epoch": 2.7573914866848397, + "grad_norm": 0.0975937694311142, + "learning_rate": 0.0002, + "loss": 0.3081, + "step": 13150 + }, + { + "epoch": 2.759488362340113, + "grad_norm": 0.08423218876123428, + "learning_rate": 0.0002, + "loss": 0.2688, + "step": 13160 + }, + { + "epoch": 2.761585237995387, + "grad_norm": 4.316064834594727, + "learning_rate": 0.0002, + "loss": 0.303, + "step": 13170 + }, + { + "epoch": 2.7636821136506606, + "grad_norm": 0.2076009064912796, + "learning_rate": 0.0002, + "loss": 0.2469, + "step": 13180 + }, + { + "epoch": 2.765778989305934, + "grad_norm": 0.5169063806533813, + "learning_rate": 0.0002, + "loss": 0.2599, + "step": 13190 + }, + { + "epoch": 2.7678758649612076, + "grad_norm": 0.09089035540819168, + "learning_rate": 0.0002, + "loss": 0.2669, + "step": 13200 + }, + { + "epoch": 2.7699727406164816, + "grad_norm": 0.40360260009765625, + "learning_rate": 0.0002, + "loss": 0.2771, + "step": 13210 + }, + { + "epoch": 2.772069616271755, + "grad_norm": 0.09568203240633011, + "learning_rate": 0.0002, + "loss": 0.2455, + "step": 13220 + }, + { + "epoch": 2.7741664919270286, + "grad_norm": 0.08937239646911621, + "learning_rate": 0.0002, + "loss": 0.3118, + "step": 13230 + }, + { + "epoch": 2.7762633675823025, + "grad_norm": 0.1122610941529274, + "learning_rate": 0.0002, + "loss": 0.2449, + "step": 13240 + }, + { + "epoch": 2.778360243237576, + "grad_norm": 0.07057496905326843, + "learning_rate": 0.0002, + "loss": 0.2442, + "step": 13250 + }, + { + "epoch": 2.7804571188928495, + "grad_norm": 8.3790864944458, + "learning_rate": 0.0002, + "loss": 0.3083, + "step": 13260 + }, + { + "epoch": 2.782553994548123, + "grad_norm": 5.793963432312012, + "learning_rate": 0.0002, + "loss": 0.2763, + "step": 13270 + }, + { + "epoch": 2.784650870203397, + "grad_norm": 0.05257759243249893, + "learning_rate": 0.0002, + "loss": 0.2607, + "step": 13280 + }, + { + "epoch": 2.7867477458586705, + "grad_norm": 0.08371423929929733, + "learning_rate": 0.0002, + "loss": 0.2806, + "step": 13290 + }, + { + "epoch": 2.7888446215139444, + "grad_norm": 3.869866371154785, + "learning_rate": 0.0002, + "loss": 0.3398, + "step": 13300 + }, + { + "epoch": 2.790941497169218, + "grad_norm": 4.551204681396484, + "learning_rate": 0.0002, + "loss": 0.2745, + "step": 13310 + }, + { + "epoch": 2.7930383728244914, + "grad_norm": 0.10620396584272385, + "learning_rate": 0.0002, + "loss": 0.2442, + "step": 13320 + }, + { + "epoch": 2.795135248479765, + "grad_norm": 0.09897986054420471, + "learning_rate": 0.0002, + "loss": 0.3427, + "step": 13330 + }, + { + "epoch": 2.797232124135039, + "grad_norm": 0.07497262209653854, + "learning_rate": 0.0002, + "loss": 0.2584, + "step": 13340 + }, + { + "epoch": 2.7993289997903124, + "grad_norm": 0.09139002114534378, + "learning_rate": 0.0002, + "loss": 0.2458, + "step": 13350 + }, + { + "epoch": 2.8014258754455863, + "grad_norm": 1.2047020196914673, + "learning_rate": 0.0002, + "loss": 0.2744, + "step": 13360 + }, + { + "epoch": 2.80352275110086, + "grad_norm": 0.08489953726530075, + "learning_rate": 0.0002, + "loss": 0.2443, + "step": 13370 + }, + { + "epoch": 2.8056196267561333, + "grad_norm": 0.1599905788898468, + "learning_rate": 0.0002, + "loss": 0.2751, + "step": 13380 + }, + { + "epoch": 2.807716502411407, + "grad_norm": 0.06880758702754974, + "learning_rate": 0.0002, + "loss": 0.2451, + "step": 13390 + }, + { + "epoch": 2.8098133780666807, + "grad_norm": 0.08114618808031082, + "learning_rate": 0.0002, + "loss": 0.274, + "step": 13400 + }, + { + "epoch": 2.8119102537219542, + "grad_norm": 0.060263942927122116, + "learning_rate": 0.0002, + "loss": 0.2719, + "step": 13410 + }, + { + "epoch": 2.814007129377228, + "grad_norm": 0.1302911937236786, + "learning_rate": 0.0002, + "loss": 0.2825, + "step": 13420 + }, + { + "epoch": 2.8161040050325017, + "grad_norm": 0.04744647815823555, + "learning_rate": 0.0002, + "loss": 0.2431, + "step": 13430 + }, + { + "epoch": 2.818200880687775, + "grad_norm": 0.05303800106048584, + "learning_rate": 0.0002, + "loss": 0.2431, + "step": 13440 + }, + { + "epoch": 2.8202977563430487, + "grad_norm": 4.2985687255859375, + "learning_rate": 0.0002, + "loss": 0.2639, + "step": 13450 + }, + { + "epoch": 2.8223946319983226, + "grad_norm": 0.41262736916542053, + "learning_rate": 0.0002, + "loss": 0.2827, + "step": 13460 + }, + { + "epoch": 2.824491507653596, + "grad_norm": 5.396834850311279, + "learning_rate": 0.0002, + "loss": 0.2976, + "step": 13470 + }, + { + "epoch": 2.8265883833088696, + "grad_norm": 0.10336003452539444, + "learning_rate": 0.0002, + "loss": 0.2737, + "step": 13480 + }, + { + "epoch": 2.8286852589641436, + "grad_norm": 0.05884670093655586, + "learning_rate": 0.0002, + "loss": 0.2598, + "step": 13490 + }, + { + "epoch": 2.830782134619417, + "grad_norm": 4.840988636016846, + "learning_rate": 0.0002, + "loss": 0.259, + "step": 13500 + }, + { + "epoch": 2.8328790102746906, + "grad_norm": 0.30140170454978943, + "learning_rate": 0.0002, + "loss": 0.2752, + "step": 13510 + }, + { + "epoch": 2.834975885929964, + "grad_norm": 0.1117088794708252, + "learning_rate": 0.0002, + "loss": 0.2429, + "step": 13520 + }, + { + "epoch": 2.837072761585238, + "grad_norm": 0.054389581084251404, + "learning_rate": 0.0002, + "loss": 0.2434, + "step": 13530 + }, + { + "epoch": 2.8391696372405115, + "grad_norm": 0.07770632952451706, + "learning_rate": 0.0002, + "loss": 0.2841, + "step": 13540 + }, + { + "epoch": 2.8412665128957855, + "grad_norm": 0.10165666043758392, + "learning_rate": 0.0002, + "loss": 0.2453, + "step": 13550 + }, + { + "epoch": 2.843363388551059, + "grad_norm": 0.12631669640541077, + "learning_rate": 0.0002, + "loss": 0.2715, + "step": 13560 + }, + { + "epoch": 2.8454602642063325, + "grad_norm": 0.08221318572759628, + "learning_rate": 0.0002, + "loss": 0.2434, + "step": 13570 + }, + { + "epoch": 2.847557139861606, + "grad_norm": 0.07879089564085007, + "learning_rate": 0.0002, + "loss": 0.2644, + "step": 13580 + }, + { + "epoch": 2.84965401551688, + "grad_norm": 0.18776120245456696, + "learning_rate": 0.0002, + "loss": 0.2761, + "step": 13590 + }, + { + "epoch": 2.8517508911721534, + "grad_norm": 0.10581996291875839, + "learning_rate": 0.0002, + "loss": 0.2443, + "step": 13600 + }, + { + "epoch": 2.8538477668274274, + "grad_norm": 0.08558280020952225, + "learning_rate": 0.0002, + "loss": 0.2767, + "step": 13610 + }, + { + "epoch": 2.855944642482701, + "grad_norm": 0.06981467455625534, + "learning_rate": 0.0002, + "loss": 0.2767, + "step": 13620 + }, + { + "epoch": 2.8580415181379744, + "grad_norm": 0.08355915546417236, + "learning_rate": 0.0002, + "loss": 0.3006, + "step": 13630 + }, + { + "epoch": 2.860138393793248, + "grad_norm": 0.06184757128357887, + "learning_rate": 0.0002, + "loss": 0.2846, + "step": 13640 + }, + { + "epoch": 2.862235269448522, + "grad_norm": 0.07668596506118774, + "learning_rate": 0.0002, + "loss": 0.2558, + "step": 13650 + }, + { + "epoch": 2.8643321451037953, + "grad_norm": 0.3981834352016449, + "learning_rate": 0.0002, + "loss": 0.275, + "step": 13660 + }, + { + "epoch": 2.8664290207590692, + "grad_norm": 0.9413365125656128, + "learning_rate": 0.0002, + "loss": 0.2796, + "step": 13670 + }, + { + "epoch": 2.8685258964143427, + "grad_norm": 0.19572269916534424, + "learning_rate": 0.0002, + "loss": 0.2703, + "step": 13680 + }, + { + "epoch": 2.8706227720696162, + "grad_norm": 0.15002276003360748, + "learning_rate": 0.0002, + "loss": 0.3071, + "step": 13690 + }, + { + "epoch": 2.8727196477248897, + "grad_norm": 0.06685584038496017, + "learning_rate": 0.0002, + "loss": 0.3044, + "step": 13700 + }, + { + "epoch": 2.8748165233801637, + "grad_norm": 4.113365650177002, + "learning_rate": 0.0002, + "loss": 0.3452, + "step": 13710 + }, + { + "epoch": 2.876913399035437, + "grad_norm": 0.12345223128795624, + "learning_rate": 0.0002, + "loss": 0.3401, + "step": 13720 + }, + { + "epoch": 2.8790102746907107, + "grad_norm": 1.0276050567626953, + "learning_rate": 0.0002, + "loss": 0.2777, + "step": 13730 + }, + { + "epoch": 2.8811071503459846, + "grad_norm": 0.27919915318489075, + "learning_rate": 0.0002, + "loss": 0.3792, + "step": 13740 + }, + { + "epoch": 2.883204026001258, + "grad_norm": 1.4898934364318848, + "learning_rate": 0.0002, + "loss": 0.2625, + "step": 13750 + }, + { + "epoch": 2.8853009016565316, + "grad_norm": 0.1877644956111908, + "learning_rate": 0.0002, + "loss": 0.3044, + "step": 13760 + }, + { + "epoch": 2.887397777311805, + "grad_norm": 0.20642811059951782, + "learning_rate": 0.0002, + "loss": 0.246, + "step": 13770 + }, + { + "epoch": 2.889494652967079, + "grad_norm": 1.3547812700271606, + "learning_rate": 0.0002, + "loss": 0.3656, + "step": 13780 + }, + { + "epoch": 2.8915915286223526, + "grad_norm": 0.8433927297592163, + "learning_rate": 0.0002, + "loss": 0.3086, + "step": 13790 + }, + { + "epoch": 2.8936884042776265, + "grad_norm": 2.5968425273895264, + "learning_rate": 0.0002, + "loss": 0.2485, + "step": 13800 + }, + { + "epoch": 2.8957852799329, + "grad_norm": 9.476542472839355, + "learning_rate": 0.0002, + "loss": 0.3393, + "step": 13810 + }, + { + "epoch": 2.8978821555881735, + "grad_norm": 1.2320115566253662, + "learning_rate": 0.0002, + "loss": 0.2803, + "step": 13820 + }, + { + "epoch": 2.899979031243447, + "grad_norm": 0.13934971392154694, + "learning_rate": 0.0002, + "loss": 0.248, + "step": 13830 + }, + { + "epoch": 2.902075906898721, + "grad_norm": 0.42801135778427124, + "learning_rate": 0.0002, + "loss": 0.2858, + "step": 13840 + }, + { + "epoch": 2.9041727825539945, + "grad_norm": 0.4410918056964874, + "learning_rate": 0.0002, + "loss": 0.3026, + "step": 13850 + }, + { + "epoch": 2.9062696582092684, + "grad_norm": 0.08050935715436935, + "learning_rate": 0.0002, + "loss": 0.2461, + "step": 13860 + }, + { + "epoch": 2.908366533864542, + "grad_norm": 0.38791462779045105, + "learning_rate": 0.0002, + "loss": 0.2902, + "step": 13870 + }, + { + "epoch": 2.9104634095198154, + "grad_norm": 7.857952117919922, + "learning_rate": 0.0002, + "loss": 0.3109, + "step": 13880 + }, + { + "epoch": 2.912560285175089, + "grad_norm": 0.7213678359985352, + "learning_rate": 0.0002, + "loss": 0.2754, + "step": 13890 + }, + { + "epoch": 2.914657160830363, + "grad_norm": 0.2854587435722351, + "learning_rate": 0.0002, + "loss": 0.3071, + "step": 13900 + }, + { + "epoch": 2.9167540364856364, + "grad_norm": 0.7336795330047607, + "learning_rate": 0.0002, + "loss": 0.278, + "step": 13910 + }, + { + "epoch": 2.9188509121409103, + "grad_norm": 0.42611584067344666, + "learning_rate": 0.0002, + "loss": 0.2723, + "step": 13920 + }, + { + "epoch": 2.920947787796184, + "grad_norm": 0.2687622606754303, + "learning_rate": 0.0002, + "loss": 0.2763, + "step": 13930 + }, + { + "epoch": 2.9230446634514573, + "grad_norm": 0.10501446574926376, + "learning_rate": 0.0002, + "loss": 0.2889, + "step": 13940 + }, + { + "epoch": 2.925141539106731, + "grad_norm": 6.071291923522949, + "learning_rate": 0.0002, + "loss": 0.2843, + "step": 13950 + }, + { + "epoch": 2.9272384147620047, + "grad_norm": 5.702052116394043, + "learning_rate": 0.0002, + "loss": 0.2853, + "step": 13960 + }, + { + "epoch": 2.9293352904172782, + "grad_norm": 0.8838995099067688, + "learning_rate": 0.0002, + "loss": 0.2863, + "step": 13970 + }, + { + "epoch": 2.9314321660725517, + "grad_norm": 0.11934137344360352, + "learning_rate": 0.0002, + "loss": 0.2441, + "step": 13980 + }, + { + "epoch": 2.9335290417278257, + "grad_norm": 1.4622691869735718, + "learning_rate": 0.0002, + "loss": 0.2466, + "step": 13990 + }, + { + "epoch": 2.935625917383099, + "grad_norm": 0.2274940013885498, + "learning_rate": 0.0002, + "loss": 0.2745, + "step": 14000 + }, + { + "epoch": 2.9377227930383727, + "grad_norm": 0.06870028376579285, + "learning_rate": 0.0002, + "loss": 0.2449, + "step": 14010 + }, + { + "epoch": 2.939819668693646, + "grad_norm": 13.201187133789062, + "learning_rate": 0.0002, + "loss": 0.2771, + "step": 14020 + }, + { + "epoch": 2.94191654434892, + "grad_norm": 0.0958949476480484, + "learning_rate": 0.0002, + "loss": 0.2621, + "step": 14030 + }, + { + "epoch": 2.9440134200041936, + "grad_norm": 1.1030700206756592, + "learning_rate": 0.0002, + "loss": 0.2474, + "step": 14040 + }, + { + "epoch": 2.9461102956594676, + "grad_norm": 3.825788736343384, + "learning_rate": 0.0002, + "loss": 0.2477, + "step": 14050 + }, + { + "epoch": 2.948207171314741, + "grad_norm": 0.2587536871433258, + "learning_rate": 0.0002, + "loss": 0.2441, + "step": 14060 + }, + { + "epoch": 2.9503040469700146, + "grad_norm": 6.049500465393066, + "learning_rate": 0.0002, + "loss": 0.3027, + "step": 14070 + }, + { + "epoch": 2.952400922625288, + "grad_norm": 0.11885102093219757, + "learning_rate": 0.0002, + "loss": 0.2429, + "step": 14080 + }, + { + "epoch": 2.954497798280562, + "grad_norm": 0.6359782814979553, + "learning_rate": 0.0002, + "loss": 0.2868, + "step": 14090 + }, + { + "epoch": 2.9565946739358355, + "grad_norm": 0.06189330294728279, + "learning_rate": 0.0002, + "loss": 0.3036, + "step": 14100 + }, + { + "epoch": 2.9586915495911095, + "grad_norm": 0.07890813052654266, + "learning_rate": 0.0002, + "loss": 0.2765, + "step": 14110 + }, + { + "epoch": 2.960788425246383, + "grad_norm": 4.272983074188232, + "learning_rate": 0.0002, + "loss": 0.2476, + "step": 14120 + }, + { + "epoch": 2.9628853009016565, + "grad_norm": 0.09933266788721085, + "learning_rate": 0.0002, + "loss": 0.2631, + "step": 14130 + }, + { + "epoch": 2.96498217655693, + "grad_norm": 0.1176319271326065, + "learning_rate": 0.0002, + "loss": 0.2448, + "step": 14140 + }, + { + "epoch": 2.967079052212204, + "grad_norm": 0.09006159752607346, + "learning_rate": 0.0002, + "loss": 0.2641, + "step": 14150 + }, + { + "epoch": 2.9691759278674774, + "grad_norm": 0.11632592976093292, + "learning_rate": 0.0002, + "loss": 0.2481, + "step": 14160 + }, + { + "epoch": 2.9712728035227514, + "grad_norm": 2.242246627807617, + "learning_rate": 0.0002, + "loss": 0.2692, + "step": 14170 + }, + { + "epoch": 2.973369679178025, + "grad_norm": 0.8853824138641357, + "learning_rate": 0.0002, + "loss": 0.2432, + "step": 14180 + }, + { + "epoch": 2.9754665548332984, + "grad_norm": 0.1101587638258934, + "learning_rate": 0.0002, + "loss": 0.242, + "step": 14190 + }, + { + "epoch": 2.977563430488572, + "grad_norm": 0.06838403642177582, + "learning_rate": 0.0002, + "loss": 0.3337, + "step": 14200 + }, + { + "epoch": 2.979660306143846, + "grad_norm": 0.46546003222465515, + "learning_rate": 0.0002, + "loss": 0.3376, + "step": 14210 + }, + { + "epoch": 2.9817571817991193, + "grad_norm": 0.08868476748466492, + "learning_rate": 0.0002, + "loss": 0.2678, + "step": 14220 + }, + { + "epoch": 2.983854057454393, + "grad_norm": 0.08028911799192429, + "learning_rate": 0.0002, + "loss": 0.2631, + "step": 14230 + }, + { + "epoch": 2.9859509331096667, + "grad_norm": 0.06891479343175888, + "learning_rate": 0.0002, + "loss": 0.2433, + "step": 14240 + }, + { + "epoch": 2.9880478087649402, + "grad_norm": 0.09133801609277725, + "learning_rate": 0.0002, + "loss": 0.2846, + "step": 14250 + }, + { + "epoch": 2.9901446844202137, + "grad_norm": 0.058840859681367874, + "learning_rate": 0.0002, + "loss": 0.266, + "step": 14260 + }, + { + "epoch": 2.9922415600754873, + "grad_norm": 0.08534104377031326, + "learning_rate": 0.0002, + "loss": 0.2452, + "step": 14270 + }, + { + "epoch": 2.994338435730761, + "grad_norm": 4.0780720710754395, + "learning_rate": 0.0002, + "loss": 0.4004, + "step": 14280 + }, + { + "epoch": 2.9964353113860347, + "grad_norm": 0.25896960496902466, + "learning_rate": 0.0002, + "loss": 0.3645, + "step": 14290 + }, + { + "epoch": 2.9985321870413086, + "grad_norm": 0.08144643902778625, + "learning_rate": 0.0002, + "loss": 0.2427, + "step": 14300 + }, + { + "epoch": 3.000629062696582, + "grad_norm": 0.06053667888045311, + "learning_rate": 0.0002, + "loss": 0.2449, + "step": 14310 + }, + { + "epoch": 3.0027259383518556, + "grad_norm": 0.12006381154060364, + "learning_rate": 0.0002, + "loss": 0.3749, + "step": 14320 + }, + { + "epoch": 3.0048228140071296, + "grad_norm": 0.09352512657642365, + "learning_rate": 0.0002, + "loss": 0.2993, + "step": 14330 + }, + { + "epoch": 3.006919689662403, + "grad_norm": 0.06361488997936249, + "learning_rate": 0.0002, + "loss": 0.2438, + "step": 14340 + }, + { + "epoch": 3.0090165653176766, + "grad_norm": 0.11830562353134155, + "learning_rate": 0.0002, + "loss": 0.3046, + "step": 14350 + }, + { + "epoch": 3.01111344097295, + "grad_norm": 0.053154319524765015, + "learning_rate": 0.0002, + "loss": 0.2441, + "step": 14360 + }, + { + "epoch": 3.013210316628224, + "grad_norm": 0.05998919904232025, + "learning_rate": 0.0002, + "loss": 0.2995, + "step": 14370 + }, + { + "epoch": 3.0153071922834975, + "grad_norm": 0.10642017424106598, + "learning_rate": 0.0002, + "loss": 0.2877, + "step": 14380 + }, + { + "epoch": 3.017404067938771, + "grad_norm": 0.13320499658584595, + "learning_rate": 0.0002, + "loss": 0.2906, + "step": 14390 + }, + { + "epoch": 3.019500943594045, + "grad_norm": 6.584969997406006, + "learning_rate": 0.0002, + "loss": 0.3778, + "step": 14400 + }, + { + "epoch": 3.0215978192493185, + "grad_norm": 7.320637226104736, + "learning_rate": 0.0002, + "loss": 0.2791, + "step": 14410 + }, + { + "epoch": 3.023694694904592, + "grad_norm": 0.06931141018867493, + "learning_rate": 0.0002, + "loss": 0.2463, + "step": 14420 + }, + { + "epoch": 3.025791570559866, + "grad_norm": 0.1134222224354744, + "learning_rate": 0.0002, + "loss": 0.2603, + "step": 14430 + }, + { + "epoch": 3.0278884462151394, + "grad_norm": 0.057289376854896545, + "learning_rate": 0.0002, + "loss": 0.2887, + "step": 14440 + }, + { + "epoch": 3.029985321870413, + "grad_norm": 0.1048545092344284, + "learning_rate": 0.0002, + "loss": 0.2821, + "step": 14450 + }, + { + "epoch": 3.032082197525687, + "grad_norm": 0.09596660733222961, + "learning_rate": 0.0002, + "loss": 0.2749, + "step": 14460 + }, + { + "epoch": 3.0341790731809604, + "grad_norm": 0.5900943279266357, + "learning_rate": 0.0002, + "loss": 0.2437, + "step": 14470 + }, + { + "epoch": 3.036275948836234, + "grad_norm": 6.8857951164245605, + "learning_rate": 0.0002, + "loss": 0.2466, + "step": 14480 + }, + { + "epoch": 3.038372824491508, + "grad_norm": 0.08555381000041962, + "learning_rate": 0.0002, + "loss": 0.2457, + "step": 14490 + }, + { + "epoch": 3.0404697001467813, + "grad_norm": 0.07776056975126266, + "learning_rate": 0.0002, + "loss": 0.3113, + "step": 14500 + }, + { + "epoch": 3.042566575802055, + "grad_norm": 10.093559265136719, + "learning_rate": 0.0002, + "loss": 0.3165, + "step": 14510 + }, + { + "epoch": 3.0446634514573288, + "grad_norm": 0.07289782166481018, + "learning_rate": 0.0002, + "loss": 0.2988, + "step": 14520 + }, + { + "epoch": 3.0467603271126023, + "grad_norm": 0.048002567142248154, + "learning_rate": 0.0002, + "loss": 0.2748, + "step": 14530 + }, + { + "epoch": 3.0488572027678758, + "grad_norm": 5.174429416656494, + "learning_rate": 0.0002, + "loss": 0.2663, + "step": 14540 + }, + { + "epoch": 3.0509540784231497, + "grad_norm": 0.0720122829079628, + "learning_rate": 0.0002, + "loss": 0.2625, + "step": 14550 + }, + { + "epoch": 3.053050954078423, + "grad_norm": 0.08649764955043793, + "learning_rate": 0.0002, + "loss": 0.2573, + "step": 14560 + }, + { + "epoch": 3.0551478297336967, + "grad_norm": 0.045956335961818695, + "learning_rate": 0.0002, + "loss": 0.2563, + "step": 14570 + }, + { + "epoch": 3.0572447053889706, + "grad_norm": 0.07197452336549759, + "learning_rate": 0.0002, + "loss": 0.2651, + "step": 14580 + }, + { + "epoch": 3.059341581044244, + "grad_norm": 0.0696483701467514, + "learning_rate": 0.0002, + "loss": 0.2435, + "step": 14590 + }, + { + "epoch": 3.0614384566995176, + "grad_norm": 0.054920099675655365, + "learning_rate": 0.0002, + "loss": 0.3082, + "step": 14600 + }, + { + "epoch": 3.0635353323547916, + "grad_norm": 0.0675140768289566, + "learning_rate": 0.0002, + "loss": 0.244, + "step": 14610 + }, + { + "epoch": 3.065632208010065, + "grad_norm": 0.11363296955823898, + "learning_rate": 0.0002, + "loss": 0.2874, + "step": 14620 + }, + { + "epoch": 3.0677290836653386, + "grad_norm": 0.051395099610090256, + "learning_rate": 0.0002, + "loss": 0.2443, + "step": 14630 + }, + { + "epoch": 3.069825959320612, + "grad_norm": 0.06727784126996994, + "learning_rate": 0.0002, + "loss": 0.2772, + "step": 14640 + }, + { + "epoch": 3.071922834975886, + "grad_norm": 0.08824780583381653, + "learning_rate": 0.0002, + "loss": 0.2637, + "step": 14650 + }, + { + "epoch": 3.0740197106311595, + "grad_norm": 0.06606782227754593, + "learning_rate": 0.0002, + "loss": 0.2681, + "step": 14660 + }, + { + "epoch": 3.076116586286433, + "grad_norm": 0.05145835131406784, + "learning_rate": 0.0002, + "loss": 0.2646, + "step": 14670 + }, + { + "epoch": 3.078213461941707, + "grad_norm": 0.16860048472881317, + "learning_rate": 0.0002, + "loss": 0.2554, + "step": 14680 + }, + { + "epoch": 3.0803103375969805, + "grad_norm": 0.08995959162712097, + "learning_rate": 0.0002, + "loss": 0.2648, + "step": 14690 + }, + { + "epoch": 3.082407213252254, + "grad_norm": 0.08270885050296783, + "learning_rate": 0.0002, + "loss": 0.2884, + "step": 14700 + }, + { + "epoch": 3.084504088907528, + "grad_norm": 5.329971790313721, + "learning_rate": 0.0002, + "loss": 0.2455, + "step": 14710 + }, + { + "epoch": 3.0866009645628014, + "grad_norm": 0.37981072068214417, + "learning_rate": 0.0002, + "loss": 0.2441, + "step": 14720 + }, + { + "epoch": 3.088697840218075, + "grad_norm": 0.09584033489227295, + "learning_rate": 0.0002, + "loss": 0.2905, + "step": 14730 + }, + { + "epoch": 3.090794715873349, + "grad_norm": 0.0819215402007103, + "learning_rate": 0.0002, + "loss": 0.2443, + "step": 14740 + }, + { + "epoch": 3.0928915915286224, + "grad_norm": 0.11679327487945557, + "learning_rate": 0.0002, + "loss": 0.2586, + "step": 14750 + }, + { + "epoch": 3.094988467183896, + "grad_norm": 0.08995963633060455, + "learning_rate": 0.0002, + "loss": 0.2579, + "step": 14760 + }, + { + "epoch": 3.09708534283917, + "grad_norm": 0.04532779008150101, + "learning_rate": 0.0002, + "loss": 0.2458, + "step": 14770 + }, + { + "epoch": 3.0991822184944433, + "grad_norm": 0.09251581877470016, + "learning_rate": 0.0002, + "loss": 0.2437, + "step": 14780 + }, + { + "epoch": 3.101279094149717, + "grad_norm": 0.06317158788442612, + "learning_rate": 0.0002, + "loss": 0.2427, + "step": 14790 + }, + { + "epoch": 3.1033759698049908, + "grad_norm": 0.07175248861312866, + "learning_rate": 0.0002, + "loss": 0.2825, + "step": 14800 + }, + { + "epoch": 3.1054728454602643, + "grad_norm": 0.05058152601122856, + "learning_rate": 0.0002, + "loss": 0.2442, + "step": 14810 + }, + { + "epoch": 3.1075697211155378, + "grad_norm": 0.07968831807374954, + "learning_rate": 0.0002, + "loss": 0.2439, + "step": 14820 + }, + { + "epoch": 3.1096665967708117, + "grad_norm": 0.07660433650016785, + "learning_rate": 0.0002, + "loss": 0.2686, + "step": 14830 + }, + { + "epoch": 3.111763472426085, + "grad_norm": 0.055322594940662384, + "learning_rate": 0.0002, + "loss": 0.2937, + "step": 14840 + }, + { + "epoch": 3.1138603480813587, + "grad_norm": 0.15317735075950623, + "learning_rate": 0.0002, + "loss": 0.2693, + "step": 14850 + }, + { + "epoch": 3.1159572237366326, + "grad_norm": 0.0666760727763176, + "learning_rate": 0.0002, + "loss": 0.244, + "step": 14860 + }, + { + "epoch": 3.118054099391906, + "grad_norm": 0.07330338656902313, + "learning_rate": 0.0002, + "loss": 0.2583, + "step": 14870 + }, + { + "epoch": 3.1201509750471796, + "grad_norm": 4.008168697357178, + "learning_rate": 0.0002, + "loss": 0.2798, + "step": 14880 + }, + { + "epoch": 3.122247850702453, + "grad_norm": 11.84869384765625, + "learning_rate": 0.0002, + "loss": 0.3852, + "step": 14890 + }, + { + "epoch": 3.124344726357727, + "grad_norm": 1.1625456809997559, + "learning_rate": 0.0002, + "loss": 0.2451, + "step": 14900 + }, + { + "epoch": 3.1264416020130006, + "grad_norm": 3.9291110038757324, + "learning_rate": 0.0002, + "loss": 0.3082, + "step": 14910 + }, + { + "epoch": 3.128538477668274, + "grad_norm": 0.0719500258564949, + "learning_rate": 0.0002, + "loss": 0.2441, + "step": 14920 + }, + { + "epoch": 3.130635353323548, + "grad_norm": 8.359219551086426, + "learning_rate": 0.0002, + "loss": 0.2796, + "step": 14930 + }, + { + "epoch": 3.1327322289788215, + "grad_norm": 0.14066196978092194, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 14940 + }, + { + "epoch": 3.134829104634095, + "grad_norm": 0.09063920378684998, + "learning_rate": 0.0002, + "loss": 0.2424, + "step": 14950 + }, + { + "epoch": 3.136925980289369, + "grad_norm": 0.15819169580936432, + "learning_rate": 0.0002, + "loss": 0.2681, + "step": 14960 + }, + { + "epoch": 3.1390228559446425, + "grad_norm": 0.06944765895605087, + "learning_rate": 0.0002, + "loss": 0.2648, + "step": 14970 + }, + { + "epoch": 3.141119731599916, + "grad_norm": 0.1226600632071495, + "learning_rate": 0.0002, + "loss": 0.2826, + "step": 14980 + }, + { + "epoch": 3.14321660725519, + "grad_norm": 0.2014627605676651, + "learning_rate": 0.0002, + "loss": 0.2977, + "step": 14990 + }, + { + "epoch": 3.1453134829104634, + "grad_norm": 4.710205078125, + "learning_rate": 0.0002, + "loss": 0.2617, + "step": 15000 + }, + { + "epoch": 3.147410358565737, + "grad_norm": 0.10994257032871246, + "learning_rate": 0.0002, + "loss": 0.2661, + "step": 15010 + }, + { + "epoch": 3.149507234221011, + "grad_norm": 0.8386690616607666, + "learning_rate": 0.0002, + "loss": 0.248, + "step": 15020 + }, + { + "epoch": 3.1516041098762844, + "grad_norm": 0.20938342809677124, + "learning_rate": 0.0002, + "loss": 0.31, + "step": 15030 + }, + { + "epoch": 3.153700985531558, + "grad_norm": 1.402302622795105, + "learning_rate": 0.0002, + "loss": 0.2876, + "step": 15040 + }, + { + "epoch": 3.155797861186832, + "grad_norm": 1.6246540546417236, + "learning_rate": 0.0002, + "loss": 0.2484, + "step": 15050 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 0.1873074173927307, + "learning_rate": 0.0002, + "loss": 0.3074, + "step": 15060 + }, + { + "epoch": 3.159991612497379, + "grad_norm": 6.368509769439697, + "learning_rate": 0.0002, + "loss": 0.3141, + "step": 15070 + }, + { + "epoch": 3.1620884881526528, + "grad_norm": 0.2556443512439728, + "learning_rate": 0.0002, + "loss": 0.2615, + "step": 15080 + }, + { + "epoch": 3.1641853638079263, + "grad_norm": 3.358315944671631, + "learning_rate": 0.0002, + "loss": 0.272, + "step": 15090 + }, + { + "epoch": 3.1662822394631998, + "grad_norm": 0.13668504357337952, + "learning_rate": 0.0002, + "loss": 0.2432, + "step": 15100 + }, + { + "epoch": 3.1683791151184737, + "grad_norm": 1.0130856037139893, + "learning_rate": 0.0002, + "loss": 0.2415, + "step": 15110 + }, + { + "epoch": 3.170475990773747, + "grad_norm": 0.08739043772220612, + "learning_rate": 0.0002, + "loss": 0.2918, + "step": 15120 + }, + { + "epoch": 3.1725728664290207, + "grad_norm": 0.7321306467056274, + "learning_rate": 0.0002, + "loss": 0.2831, + "step": 15130 + }, + { + "epoch": 3.174669742084294, + "grad_norm": 0.18466271460056305, + "learning_rate": 0.0002, + "loss": 0.2803, + "step": 15140 + }, + { + "epoch": 3.176766617739568, + "grad_norm": 0.37699639797210693, + "learning_rate": 0.0002, + "loss": 0.2594, + "step": 15150 + }, + { + "epoch": 3.1788634933948416, + "grad_norm": 5.0264763832092285, + "learning_rate": 0.0002, + "loss": 0.3041, + "step": 15160 + }, + { + "epoch": 3.180960369050115, + "grad_norm": 0.1934100240468979, + "learning_rate": 0.0002, + "loss": 0.2455, + "step": 15170 + }, + { + "epoch": 3.183057244705389, + "grad_norm": 0.09424150735139847, + "learning_rate": 0.0002, + "loss": 0.3119, + "step": 15180 + }, + { + "epoch": 3.1851541203606626, + "grad_norm": 0.05102267488837242, + "learning_rate": 0.0002, + "loss": 0.243, + "step": 15190 + }, + { + "epoch": 3.187250996015936, + "grad_norm": 0.09043313562870026, + "learning_rate": 0.0002, + "loss": 0.2465, + "step": 15200 + }, + { + "epoch": 3.18934787167121, + "grad_norm": 10.404960632324219, + "learning_rate": 0.0002, + "loss": 0.3076, + "step": 15210 + }, + { + "epoch": 3.1914447473264835, + "grad_norm": 9.10566234588623, + "learning_rate": 0.0002, + "loss": 0.352, + "step": 15220 + }, + { + "epoch": 3.193541622981757, + "grad_norm": 0.09223295748233795, + "learning_rate": 0.0002, + "loss": 0.2711, + "step": 15230 + }, + { + "epoch": 3.195638498637031, + "grad_norm": 0.12465586513280869, + "learning_rate": 0.0002, + "loss": 0.245, + "step": 15240 + }, + { + "epoch": 3.1977353742923045, + "grad_norm": 9.97639274597168, + "learning_rate": 0.0002, + "loss": 0.2801, + "step": 15250 + }, + { + "epoch": 3.199832249947578, + "grad_norm": 0.1685313880443573, + "learning_rate": 0.0002, + "loss": 0.2485, + "step": 15260 + }, + { + "epoch": 3.201929125602852, + "grad_norm": 0.2190469652414322, + "learning_rate": 0.0002, + "loss": 0.2441, + "step": 15270 + }, + { + "epoch": 3.2040260012581254, + "grad_norm": 7.2927775382995605, + "learning_rate": 0.0002, + "loss": 0.2772, + "step": 15280 + }, + { + "epoch": 3.206122876913399, + "grad_norm": 5.700809478759766, + "learning_rate": 0.0002, + "loss": 0.3131, + "step": 15290 + }, + { + "epoch": 3.208219752568673, + "grad_norm": 1.4845271110534668, + "learning_rate": 0.0002, + "loss": 0.3096, + "step": 15300 + }, + { + "epoch": 3.2103166282239464, + "grad_norm": 0.08886919170618057, + "learning_rate": 0.0002, + "loss": 0.2476, + "step": 15310 + }, + { + "epoch": 3.21241350387922, + "grad_norm": 0.11333826929330826, + "learning_rate": 0.0002, + "loss": 0.2849, + "step": 15320 + }, + { + "epoch": 3.214510379534494, + "grad_norm": 1.2613693475723267, + "learning_rate": 0.0002, + "loss": 0.2711, + "step": 15330 + }, + { + "epoch": 3.2166072551897673, + "grad_norm": 0.1572788953781128, + "learning_rate": 0.0002, + "loss": 0.28, + "step": 15340 + }, + { + "epoch": 3.218704130845041, + "grad_norm": 0.9039869904518127, + "learning_rate": 0.0002, + "loss": 0.2768, + "step": 15350 + }, + { + "epoch": 3.2208010065003148, + "grad_norm": 0.3126716613769531, + "learning_rate": 0.0002, + "loss": 0.2663, + "step": 15360 + }, + { + "epoch": 3.2228978821555883, + "grad_norm": 0.44652101397514343, + "learning_rate": 0.0002, + "loss": 0.3121, + "step": 15370 + }, + { + "epoch": 3.2249947578108618, + "grad_norm": 0.6988358497619629, + "learning_rate": 0.0002, + "loss": 0.2734, + "step": 15380 + }, + { + "epoch": 3.2270916334661353, + "grad_norm": 7.399543285369873, + "learning_rate": 0.0002, + "loss": 0.2515, + "step": 15390 + }, + { + "epoch": 3.229188509121409, + "grad_norm": 0.9261417984962463, + "learning_rate": 0.0002, + "loss": 0.2469, + "step": 15400 + }, + { + "epoch": 3.2312853847766827, + "grad_norm": 0.5016912817955017, + "learning_rate": 0.0002, + "loss": 0.246, + "step": 15410 + }, + { + "epoch": 3.233382260431956, + "grad_norm": 16.135902404785156, + "learning_rate": 0.0002, + "loss": 0.275, + "step": 15420 + }, + { + "epoch": 3.23547913608723, + "grad_norm": 23.701261520385742, + "learning_rate": 0.0002, + "loss": 0.2501, + "step": 15430 + }, + { + "epoch": 3.2375760117425036, + "grad_norm": 5.819489479064941, + "learning_rate": 0.0002, + "loss": 0.3584, + "step": 15440 + }, + { + "epoch": 3.239672887397777, + "grad_norm": 5.641313076019287, + "learning_rate": 0.0002, + "loss": 0.3476, + "step": 15450 + }, + { + "epoch": 3.241769763053051, + "grad_norm": 0.2730180025100708, + "learning_rate": 0.0002, + "loss": 0.3157, + "step": 15460 + }, + { + "epoch": 3.2438666387083246, + "grad_norm": 2.8253672122955322, + "learning_rate": 0.0002, + "loss": 0.2499, + "step": 15470 + }, + { + "epoch": 3.245963514363598, + "grad_norm": 0.17094270884990692, + "learning_rate": 0.0002, + "loss": 0.2804, + "step": 15480 + }, + { + "epoch": 3.248060390018872, + "grad_norm": 0.3280426263809204, + "learning_rate": 0.0002, + "loss": 0.2534, + "step": 15490 + }, + { + "epoch": 3.2501572656741455, + "grad_norm": 2.55387806892395, + "learning_rate": 0.0002, + "loss": 0.329, + "step": 15500 + }, + { + "epoch": 3.252254141329419, + "grad_norm": 1.651965856552124, + "learning_rate": 0.0002, + "loss": 0.2491, + "step": 15510 + }, + { + "epoch": 3.254351016984693, + "grad_norm": 4.471637725830078, + "learning_rate": 0.0002, + "loss": 0.2505, + "step": 15520 + }, + { + "epoch": 3.2564478926399665, + "grad_norm": 0.9797324538230896, + "learning_rate": 0.0002, + "loss": 0.2771, + "step": 15530 + }, + { + "epoch": 3.25854476829524, + "grad_norm": 1.2143648862838745, + "learning_rate": 0.0002, + "loss": 0.3634, + "step": 15540 + }, + { + "epoch": 3.260641643950514, + "grad_norm": 7.796568393707275, + "learning_rate": 0.0002, + "loss": 0.3131, + "step": 15550 + }, + { + "epoch": 3.2627385196057874, + "grad_norm": 2.383347511291504, + "learning_rate": 0.0002, + "loss": 0.2947, + "step": 15560 + }, + { + "epoch": 3.264835395261061, + "grad_norm": 0.2774340808391571, + "learning_rate": 0.0002, + "loss": 0.2465, + "step": 15570 + }, + { + "epoch": 3.2669322709163344, + "grad_norm": 0.2396259754896164, + "learning_rate": 0.0002, + "loss": 0.2837, + "step": 15580 + }, + { + "epoch": 3.2690291465716084, + "grad_norm": 2.832759141921997, + "learning_rate": 0.0002, + "loss": 0.2479, + "step": 15590 + }, + { + "epoch": 3.271126022226882, + "grad_norm": 2.0031135082244873, + "learning_rate": 0.0002, + "loss": 0.2445, + "step": 15600 + }, + { + "epoch": 3.273222897882156, + "grad_norm": 0.11969097703695297, + "learning_rate": 0.0002, + "loss": 0.2664, + "step": 15610 + }, + { + "epoch": 3.2753197735374293, + "grad_norm": 0.3683199882507324, + "learning_rate": 0.0002, + "loss": 0.248, + "step": 15620 + }, + { + "epoch": 3.277416649192703, + "grad_norm": 0.9202993512153625, + "learning_rate": 0.0002, + "loss": 0.2736, + "step": 15630 + }, + { + "epoch": 3.2795135248479763, + "grad_norm": 0.07227743417024612, + "learning_rate": 0.0002, + "loss": 0.2428, + "step": 15640 + }, + { + "epoch": 3.2816104005032503, + "grad_norm": 0.3796677887439728, + "learning_rate": 0.0002, + "loss": 0.2834, + "step": 15650 + }, + { + "epoch": 3.2837072761585238, + "grad_norm": 0.09739989042282104, + "learning_rate": 0.0002, + "loss": 0.3535, + "step": 15660 + }, + { + "epoch": 3.2858041518137973, + "grad_norm": 3.952669143676758, + "learning_rate": 0.0002, + "loss": 0.2781, + "step": 15670 + }, + { + "epoch": 3.287901027469071, + "grad_norm": 0.35927778482437134, + "learning_rate": 0.0002, + "loss": 0.2457, + "step": 15680 + }, + { + "epoch": 3.2899979031243447, + "grad_norm": 0.08492504060268402, + "learning_rate": 0.0002, + "loss": 0.2626, + "step": 15690 + }, + { + "epoch": 3.292094778779618, + "grad_norm": 0.12667356431484222, + "learning_rate": 0.0002, + "loss": 0.2444, + "step": 15700 + }, + { + "epoch": 3.294191654434892, + "grad_norm": 1.5572481155395508, + "learning_rate": 0.0002, + "loss": 0.2468, + "step": 15710 + }, + { + "epoch": 3.2962885300901656, + "grad_norm": 0.07433910667896271, + "learning_rate": 0.0002, + "loss": 0.246, + "step": 15720 + }, + { + "epoch": 3.298385405745439, + "grad_norm": 0.09573189914226532, + "learning_rate": 0.0002, + "loss": 0.2444, + "step": 15730 + }, + { + "epoch": 3.300482281400713, + "grad_norm": 0.10412845760583878, + "learning_rate": 0.0002, + "loss": 0.2611, + "step": 15740 + }, + { + "epoch": 3.3025791570559866, + "grad_norm": 0.09497421979904175, + "learning_rate": 0.0002, + "loss": 0.2464, + "step": 15750 + }, + { + "epoch": 3.30467603271126, + "grad_norm": 0.09060948342084885, + "learning_rate": 0.0002, + "loss": 0.2446, + "step": 15760 + }, + { + "epoch": 3.306772908366534, + "grad_norm": 0.056995928287506104, + "learning_rate": 0.0002, + "loss": 0.2687, + "step": 15770 + }, + { + "epoch": 3.3088697840218075, + "grad_norm": 0.10010065883398056, + "learning_rate": 0.0002, + "loss": 0.2856, + "step": 15780 + }, + { + "epoch": 3.310966659677081, + "grad_norm": 0.09349839389324188, + "learning_rate": 0.0002, + "loss": 0.2436, + "step": 15790 + }, + { + "epoch": 3.313063535332355, + "grad_norm": 0.9555671811103821, + "learning_rate": 0.0002, + "loss": 0.2865, + "step": 15800 + }, + { + "epoch": 3.3151604109876285, + "grad_norm": 0.15668243169784546, + "learning_rate": 0.0002, + "loss": 0.2914, + "step": 15810 + }, + { + "epoch": 3.317257286642902, + "grad_norm": 0.0925045758485794, + "learning_rate": 0.0002, + "loss": 0.2436, + "step": 15820 + }, + { + "epoch": 3.3193541622981755, + "grad_norm": 0.11761250346899033, + "learning_rate": 0.0002, + "loss": 0.2842, + "step": 15830 + }, + { + "epoch": 3.3214510379534494, + "grad_norm": 0.1782289296388626, + "learning_rate": 0.0002, + "loss": 0.2631, + "step": 15840 + }, + { + "epoch": 3.323547913608723, + "grad_norm": 26.115440368652344, + "learning_rate": 0.0002, + "loss": 0.3119, + "step": 15850 + }, + { + "epoch": 3.325644789263997, + "grad_norm": 8.95678424835205, + "learning_rate": 0.0002, + "loss": 0.2799, + "step": 15860 + }, + { + "epoch": 3.3277416649192704, + "grad_norm": 0.21364077925682068, + "learning_rate": 0.0002, + "loss": 0.2725, + "step": 15870 + }, + { + "epoch": 3.329838540574544, + "grad_norm": 0.23057472705841064, + "learning_rate": 0.0002, + "loss": 0.2458, + "step": 15880 + }, + { + "epoch": 3.3319354162298174, + "grad_norm": 6.40324592590332, + "learning_rate": 0.0002, + "loss": 0.3034, + "step": 15890 + }, + { + "epoch": 3.3340322918850913, + "grad_norm": 4.420671463012695, + "learning_rate": 0.0002, + "loss": 0.3232, + "step": 15900 + }, + { + "epoch": 3.336129167540365, + "grad_norm": 0.1368934065103531, + "learning_rate": 0.0002, + "loss": 0.3034, + "step": 15910 + }, + { + "epoch": 3.3382260431956383, + "grad_norm": 0.18694345653057098, + "learning_rate": 0.0002, + "loss": 0.3256, + "step": 15920 + }, + { + "epoch": 3.3403229188509123, + "grad_norm": 7.2182297706604, + "learning_rate": 0.0002, + "loss": 0.3175, + "step": 15930 + }, + { + "epoch": 3.3424197945061858, + "grad_norm": 0.5887989401817322, + "learning_rate": 0.0002, + "loss": 0.2771, + "step": 15940 + }, + { + "epoch": 3.3445166701614593, + "grad_norm": 0.1084941029548645, + "learning_rate": 0.0002, + "loss": 0.2706, + "step": 15950 + }, + { + "epoch": 3.346613545816733, + "grad_norm": 0.13692770898342133, + "learning_rate": 0.0002, + "loss": 0.2443, + "step": 15960 + }, + { + "epoch": 3.3487104214720067, + "grad_norm": 0.1195460706949234, + "learning_rate": 0.0002, + "loss": 0.2975, + "step": 15970 + }, + { + "epoch": 3.35080729712728, + "grad_norm": 0.12102661281824112, + "learning_rate": 0.0002, + "loss": 0.2478, + "step": 15980 + }, + { + "epoch": 3.352904172782554, + "grad_norm": 0.11354684084653854, + "learning_rate": 0.0002, + "loss": 0.2766, + "step": 15990 + }, + { + "epoch": 3.3550010484378276, + "grad_norm": 0.300896018743515, + "learning_rate": 0.0002, + "loss": 0.2848, + "step": 16000 + }, + { + "epoch": 3.357097924093101, + "grad_norm": 0.09630642086267471, + "learning_rate": 0.0002, + "loss": 0.2432, + "step": 16010 + }, + { + "epoch": 3.359194799748375, + "grad_norm": 0.09073801338672638, + "learning_rate": 0.0002, + "loss": 0.2474, + "step": 16020 + }, + { + "epoch": 3.3612916754036486, + "grad_norm": 0.2423640638589859, + "learning_rate": 0.0002, + "loss": 0.3038, + "step": 16030 + }, + { + "epoch": 3.363388551058922, + "grad_norm": 0.34864917397499084, + "learning_rate": 0.0002, + "loss": 0.2697, + "step": 16040 + }, + { + "epoch": 3.365485426714196, + "grad_norm": 1.5239778757095337, + "learning_rate": 0.0002, + "loss": 0.2771, + "step": 16050 + }, + { + "epoch": 3.3675823023694695, + "grad_norm": 2.1417646408081055, + "learning_rate": 0.0002, + "loss": 0.3491, + "step": 16060 + }, + { + "epoch": 3.369679178024743, + "grad_norm": 0.2309965044260025, + "learning_rate": 0.0002, + "loss": 0.303, + "step": 16070 + }, + { + "epoch": 3.3717760536800165, + "grad_norm": 0.5179856419563293, + "learning_rate": 0.0002, + "loss": 0.2761, + "step": 16080 + }, + { + "epoch": 3.3738729293352905, + "grad_norm": 0.28771182894706726, + "learning_rate": 0.0002, + "loss": 0.2579, + "step": 16090 + }, + { + "epoch": 3.375969804990564, + "grad_norm": 0.08401656895875931, + "learning_rate": 0.0002, + "loss": 0.2695, + "step": 16100 + }, + { + "epoch": 3.378066680645838, + "grad_norm": 0.15731945633888245, + "learning_rate": 0.0002, + "loss": 0.306, + "step": 16110 + }, + { + "epoch": 3.3801635563011114, + "grad_norm": 0.10326915979385376, + "learning_rate": 0.0002, + "loss": 0.2465, + "step": 16120 + }, + { + "epoch": 3.382260431956385, + "grad_norm": 0.46936795115470886, + "learning_rate": 0.0002, + "loss": 0.2676, + "step": 16130 + }, + { + "epoch": 3.3843573076116584, + "grad_norm": 0.10452701896429062, + "learning_rate": 0.0002, + "loss": 0.3082, + "step": 16140 + }, + { + "epoch": 3.3864541832669324, + "grad_norm": 11.49814510345459, + "learning_rate": 0.0002, + "loss": 0.2666, + "step": 16150 + }, + { + "epoch": 3.388551058922206, + "grad_norm": 0.7889800667762756, + "learning_rate": 0.0002, + "loss": 0.2711, + "step": 16160 + }, + { + "epoch": 3.3906479345774794, + "grad_norm": 0.21009521186351776, + "learning_rate": 0.0002, + "loss": 0.2719, + "step": 16170 + }, + { + "epoch": 3.3927448102327533, + "grad_norm": 0.09990337491035461, + "learning_rate": 0.0002, + "loss": 0.295, + "step": 16180 + }, + { + "epoch": 3.394841685888027, + "grad_norm": 1.3079146146774292, + "learning_rate": 0.0002, + "loss": 0.2758, + "step": 16190 + }, + { + "epoch": 3.3969385615433003, + "grad_norm": 2.922199010848999, + "learning_rate": 0.0002, + "loss": 0.2836, + "step": 16200 + }, + { + "epoch": 3.3990354371985743, + "grad_norm": 0.090868279337883, + "learning_rate": 0.0002, + "loss": 0.2932, + "step": 16210 + }, + { + "epoch": 3.4011323128538478, + "grad_norm": 2.9474756717681885, + "learning_rate": 0.0002, + "loss": 0.2778, + "step": 16220 + }, + { + "epoch": 3.4032291885091213, + "grad_norm": 0.11677554249763489, + "learning_rate": 0.0002, + "loss": 0.3101, + "step": 16230 + }, + { + "epoch": 3.405326064164395, + "grad_norm": 1.1826450824737549, + "learning_rate": 0.0002, + "loss": 0.3257, + "step": 16240 + }, + { + "epoch": 3.4074229398196687, + "grad_norm": 0.3995857536792755, + "learning_rate": 0.0002, + "loss": 0.3077, + "step": 16250 + }, + { + "epoch": 3.409519815474942, + "grad_norm": 5.6643266677856445, + "learning_rate": 0.0002, + "loss": 0.2657, + "step": 16260 + }, + { + "epoch": 3.411616691130216, + "grad_norm": 10.34603214263916, + "learning_rate": 0.0002, + "loss": 0.2948, + "step": 16270 + }, + { + "epoch": 3.4137135667854897, + "grad_norm": 0.4758007228374481, + "learning_rate": 0.0002, + "loss": 0.2976, + "step": 16280 + }, + { + "epoch": 3.415810442440763, + "grad_norm": 15.302112579345703, + "learning_rate": 0.0002, + "loss": 0.3524, + "step": 16290 + }, + { + "epoch": 3.417907318096037, + "grad_norm": 0.20163509249687195, + "learning_rate": 0.0002, + "loss": 0.2469, + "step": 16300 + }, + { + "epoch": 3.4200041937513106, + "grad_norm": 0.3203161656856537, + "learning_rate": 0.0002, + "loss": 0.301, + "step": 16310 + }, + { + "epoch": 3.422101069406584, + "grad_norm": 4.417267799377441, + "learning_rate": 0.0002, + "loss": 0.355, + "step": 16320 + }, + { + "epoch": 3.4241979450618576, + "grad_norm": 6.401717662811279, + "learning_rate": 0.0002, + "loss": 0.3086, + "step": 16330 + }, + { + "epoch": 3.4262948207171315, + "grad_norm": 0.4465663731098175, + "learning_rate": 0.0002, + "loss": 0.2818, + "step": 16340 + }, + { + "epoch": 3.428391696372405, + "grad_norm": 0.6320145726203918, + "learning_rate": 0.0002, + "loss": 0.3339, + "step": 16350 + }, + { + "epoch": 3.430488572027679, + "grad_norm": 0.0976993590593338, + "learning_rate": 0.0002, + "loss": 0.2778, + "step": 16360 + }, + { + "epoch": 3.4325854476829525, + "grad_norm": 0.10189173370599747, + "learning_rate": 0.0002, + "loss": 0.2648, + "step": 16370 + }, + { + "epoch": 3.434682323338226, + "grad_norm": 5.137659072875977, + "learning_rate": 0.0002, + "loss": 0.3291, + "step": 16380 + }, + { + "epoch": 3.4367791989934995, + "grad_norm": 61.313507080078125, + "learning_rate": 0.0002, + "loss": 0.291, + "step": 16390 + }, + { + "epoch": 3.4388760746487734, + "grad_norm": 0.3099044859409332, + "learning_rate": 0.0002, + "loss": 0.3097, + "step": 16400 + }, + { + "epoch": 3.440972950304047, + "grad_norm": 0.15905895829200745, + "learning_rate": 0.0002, + "loss": 0.2659, + "step": 16410 + }, + { + "epoch": 3.4430698259593204, + "grad_norm": 0.0829874575138092, + "learning_rate": 0.0002, + "loss": 0.2452, + "step": 16420 + }, + { + "epoch": 3.4451667016145944, + "grad_norm": 0.1382455974817276, + "learning_rate": 0.0002, + "loss": 0.2753, + "step": 16430 + }, + { + "epoch": 3.447263577269868, + "grad_norm": 1.0949729681015015, + "learning_rate": 0.0002, + "loss": 0.2467, + "step": 16440 + }, + { + "epoch": 3.4493604529251414, + "grad_norm": 0.46144360303878784, + "learning_rate": 0.0002, + "loss": 0.2449, + "step": 16450 + }, + { + "epoch": 3.4514573285804153, + "grad_norm": 0.08408217877149582, + "learning_rate": 0.0002, + "loss": 0.2811, + "step": 16460 + }, + { + "epoch": 3.453554204235689, + "grad_norm": 13.272506713867188, + "learning_rate": 0.0002, + "loss": 0.2794, + "step": 16470 + }, + { + "epoch": 3.4556510798909623, + "grad_norm": 0.13884180784225464, + "learning_rate": 0.0002, + "loss": 0.3177, + "step": 16480 + }, + { + "epoch": 3.4577479555462363, + "grad_norm": 0.12016461789608002, + "learning_rate": 0.0002, + "loss": 0.2717, + "step": 16490 + }, + { + "epoch": 3.4598448312015098, + "grad_norm": 0.13663773238658905, + "learning_rate": 0.0002, + "loss": 0.2805, + "step": 16500 + }, + { + "epoch": 3.4619417068567833, + "grad_norm": 0.6070636510848999, + "learning_rate": 0.0002, + "loss": 0.2444, + "step": 16510 + }, + { + "epoch": 3.464038582512057, + "grad_norm": 28.876049041748047, + "learning_rate": 0.0002, + "loss": 0.427, + "step": 16520 + }, + { + "epoch": 3.4661354581673307, + "grad_norm": 0.09710516780614853, + "learning_rate": 0.0002, + "loss": 0.2669, + "step": 16530 + }, + { + "epoch": 3.468232333822604, + "grad_norm": 0.1740933507680893, + "learning_rate": 0.0002, + "loss": 0.2508, + "step": 16540 + }, + { + "epoch": 3.470329209477878, + "grad_norm": 0.11242100596427917, + "learning_rate": 0.0002, + "loss": 0.2622, + "step": 16550 + }, + { + "epoch": 3.4724260851331517, + "grad_norm": 3.6925201416015625, + "learning_rate": 0.0002, + "loss": 0.3327, + "step": 16560 + }, + { + "epoch": 3.474522960788425, + "grad_norm": 0.13877654075622559, + "learning_rate": 0.0002, + "loss": 0.2746, + "step": 16570 + }, + { + "epoch": 3.4766198364436987, + "grad_norm": 0.12891237437725067, + "learning_rate": 0.0002, + "loss": 0.2455, + "step": 16580 + }, + { + "epoch": 3.4787167120989726, + "grad_norm": 1.3704564571380615, + "learning_rate": 0.0002, + "loss": 0.2455, + "step": 16590 + }, + { + "epoch": 3.480813587754246, + "grad_norm": 14.254056930541992, + "learning_rate": 0.0002, + "loss": 0.3183, + "step": 16600 + }, + { + "epoch": 3.48291046340952, + "grad_norm": 0.10901372879743576, + "learning_rate": 0.0002, + "loss": 0.2473, + "step": 16610 + }, + { + "epoch": 3.4850073390647935, + "grad_norm": 0.0792684555053711, + "learning_rate": 0.0002, + "loss": 0.2451, + "step": 16620 + }, + { + "epoch": 3.487104214720067, + "grad_norm": 0.08512654900550842, + "learning_rate": 0.0002, + "loss": 0.2864, + "step": 16630 + }, + { + "epoch": 3.4892010903753405, + "grad_norm": 0.2178574949502945, + "learning_rate": 0.0002, + "loss": 0.2715, + "step": 16640 + }, + { + "epoch": 3.4912979660306145, + "grad_norm": 0.10082219541072845, + "learning_rate": 0.0002, + "loss": 0.316, + "step": 16650 + }, + { + "epoch": 3.493394841685888, + "grad_norm": 2.8539416790008545, + "learning_rate": 0.0002, + "loss": 0.2639, + "step": 16660 + }, + { + "epoch": 3.4954917173411615, + "grad_norm": 0.11633438616991043, + "learning_rate": 0.0002, + "loss": 0.2763, + "step": 16670 + }, + { + "epoch": 3.4975885929964354, + "grad_norm": 0.08961481600999832, + "learning_rate": 0.0002, + "loss": 0.2778, + "step": 16680 + }, + { + "epoch": 3.499685468651709, + "grad_norm": 0.08329380303621292, + "learning_rate": 0.0002, + "loss": 0.2459, + "step": 16690 + }, + { + "epoch": 3.5017823443069824, + "grad_norm": 0.2124902606010437, + "learning_rate": 0.0002, + "loss": 0.2787, + "step": 16700 + }, + { + "epoch": 3.5038792199622564, + "grad_norm": 0.07479745149612427, + "learning_rate": 0.0002, + "loss": 0.2424, + "step": 16710 + }, + { + "epoch": 3.50597609561753, + "grad_norm": 0.8046960830688477, + "learning_rate": 0.0002, + "loss": 0.3099, + "step": 16720 + }, + { + "epoch": 3.5080729712728034, + "grad_norm": 0.17260153591632843, + "learning_rate": 0.0002, + "loss": 0.2451, + "step": 16730 + }, + { + "epoch": 3.5101698469280773, + "grad_norm": 0.4296531677246094, + "learning_rate": 0.0002, + "loss": 0.3065, + "step": 16740 + }, + { + "epoch": 3.512266722583351, + "grad_norm": 1.1957885026931763, + "learning_rate": 0.0002, + "loss": 0.2784, + "step": 16750 + }, + { + "epoch": 3.5143635982386243, + "grad_norm": 0.08505085110664368, + "learning_rate": 0.0002, + "loss": 0.2827, + "step": 16760 + }, + { + "epoch": 3.516460473893898, + "grad_norm": 1.681307315826416, + "learning_rate": 0.0002, + "loss": 0.2906, + "step": 16770 + }, + { + "epoch": 3.5185573495491718, + "grad_norm": 0.19510099291801453, + "learning_rate": 0.0002, + "loss": 0.247, + "step": 16780 + }, + { + "epoch": 3.5206542252044453, + "grad_norm": 0.40013596415519714, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 16790 + }, + { + "epoch": 3.522751100859719, + "grad_norm": 20.94623565673828, + "learning_rate": 0.0002, + "loss": 0.3363, + "step": 16800 + }, + { + "epoch": 3.5248479765149927, + "grad_norm": 0.11770687252283096, + "learning_rate": 0.0002, + "loss": 0.2442, + "step": 16810 + }, + { + "epoch": 3.526944852170266, + "grad_norm": 0.47683051228523254, + "learning_rate": 0.0002, + "loss": 0.2424, + "step": 16820 + }, + { + "epoch": 3.5290417278255397, + "grad_norm": 0.3234413266181946, + "learning_rate": 0.0002, + "loss": 0.2468, + "step": 16830 + }, + { + "epoch": 3.5311386034808137, + "grad_norm": 66.1975326538086, + "learning_rate": 0.0002, + "loss": 0.3623, + "step": 16840 + }, + { + "epoch": 3.533235479136087, + "grad_norm": 10.475239753723145, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 16850 + }, + { + "epoch": 3.535332354791361, + "grad_norm": 0.19146008789539337, + "learning_rate": 0.0002, + "loss": 0.3097, + "step": 16860 + }, + { + "epoch": 3.5374292304466346, + "grad_norm": 0.10526818037033081, + "learning_rate": 0.0002, + "loss": 0.3286, + "step": 16870 + }, + { + "epoch": 3.539526106101908, + "grad_norm": 0.2106499820947647, + "learning_rate": 0.0002, + "loss": 0.326, + "step": 16880 + }, + { + "epoch": 3.5416229817571816, + "grad_norm": 0.16286778450012207, + "learning_rate": 0.0002, + "loss": 0.2455, + "step": 16890 + }, + { + "epoch": 3.5437198574124555, + "grad_norm": 0.2097437083721161, + "learning_rate": 0.0002, + "loss": 0.3212, + "step": 16900 + }, + { + "epoch": 3.545816733067729, + "grad_norm": 0.8928031921386719, + "learning_rate": 0.0002, + "loss": 0.2455, + "step": 16910 + }, + { + "epoch": 3.547913608723003, + "grad_norm": 0.20158305764198303, + "learning_rate": 0.0002, + "loss": 0.2946, + "step": 16920 + }, + { + "epoch": 3.5500104843782765, + "grad_norm": 8.960654258728027, + "learning_rate": 0.0002, + "loss": 0.2485, + "step": 16930 + }, + { + "epoch": 3.55210736003355, + "grad_norm": 3.4482638835906982, + "learning_rate": 0.0002, + "loss": 0.2985, + "step": 16940 + }, + { + "epoch": 3.5542042356888235, + "grad_norm": 1.4937551021575928, + "learning_rate": 0.0002, + "loss": 0.2759, + "step": 16950 + }, + { + "epoch": 3.5563011113440974, + "grad_norm": 3.2123825550079346, + "learning_rate": 0.0002, + "loss": 0.2446, + "step": 16960 + }, + { + "epoch": 3.558397986999371, + "grad_norm": 0.10020950436592102, + "learning_rate": 0.0002, + "loss": 0.2486, + "step": 16970 + }, + { + "epoch": 3.5604948626546444, + "grad_norm": 0.6148474812507629, + "learning_rate": 0.0002, + "loss": 0.3961, + "step": 16980 + }, + { + "epoch": 3.5625917383099184, + "grad_norm": 0.11368487775325775, + "learning_rate": 0.0002, + "loss": 0.2442, + "step": 16990 + }, + { + "epoch": 3.564688613965192, + "grad_norm": 0.4417993724346161, + "learning_rate": 0.0002, + "loss": 0.4003, + "step": 17000 + }, + { + "epoch": 3.5667854896204654, + "grad_norm": 0.10857246071100235, + "learning_rate": 0.0002, + "loss": 0.2453, + "step": 17010 + }, + { + "epoch": 3.568882365275739, + "grad_norm": 0.4231739044189453, + "learning_rate": 0.0002, + "loss": 0.2452, + "step": 17020 + }, + { + "epoch": 3.570979240931013, + "grad_norm": 0.21850062906742096, + "learning_rate": 0.0002, + "loss": 0.3663, + "step": 17030 + }, + { + "epoch": 3.5730761165862863, + "grad_norm": 0.10159339010715485, + "learning_rate": 0.0002, + "loss": 0.3287, + "step": 17040 + }, + { + "epoch": 3.5751729922415603, + "grad_norm": 0.2898092269897461, + "learning_rate": 0.0002, + "loss": 0.2447, + "step": 17050 + }, + { + "epoch": 3.5772698678968338, + "grad_norm": 7.484994411468506, + "learning_rate": 0.0002, + "loss": 0.2693, + "step": 17060 + }, + { + "epoch": 3.5793667435521073, + "grad_norm": 27.404441833496094, + "learning_rate": 0.0002, + "loss": 0.3023, + "step": 17070 + }, + { + "epoch": 3.5814636192073808, + "grad_norm": 3.085254430770874, + "learning_rate": 0.0002, + "loss": 0.2913, + "step": 17080 + }, + { + "epoch": 3.5835604948626547, + "grad_norm": 2.5266666412353516, + "learning_rate": 0.0002, + "loss": 0.3504, + "step": 17090 + }, + { + "epoch": 3.585657370517928, + "grad_norm": 0.17346085608005524, + "learning_rate": 0.0002, + "loss": 0.2483, + "step": 17100 + }, + { + "epoch": 3.587754246173202, + "grad_norm": 5.260568618774414, + "learning_rate": 0.0002, + "loss": 0.2502, + "step": 17110 + }, + { + "epoch": 3.5898511218284757, + "grad_norm": 8.863641738891602, + "learning_rate": 0.0002, + "loss": 0.2695, + "step": 17120 + }, + { + "epoch": 3.591947997483749, + "grad_norm": 0.36519744992256165, + "learning_rate": 0.0002, + "loss": 0.2684, + "step": 17130 + }, + { + "epoch": 3.5940448731390227, + "grad_norm": 1.6287387609481812, + "learning_rate": 0.0002, + "loss": 0.2827, + "step": 17140 + }, + { + "epoch": 3.5961417487942966, + "grad_norm": 15.245902061462402, + "learning_rate": 0.0002, + "loss": 0.2643, + "step": 17150 + }, + { + "epoch": 3.59823862444957, + "grad_norm": 26.536678314208984, + "learning_rate": 0.0002, + "loss": 0.3645, + "step": 17160 + }, + { + "epoch": 3.600335500104844, + "grad_norm": 10.872307777404785, + "learning_rate": 0.0002, + "loss": 0.4323, + "step": 17170 + }, + { + "epoch": 3.6024323757601175, + "grad_norm": 14.617993354797363, + "learning_rate": 0.0002, + "loss": 0.2648, + "step": 17180 + }, + { + "epoch": 3.604529251415391, + "grad_norm": 3.0631062984466553, + "learning_rate": 0.0002, + "loss": 0.3932, + "step": 17190 + }, + { + "epoch": 3.6066261270706645, + "grad_norm": 7.422130107879639, + "learning_rate": 0.0002, + "loss": 0.3115, + "step": 17200 + }, + { + "epoch": 3.6087230027259385, + "grad_norm": 2.1507389545440674, + "learning_rate": 0.0002, + "loss": 0.2567, + "step": 17210 + }, + { + "epoch": 3.610819878381212, + "grad_norm": 3.0334174633026123, + "learning_rate": 0.0002, + "loss": 0.251, + "step": 17220 + }, + { + "epoch": 3.6129167540364855, + "grad_norm": 3.9418137073516846, + "learning_rate": 0.0002, + "loss": 0.3277, + "step": 17230 + }, + { + "epoch": 3.6150136296917594, + "grad_norm": 14.20264720916748, + "learning_rate": 0.0002, + "loss": 0.2983, + "step": 17240 + }, + { + "epoch": 3.617110505347033, + "grad_norm": 5.825155735015869, + "learning_rate": 0.0002, + "loss": 0.3208, + "step": 17250 + }, + { + "epoch": 3.6192073810023064, + "grad_norm": 15.464000701904297, + "learning_rate": 0.0002, + "loss": 0.2542, + "step": 17260 + }, + { + "epoch": 3.62130425665758, + "grad_norm": 3.840942621231079, + "learning_rate": 0.0002, + "loss": 0.2642, + "step": 17270 + }, + { + "epoch": 3.623401132312854, + "grad_norm": 1.0902684926986694, + "learning_rate": 0.0002, + "loss": 0.345, + "step": 17280 + }, + { + "epoch": 3.6254980079681274, + "grad_norm": 10.194125175476074, + "learning_rate": 0.0002, + "loss": 0.2463, + "step": 17290 + }, + { + "epoch": 3.6275948836234013, + "grad_norm": 0.5578778982162476, + "learning_rate": 0.0002, + "loss": 0.2488, + "step": 17300 + }, + { + "epoch": 3.629691759278675, + "grad_norm": 0.55025714635849, + "learning_rate": 0.0002, + "loss": 0.2786, + "step": 17310 + }, + { + "epoch": 3.6317886349339483, + "grad_norm": 2.2328052520751953, + "learning_rate": 0.0002, + "loss": 0.301, + "step": 17320 + }, + { + "epoch": 3.633885510589222, + "grad_norm": 21.651777267456055, + "learning_rate": 0.0002, + "loss": 0.3668, + "step": 17330 + }, + { + "epoch": 3.6359823862444958, + "grad_norm": 10.565069198608398, + "learning_rate": 0.0002, + "loss": 0.2922, + "step": 17340 + }, + { + "epoch": 3.6380792618997693, + "grad_norm": 12.717290878295898, + "learning_rate": 0.0002, + "loss": 0.3583, + "step": 17350 + }, + { + "epoch": 3.640176137555043, + "grad_norm": 8.592275619506836, + "learning_rate": 0.0002, + "loss": 0.3302, + "step": 17360 + }, + { + "epoch": 3.6422730132103167, + "grad_norm": 1.2025078535079956, + "learning_rate": 0.0002, + "loss": 0.2525, + "step": 17370 + }, + { + "epoch": 3.64436988886559, + "grad_norm": 0.9852961301803589, + "learning_rate": 0.0002, + "loss": 0.3788, + "step": 17380 + }, + { + "epoch": 3.6464667645208637, + "grad_norm": 11.788804054260254, + "learning_rate": 0.0002, + "loss": 0.2989, + "step": 17390 + }, + { + "epoch": 3.6485636401761377, + "grad_norm": 2.911071300506592, + "learning_rate": 0.0002, + "loss": 0.3112, + "step": 17400 + }, + { + "epoch": 3.650660515831411, + "grad_norm": 1.0645408630371094, + "learning_rate": 0.0002, + "loss": 0.2476, + "step": 17410 + }, + { + "epoch": 3.652757391486685, + "grad_norm": 0.17559315264225006, + "learning_rate": 0.0002, + "loss": 0.2792, + "step": 17420 + }, + { + "epoch": 3.6548542671419586, + "grad_norm": 0.39271262288093567, + "learning_rate": 0.0002, + "loss": 0.2963, + "step": 17430 + }, + { + "epoch": 3.656951142797232, + "grad_norm": 0.6976293921470642, + "learning_rate": 0.0002, + "loss": 0.3541, + "step": 17440 + }, + { + "epoch": 3.6590480184525056, + "grad_norm": 0.7792041897773743, + "learning_rate": 0.0002, + "loss": 0.2937, + "step": 17450 + }, + { + "epoch": 3.6611448941077795, + "grad_norm": 0.5158128142356873, + "learning_rate": 0.0002, + "loss": 0.3441, + "step": 17460 + }, + { + "epoch": 3.663241769763053, + "grad_norm": 2.1796839237213135, + "learning_rate": 0.0002, + "loss": 0.3525, + "step": 17470 + }, + { + "epoch": 3.6653386454183265, + "grad_norm": 7.255102157592773, + "learning_rate": 0.0002, + "loss": 0.4197, + "step": 17480 + }, + { + "epoch": 3.6674355210736005, + "grad_norm": 1.5092166662216187, + "learning_rate": 0.0002, + "loss": 0.3176, + "step": 17490 + }, + { + "epoch": 3.669532396728874, + "grad_norm": 0.32293596863746643, + "learning_rate": 0.0002, + "loss": 0.246, + "step": 17500 + }, + { + "epoch": 3.6716292723841475, + "grad_norm": 1.2418729066848755, + "learning_rate": 0.0002, + "loss": 0.2499, + "step": 17510 + }, + { + "epoch": 3.673726148039421, + "grad_norm": 8.73947811126709, + "learning_rate": 0.0002, + "loss": 0.3851, + "step": 17520 + }, + { + "epoch": 3.675823023694695, + "grad_norm": 0.43778735399246216, + "learning_rate": 0.0002, + "loss": 0.2479, + "step": 17530 + }, + { + "epoch": 3.6779198993499684, + "grad_norm": 5.211206436157227, + "learning_rate": 0.0002, + "loss": 0.3464, + "step": 17540 + }, + { + "epoch": 3.6800167750052424, + "grad_norm": 0.3823220133781433, + "learning_rate": 0.0002, + "loss": 0.3327, + "step": 17550 + }, + { + "epoch": 3.682113650660516, + "grad_norm": 1.5433018207550049, + "learning_rate": 0.0002, + "loss": 0.2942, + "step": 17560 + }, + { + "epoch": 3.6842105263157894, + "grad_norm": 0.5892255306243896, + "learning_rate": 0.0002, + "loss": 0.2552, + "step": 17570 + }, + { + "epoch": 3.686307401971063, + "grad_norm": 0.17415273189544678, + "learning_rate": 0.0002, + "loss": 0.2468, + "step": 17580 + }, + { + "epoch": 3.688404277626337, + "grad_norm": 0.13931488990783691, + "learning_rate": 0.0002, + "loss": 0.2599, + "step": 17590 + }, + { + "epoch": 3.6905011532816103, + "grad_norm": 0.3451101779937744, + "learning_rate": 0.0002, + "loss": 0.3056, + "step": 17600 + }, + { + "epoch": 3.6925980289368843, + "grad_norm": 5.913544654846191, + "learning_rate": 0.0002, + "loss": 0.2928, + "step": 17610 + }, + { + "epoch": 3.6946949045921578, + "grad_norm": 1.0163569450378418, + "learning_rate": 0.0002, + "loss": 0.2797, + "step": 17620 + }, + { + "epoch": 3.6967917802474313, + "grad_norm": 0.27711400389671326, + "learning_rate": 0.0002, + "loss": 0.3853, + "step": 17630 + }, + { + "epoch": 3.6988886559027048, + "grad_norm": 0.5875184535980225, + "learning_rate": 0.0002, + "loss": 0.2511, + "step": 17640 + }, + { + "epoch": 3.7009855315579787, + "grad_norm": 0.15635895729064941, + "learning_rate": 0.0002, + "loss": 0.2719, + "step": 17650 + }, + { + "epoch": 3.703082407213252, + "grad_norm": 0.15206418931484222, + "learning_rate": 0.0002, + "loss": 0.325, + "step": 17660 + }, + { + "epoch": 3.705179282868526, + "grad_norm": 0.08338475227355957, + "learning_rate": 0.0002, + "loss": 0.281, + "step": 17670 + }, + { + "epoch": 3.7072761585237997, + "grad_norm": 0.4060516059398651, + "learning_rate": 0.0002, + "loss": 0.3038, + "step": 17680 + }, + { + "epoch": 3.709373034179073, + "grad_norm": 0.23129555583000183, + "learning_rate": 0.0002, + "loss": 0.2948, + "step": 17690 + }, + { + "epoch": 3.7114699098343467, + "grad_norm": 0.19247643649578094, + "learning_rate": 0.0002, + "loss": 0.319, + "step": 17700 + }, + { + "epoch": 3.7135667854896206, + "grad_norm": 1.2437766790390015, + "learning_rate": 0.0002, + "loss": 0.274, + "step": 17710 + }, + { + "epoch": 3.715663661144894, + "grad_norm": 0.10187529772520065, + "learning_rate": 0.0002, + "loss": 0.246, + "step": 17720 + }, + { + "epoch": 3.7177605368001676, + "grad_norm": 0.1905500441789627, + "learning_rate": 0.0002, + "loss": 0.277, + "step": 17730 + }, + { + "epoch": 3.7198574124554415, + "grad_norm": 0.18983417749404907, + "learning_rate": 0.0002, + "loss": 0.3274, + "step": 17740 + }, + { + "epoch": 3.721954288110715, + "grad_norm": 12.874968528747559, + "learning_rate": 0.0002, + "loss": 0.2807, + "step": 17750 + }, + { + "epoch": 3.7240511637659885, + "grad_norm": 0.11451554298400879, + "learning_rate": 0.0002, + "loss": 0.3368, + "step": 17760 + }, + { + "epoch": 3.726148039421262, + "grad_norm": 0.10391887277364731, + "learning_rate": 0.0002, + "loss": 0.2853, + "step": 17770 + }, + { + "epoch": 3.728244915076536, + "grad_norm": 0.11095970869064331, + "learning_rate": 0.0002, + "loss": 0.32, + "step": 17780 + }, + { + "epoch": 3.7303417907318095, + "grad_norm": 0.09512672573328018, + "learning_rate": 0.0002, + "loss": 0.2787, + "step": 17790 + }, + { + "epoch": 3.7324386663870834, + "grad_norm": 0.19171316921710968, + "learning_rate": 0.0002, + "loss": 0.2457, + "step": 17800 + }, + { + "epoch": 3.734535542042357, + "grad_norm": 0.12946048378944397, + "learning_rate": 0.0002, + "loss": 0.2843, + "step": 17810 + }, + { + "epoch": 3.7366324176976304, + "grad_norm": 0.06607726216316223, + "learning_rate": 0.0002, + "loss": 0.2844, + "step": 17820 + }, + { + "epoch": 3.738729293352904, + "grad_norm": 0.40193358063697815, + "learning_rate": 0.0002, + "loss": 0.3293, + "step": 17830 + }, + { + "epoch": 3.740826169008178, + "grad_norm": 0.14429262280464172, + "learning_rate": 0.0002, + "loss": 0.2446, + "step": 17840 + }, + { + "epoch": 3.7429230446634514, + "grad_norm": 0.11837374418973923, + "learning_rate": 0.0002, + "loss": 0.2532, + "step": 17850 + }, + { + "epoch": 3.7450199203187253, + "grad_norm": 0.182389035820961, + "learning_rate": 0.0002, + "loss": 0.2528, + "step": 17860 + }, + { + "epoch": 3.747116795973999, + "grad_norm": 2.132552146911621, + "learning_rate": 0.0002, + "loss": 0.3393, + "step": 17870 + }, + { + "epoch": 3.7492136716292723, + "grad_norm": 0.14405368268489838, + "learning_rate": 0.0002, + "loss": 0.2848, + "step": 17880 + }, + { + "epoch": 3.751310547284546, + "grad_norm": 20.570039749145508, + "learning_rate": 0.0002, + "loss": 0.277, + "step": 17890 + }, + { + "epoch": 3.7534074229398198, + "grad_norm": 8.199270248413086, + "learning_rate": 0.0002, + "loss": 0.2777, + "step": 17900 + }, + { + "epoch": 3.7555042985950933, + "grad_norm": 1.1704336404800415, + "learning_rate": 0.0002, + "loss": 0.25, + "step": 17910 + }, + { + "epoch": 3.757601174250367, + "grad_norm": 4.142645835876465, + "learning_rate": 0.0002, + "loss": 0.3166, + "step": 17920 + }, + { + "epoch": 3.7596980499056407, + "grad_norm": 1.2840096950531006, + "learning_rate": 0.0002, + "loss": 0.2479, + "step": 17930 + }, + { + "epoch": 3.761794925560914, + "grad_norm": 0.2658231854438782, + "learning_rate": 0.0002, + "loss": 0.295, + "step": 17940 + }, + { + "epoch": 3.7638918012161877, + "grad_norm": 10.568625450134277, + "learning_rate": 0.0002, + "loss": 0.2504, + "step": 17950 + }, + { + "epoch": 3.7659886768714617, + "grad_norm": 0.7367125153541565, + "learning_rate": 0.0002, + "loss": 0.2729, + "step": 17960 + }, + { + "epoch": 3.768085552526735, + "grad_norm": 15.142430305480957, + "learning_rate": 0.0002, + "loss": 0.3903, + "step": 17970 + }, + { + "epoch": 3.7701824281820087, + "grad_norm": 0.8723947405815125, + "learning_rate": 0.0002, + "loss": 0.2495, + "step": 17980 + }, + { + "epoch": 3.7722793038372826, + "grad_norm": 0.542462944984436, + "learning_rate": 0.0002, + "loss": 0.2764, + "step": 17990 + }, + { + "epoch": 3.774376179492556, + "grad_norm": 0.5069801807403564, + "learning_rate": 0.0002, + "loss": 0.2774, + "step": 18000 + }, + { + "epoch": 3.7764730551478296, + "grad_norm": 28.867746353149414, + "learning_rate": 0.0002, + "loss": 0.3051, + "step": 18010 + }, + { + "epoch": 3.778569930803103, + "grad_norm": 19.830551147460938, + "learning_rate": 0.0002, + "loss": 0.3139, + "step": 18020 + }, + { + "epoch": 3.780666806458377, + "grad_norm": 13.429841041564941, + "learning_rate": 0.0002, + "loss": 0.2492, + "step": 18030 + }, + { + "epoch": 3.7827636821136505, + "grad_norm": 0.7679717540740967, + "learning_rate": 0.0002, + "loss": 0.2486, + "step": 18040 + }, + { + "epoch": 3.7848605577689245, + "grad_norm": 0.280867338180542, + "learning_rate": 0.0002, + "loss": 0.2875, + "step": 18050 + }, + { + "epoch": 3.786957433424198, + "grad_norm": 0.576727032661438, + "learning_rate": 0.0002, + "loss": 0.2481, + "step": 18060 + }, + { + "epoch": 3.7890543090794715, + "grad_norm": 12.595909118652344, + "learning_rate": 0.0002, + "loss": 0.3076, + "step": 18070 + }, + { + "epoch": 3.791151184734745, + "grad_norm": 8.194901466369629, + "learning_rate": 0.0002, + "loss": 0.3268, + "step": 18080 + }, + { + "epoch": 3.793248060390019, + "grad_norm": 0.9795236587524414, + "learning_rate": 0.0002, + "loss": 0.2486, + "step": 18090 + }, + { + "epoch": 3.7953449360452924, + "grad_norm": 0.21155227720737457, + "learning_rate": 0.0002, + "loss": 0.2967, + "step": 18100 + }, + { + "epoch": 3.7974418117005664, + "grad_norm": 0.17290417850017548, + "learning_rate": 0.0002, + "loss": 0.2474, + "step": 18110 + }, + { + "epoch": 3.79953868735584, + "grad_norm": 0.1753161996603012, + "learning_rate": 0.0002, + "loss": 0.2472, + "step": 18120 + }, + { + "epoch": 3.8016355630111134, + "grad_norm": 0.16531455516815186, + "learning_rate": 0.0002, + "loss": 0.3316, + "step": 18130 + }, + { + "epoch": 3.803732438666387, + "grad_norm": 0.27308961749076843, + "learning_rate": 0.0002, + "loss": 0.2461, + "step": 18140 + }, + { + "epoch": 3.805829314321661, + "grad_norm": 0.885614812374115, + "learning_rate": 0.0002, + "loss": 0.3144, + "step": 18150 + }, + { + "epoch": 3.8079261899769343, + "grad_norm": 0.23859460651874542, + "learning_rate": 0.0002, + "loss": 0.3965, + "step": 18160 + }, + { + "epoch": 3.8100230656322083, + "grad_norm": 0.8167296051979065, + "learning_rate": 0.0002, + "loss": 0.2485, + "step": 18170 + }, + { + "epoch": 3.8121199412874818, + "grad_norm": 14.719027519226074, + "learning_rate": 0.0002, + "loss": 0.3193, + "step": 18180 + }, + { + "epoch": 3.8142168169427553, + "grad_norm": 18.75009536743164, + "learning_rate": 0.0002, + "loss": 0.3505, + "step": 18190 + }, + { + "epoch": 3.8163136925980288, + "grad_norm": 0.7599632740020752, + "learning_rate": 0.0002, + "loss": 0.2432, + "step": 18200 + }, + { + "epoch": 3.8184105682533027, + "grad_norm": 2.993929862976074, + "learning_rate": 0.0002, + "loss": 0.4737, + "step": 18210 + }, + { + "epoch": 3.820507443908576, + "grad_norm": 0.14531955122947693, + "learning_rate": 0.0002, + "loss": 0.2779, + "step": 18220 + }, + { + "epoch": 3.8226043195638497, + "grad_norm": 0.21003879606723785, + "learning_rate": 0.0002, + "loss": 0.2883, + "step": 18230 + }, + { + "epoch": 3.8247011952191237, + "grad_norm": 15.457707405090332, + "learning_rate": 0.0002, + "loss": 0.3807, + "step": 18240 + }, + { + "epoch": 3.826798070874397, + "grad_norm": 0.12080878019332886, + "learning_rate": 0.0002, + "loss": 0.269, + "step": 18250 + }, + { + "epoch": 3.8288949465296707, + "grad_norm": 0.20637743175029755, + "learning_rate": 0.0002, + "loss": 0.2463, + "step": 18260 + }, + { + "epoch": 3.830991822184944, + "grad_norm": 16.86208724975586, + "learning_rate": 0.0002, + "loss": 0.3, + "step": 18270 + }, + { + "epoch": 3.833088697840218, + "grad_norm": 0.15844117105007172, + "learning_rate": 0.0002, + "loss": 0.2729, + "step": 18280 + }, + { + "epoch": 3.8351855734954916, + "grad_norm": 1.4949979782104492, + "learning_rate": 0.0002, + "loss": 0.2463, + "step": 18290 + }, + { + "epoch": 3.8372824491507656, + "grad_norm": 0.2570766806602478, + "learning_rate": 0.0002, + "loss": 0.2507, + "step": 18300 + }, + { + "epoch": 3.839379324806039, + "grad_norm": 0.27064013481140137, + "learning_rate": 0.0002, + "loss": 0.2641, + "step": 18310 + }, + { + "epoch": 3.8414762004613126, + "grad_norm": 0.10055670887231827, + "learning_rate": 0.0002, + "loss": 0.2928, + "step": 18320 + }, + { + "epoch": 3.843573076116586, + "grad_norm": 2.022714614868164, + "learning_rate": 0.0002, + "loss": 0.2814, + "step": 18330 + }, + { + "epoch": 3.84566995177186, + "grad_norm": 0.17389263212680817, + "learning_rate": 0.0002, + "loss": 0.3323, + "step": 18340 + }, + { + "epoch": 3.8477668274271335, + "grad_norm": 1.4782214164733887, + "learning_rate": 0.0002, + "loss": 0.246, + "step": 18350 + }, + { + "epoch": 3.8498637030824074, + "grad_norm": 1.1519814729690552, + "learning_rate": 0.0002, + "loss": 0.297, + "step": 18360 + }, + { + "epoch": 3.851960578737681, + "grad_norm": 0.8425706028938293, + "learning_rate": 0.0002, + "loss": 0.2739, + "step": 18370 + }, + { + "epoch": 3.8540574543929544, + "grad_norm": 0.17960014939308167, + "learning_rate": 0.0002, + "loss": 0.3024, + "step": 18380 + }, + { + "epoch": 3.856154330048228, + "grad_norm": 0.7193502187728882, + "learning_rate": 0.0002, + "loss": 0.2457, + "step": 18390 + }, + { + "epoch": 3.858251205703502, + "grad_norm": 0.5273101925849915, + "learning_rate": 0.0002, + "loss": 0.3255, + "step": 18400 + }, + { + "epoch": 3.8603480813587754, + "grad_norm": 9.503673553466797, + "learning_rate": 0.0002, + "loss": 0.3206, + "step": 18410 + }, + { + "epoch": 3.8624449570140493, + "grad_norm": 0.15235988795757294, + "learning_rate": 0.0002, + "loss": 0.2463, + "step": 18420 + }, + { + "epoch": 3.864541832669323, + "grad_norm": 0.24428336322307587, + "learning_rate": 0.0002, + "loss": 0.2477, + "step": 18430 + }, + { + "epoch": 3.8666387083245963, + "grad_norm": 12.147161483764648, + "learning_rate": 0.0002, + "loss": 0.2859, + "step": 18440 + }, + { + "epoch": 3.86873558397987, + "grad_norm": 0.24032670259475708, + "learning_rate": 0.0002, + "loss": 0.2585, + "step": 18450 + }, + { + "epoch": 3.8708324596351438, + "grad_norm": 0.626478374004364, + "learning_rate": 0.0002, + "loss": 0.2903, + "step": 18460 + }, + { + "epoch": 3.8729293352904173, + "grad_norm": 0.7303033471107483, + "learning_rate": 0.0002, + "loss": 0.3426, + "step": 18470 + }, + { + "epoch": 3.8750262109456908, + "grad_norm": 1.397939920425415, + "learning_rate": 0.0002, + "loss": 0.3911, + "step": 18480 + }, + { + "epoch": 3.8771230866009647, + "grad_norm": 0.2924603223800659, + "learning_rate": 0.0002, + "loss": 0.3147, + "step": 18490 + }, + { + "epoch": 3.879219962256238, + "grad_norm": 11.158350944519043, + "learning_rate": 0.0002, + "loss": 0.2983, + "step": 18500 + }, + { + "epoch": 3.8813168379115117, + "grad_norm": 0.773551344871521, + "learning_rate": 0.0002, + "loss": 0.2881, + "step": 18510 + }, + { + "epoch": 3.883413713566785, + "grad_norm": 0.7801753282546997, + "learning_rate": 0.0002, + "loss": 0.2908, + "step": 18520 + }, + { + "epoch": 3.885510589222059, + "grad_norm": 0.9729545712471008, + "learning_rate": 0.0002, + "loss": 0.3189, + "step": 18530 + }, + { + "epoch": 3.8876074648773327, + "grad_norm": 7.370116233825684, + "learning_rate": 0.0002, + "loss": 0.2981, + "step": 18540 + }, + { + "epoch": 3.8897043405326066, + "grad_norm": 1.4260889291763306, + "learning_rate": 0.0002, + "loss": 0.3241, + "step": 18550 + }, + { + "epoch": 3.89180121618788, + "grad_norm": 1.2304461002349854, + "learning_rate": 0.0002, + "loss": 0.2543, + "step": 18560 + }, + { + "epoch": 3.8938980918431536, + "grad_norm": 10.699807167053223, + "learning_rate": 0.0002, + "loss": 0.3361, + "step": 18570 + }, + { + "epoch": 3.895994967498427, + "grad_norm": 5.950390338897705, + "learning_rate": 0.0002, + "loss": 0.3532, + "step": 18580 + }, + { + "epoch": 3.898091843153701, + "grad_norm": 0.6864115595817566, + "learning_rate": 0.0002, + "loss": 0.2665, + "step": 18590 + }, + { + "epoch": 3.9001887188089746, + "grad_norm": 1.4243322610855103, + "learning_rate": 0.0002, + "loss": 0.2468, + "step": 18600 + }, + { + "epoch": 3.9022855944642485, + "grad_norm": 15.98333740234375, + "learning_rate": 0.0002, + "loss": 0.3061, + "step": 18610 + }, + { + "epoch": 3.904382470119522, + "grad_norm": 0.6285650730133057, + "learning_rate": 0.0002, + "loss": 0.3156, + "step": 18620 + }, + { + "epoch": 3.9064793457747955, + "grad_norm": 10.524168014526367, + "learning_rate": 0.0002, + "loss": 0.3351, + "step": 18630 + }, + { + "epoch": 3.908576221430069, + "grad_norm": 0.1589958518743515, + "learning_rate": 0.0002, + "loss": 0.2456, + "step": 18640 + }, + { + "epoch": 3.910673097085343, + "grad_norm": 0.1514245867729187, + "learning_rate": 0.0002, + "loss": 0.2736, + "step": 18650 + }, + { + "epoch": 3.9127699727406164, + "grad_norm": 0.2135051190853119, + "learning_rate": 0.0002, + "loss": 0.2448, + "step": 18660 + }, + { + "epoch": 3.9148668483958904, + "grad_norm": 0.35259103775024414, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 18670 + }, + { + "epoch": 3.916963724051164, + "grad_norm": 0.27322497963905334, + "learning_rate": 0.0002, + "loss": 0.3172, + "step": 18680 + }, + { + "epoch": 3.9190605997064374, + "grad_norm": 7.8673906326293945, + "learning_rate": 0.0002, + "loss": 0.323, + "step": 18690 + }, + { + "epoch": 3.921157475361711, + "grad_norm": 4.626621723175049, + "learning_rate": 0.0002, + "loss": 0.2822, + "step": 18700 + }, + { + "epoch": 3.923254351016985, + "grad_norm": 0.13362011313438416, + "learning_rate": 0.0002, + "loss": 0.2921, + "step": 18710 + }, + { + "epoch": 3.9253512266722583, + "grad_norm": 0.2543129622936249, + "learning_rate": 0.0002, + "loss": 0.3575, + "step": 18720 + }, + { + "epoch": 3.927448102327532, + "grad_norm": 0.21031315624713898, + "learning_rate": 0.0002, + "loss": 0.3077, + "step": 18730 + }, + { + "epoch": 3.9295449779828058, + "grad_norm": 1.1506986618041992, + "learning_rate": 0.0002, + "loss": 0.2742, + "step": 18740 + }, + { + "epoch": 3.9316418536380793, + "grad_norm": 9.54204273223877, + "learning_rate": 0.0002, + "loss": 0.3883, + "step": 18750 + }, + { + "epoch": 3.9337387292933528, + "grad_norm": 0.9476527571678162, + "learning_rate": 0.0002, + "loss": 0.2547, + "step": 18760 + }, + { + "epoch": 3.9358356049486263, + "grad_norm": 0.33886954188346863, + "learning_rate": 0.0002, + "loss": 0.3091, + "step": 18770 + }, + { + "epoch": 3.9379324806039, + "grad_norm": 0.41044721007347107, + "learning_rate": 0.0002, + "loss": 0.2484, + "step": 18780 + }, + { + "epoch": 3.9400293562591737, + "grad_norm": 13.303047180175781, + "learning_rate": 0.0002, + "loss": 0.3628, + "step": 18790 + }, + { + "epoch": 3.9421262319144477, + "grad_norm": 0.8790237307548523, + "learning_rate": 0.0002, + "loss": 0.2819, + "step": 18800 + }, + { + "epoch": 3.944223107569721, + "grad_norm": 0.29523730278015137, + "learning_rate": 0.0002, + "loss": 0.2491, + "step": 18810 + }, + { + "epoch": 3.9463199832249947, + "grad_norm": 1.0903148651123047, + "learning_rate": 0.0002, + "loss": 0.317, + "step": 18820 + }, + { + "epoch": 3.948416858880268, + "grad_norm": 2.2300570011138916, + "learning_rate": 0.0002, + "loss": 0.3695, + "step": 18830 + }, + { + "epoch": 3.950513734535542, + "grad_norm": 9.108022689819336, + "learning_rate": 0.0002, + "loss": 0.253, + "step": 18840 + }, + { + "epoch": 3.9526106101908156, + "grad_norm": 0.383888304233551, + "learning_rate": 0.0002, + "loss": 0.2647, + "step": 18850 + }, + { + "epoch": 3.9547074858460896, + "grad_norm": 9.491477012634277, + "learning_rate": 0.0002, + "loss": 0.2457, + "step": 18860 + }, + { + "epoch": 3.956804361501363, + "grad_norm": 0.9953404068946838, + "learning_rate": 0.0002, + "loss": 0.3413, + "step": 18870 + }, + { + "epoch": 3.9589012371566366, + "grad_norm": 0.2776716649532318, + "learning_rate": 0.0002, + "loss": 0.4267, + "step": 18880 + }, + { + "epoch": 3.96099811281191, + "grad_norm": 2.3032448291778564, + "learning_rate": 0.0002, + "loss": 0.246, + "step": 18890 + }, + { + "epoch": 3.963094988467184, + "grad_norm": 0.35753384232521057, + "learning_rate": 0.0002, + "loss": 0.3361, + "step": 18900 + }, + { + "epoch": 3.9651918641224575, + "grad_norm": 2.1207659244537354, + "learning_rate": 0.0002, + "loss": 0.366, + "step": 18910 + }, + { + "epoch": 3.9672887397777314, + "grad_norm": 16.24346923828125, + "learning_rate": 0.0002, + "loss": 0.323, + "step": 18920 + }, + { + "epoch": 3.969385615433005, + "grad_norm": 0.1958165466785431, + "learning_rate": 0.0002, + "loss": 0.2467, + "step": 18930 + }, + { + "epoch": 3.9714824910882784, + "grad_norm": 0.3013644218444824, + "learning_rate": 0.0002, + "loss": 0.315, + "step": 18940 + }, + { + "epoch": 3.973579366743552, + "grad_norm": 9.605280876159668, + "learning_rate": 0.0002, + "loss": 0.2968, + "step": 18950 + }, + { + "epoch": 3.975676242398826, + "grad_norm": 4.225147724151611, + "learning_rate": 0.0002, + "loss": 0.2526, + "step": 18960 + }, + { + "epoch": 3.9777731180540994, + "grad_norm": 3.812077045440674, + "learning_rate": 0.0002, + "loss": 0.2869, + "step": 18970 + }, + { + "epoch": 3.979869993709373, + "grad_norm": 1.702895164489746, + "learning_rate": 0.0002, + "loss": 0.2474, + "step": 18980 + }, + { + "epoch": 3.981966869364647, + "grad_norm": 0.22785484790802002, + "learning_rate": 0.0002, + "loss": 0.335, + "step": 18990 + }, + { + "epoch": 3.9840637450199203, + "grad_norm": 2.7882556915283203, + "learning_rate": 0.0002, + "loss": 0.248, + "step": 19000 + }, + { + "epoch": 3.986160620675194, + "grad_norm": 7.80819034576416, + "learning_rate": 0.0002, + "loss": 0.3786, + "step": 19010 + }, + { + "epoch": 3.9882574963304673, + "grad_norm": 0.829639732837677, + "learning_rate": 0.0002, + "loss": 0.3423, + "step": 19020 + }, + { + "epoch": 3.9903543719857413, + "grad_norm": 4.340249538421631, + "learning_rate": 0.0002, + "loss": 0.2621, + "step": 19030 + }, + { + "epoch": 3.9924512476410148, + "grad_norm": 3.492180824279785, + "learning_rate": 0.0002, + "loss": 0.2488, + "step": 19040 + }, + { + "epoch": 3.9945481232962887, + "grad_norm": 1.187342882156372, + "learning_rate": 0.0002, + "loss": 0.2897, + "step": 19050 + }, + { + "epoch": 3.9966449989515622, + "grad_norm": 0.2959229052066803, + "learning_rate": 0.0002, + "loss": 0.3287, + "step": 19060 + }, + { + "epoch": 3.9987418746068357, + "grad_norm": 2.1524133682250977, + "learning_rate": 0.0002, + "loss": 0.2873, + "step": 19070 + }, + { + "epoch": 4.000838750262109, + "grad_norm": 0.7680649757385254, + "learning_rate": 0.0002, + "loss": 0.2441, + "step": 19080 + }, + { + "epoch": 4.002935625917383, + "grad_norm": 3.3124094009399414, + "learning_rate": 0.0002, + "loss": 0.246, + "step": 19090 + }, + { + "epoch": 4.005032501572657, + "grad_norm": 8.343110084533691, + "learning_rate": 0.0002, + "loss": 0.2892, + "step": 19100 + }, + { + "epoch": 4.007129377227931, + "grad_norm": 0.46529117226600647, + "learning_rate": 0.0002, + "loss": 0.3646, + "step": 19110 + }, + { + "epoch": 4.009226252883204, + "grad_norm": 1.364998698234558, + "learning_rate": 0.0002, + "loss": 0.2461, + "step": 19120 + }, + { + "epoch": 4.011323128538478, + "grad_norm": 1.5017606019973755, + "learning_rate": 0.0002, + "loss": 0.4226, + "step": 19130 + }, + { + "epoch": 4.013420004193751, + "grad_norm": 4.142571926116943, + "learning_rate": 0.0002, + "loss": 0.3087, + "step": 19140 + }, + { + "epoch": 4.015516879849025, + "grad_norm": 8.241575241088867, + "learning_rate": 0.0002, + "loss": 0.2812, + "step": 19150 + }, + { + "epoch": 4.017613755504299, + "grad_norm": 3.2250149250030518, + "learning_rate": 0.0002, + "loss": 0.3269, + "step": 19160 + }, + { + "epoch": 4.0197106311595725, + "grad_norm": 0.6055905222892761, + "learning_rate": 0.0002, + "loss": 0.2535, + "step": 19170 + }, + { + "epoch": 4.021807506814846, + "grad_norm": 0.2831921875476837, + "learning_rate": 0.0002, + "loss": 0.2572, + "step": 19180 + }, + { + "epoch": 4.0239043824701195, + "grad_norm": 0.3150438964366913, + "learning_rate": 0.0002, + "loss": 0.3007, + "step": 19190 + }, + { + "epoch": 4.026001258125393, + "grad_norm": 12.150964736938477, + "learning_rate": 0.0002, + "loss": 0.281, + "step": 19200 + }, + { + "epoch": 4.0280981337806665, + "grad_norm": 13.298822402954102, + "learning_rate": 0.0002, + "loss": 0.3141, + "step": 19210 + }, + { + "epoch": 4.030195009435941, + "grad_norm": 0.38828134536743164, + "learning_rate": 0.0002, + "loss": 0.254, + "step": 19220 + }, + { + "epoch": 4.032291885091214, + "grad_norm": 0.5457937717437744, + "learning_rate": 0.0002, + "loss": 0.2458, + "step": 19230 + }, + { + "epoch": 4.034388760746488, + "grad_norm": 0.26707398891448975, + "learning_rate": 0.0002, + "loss": 0.249, + "step": 19240 + }, + { + "epoch": 4.036485636401761, + "grad_norm": 26.078208923339844, + "learning_rate": 0.0002, + "loss": 0.3605, + "step": 19250 + }, + { + "epoch": 4.038582512057035, + "grad_norm": 0.1831461787223816, + "learning_rate": 0.0002, + "loss": 0.2754, + "step": 19260 + }, + { + "epoch": 4.040679387712308, + "grad_norm": 3.2851014137268066, + "learning_rate": 0.0002, + "loss": 0.3133, + "step": 19270 + }, + { + "epoch": 4.042776263367582, + "grad_norm": 6.421793460845947, + "learning_rate": 0.0002, + "loss": 0.3104, + "step": 19280 + }, + { + "epoch": 4.044873139022856, + "grad_norm": 22.904735565185547, + "learning_rate": 0.0002, + "loss": 0.3006, + "step": 19290 + }, + { + "epoch": 4.04697001467813, + "grad_norm": 9.35460090637207, + "learning_rate": 0.0002, + "loss": 0.3001, + "step": 19300 + }, + { + "epoch": 4.049066890333403, + "grad_norm": 2.4891185760498047, + "learning_rate": 0.0002, + "loss": 0.3267, + "step": 19310 + }, + { + "epoch": 4.051163765988677, + "grad_norm": 3.316619396209717, + "learning_rate": 0.0002, + "loss": 0.248, + "step": 19320 + }, + { + "epoch": 4.05326064164395, + "grad_norm": 8.631692886352539, + "learning_rate": 0.0002, + "loss": 0.3136, + "step": 19330 + }, + { + "epoch": 4.055357517299224, + "grad_norm": 1.2316213846206665, + "learning_rate": 0.0002, + "loss": 0.4906, + "step": 19340 + }, + { + "epoch": 4.057454392954498, + "grad_norm": 0.4751741588115692, + "learning_rate": 0.0002, + "loss": 0.2471, + "step": 19350 + }, + { + "epoch": 4.059551268609772, + "grad_norm": 2.4287467002868652, + "learning_rate": 0.0002, + "loss": 0.3005, + "step": 19360 + }, + { + "epoch": 4.061648144265045, + "grad_norm": 5.299201488494873, + "learning_rate": 0.0002, + "loss": 0.2505, + "step": 19370 + }, + { + "epoch": 4.063745019920319, + "grad_norm": 0.45759066939353943, + "learning_rate": 0.0002, + "loss": 0.2545, + "step": 19380 + }, + { + "epoch": 4.065841895575592, + "grad_norm": 8.147871017456055, + "learning_rate": 0.0002, + "loss": 0.3042, + "step": 19390 + }, + { + "epoch": 4.067938771230866, + "grad_norm": 3.946077823638916, + "learning_rate": 0.0002, + "loss": 0.3312, + "step": 19400 + }, + { + "epoch": 4.07003564688614, + "grad_norm": 24.572731018066406, + "learning_rate": 0.0002, + "loss": 0.4547, + "step": 19410 + }, + { + "epoch": 4.072132522541414, + "grad_norm": 3.766494035720825, + "learning_rate": 0.0002, + "loss": 0.2459, + "step": 19420 + }, + { + "epoch": 4.074229398196687, + "grad_norm": 7.838687419891357, + "learning_rate": 0.0002, + "loss": 0.4842, + "step": 19430 + }, + { + "epoch": 4.076326273851961, + "grad_norm": 0.370241641998291, + "learning_rate": 0.0002, + "loss": 0.2907, + "step": 19440 + }, + { + "epoch": 4.078423149507234, + "grad_norm": 0.26800379157066345, + "learning_rate": 0.0002, + "loss": 0.2482, + "step": 19450 + }, + { + "epoch": 4.080520025162508, + "grad_norm": 0.5113723874092102, + "learning_rate": 0.0002, + "loss": 0.2461, + "step": 19460 + }, + { + "epoch": 4.082616900817781, + "grad_norm": 0.381949245929718, + "learning_rate": 0.0002, + "loss": 0.3141, + "step": 19470 + }, + { + "epoch": 4.0847137764730554, + "grad_norm": 0.7983457446098328, + "learning_rate": 0.0002, + "loss": 0.2473, + "step": 19480 + }, + { + "epoch": 4.086810652128329, + "grad_norm": 0.41288188099861145, + "learning_rate": 0.0002, + "loss": 0.2446, + "step": 19490 + }, + { + "epoch": 4.0889075277836024, + "grad_norm": 0.24723954498767853, + "learning_rate": 0.0002, + "loss": 0.2868, + "step": 19500 + }, + { + "epoch": 4.091004403438876, + "grad_norm": 0.1693434864282608, + "learning_rate": 0.0002, + "loss": 0.2461, + "step": 19510 + }, + { + "epoch": 4.0931012790941494, + "grad_norm": 0.557501494884491, + "learning_rate": 0.0002, + "loss": 0.3037, + "step": 19520 + }, + { + "epoch": 4.095198154749423, + "grad_norm": 0.6941953897476196, + "learning_rate": 0.0002, + "loss": 0.2497, + "step": 19530 + }, + { + "epoch": 4.097295030404697, + "grad_norm": 0.5740838050842285, + "learning_rate": 0.0002, + "loss": 0.2511, + "step": 19540 + }, + { + "epoch": 4.099391906059971, + "grad_norm": 1.7676140069961548, + "learning_rate": 0.0002, + "loss": 0.2954, + "step": 19550 + }, + { + "epoch": 4.101488781715244, + "grad_norm": 5.540597915649414, + "learning_rate": 0.0002, + "loss": 0.3015, + "step": 19560 + }, + { + "epoch": 4.103585657370518, + "grad_norm": 5.951892375946045, + "learning_rate": 0.0002, + "loss": 0.3055, + "step": 19570 + }, + { + "epoch": 4.105682533025791, + "grad_norm": 11.446410179138184, + "learning_rate": 0.0002, + "loss": 0.2508, + "step": 19580 + }, + { + "epoch": 4.107779408681065, + "grad_norm": 0.43041107058525085, + "learning_rate": 0.0002, + "loss": 0.3027, + "step": 19590 + }, + { + "epoch": 4.109876284336339, + "grad_norm": 0.48323339223861694, + "learning_rate": 0.0002, + "loss": 0.2867, + "step": 19600 + }, + { + "epoch": 4.111973159991613, + "grad_norm": 0.5927427411079407, + "learning_rate": 0.0002, + "loss": 0.3368, + "step": 19610 + }, + { + "epoch": 4.114070035646886, + "grad_norm": 2.952925205230713, + "learning_rate": 0.0002, + "loss": 0.3301, + "step": 19620 + }, + { + "epoch": 4.11616691130216, + "grad_norm": 4.207739353179932, + "learning_rate": 0.0002, + "loss": 0.3181, + "step": 19630 + }, + { + "epoch": 4.118263786957433, + "grad_norm": 2.4143640995025635, + "learning_rate": 0.0002, + "loss": 0.3088, + "step": 19640 + }, + { + "epoch": 4.120360662612707, + "grad_norm": 0.6868274211883545, + "learning_rate": 0.0002, + "loss": 0.2837, + "step": 19650 + }, + { + "epoch": 4.122457538267981, + "grad_norm": 1.6815471649169922, + "learning_rate": 0.0002, + "loss": 0.2477, + "step": 19660 + }, + { + "epoch": 4.124554413923255, + "grad_norm": 0.845425009727478, + "learning_rate": 0.0002, + "loss": 0.3732, + "step": 19670 + }, + { + "epoch": 4.126651289578528, + "grad_norm": 2.966221332550049, + "learning_rate": 0.0002, + "loss": 0.25, + "step": 19680 + }, + { + "epoch": 4.128748165233802, + "grad_norm": 0.26147690415382385, + "learning_rate": 0.0002, + "loss": 0.2464, + "step": 19690 + }, + { + "epoch": 4.130845040889075, + "grad_norm": 1.215938925743103, + "learning_rate": 0.0002, + "loss": 0.3892, + "step": 19700 + }, + { + "epoch": 4.132941916544349, + "grad_norm": 0.2903278172016144, + "learning_rate": 0.0002, + "loss": 0.308, + "step": 19710 + }, + { + "epoch": 4.135038792199623, + "grad_norm": 0.33962321281433105, + "learning_rate": 0.0002, + "loss": 0.3393, + "step": 19720 + }, + { + "epoch": 4.1371356678548965, + "grad_norm": 2.0429956912994385, + "learning_rate": 0.0002, + "loss": 0.2933, + "step": 19730 + }, + { + "epoch": 4.13923254351017, + "grad_norm": 3.423198938369751, + "learning_rate": 0.0002, + "loss": 0.3338, + "step": 19740 + }, + { + "epoch": 4.1413294191654435, + "grad_norm": 0.21673530340194702, + "learning_rate": 0.0002, + "loss": 0.3316, + "step": 19750 + }, + { + "epoch": 4.143426294820717, + "grad_norm": 23.13585090637207, + "learning_rate": 0.0002, + "loss": 0.2715, + "step": 19760 + }, + { + "epoch": 4.1455231704759905, + "grad_norm": 18.557872772216797, + "learning_rate": 0.0002, + "loss": 0.3716, + "step": 19770 + }, + { + "epoch": 4.147620046131264, + "grad_norm": 32.08924865722656, + "learning_rate": 0.0002, + "loss": 0.37, + "step": 19780 + }, + { + "epoch": 4.149716921786538, + "grad_norm": 0.26837587356567383, + "learning_rate": 0.0002, + "loss": 0.2617, + "step": 19790 + }, + { + "epoch": 4.151813797441812, + "grad_norm": 0.33404791355133057, + "learning_rate": 0.0002, + "loss": 0.2472, + "step": 19800 + }, + { + "epoch": 4.153910673097085, + "grad_norm": 0.5519550442695618, + "learning_rate": 0.0002, + "loss": 0.4107, + "step": 19810 + }, + { + "epoch": 4.156007548752359, + "grad_norm": 2.7054030895233154, + "learning_rate": 0.0002, + "loss": 0.2485, + "step": 19820 + }, + { + "epoch": 4.158104424407632, + "grad_norm": 0.9935354590415955, + "learning_rate": 0.0002, + "loss": 0.3529, + "step": 19830 + }, + { + "epoch": 4.160201300062906, + "grad_norm": 0.3448370397090912, + "learning_rate": 0.0002, + "loss": 0.3026, + "step": 19840 + }, + { + "epoch": 4.16229817571818, + "grad_norm": 21.877988815307617, + "learning_rate": 0.0002, + "loss": 0.3752, + "step": 19850 + }, + { + "epoch": 4.164395051373454, + "grad_norm": 1.063585638999939, + "learning_rate": 0.0002, + "loss": 0.2481, + "step": 19860 + }, + { + "epoch": 4.166491927028727, + "grad_norm": 0.8071320056915283, + "learning_rate": 0.0002, + "loss": 0.3006, + "step": 19870 + }, + { + "epoch": 4.168588802684001, + "grad_norm": 0.8395910859107971, + "learning_rate": 0.0002, + "loss": 0.2501, + "step": 19880 + }, + { + "epoch": 4.170685678339274, + "grad_norm": 0.6836159825325012, + "learning_rate": 0.0002, + "loss": 0.2925, + "step": 19890 + }, + { + "epoch": 4.172782553994548, + "grad_norm": 0.48117589950561523, + "learning_rate": 0.0002, + "loss": 0.3058, + "step": 19900 + }, + { + "epoch": 4.174879429649822, + "grad_norm": 0.6200854778289795, + "learning_rate": 0.0002, + "loss": 0.2484, + "step": 19910 + }, + { + "epoch": 4.176976305305096, + "grad_norm": 15.760970115661621, + "learning_rate": 0.0002, + "loss": 0.3091, + "step": 19920 + }, + { + "epoch": 4.179073180960369, + "grad_norm": 27.362468719482422, + "learning_rate": 0.0002, + "loss": 0.445, + "step": 19930 + }, + { + "epoch": 4.181170056615643, + "grad_norm": 1.0544220209121704, + "learning_rate": 0.0002, + "loss": 0.3394, + "step": 19940 + }, + { + "epoch": 4.183266932270916, + "grad_norm": 8.582993507385254, + "learning_rate": 0.0002, + "loss": 0.2508, + "step": 19950 + }, + { + "epoch": 4.18536380792619, + "grad_norm": 0.3823242485523224, + "learning_rate": 0.0002, + "loss": 0.5184, + "step": 19960 + }, + { + "epoch": 4.187460683581463, + "grad_norm": 0.1922634392976761, + "learning_rate": 0.0002, + "loss": 0.2462, + "step": 19970 + }, + { + "epoch": 4.189557559236738, + "grad_norm": 0.3688085377216339, + "learning_rate": 0.0002, + "loss": 0.2484, + "step": 19980 + }, + { + "epoch": 4.191654434892011, + "grad_norm": 0.38985419273376465, + "learning_rate": 0.0002, + "loss": 0.247, + "step": 19990 + }, + { + "epoch": 4.193751310547285, + "grad_norm": 14.897371292114258, + "learning_rate": 0.0002, + "loss": 0.2517, + "step": 20000 + }, + { + "epoch": 4.195848186202558, + "grad_norm": 0.27810803055763245, + "learning_rate": 0.0002, + "loss": 0.2945, + "step": 20010 + }, + { + "epoch": 4.197945061857832, + "grad_norm": 2.055335283279419, + "learning_rate": 0.0002, + "loss": 0.3384, + "step": 20020 + }, + { + "epoch": 4.200041937513105, + "grad_norm": 35.41399383544922, + "learning_rate": 0.0002, + "loss": 0.2872, + "step": 20030 + }, + { + "epoch": 4.2021388131683794, + "grad_norm": 0.3352704644203186, + "learning_rate": 0.0002, + "loss": 0.25, + "step": 20040 + }, + { + "epoch": 4.204235688823653, + "grad_norm": 10.434800148010254, + "learning_rate": 0.0002, + "loss": 0.3184, + "step": 20050 + }, + { + "epoch": 4.2063325644789265, + "grad_norm": 0.7769026756286621, + "learning_rate": 0.0002, + "loss": 0.4273, + "step": 20060 + }, + { + "epoch": 4.2084294401342, + "grad_norm": 0.48685234785079956, + "learning_rate": 0.0002, + "loss": 0.3596, + "step": 20070 + }, + { + "epoch": 4.2105263157894735, + "grad_norm": 8.412596702575684, + "learning_rate": 0.0002, + "loss": 0.2924, + "step": 20080 + }, + { + "epoch": 4.212623191444747, + "grad_norm": 7.435706615447998, + "learning_rate": 0.0002, + "loss": 0.3786, + "step": 20090 + }, + { + "epoch": 4.214720067100021, + "grad_norm": 0.4827675223350525, + "learning_rate": 0.0002, + "loss": 0.3096, + "step": 20100 + }, + { + "epoch": 4.216816942755295, + "grad_norm": 2.754267692565918, + "learning_rate": 0.0002, + "loss": 0.3012, + "step": 20110 + }, + { + "epoch": 4.218913818410568, + "grad_norm": 0.3863971531391144, + "learning_rate": 0.0002, + "loss": 0.3547, + "step": 20120 + }, + { + "epoch": 4.221010694065842, + "grad_norm": 0.3589477837085724, + "learning_rate": 0.0002, + "loss": 0.2464, + "step": 20130 + }, + { + "epoch": 4.223107569721115, + "grad_norm": 9.28115177154541, + "learning_rate": 0.0002, + "loss": 0.3512, + "step": 20140 + }, + { + "epoch": 4.225204445376389, + "grad_norm": 1.1233867406845093, + "learning_rate": 0.0002, + "loss": 0.2903, + "step": 20150 + }, + { + "epoch": 4.227301321031663, + "grad_norm": 2.778885841369629, + "learning_rate": 0.0002, + "loss": 0.2465, + "step": 20160 + }, + { + "epoch": 4.229398196686937, + "grad_norm": 0.5360897779464722, + "learning_rate": 0.0002, + "loss": 0.2699, + "step": 20170 + }, + { + "epoch": 4.23149507234221, + "grad_norm": 16.425745010375977, + "learning_rate": 0.0002, + "loss": 0.3357, + "step": 20180 + }, + { + "epoch": 4.233591947997484, + "grad_norm": 20.68720054626465, + "learning_rate": 0.0002, + "loss": 0.3336, + "step": 20190 + }, + { + "epoch": 4.235688823652757, + "grad_norm": 8.801888465881348, + "learning_rate": 0.0002, + "loss": 0.2671, + "step": 20200 + }, + { + "epoch": 4.237785699308031, + "grad_norm": 5.202653884887695, + "learning_rate": 0.0002, + "loss": 0.3124, + "step": 20210 + }, + { + "epoch": 4.239882574963305, + "grad_norm": 20.8682804107666, + "learning_rate": 0.0002, + "loss": 0.2606, + "step": 20220 + }, + { + "epoch": 4.241979450618579, + "grad_norm": 2.6714205741882324, + "learning_rate": 0.0002, + "loss": 0.2463, + "step": 20230 + }, + { + "epoch": 4.244076326273852, + "grad_norm": 20.23837661743164, + "learning_rate": 0.0002, + "loss": 0.3427, + "step": 20240 + }, + { + "epoch": 4.246173201929126, + "grad_norm": 2.1679625511169434, + "learning_rate": 0.0002, + "loss": 0.3103, + "step": 20250 + }, + { + "epoch": 4.248270077584399, + "grad_norm": 27.004417419433594, + "learning_rate": 0.0002, + "loss": 0.4334, + "step": 20260 + }, + { + "epoch": 4.250366953239673, + "grad_norm": 0.6717422008514404, + "learning_rate": 0.0002, + "loss": 0.2503, + "step": 20270 + }, + { + "epoch": 4.252463828894946, + "grad_norm": 0.48832905292510986, + "learning_rate": 0.0002, + "loss": 0.3939, + "step": 20280 + }, + { + "epoch": 4.2545607045502205, + "grad_norm": 19.908430099487305, + "learning_rate": 0.0002, + "loss": 0.312, + "step": 20290 + }, + { + "epoch": 4.256657580205494, + "grad_norm": 6.089963912963867, + "learning_rate": 0.0002, + "loss": 0.2523, + "step": 20300 + }, + { + "epoch": 4.2587544558607675, + "grad_norm": 6.796069622039795, + "learning_rate": 0.0002, + "loss": 0.2714, + "step": 20310 + }, + { + "epoch": 4.260851331516041, + "grad_norm": 9.671527862548828, + "learning_rate": 0.0002, + "loss": 0.3746, + "step": 20320 + }, + { + "epoch": 4.2629482071713145, + "grad_norm": 10.420218467712402, + "learning_rate": 0.0002, + "loss": 0.299, + "step": 20330 + }, + { + "epoch": 4.265045082826588, + "grad_norm": 6.379571437835693, + "learning_rate": 0.0002, + "loss": 0.2959, + "step": 20340 + }, + { + "epoch": 4.267141958481862, + "grad_norm": 0.3346550464630127, + "learning_rate": 0.0002, + "loss": 0.3061, + "step": 20350 + }, + { + "epoch": 4.269238834137136, + "grad_norm": 2.433529853820801, + "learning_rate": 0.0002, + "loss": 0.3757, + "step": 20360 + }, + { + "epoch": 4.271335709792409, + "grad_norm": 56.37220764160156, + "learning_rate": 0.0002, + "loss": 0.3016, + "step": 20370 + }, + { + "epoch": 4.273432585447683, + "grad_norm": 43.44303512573242, + "learning_rate": 0.0002, + "loss": 0.2864, + "step": 20380 + }, + { + "epoch": 4.275529461102956, + "grad_norm": 3.851576328277588, + "learning_rate": 0.0002, + "loss": 0.3883, + "step": 20390 + }, + { + "epoch": 4.27762633675823, + "grad_norm": 41.155879974365234, + "learning_rate": 0.0002, + "loss": 0.3486, + "step": 20400 + }, + { + "epoch": 4.279723212413504, + "grad_norm": 7.531223297119141, + "learning_rate": 0.0002, + "loss": 0.2479, + "step": 20410 + }, + { + "epoch": 4.281820088068778, + "grad_norm": 16.209299087524414, + "learning_rate": 0.0002, + "loss": 0.279, + "step": 20420 + }, + { + "epoch": 4.283916963724051, + "grad_norm": 15.871269226074219, + "learning_rate": 0.0002, + "loss": 0.3702, + "step": 20430 + }, + { + "epoch": 4.286013839379325, + "grad_norm": 26.545909881591797, + "learning_rate": 0.0002, + "loss": 0.2684, + "step": 20440 + }, + { + "epoch": 4.288110715034598, + "grad_norm": 17.186182022094727, + "learning_rate": 0.0002, + "loss": 0.2698, + "step": 20450 + }, + { + "epoch": 4.290207590689872, + "grad_norm": 18.780088424682617, + "learning_rate": 0.0002, + "loss": 0.2631, + "step": 20460 + }, + { + "epoch": 4.292304466345145, + "grad_norm": 624.7259521484375, + "learning_rate": 0.0002, + "loss": 0.9436, + "step": 20470 + }, + { + "epoch": 4.29440134200042, + "grad_norm": 16.724163055419922, + "learning_rate": 0.0002, + "loss": 1.1358, + "step": 20480 + }, + { + "epoch": 4.296498217655693, + "grad_norm": 4.588830471038818, + "learning_rate": 0.0002, + "loss": 0.249, + "step": 20490 + }, + { + "epoch": 4.298595093310967, + "grad_norm": 83.1984634399414, + "learning_rate": 0.0002, + "loss": 0.3602, + "step": 20500 + }, + { + "epoch": 4.30069196896624, + "grad_norm": 26.37981605529785, + "learning_rate": 0.0002, + "loss": 0.3221, + "step": 20510 + }, + { + "epoch": 4.302788844621514, + "grad_norm": 47.30278396606445, + "learning_rate": 0.0002, + "loss": 0.2829, + "step": 20520 + }, + { + "epoch": 4.304885720276787, + "grad_norm": 33.844642639160156, + "learning_rate": 0.0002, + "loss": 0.4587, + "step": 20530 + }, + { + "epoch": 4.306982595932062, + "grad_norm": 8.160712242126465, + "learning_rate": 0.0002, + "loss": 0.2647, + "step": 20540 + }, + { + "epoch": 4.309079471587335, + "grad_norm": 55.612396240234375, + "learning_rate": 0.0002, + "loss": 0.3565, + "step": 20550 + }, + { + "epoch": 4.311176347242609, + "grad_norm": 10.851349830627441, + "learning_rate": 0.0002, + "loss": 0.3704, + "step": 20560 + }, + { + "epoch": 4.313273222897882, + "grad_norm": 18.54105567932129, + "learning_rate": 0.0002, + "loss": 0.3417, + "step": 20570 + }, + { + "epoch": 4.315370098553156, + "grad_norm": 5.300592422485352, + "learning_rate": 0.0002, + "loss": 0.2883, + "step": 20580 + }, + { + "epoch": 4.317466974208429, + "grad_norm": 22.01258659362793, + "learning_rate": 0.0002, + "loss": 0.3597, + "step": 20590 + }, + { + "epoch": 4.3195638498637035, + "grad_norm": 50.189697265625, + "learning_rate": 0.0002, + "loss": 0.3421, + "step": 20600 + }, + { + "epoch": 4.321660725518977, + "grad_norm": 13.125137329101562, + "learning_rate": 0.0002, + "loss": 0.3829, + "step": 20610 + }, + { + "epoch": 4.3237576011742505, + "grad_norm": 64.3811264038086, + "learning_rate": 0.0002, + "loss": 0.391, + "step": 20620 + }, + { + "epoch": 4.325854476829524, + "grad_norm": 1.811645746231079, + "learning_rate": 0.0002, + "loss": 0.2639, + "step": 20630 + }, + { + "epoch": 4.3279513524847975, + "grad_norm": 12.494913101196289, + "learning_rate": 0.0002, + "loss": 0.2465, + "step": 20640 + }, + { + "epoch": 4.330048228140071, + "grad_norm": 7.9811201095581055, + "learning_rate": 0.0002, + "loss": 0.2627, + "step": 20650 + }, + { + "epoch": 4.332145103795345, + "grad_norm": 5.287240505218506, + "learning_rate": 0.0002, + "loss": 0.2506, + "step": 20660 + }, + { + "epoch": 4.334241979450619, + "grad_norm": 1.0475136041641235, + "learning_rate": 0.0002, + "loss": 0.2512, + "step": 20670 + }, + { + "epoch": 4.336338855105892, + "grad_norm": 10.19491195678711, + "learning_rate": 0.0002, + "loss": 0.3017, + "step": 20680 + }, + { + "epoch": 4.338435730761166, + "grad_norm": 67.91783142089844, + "learning_rate": 0.0002, + "loss": 0.3486, + "step": 20690 + }, + { + "epoch": 4.340532606416439, + "grad_norm": 3.169609785079956, + "learning_rate": 0.0002, + "loss": 0.3006, + "step": 20700 + }, + { + "epoch": 4.342629482071713, + "grad_norm": 8.942724227905273, + "learning_rate": 0.0002, + "loss": 0.2486, + "step": 20710 + }, + { + "epoch": 4.344726357726987, + "grad_norm": 7.17462682723999, + "learning_rate": 0.0002, + "loss": 0.2686, + "step": 20720 + }, + { + "epoch": 4.346823233382261, + "grad_norm": 6.14696741104126, + "learning_rate": 0.0002, + "loss": 0.3343, + "step": 20730 + }, + { + "epoch": 4.348920109037534, + "grad_norm": 16.166717529296875, + "learning_rate": 0.0002, + "loss": 0.3094, + "step": 20740 + }, + { + "epoch": 4.351016984692808, + "grad_norm": 36.18352508544922, + "learning_rate": 0.0002, + "loss": 0.4342, + "step": 20750 + }, + { + "epoch": 4.353113860348081, + "grad_norm": 3.9650731086730957, + "learning_rate": 0.0002, + "loss": 0.2763, + "step": 20760 + }, + { + "epoch": 4.355210736003355, + "grad_norm": 4.956878185272217, + "learning_rate": 0.0002, + "loss": 0.2491, + "step": 20770 + }, + { + "epoch": 4.357307611658628, + "grad_norm": 33.45060729980469, + "learning_rate": 0.0002, + "loss": 0.2603, + "step": 20780 + }, + { + "epoch": 4.359404487313903, + "grad_norm": 6.082530498504639, + "learning_rate": 0.0002, + "loss": 0.3175, + "step": 20790 + }, + { + "epoch": 4.361501362969176, + "grad_norm": 34.4377555847168, + "learning_rate": 0.0002, + "loss": 0.3072, + "step": 20800 + }, + { + "epoch": 4.36359823862445, + "grad_norm": 11.42460823059082, + "learning_rate": 0.0002, + "loss": 0.2476, + "step": 20810 + }, + { + "epoch": 4.365695114279723, + "grad_norm": 2.2365314960479736, + "learning_rate": 0.0002, + "loss": 0.2976, + "step": 20820 + }, + { + "epoch": 4.367791989934997, + "grad_norm": 12.357730865478516, + "learning_rate": 0.0002, + "loss": 0.3729, + "step": 20830 + }, + { + "epoch": 4.36988886559027, + "grad_norm": 1.3517647981643677, + "learning_rate": 0.0002, + "loss": 0.2908, + "step": 20840 + }, + { + "epoch": 4.3719857412455445, + "grad_norm": 2.2728474140167236, + "learning_rate": 0.0002, + "loss": 0.3596, + "step": 20850 + }, + { + "epoch": 4.374082616900818, + "grad_norm": 0.8363596200942993, + "learning_rate": 0.0002, + "loss": 0.2544, + "step": 20860 + }, + { + "epoch": 4.3761794925560915, + "grad_norm": 2.689927816390991, + "learning_rate": 0.0002, + "loss": 0.2475, + "step": 20870 + }, + { + "epoch": 4.378276368211365, + "grad_norm": 2.3501954078674316, + "learning_rate": 0.0002, + "loss": 0.3222, + "step": 20880 + }, + { + "epoch": 4.3803732438666385, + "grad_norm": 0.6565678715705872, + "learning_rate": 0.0002, + "loss": 0.2511, + "step": 20890 + }, + { + "epoch": 4.382470119521912, + "grad_norm": 12.648674011230469, + "learning_rate": 0.0002, + "loss": 0.2525, + "step": 20900 + }, + { + "epoch": 4.384566995177186, + "grad_norm": 60.73137283325195, + "learning_rate": 0.0002, + "loss": 0.435, + "step": 20910 + }, + { + "epoch": 4.38666387083246, + "grad_norm": 34.06174087524414, + "learning_rate": 0.0002, + "loss": 0.3655, + "step": 20920 + }, + { + "epoch": 4.388760746487733, + "grad_norm": 11.02375316619873, + "learning_rate": 0.0002, + "loss": 0.3604, + "step": 20930 + }, + { + "epoch": 4.390857622143007, + "grad_norm": 6.379631042480469, + "learning_rate": 0.0002, + "loss": 0.3405, + "step": 20940 + }, + { + "epoch": 4.39295449779828, + "grad_norm": 2.8621158599853516, + "learning_rate": 0.0002, + "loss": 0.304, + "step": 20950 + }, + { + "epoch": 4.395051373453554, + "grad_norm": 80.70250701904297, + "learning_rate": 0.0002, + "loss": 0.3652, + "step": 20960 + }, + { + "epoch": 4.397148249108827, + "grad_norm": 2.1578187942504883, + "learning_rate": 0.0002, + "loss": 0.3582, + "step": 20970 + }, + { + "epoch": 4.399245124764102, + "grad_norm": 6.415958404541016, + "learning_rate": 0.0002, + "loss": 0.3253, + "step": 20980 + }, + { + "epoch": 4.401342000419375, + "grad_norm": 21.651079177856445, + "learning_rate": 0.0002, + "loss": 0.3046, + "step": 20990 + }, + { + "epoch": 4.403438876074649, + "grad_norm": 9.651481628417969, + "learning_rate": 0.0002, + "loss": 0.2596, + "step": 21000 + }, + { + "epoch": 4.405535751729922, + "grad_norm": 14.337310791015625, + "learning_rate": 0.0002, + "loss": 0.4149, + "step": 21010 + }, + { + "epoch": 4.407632627385196, + "grad_norm": 43.099578857421875, + "learning_rate": 0.0002, + "loss": 0.2976, + "step": 21020 + }, + { + "epoch": 4.409729503040469, + "grad_norm": 417.2384948730469, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 21030 + }, + { + "epoch": 4.411826378695744, + "grad_norm": 7.477783203125, + "learning_rate": 0.0002, + "loss": 0.2786, + "step": 21040 + }, + { + "epoch": 4.413923254351017, + "grad_norm": 17.863725662231445, + "learning_rate": 0.0002, + "loss": 0.3768, + "step": 21050 + }, + { + "epoch": 4.416020130006291, + "grad_norm": 56.08100509643555, + "learning_rate": 0.0002, + "loss": 0.3009, + "step": 21060 + }, + { + "epoch": 4.418117005661564, + "grad_norm": 22.835973739624023, + "learning_rate": 0.0002, + "loss": 0.3914, + "step": 21070 + }, + { + "epoch": 4.420213881316838, + "grad_norm": 39.187904357910156, + "learning_rate": 0.0002, + "loss": 0.3252, + "step": 21080 + }, + { + "epoch": 4.422310756972111, + "grad_norm": 18.0650634765625, + "learning_rate": 0.0002, + "loss": 0.268, + "step": 21090 + }, + { + "epoch": 4.424407632627386, + "grad_norm": 9.460448265075684, + "learning_rate": 0.0002, + "loss": 0.258, + "step": 21100 + }, + { + "epoch": 4.426504508282659, + "grad_norm": 3.0662641525268555, + "learning_rate": 0.0002, + "loss": 0.2507, + "step": 21110 + }, + { + "epoch": 4.428601383937933, + "grad_norm": 12.422566413879395, + "learning_rate": 0.0002, + "loss": 0.3051, + "step": 21120 + }, + { + "epoch": 4.430698259593206, + "grad_norm": 27.117107391357422, + "learning_rate": 0.0002, + "loss": 0.2727, + "step": 21130 + }, + { + "epoch": 4.43279513524848, + "grad_norm": 41.62672805786133, + "learning_rate": 0.0002, + "loss": 0.3167, + "step": 21140 + }, + { + "epoch": 4.434892010903753, + "grad_norm": 191.19381713867188, + "learning_rate": 0.0002, + "loss": 0.2917, + "step": 21150 + }, + { + "epoch": 4.4369888865590275, + "grad_norm": 7.5268120765686035, + "learning_rate": 0.0002, + "loss": 0.3105, + "step": 21160 + }, + { + "epoch": 4.439085762214301, + "grad_norm": 6.784496307373047, + "learning_rate": 0.0002, + "loss": 0.3932, + "step": 21170 + }, + { + "epoch": 4.4411826378695745, + "grad_norm": 3.4503252506256104, + "learning_rate": 0.0002, + "loss": 0.2953, + "step": 21180 + }, + { + "epoch": 4.443279513524848, + "grad_norm": 1.6427088975906372, + "learning_rate": 0.0002, + "loss": 0.3064, + "step": 21190 + }, + { + "epoch": 4.4453763891801215, + "grad_norm": 4.4671807289123535, + "learning_rate": 0.0002, + "loss": 0.3886, + "step": 21200 + }, + { + "epoch": 4.447473264835395, + "grad_norm": 10.051387786865234, + "learning_rate": 0.0002, + "loss": 0.3087, + "step": 21210 + }, + { + "epoch": 4.449570140490669, + "grad_norm": 19.033756256103516, + "learning_rate": 0.0002, + "loss": 0.4619, + "step": 21220 + }, + { + "epoch": 4.451667016145943, + "grad_norm": 15.37178897857666, + "learning_rate": 0.0002, + "loss": 0.4557, + "step": 21230 + }, + { + "epoch": 4.453763891801216, + "grad_norm": 5.2867279052734375, + "learning_rate": 0.0002, + "loss": 0.2676, + "step": 21240 + }, + { + "epoch": 4.45586076745649, + "grad_norm": 2.558028221130371, + "learning_rate": 0.0002, + "loss": 0.2647, + "step": 21250 + }, + { + "epoch": 4.457957643111763, + "grad_norm": 35.25192642211914, + "learning_rate": 0.0002, + "loss": 0.4654, + "step": 21260 + }, + { + "epoch": 4.460054518767037, + "grad_norm": 27.573955535888672, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 21270 + }, + { + "epoch": 4.46215139442231, + "grad_norm": 3.5311503410339355, + "learning_rate": 0.0002, + "loss": 0.4097, + "step": 21280 + }, + { + "epoch": 4.464248270077585, + "grad_norm": 10.815498352050781, + "learning_rate": 0.0002, + "loss": 0.4622, + "step": 21290 + }, + { + "epoch": 4.466345145732858, + "grad_norm": 8.656516075134277, + "learning_rate": 0.0002, + "loss": 0.2548, + "step": 21300 + }, + { + "epoch": 4.468442021388132, + "grad_norm": 47.45362091064453, + "learning_rate": 0.0002, + "loss": 0.3418, + "step": 21310 + }, + { + "epoch": 4.470538897043405, + "grad_norm": 15.242376327514648, + "learning_rate": 0.0002, + "loss": 0.3816, + "step": 21320 + }, + { + "epoch": 4.472635772698679, + "grad_norm": 13.443343162536621, + "learning_rate": 0.0002, + "loss": 0.3601, + "step": 21330 + }, + { + "epoch": 4.474732648353952, + "grad_norm": 94.393798828125, + "learning_rate": 0.0002, + "loss": 0.4557, + "step": 21340 + }, + { + "epoch": 4.476829524009227, + "grad_norm": 207.5814666748047, + "learning_rate": 0.0002, + "loss": 0.9471, + "step": 21350 + }, + { + "epoch": 4.4789263996645, + "grad_norm": 42.534278869628906, + "learning_rate": 0.0002, + "loss": 0.8521, + "step": 21360 + }, + { + "epoch": 4.481023275319774, + "grad_norm": 15.869437217712402, + "learning_rate": 0.0002, + "loss": 0.4873, + "step": 21370 + }, + { + "epoch": 4.483120150975047, + "grad_norm": 2.4193897247314453, + "learning_rate": 0.0002, + "loss": 0.3548, + "step": 21380 + }, + { + "epoch": 4.485217026630321, + "grad_norm": 17.73488426208496, + "learning_rate": 0.0002, + "loss": 0.2896, + "step": 21390 + }, + { + "epoch": 4.487313902285594, + "grad_norm": 48.30131149291992, + "learning_rate": 0.0002, + "loss": 0.3949, + "step": 21400 + }, + { + "epoch": 4.4894107779408685, + "grad_norm": 5.021493911743164, + "learning_rate": 0.0002, + "loss": 0.3309, + "step": 21410 + }, + { + "epoch": 4.491507653596142, + "grad_norm": 28.720203399658203, + "learning_rate": 0.0002, + "loss": 0.3268, + "step": 21420 + }, + { + "epoch": 4.4936045292514155, + "grad_norm": 2.697554111480713, + "learning_rate": 0.0002, + "loss": 0.3162, + "step": 21430 + }, + { + "epoch": 4.495701404906689, + "grad_norm": 57.5301513671875, + "learning_rate": 0.0002, + "loss": 0.3567, + "step": 21440 + }, + { + "epoch": 4.4977982805619625, + "grad_norm": 6.8275299072265625, + "learning_rate": 0.0002, + "loss": 0.3677, + "step": 21450 + }, + { + "epoch": 4.499895156217236, + "grad_norm": 46.234619140625, + "learning_rate": 0.0002, + "loss": 0.4583, + "step": 21460 + }, + { + "epoch": 4.5019920318725095, + "grad_norm": 6.242780685424805, + "learning_rate": 0.0002, + "loss": 0.2637, + "step": 21470 + }, + { + "epoch": 4.504088907527784, + "grad_norm": 2.4962878227233887, + "learning_rate": 0.0002, + "loss": 0.2564, + "step": 21480 + }, + { + "epoch": 4.506185783183057, + "grad_norm": 17.033349990844727, + "learning_rate": 0.0002, + "loss": 0.3303, + "step": 21490 + }, + { + "epoch": 4.508282658838331, + "grad_norm": 8.312446594238281, + "learning_rate": 0.0002, + "loss": 0.3764, + "step": 21500 + }, + { + "epoch": 4.510379534493604, + "grad_norm": 6.448761940002441, + "learning_rate": 0.0002, + "loss": 0.2504, + "step": 21510 + }, + { + "epoch": 4.512476410148878, + "grad_norm": 64.10653686523438, + "learning_rate": 0.0002, + "loss": 0.3032, + "step": 21520 + }, + { + "epoch": 4.514573285804152, + "grad_norm": 21.183940887451172, + "learning_rate": 0.0002, + "loss": 0.3583, + "step": 21530 + }, + { + "epoch": 4.516670161459426, + "grad_norm": 18.349613189697266, + "learning_rate": 0.0002, + "loss": 0.3541, + "step": 21540 + }, + { + "epoch": 4.518767037114699, + "grad_norm": 21.80953025817871, + "learning_rate": 0.0002, + "loss": 0.3975, + "step": 21550 + }, + { + "epoch": 4.520863912769973, + "grad_norm": 38.548728942871094, + "learning_rate": 0.0002, + "loss": 0.4618, + "step": 21560 + }, + { + "epoch": 4.522960788425246, + "grad_norm": 5.460489273071289, + "learning_rate": 0.0002, + "loss": 0.3965, + "step": 21570 + }, + { + "epoch": 4.52505766408052, + "grad_norm": 52.827781677246094, + "learning_rate": 0.0002, + "loss": 0.3507, + "step": 21580 + }, + { + "epoch": 4.527154539735793, + "grad_norm": 3.8467347621917725, + "learning_rate": 0.0002, + "loss": 0.3747, + "step": 21590 + }, + { + "epoch": 4.529251415391068, + "grad_norm": 7.897753715515137, + "learning_rate": 0.0002, + "loss": 0.2542, + "step": 21600 + }, + { + "epoch": 4.531348291046341, + "grad_norm": 11.481263160705566, + "learning_rate": 0.0002, + "loss": 0.3945, + "step": 21610 + }, + { + "epoch": 4.533445166701615, + "grad_norm": 1.1834608316421509, + "learning_rate": 0.0002, + "loss": 0.3417, + "step": 21620 + }, + { + "epoch": 4.535542042356888, + "grad_norm": 3.0790064334869385, + "learning_rate": 0.0002, + "loss": 0.5012, + "step": 21630 + }, + { + "epoch": 4.537638918012162, + "grad_norm": 5.346984386444092, + "learning_rate": 0.0002, + "loss": 0.3821, + "step": 21640 + }, + { + "epoch": 4.539735793667435, + "grad_norm": 12.952540397644043, + "learning_rate": 0.0002, + "loss": 0.4092, + "step": 21650 + }, + { + "epoch": 4.541832669322709, + "grad_norm": 39.88877487182617, + "learning_rate": 0.0002, + "loss": 0.4586, + "step": 21660 + }, + { + "epoch": 4.543929544977983, + "grad_norm": 41.47772216796875, + "learning_rate": 0.0002, + "loss": 0.3667, + "step": 21670 + }, + { + "epoch": 4.546026420633257, + "grad_norm": 15.996908187866211, + "learning_rate": 0.0002, + "loss": 0.308, + "step": 21680 + }, + { + "epoch": 4.54812329628853, + "grad_norm": 4.2574381828308105, + "learning_rate": 0.0002, + "loss": 0.3494, + "step": 21690 + }, + { + "epoch": 4.550220171943804, + "grad_norm": 16.708751678466797, + "learning_rate": 0.0002, + "loss": 0.4887, + "step": 21700 + }, + { + "epoch": 4.552317047599077, + "grad_norm": 2.954166889190674, + "learning_rate": 0.0002, + "loss": 0.3128, + "step": 21710 + }, + { + "epoch": 4.5544139232543515, + "grad_norm": 4.792957305908203, + "learning_rate": 0.0002, + "loss": 0.461, + "step": 21720 + }, + { + "epoch": 4.556510798909625, + "grad_norm": 1.6357908248901367, + "learning_rate": 0.0002, + "loss": 0.279, + "step": 21730 + }, + { + "epoch": 4.5586076745648985, + "grad_norm": 13.55954647064209, + "learning_rate": 0.0002, + "loss": 0.4901, + "step": 21740 + }, + { + "epoch": 4.560704550220172, + "grad_norm": 0.8995484113693237, + "learning_rate": 0.0002, + "loss": 0.3749, + "step": 21750 + }, + { + "epoch": 4.5628014258754455, + "grad_norm": 5.566488265991211, + "learning_rate": 0.0002, + "loss": 0.3651, + "step": 21760 + }, + { + "epoch": 4.564898301530719, + "grad_norm": 17.96256446838379, + "learning_rate": 0.0002, + "loss": 0.2538, + "step": 21770 + }, + { + "epoch": 4.5669951771859925, + "grad_norm": 10.625880241394043, + "learning_rate": 0.0002, + "loss": 0.2498, + "step": 21780 + }, + { + "epoch": 4.569092052841267, + "grad_norm": 3.7621958255767822, + "learning_rate": 0.0002, + "loss": 0.3333, + "step": 21790 + }, + { + "epoch": 4.57118892849654, + "grad_norm": 5.985695838928223, + "learning_rate": 0.0002, + "loss": 0.2501, + "step": 21800 + }, + { + "epoch": 4.573285804151814, + "grad_norm": 18.665172576904297, + "learning_rate": 0.0002, + "loss": 0.4385, + "step": 21810 + }, + { + "epoch": 4.575382679807087, + "grad_norm": 11.269116401672363, + "learning_rate": 0.0002, + "loss": 0.2882, + "step": 21820 + }, + { + "epoch": 4.577479555462361, + "grad_norm": 125.940185546875, + "learning_rate": 0.0002, + "loss": 0.4505, + "step": 21830 + }, + { + "epoch": 4.579576431117634, + "grad_norm": 3.6832833290100098, + "learning_rate": 0.0002, + "loss": 0.5584, + "step": 21840 + }, + { + "epoch": 4.581673306772909, + "grad_norm": 26.39314079284668, + "learning_rate": 0.0002, + "loss": 0.4416, + "step": 21850 + }, + { + "epoch": 4.583770182428182, + "grad_norm": 3.046283006668091, + "learning_rate": 0.0002, + "loss": 0.2492, + "step": 21860 + }, + { + "epoch": 4.585867058083456, + "grad_norm": 4.790574073791504, + "learning_rate": 0.0002, + "loss": 0.2998, + "step": 21870 + }, + { + "epoch": 4.587963933738729, + "grad_norm": 21.731502532958984, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 21880 + }, + { + "epoch": 4.590060809394003, + "grad_norm": 1.8135603666305542, + "learning_rate": 0.0002, + "loss": 0.2608, + "step": 21890 + }, + { + "epoch": 4.592157685049276, + "grad_norm": 0.8779616951942444, + "learning_rate": 0.0002, + "loss": 0.2493, + "step": 21900 + }, + { + "epoch": 4.594254560704551, + "grad_norm": 6.280771732330322, + "learning_rate": 0.0002, + "loss": 0.2793, + "step": 21910 + }, + { + "epoch": 4.596351436359824, + "grad_norm": 7.2261152267456055, + "learning_rate": 0.0002, + "loss": 0.3184, + "step": 21920 + }, + { + "epoch": 4.598448312015098, + "grad_norm": 16.766836166381836, + "learning_rate": 0.0002, + "loss": 0.3586, + "step": 21930 + }, + { + "epoch": 4.600545187670371, + "grad_norm": 1.0573318004608154, + "learning_rate": 0.0002, + "loss": 0.2479, + "step": 21940 + }, + { + "epoch": 4.602642063325645, + "grad_norm": 19.868959426879883, + "learning_rate": 0.0002, + "loss": 0.392, + "step": 21950 + }, + { + "epoch": 4.604738938980918, + "grad_norm": 19.53751564025879, + "learning_rate": 0.0002, + "loss": 0.5002, + "step": 21960 + }, + { + "epoch": 4.606835814636192, + "grad_norm": 1.9004204273223877, + "learning_rate": 0.0002, + "loss": 0.376, + "step": 21970 + }, + { + "epoch": 4.608932690291466, + "grad_norm": 13.868071556091309, + "learning_rate": 0.0002, + "loss": 0.2562, + "step": 21980 + }, + { + "epoch": 4.6110295659467395, + "grad_norm": 3.889951467514038, + "learning_rate": 0.0002, + "loss": 0.3608, + "step": 21990 + }, + { + "epoch": 4.613126441602013, + "grad_norm": 6.130751609802246, + "learning_rate": 0.0002, + "loss": 0.3432, + "step": 22000 + }, + { + "epoch": 4.6152233172572865, + "grad_norm": 2.158506155014038, + "learning_rate": 0.0002, + "loss": 0.2507, + "step": 22010 + }, + { + "epoch": 4.61732019291256, + "grad_norm": 5.004857540130615, + "learning_rate": 0.0002, + "loss": 0.3078, + "step": 22020 + }, + { + "epoch": 4.619417068567834, + "grad_norm": 2.55324649810791, + "learning_rate": 0.0002, + "loss": 0.4217, + "step": 22030 + }, + { + "epoch": 4.621513944223108, + "grad_norm": 2.718355655670166, + "learning_rate": 0.0002, + "loss": 0.2676, + "step": 22040 + }, + { + "epoch": 4.623610819878381, + "grad_norm": 11.014877319335938, + "learning_rate": 0.0002, + "loss": 0.2641, + "step": 22050 + }, + { + "epoch": 4.625707695533655, + "grad_norm": 7.405698299407959, + "learning_rate": 0.0002, + "loss": 0.2586, + "step": 22060 + }, + { + "epoch": 4.627804571188928, + "grad_norm": 21.379192352294922, + "learning_rate": 0.0002, + "loss": 0.4185, + "step": 22070 + }, + { + "epoch": 4.629901446844202, + "grad_norm": 8.769160270690918, + "learning_rate": 0.0002, + "loss": 0.2551, + "step": 22080 + }, + { + "epoch": 4.631998322499475, + "grad_norm": 7.6808600425720215, + "learning_rate": 0.0002, + "loss": 0.2705, + "step": 22090 + }, + { + "epoch": 4.63409519815475, + "grad_norm": 5.105208873748779, + "learning_rate": 0.0002, + "loss": 0.2525, + "step": 22100 + }, + { + "epoch": 4.636192073810023, + "grad_norm": 12.7372407913208, + "learning_rate": 0.0002, + "loss": 0.3947, + "step": 22110 + }, + { + "epoch": 4.638288949465297, + "grad_norm": 11.61864185333252, + "learning_rate": 0.0002, + "loss": 0.2563, + "step": 22120 + }, + { + "epoch": 4.64038582512057, + "grad_norm": 16.5443172454834, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 22130 + }, + { + "epoch": 4.642482700775844, + "grad_norm": 18.34696388244629, + "learning_rate": 0.0002, + "loss": 0.4231, + "step": 22140 + }, + { + "epoch": 4.644579576431117, + "grad_norm": 25.033620834350586, + "learning_rate": 0.0002, + "loss": 0.2918, + "step": 22150 + }, + { + "epoch": 4.646676452086391, + "grad_norm": 18.237457275390625, + "learning_rate": 0.0002, + "loss": 0.3403, + "step": 22160 + }, + { + "epoch": 4.648773327741665, + "grad_norm": 4.656020641326904, + "learning_rate": 0.0002, + "loss": 0.446, + "step": 22170 + }, + { + "epoch": 4.650870203396939, + "grad_norm": 27.86395263671875, + "learning_rate": 0.0002, + "loss": 0.4859, + "step": 22180 + }, + { + "epoch": 4.652967079052212, + "grad_norm": 2.5194180011749268, + "learning_rate": 0.0002, + "loss": 0.2589, + "step": 22190 + }, + { + "epoch": 4.655063954707486, + "grad_norm": 64.11674499511719, + "learning_rate": 0.0002, + "loss": 0.2603, + "step": 22200 + }, + { + "epoch": 4.657160830362759, + "grad_norm": 12.19118881225586, + "learning_rate": 0.0002, + "loss": 0.2726, + "step": 22210 + }, + { + "epoch": 4.659257706018034, + "grad_norm": 5.304041862487793, + "learning_rate": 0.0002, + "loss": 0.3687, + "step": 22220 + }, + { + "epoch": 4.661354581673307, + "grad_norm": 19.514848709106445, + "learning_rate": 0.0002, + "loss": 0.2648, + "step": 22230 + }, + { + "epoch": 4.663451457328581, + "grad_norm": 50.096492767333984, + "learning_rate": 0.0002, + "loss": 0.2644, + "step": 22240 + }, + { + "epoch": 4.665548332983854, + "grad_norm": 1.945943832397461, + "learning_rate": 0.0002, + "loss": 0.4599, + "step": 22250 + }, + { + "epoch": 4.667645208639128, + "grad_norm": 15.48399543762207, + "learning_rate": 0.0002, + "loss": 0.3748, + "step": 22260 + }, + { + "epoch": 4.669742084294401, + "grad_norm": 23.655168533325195, + "learning_rate": 0.0002, + "loss": 0.3264, + "step": 22270 + }, + { + "epoch": 4.671838959949675, + "grad_norm": 5.6444478034973145, + "learning_rate": 0.0002, + "loss": 0.3308, + "step": 22280 + }, + { + "epoch": 4.673935835604949, + "grad_norm": 1.5966367721557617, + "learning_rate": 0.0002, + "loss": 0.3293, + "step": 22290 + }, + { + "epoch": 4.6760327112602225, + "grad_norm": 37.680545806884766, + "learning_rate": 0.0002, + "loss": 0.4031, + "step": 22300 + }, + { + "epoch": 4.678129586915496, + "grad_norm": 3.1248462200164795, + "learning_rate": 0.0002, + "loss": 0.4096, + "step": 22310 + }, + { + "epoch": 4.6802264625707695, + "grad_norm": 2.226181745529175, + "learning_rate": 0.0002, + "loss": 0.3701, + "step": 22320 + }, + { + "epoch": 4.682323338226043, + "grad_norm": 4.016665458679199, + "learning_rate": 0.0002, + "loss": 0.401, + "step": 22330 + }, + { + "epoch": 4.6844202138813165, + "grad_norm": 17.831260681152344, + "learning_rate": 0.0002, + "loss": 0.3257, + "step": 22340 + }, + { + "epoch": 4.686517089536591, + "grad_norm": 17.144725799560547, + "learning_rate": 0.0002, + "loss": 0.3335, + "step": 22350 + }, + { + "epoch": 4.688613965191864, + "grad_norm": 7.547942638397217, + "learning_rate": 0.0002, + "loss": 0.4015, + "step": 22360 + }, + { + "epoch": 4.690710840847138, + "grad_norm": 6.964625358581543, + "learning_rate": 0.0002, + "loss": 0.3709, + "step": 22370 + }, + { + "epoch": 4.692807716502411, + "grad_norm": 5.216811656951904, + "learning_rate": 0.0002, + "loss": 0.4521, + "step": 22380 + }, + { + "epoch": 4.694904592157685, + "grad_norm": 47.28529739379883, + "learning_rate": 0.0002, + "loss": 0.3236, + "step": 22390 + }, + { + "epoch": 4.697001467812958, + "grad_norm": 1.253324031829834, + "learning_rate": 0.0002, + "loss": 0.3612, + "step": 22400 + }, + { + "epoch": 4.699098343468233, + "grad_norm": 7.6197733879089355, + "learning_rate": 0.0002, + "loss": 0.2537, + "step": 22410 + }, + { + "epoch": 4.701195219123506, + "grad_norm": 72.7631607055664, + "learning_rate": 0.0002, + "loss": 0.9054, + "step": 22420 + }, + { + "epoch": 4.70329209477878, + "grad_norm": 35.460487365722656, + "learning_rate": 0.0002, + "loss": 0.3916, + "step": 22430 + }, + { + "epoch": 4.705388970434053, + "grad_norm": 31.78118324279785, + "learning_rate": 0.0002, + "loss": 0.4392, + "step": 22440 + }, + { + "epoch": 4.707485846089327, + "grad_norm": 59.87028121948242, + "learning_rate": 0.0002, + "loss": 0.4103, + "step": 22450 + }, + { + "epoch": 4.7095827217446, + "grad_norm": 1.4908517599105835, + "learning_rate": 0.0002, + "loss": 0.2769, + "step": 22460 + }, + { + "epoch": 4.711679597399874, + "grad_norm": 3.2404863834381104, + "learning_rate": 0.0002, + "loss": 0.3169, + "step": 22470 + }, + { + "epoch": 4.713776473055148, + "grad_norm": 5.952626705169678, + "learning_rate": 0.0002, + "loss": 0.4722, + "step": 22480 + }, + { + "epoch": 4.715873348710422, + "grad_norm": 9.007226943969727, + "learning_rate": 0.0002, + "loss": 0.4048, + "step": 22490 + }, + { + "epoch": 4.717970224365695, + "grad_norm": 4.316738128662109, + "learning_rate": 0.0002, + "loss": 0.3234, + "step": 22500 + }, + { + "epoch": 4.720067100020969, + "grad_norm": 7.632235050201416, + "learning_rate": 0.0002, + "loss": 0.3282, + "step": 22510 + }, + { + "epoch": 4.722163975676242, + "grad_norm": 4.759393215179443, + "learning_rate": 0.0002, + "loss": 0.3523, + "step": 22520 + }, + { + "epoch": 4.7242608513315165, + "grad_norm": 2.0390775203704834, + "learning_rate": 0.0002, + "loss": 0.3229, + "step": 22530 + }, + { + "epoch": 4.72635772698679, + "grad_norm": 12.438055992126465, + "learning_rate": 0.0002, + "loss": 0.272, + "step": 22540 + }, + { + "epoch": 4.7284546026420635, + "grad_norm": 13.05179500579834, + "learning_rate": 0.0002, + "loss": 0.3151, + "step": 22550 + }, + { + "epoch": 4.730551478297337, + "grad_norm": 4.340366840362549, + "learning_rate": 0.0002, + "loss": 0.4242, + "step": 22560 + }, + { + "epoch": 4.7326483539526105, + "grad_norm": 7.640782833099365, + "learning_rate": 0.0002, + "loss": 0.4112, + "step": 22570 + }, + { + "epoch": 4.734745229607884, + "grad_norm": 1.2433539628982544, + "learning_rate": 0.0002, + "loss": 0.2546, + "step": 22580 + }, + { + "epoch": 4.7368421052631575, + "grad_norm": 4.08035945892334, + "learning_rate": 0.0002, + "loss": 0.3997, + "step": 22590 + }, + { + "epoch": 4.738938980918432, + "grad_norm": 7.260676383972168, + "learning_rate": 0.0002, + "loss": 0.4803, + "step": 22600 + }, + { + "epoch": 4.741035856573705, + "grad_norm": 5.27596378326416, + "learning_rate": 0.0002, + "loss": 0.3236, + "step": 22610 + }, + { + "epoch": 4.743132732228979, + "grad_norm": 8.552403450012207, + "learning_rate": 0.0002, + "loss": 0.3111, + "step": 22620 + }, + { + "epoch": 4.745229607884252, + "grad_norm": 41.805084228515625, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 22630 + }, + { + "epoch": 4.747326483539526, + "grad_norm": 45.03173828125, + "learning_rate": 0.0002, + "loss": 0.4797, + "step": 22640 + }, + { + "epoch": 4.749423359194799, + "grad_norm": 11.940930366516113, + "learning_rate": 0.0002, + "loss": 0.4627, + "step": 22650 + }, + { + "epoch": 4.751520234850073, + "grad_norm": 17.953216552734375, + "learning_rate": 0.0002, + "loss": 0.5131, + "step": 22660 + }, + { + "epoch": 4.753617110505347, + "grad_norm": 9.348217010498047, + "learning_rate": 0.0002, + "loss": 0.3437, + "step": 22670 + }, + { + "epoch": 4.755713986160621, + "grad_norm": 2.931450128555298, + "learning_rate": 0.0002, + "loss": 0.2577, + "step": 22680 + }, + { + "epoch": 4.757810861815894, + "grad_norm": 3.602579355239868, + "learning_rate": 0.0002, + "loss": 0.2717, + "step": 22690 + }, + { + "epoch": 4.759907737471168, + "grad_norm": 41.81117630004883, + "learning_rate": 0.0002, + "loss": 0.5134, + "step": 22700 + }, + { + "epoch": 4.762004613126441, + "grad_norm": 5.8748650550842285, + "learning_rate": 0.0002, + "loss": 0.2563, + "step": 22710 + }, + { + "epoch": 4.764101488781716, + "grad_norm": 5.086737155914307, + "learning_rate": 0.0002, + "loss": 0.4383, + "step": 22720 + }, + { + "epoch": 4.766198364436989, + "grad_norm": 53.62387466430664, + "learning_rate": 0.0002, + "loss": 0.4937, + "step": 22730 + }, + { + "epoch": 4.768295240092263, + "grad_norm": 12.729607582092285, + "learning_rate": 0.0002, + "loss": 0.2615, + "step": 22740 + }, + { + "epoch": 4.770392115747536, + "grad_norm": 89.39778137207031, + "learning_rate": 0.0002, + "loss": 0.3657, + "step": 22750 + }, + { + "epoch": 4.77248899140281, + "grad_norm": 32.737648010253906, + "learning_rate": 0.0002, + "loss": 0.337, + "step": 22760 + }, + { + "epoch": 4.774585867058083, + "grad_norm": 10.218364715576172, + "learning_rate": 0.0002, + "loss": 0.2745, + "step": 22770 + }, + { + "epoch": 4.776682742713357, + "grad_norm": 9.050323486328125, + "learning_rate": 0.0002, + "loss": 0.4922, + "step": 22780 + }, + { + "epoch": 4.778779618368631, + "grad_norm": 5.656611442565918, + "learning_rate": 0.0002, + "loss": 0.2631, + "step": 22790 + }, + { + "epoch": 4.780876494023905, + "grad_norm": 52.178993225097656, + "learning_rate": 0.0002, + "loss": 0.2844, + "step": 22800 + }, + { + "epoch": 4.782973369679178, + "grad_norm": 47.777408599853516, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 22810 + }, + { + "epoch": 4.785070245334452, + "grad_norm": 4.114084243774414, + "learning_rate": 0.0002, + "loss": 0.2857, + "step": 22820 + }, + { + "epoch": 4.787167120989725, + "grad_norm": 12.685486793518066, + "learning_rate": 0.0002, + "loss": 0.2551, + "step": 22830 + }, + { + "epoch": 4.789263996644999, + "grad_norm": 3.0896663665771484, + "learning_rate": 0.0002, + "loss": 0.3513, + "step": 22840 + }, + { + "epoch": 4.791360872300273, + "grad_norm": 8.13403606414795, + "learning_rate": 0.0002, + "loss": 0.3306, + "step": 22850 + }, + { + "epoch": 4.7934577479555465, + "grad_norm": 138.98486328125, + "learning_rate": 0.0002, + "loss": 0.3493, + "step": 22860 + }, + { + "epoch": 4.79555462361082, + "grad_norm": 3.541754722595215, + "learning_rate": 0.0002, + "loss": 0.262, + "step": 22870 + }, + { + "epoch": 4.7976514992660935, + "grad_norm": 21.595823287963867, + "learning_rate": 0.0002, + "loss": 0.259, + "step": 22880 + }, + { + "epoch": 4.799748374921367, + "grad_norm": 3.3957481384277344, + "learning_rate": 0.0002, + "loss": 0.2567, + "step": 22890 + }, + { + "epoch": 4.8018452505766405, + "grad_norm": 15.020285606384277, + "learning_rate": 0.0002, + "loss": 0.3227, + "step": 22900 + }, + { + "epoch": 4.803942126231915, + "grad_norm": 17.61827278137207, + "learning_rate": 0.0002, + "loss": 0.4037, + "step": 22910 + }, + { + "epoch": 4.806039001887188, + "grad_norm": 2.315112352371216, + "learning_rate": 0.0002, + "loss": 0.3231, + "step": 22920 + }, + { + "epoch": 4.808135877542462, + "grad_norm": 4.932908058166504, + "learning_rate": 0.0002, + "loss": 0.3858, + "step": 22930 + }, + { + "epoch": 4.810232753197735, + "grad_norm": 4.863265514373779, + "learning_rate": 0.0002, + "loss": 0.3371, + "step": 22940 + }, + { + "epoch": 4.812329628853009, + "grad_norm": 53.41450881958008, + "learning_rate": 0.0002, + "loss": 0.3507, + "step": 22950 + }, + { + "epoch": 4.814426504508282, + "grad_norm": 47.964725494384766, + "learning_rate": 0.0002, + "loss": 0.5097, + "step": 22960 + }, + { + "epoch": 4.816523380163556, + "grad_norm": 37.04306411743164, + "learning_rate": 0.0002, + "loss": 0.4286, + "step": 22970 + }, + { + "epoch": 4.81862025581883, + "grad_norm": 1.4727418422698975, + "learning_rate": 0.0002, + "loss": 0.312, + "step": 22980 + }, + { + "epoch": 4.820717131474104, + "grad_norm": 54.622379302978516, + "learning_rate": 0.0002, + "loss": 0.3793, + "step": 22990 + }, + { + "epoch": 4.822814007129377, + "grad_norm": 68.0690689086914, + "learning_rate": 0.0002, + "loss": 0.5235, + "step": 23000 + }, + { + "epoch": 4.824910882784651, + "grad_norm": 55.60499954223633, + "learning_rate": 0.0002, + "loss": 0.3695, + "step": 23010 + }, + { + "epoch": 4.827007758439924, + "grad_norm": 2.116057872772217, + "learning_rate": 0.0002, + "loss": 0.2591, + "step": 23020 + }, + { + "epoch": 4.829104634095199, + "grad_norm": 27.25665283203125, + "learning_rate": 0.0002, + "loss": 0.258, + "step": 23030 + }, + { + "epoch": 4.831201509750472, + "grad_norm": 1.7732720375061035, + "learning_rate": 0.0002, + "loss": 0.3434, + "step": 23040 + }, + { + "epoch": 4.833298385405746, + "grad_norm": 1.4641029834747314, + "learning_rate": 0.0002, + "loss": 0.2559, + "step": 23050 + }, + { + "epoch": 4.835395261061019, + "grad_norm": 15.68821907043457, + "learning_rate": 0.0002, + "loss": 0.4408, + "step": 23060 + }, + { + "epoch": 4.837492136716293, + "grad_norm": 50.2274169921875, + "learning_rate": 0.0002, + "loss": 0.2794, + "step": 23070 + }, + { + "epoch": 4.839589012371566, + "grad_norm": 13.899438858032227, + "learning_rate": 0.0002, + "loss": 0.4018, + "step": 23080 + }, + { + "epoch": 4.84168588802684, + "grad_norm": 24.751676559448242, + "learning_rate": 0.0002, + "loss": 0.3897, + "step": 23090 + }, + { + "epoch": 4.843782763682114, + "grad_norm": 70.79833221435547, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 23100 + }, + { + "epoch": 4.8458796393373875, + "grad_norm": 2.478081464767456, + "learning_rate": 0.0002, + "loss": 0.2668, + "step": 23110 + }, + { + "epoch": 4.847976514992661, + "grad_norm": 3.0055885314941406, + "learning_rate": 0.0002, + "loss": 0.2569, + "step": 23120 + }, + { + "epoch": 4.8500733906479345, + "grad_norm": 105.44467163085938, + "learning_rate": 0.0002, + "loss": 0.299, + "step": 23130 + }, + { + "epoch": 4.852170266303208, + "grad_norm": 172.33485412597656, + "learning_rate": 0.0002, + "loss": 0.4352, + "step": 23140 + }, + { + "epoch": 4.8542671419584815, + "grad_norm": 2.8913474082946777, + "learning_rate": 0.0002, + "loss": 0.2581, + "step": 23150 + }, + { + "epoch": 4.856364017613755, + "grad_norm": 63.7279052734375, + "learning_rate": 0.0002, + "loss": 0.328, + "step": 23160 + }, + { + "epoch": 4.858460893269029, + "grad_norm": 16.1322021484375, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 23170 + }, + { + "epoch": 4.860557768924303, + "grad_norm": 4.814330101013184, + "learning_rate": 0.0002, + "loss": 0.4127, + "step": 23180 + }, + { + "epoch": 4.862654644579576, + "grad_norm": 6.94169807434082, + "learning_rate": 0.0002, + "loss": 0.3144, + "step": 23190 + }, + { + "epoch": 4.86475152023485, + "grad_norm": 43.06448745727539, + "learning_rate": 0.0002, + "loss": 0.3599, + "step": 23200 + }, + { + "epoch": 4.866848395890123, + "grad_norm": 100.02835845947266, + "learning_rate": 0.0002, + "loss": 0.3969, + "step": 23210 + }, + { + "epoch": 4.868945271545398, + "grad_norm": 173.16786193847656, + "learning_rate": 0.0002, + "loss": 0.4545, + "step": 23220 + }, + { + "epoch": 4.871042147200671, + "grad_norm": 81.39131927490234, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 23230 + }, + { + "epoch": 4.873139022855945, + "grad_norm": 114.65706634521484, + "learning_rate": 0.0002, + "loss": 0.6302, + "step": 23240 + }, + { + "epoch": 4.875235898511218, + "grad_norm": 79.38748931884766, + "learning_rate": 0.0002, + "loss": 0.3835, + "step": 23250 + }, + { + "epoch": 4.877332774166492, + "grad_norm": 21.65077781677246, + "learning_rate": 0.0002, + "loss": 0.3005, + "step": 23260 + }, + { + "epoch": 4.879429649821765, + "grad_norm": 20.43645477294922, + "learning_rate": 0.0002, + "loss": 0.3529, + "step": 23270 + }, + { + "epoch": 4.881526525477039, + "grad_norm": 13.03095531463623, + "learning_rate": 0.0002, + "loss": 0.4077, + "step": 23280 + }, + { + "epoch": 4.883623401132313, + "grad_norm": 0.7234331965446472, + "learning_rate": 0.0002, + "loss": 0.4606, + "step": 23290 + }, + { + "epoch": 4.885720276787587, + "grad_norm": 3.002551555633545, + "learning_rate": 0.0002, + "loss": 0.2611, + "step": 23300 + }, + { + "epoch": 4.88781715244286, + "grad_norm": 48.173095703125, + "learning_rate": 0.0002, + "loss": 0.4378, + "step": 23310 + }, + { + "epoch": 4.889914028098134, + "grad_norm": 30.906227111816406, + "learning_rate": 0.0002, + "loss": 0.4116, + "step": 23320 + }, + { + "epoch": 4.892010903753407, + "grad_norm": 52.0058479309082, + "learning_rate": 0.0002, + "loss": 0.4907, + "step": 23330 + }, + { + "epoch": 4.894107779408681, + "grad_norm": 3.250602960586548, + "learning_rate": 0.0002, + "loss": 0.4432, + "step": 23340 + }, + { + "epoch": 4.896204655063955, + "grad_norm": 7.294270992279053, + "learning_rate": 0.0002, + "loss": 0.268, + "step": 23350 + }, + { + "epoch": 4.898301530719229, + "grad_norm": 16.525890350341797, + "learning_rate": 0.0002, + "loss": 0.3601, + "step": 23360 + }, + { + "epoch": 4.900398406374502, + "grad_norm": 29.520700454711914, + "learning_rate": 0.0002, + "loss": 0.276, + "step": 23370 + }, + { + "epoch": 4.902495282029776, + "grad_norm": 13.231327056884766, + "learning_rate": 0.0002, + "loss": 0.2752, + "step": 23380 + }, + { + "epoch": 4.904592157685049, + "grad_norm": 8.970483779907227, + "learning_rate": 0.0002, + "loss": 0.3489, + "step": 23390 + }, + { + "epoch": 4.906689033340323, + "grad_norm": 14.564881324768066, + "learning_rate": 0.0002, + "loss": 0.3565, + "step": 23400 + }, + { + "epoch": 4.908785908995597, + "grad_norm": 47.08441925048828, + "learning_rate": 0.0002, + "loss": 0.5337, + "step": 23410 + }, + { + "epoch": 4.9108827846508705, + "grad_norm": 10.598335266113281, + "learning_rate": 0.0002, + "loss": 0.3706, + "step": 23420 + }, + { + "epoch": 4.912979660306144, + "grad_norm": 7.135160446166992, + "learning_rate": 0.0002, + "loss": 0.3633, + "step": 23430 + }, + { + "epoch": 4.9150765359614175, + "grad_norm": 4.119414806365967, + "learning_rate": 0.0002, + "loss": 0.3192, + "step": 23440 + }, + { + "epoch": 4.917173411616691, + "grad_norm": 61.618770599365234, + "learning_rate": 0.0002, + "loss": 0.2827, + "step": 23450 + }, + { + "epoch": 4.9192702872719645, + "grad_norm": 2.349839687347412, + "learning_rate": 0.0002, + "loss": 0.4235, + "step": 23460 + }, + { + "epoch": 4.921367162927238, + "grad_norm": 4.726789951324463, + "learning_rate": 0.0002, + "loss": 0.392, + "step": 23470 + }, + { + "epoch": 4.923464038582512, + "grad_norm": 100.8683853149414, + "learning_rate": 0.0002, + "loss": 0.2534, + "step": 23480 + }, + { + "epoch": 4.925560914237786, + "grad_norm": 5.879467964172363, + "learning_rate": 0.0002, + "loss": 0.3332, + "step": 23490 + }, + { + "epoch": 4.927657789893059, + "grad_norm": 11.780795097351074, + "learning_rate": 0.0002, + "loss": 0.2696, + "step": 23500 + }, + { + "epoch": 4.929754665548333, + "grad_norm": 9.69931697845459, + "learning_rate": 0.0002, + "loss": 0.3792, + "step": 23510 + }, + { + "epoch": 4.931851541203606, + "grad_norm": 109.73184204101562, + "learning_rate": 0.0002, + "loss": 6.2043, + "step": 23520 + }, + { + "epoch": 4.933948416858881, + "grad_norm": 46.461143493652344, + "learning_rate": 0.0002, + "loss": 6.4032, + "step": 23530 + }, + { + "epoch": 4.936045292514154, + "grad_norm": 108.99391174316406, + "learning_rate": 0.0002, + "loss": 2.7763, + "step": 23540 + }, + { + "epoch": 4.938142168169428, + "grad_norm": 285.75054931640625, + "learning_rate": 0.0002, + "loss": 1.7655, + "step": 23550 + }, + { + "epoch": 4.940239043824701, + "grad_norm": 128.00816345214844, + "learning_rate": 0.0002, + "loss": 0.9253, + "step": 23560 + }, + { + "epoch": 4.942335919479975, + "grad_norm": 126.41734313964844, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 23570 + }, + { + "epoch": 4.944432795135248, + "grad_norm": 62.741641998291016, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 23580 + }, + { + "epoch": 4.946529670790522, + "grad_norm": 33.307369232177734, + "learning_rate": 0.0002, + "loss": 0.4376, + "step": 23590 + }, + { + "epoch": 4.948626546445796, + "grad_norm": 26.6583251953125, + "learning_rate": 0.0002, + "loss": 0.4938, + "step": 23600 + }, + { + "epoch": 4.95072342210107, + "grad_norm": 14.1249361038208, + "learning_rate": 0.0002, + "loss": 0.4565, + "step": 23610 + }, + { + "epoch": 4.952820297756343, + "grad_norm": 45.58442687988281, + "learning_rate": 0.0002, + "loss": 0.5171, + "step": 23620 + }, + { + "epoch": 4.954917173411617, + "grad_norm": 26.00278091430664, + "learning_rate": 0.0002, + "loss": 0.5916, + "step": 23630 + }, + { + "epoch": 4.95701404906689, + "grad_norm": 12.711030960083008, + "learning_rate": 0.0002, + "loss": 0.4861, + "step": 23640 + }, + { + "epoch": 4.959110924722164, + "grad_norm": 46.92948532104492, + "learning_rate": 0.0002, + "loss": 0.3547, + "step": 23650 + }, + { + "epoch": 4.961207800377437, + "grad_norm": 73.09407043457031, + "learning_rate": 0.0002, + "loss": 0.3132, + "step": 23660 + }, + { + "epoch": 4.9633046760327115, + "grad_norm": 64.90545654296875, + "learning_rate": 0.0002, + "loss": 0.9628, + "step": 23670 + }, + { + "epoch": 4.965401551687985, + "grad_norm": 17.83898162841797, + "learning_rate": 0.0002, + "loss": 0.3811, + "step": 23680 + }, + { + "epoch": 4.9674984273432585, + "grad_norm": 55.39151382446289, + "learning_rate": 0.0002, + "loss": 0.2989, + "step": 23690 + }, + { + "epoch": 4.969595302998532, + "grad_norm": 24.683879852294922, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 23700 + }, + { + "epoch": 4.9716921786538055, + "grad_norm": 106.67837524414062, + "learning_rate": 0.0002, + "loss": 0.3724, + "step": 23710 + }, + { + "epoch": 4.97378905430908, + "grad_norm": 10.091436386108398, + "learning_rate": 0.0002, + "loss": 0.5226, + "step": 23720 + }, + { + "epoch": 4.975885929964353, + "grad_norm": 55.912933349609375, + "learning_rate": 0.0002, + "loss": 0.5166, + "step": 23730 + }, + { + "epoch": 4.977982805619627, + "grad_norm": 303.874755859375, + "learning_rate": 0.0002, + "loss": 0.9055, + "step": 23740 + }, + { + "epoch": 4.9800796812749, + "grad_norm": 53.40581512451172, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 23750 + }, + { + "epoch": 4.982176556930174, + "grad_norm": 65.19920349121094, + "learning_rate": 0.0002, + "loss": 0.6391, + "step": 23760 + }, + { + "epoch": 4.984273432585447, + "grad_norm": 183.5748291015625, + "learning_rate": 0.0002, + "loss": 0.5376, + "step": 23770 + }, + { + "epoch": 4.986370308240721, + "grad_norm": 17.69962501525879, + "learning_rate": 0.0002, + "loss": 0.4914, + "step": 23780 + }, + { + "epoch": 4.988467183895995, + "grad_norm": 20.91483497619629, + "learning_rate": 0.0002, + "loss": 0.4115, + "step": 23790 + }, + { + "epoch": 4.990564059551269, + "grad_norm": 38.18173599243164, + "learning_rate": 0.0002, + "loss": 0.4986, + "step": 23800 + }, + { + "epoch": 4.992660935206542, + "grad_norm": 59.649051666259766, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 23810 + }, + { + "epoch": 4.994757810861816, + "grad_norm": 8.023183822631836, + "learning_rate": 0.0002, + "loss": 0.3814, + "step": 23820 + }, + { + "epoch": 4.996854686517089, + "grad_norm": 5.922271728515625, + "learning_rate": 0.0002, + "loss": 0.4391, + "step": 23830 + }, + { + "epoch": 4.998951562172363, + "grad_norm": 52.8021240234375, + "learning_rate": 0.0002, + "loss": 0.461, + "step": 23840 + } + ], + "logging_steps": 10, + "max_steps": 23845, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.8878877907124224e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}