diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8749 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 12450, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0024099289070972406, + "grad_norm": 2.472470998764038, + "learning_rate": 7.228915662650603e-08, + "loss": 0.4414, + "step": 10 + }, + { + "epoch": 0.004819857814194481, + "grad_norm": 2.5485317707061768, + "learning_rate": 1.526104417670683e-07, + "loss": 0.4342, + "step": 20 + }, + { + "epoch": 0.007229786721291722, + "grad_norm": 2.1985578536987305, + "learning_rate": 2.3293172690763053e-07, + "loss": 0.4302, + "step": 30 + }, + { + "epoch": 0.009639715628388962, + "grad_norm": 1.616897463798523, + "learning_rate": 3.1325301204819284e-07, + "loss": 0.4214, + "step": 40 + }, + { + "epoch": 0.012049644535486204, + "grad_norm": 1.18423593044281, + "learning_rate": 3.93574297188755e-07, + "loss": 0.4146, + "step": 50 + }, + { + "epoch": 0.014459573442583444, + "grad_norm": 0.7588810920715332, + "learning_rate": 4.738955823293173e-07, + "loss": 0.3957, + "step": 60 + }, + { + "epoch": 0.016869502349680685, + "grad_norm": 0.5396777987480164, + "learning_rate": 5.542168674698796e-07, + "loss": 0.3805, + "step": 70 + }, + { + "epoch": 0.019279431256777925, + "grad_norm": 0.37596839666366577, + "learning_rate": 6.345381526104419e-07, + "loss": 0.3605, + "step": 80 + }, + { + "epoch": 0.021689360163875165, + "grad_norm": 0.29817959666252136, + "learning_rate": 7.14859437751004e-07, + "loss": 0.3513, + "step": 90 + }, + { + "epoch": 0.024099289070972408, + "grad_norm": 0.25247910618782043, + "learning_rate": 7.951807228915663e-07, + "loss": 0.3382, + "step": 100 + }, + { + "epoch": 0.026509217978069648, + "grad_norm": 0.4067523181438446, + "learning_rate": 8.755020080321286e-07, + "loss": 0.3405, + "step": 110 + }, + { + "epoch": 0.028919146885166887, + "grad_norm": 0.18429981172084808, + "learning_rate": 9.558232931726909e-07, + "loss": 0.3298, + "step": 120 + }, + { + "epoch": 0.03132907579226413, + "grad_norm": 0.24616770446300507, + "learning_rate": 1.0361445783132532e-06, + "loss": 0.3259, + "step": 130 + }, + { + "epoch": 0.03373900469936137, + "grad_norm": 0.17981302738189697, + "learning_rate": 1.1164658634538152e-06, + "loss": 0.3171, + "step": 140 + }, + { + "epoch": 0.03614893360645861, + "grad_norm": 0.17655853927135468, + "learning_rate": 1.1967871485943775e-06, + "loss": 0.3173, + "step": 150 + }, + { + "epoch": 0.03855886251355585, + "grad_norm": 0.15255206823349, + "learning_rate": 1.2771084337349398e-06, + "loss": 0.3153, + "step": 160 + }, + { + "epoch": 0.04096879142065309, + "grad_norm": 0.1549864560365677, + "learning_rate": 1.357429718875502e-06, + "loss": 0.3071, + "step": 170 + }, + { + "epoch": 0.04337872032775033, + "grad_norm": 0.16035476326942444, + "learning_rate": 1.4377510040160644e-06, + "loss": 0.3071, + "step": 180 + }, + { + "epoch": 0.04578864923484757, + "grad_norm": 0.17461898922920227, + "learning_rate": 1.5180722891566266e-06, + "loss": 0.3056, + "step": 190 + }, + { + "epoch": 0.048198578141944816, + "grad_norm": 0.14300084114074707, + "learning_rate": 1.598393574297189e-06, + "loss": 0.3018, + "step": 200 + }, + { + "epoch": 0.05060850704904205, + "grad_norm": 0.1381385326385498, + "learning_rate": 1.6787148594377512e-06, + "loss": 0.2994, + "step": 210 + }, + { + "epoch": 0.053018435956139295, + "grad_norm": 0.1574939638376236, + "learning_rate": 1.7590361445783133e-06, + "loss": 0.3013, + "step": 220 + }, + { + "epoch": 0.05542836486323653, + "grad_norm": 0.15499268472194672, + "learning_rate": 1.8393574297188758e-06, + "loss": 0.2931, + "step": 230 + }, + { + "epoch": 0.057838293770333775, + "grad_norm": 0.1661788523197174, + "learning_rate": 1.919678714859438e-06, + "loss": 0.2936, + "step": 240 + }, + { + "epoch": 0.06024822267743102, + "grad_norm": 0.16183660924434662, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.2913, + "step": 250 + }, + { + "epoch": 0.06265815158452825, + "grad_norm": 0.15030546486377716, + "learning_rate": 2.0803212851405624e-06, + "loss": 0.2914, + "step": 260 + }, + { + "epoch": 0.06506808049162549, + "grad_norm": 0.16651834547519684, + "learning_rate": 2.1606425702811245e-06, + "loss": 0.2896, + "step": 270 + }, + { + "epoch": 0.06747800939872274, + "grad_norm": 0.1691359132528305, + "learning_rate": 2.240963855421687e-06, + "loss": 0.2855, + "step": 280 + }, + { + "epoch": 0.06988793830581998, + "grad_norm": 0.15761148929595947, + "learning_rate": 2.321285140562249e-06, + "loss": 0.2865, + "step": 290 + }, + { + "epoch": 0.07229786721291721, + "grad_norm": 0.17686566710472107, + "learning_rate": 2.4016064257028115e-06, + "loss": 0.2854, + "step": 300 + }, + { + "epoch": 0.07470779612001446, + "grad_norm": 0.15508554875850677, + "learning_rate": 2.4819277108433736e-06, + "loss": 0.2792, + "step": 310 + }, + { + "epoch": 0.0771177250271117, + "grad_norm": 0.15847276151180267, + "learning_rate": 2.5622489959839357e-06, + "loss": 0.2821, + "step": 320 + }, + { + "epoch": 0.07952765393420894, + "grad_norm": 0.18257629871368408, + "learning_rate": 2.642570281124498e-06, + "loss": 0.2771, + "step": 330 + }, + { + "epoch": 0.08193758284130619, + "grad_norm": 0.173823282122612, + "learning_rate": 2.7228915662650607e-06, + "loss": 0.277, + "step": 340 + }, + { + "epoch": 0.08434751174840342, + "grad_norm": 0.16445614397525787, + "learning_rate": 2.8032128514056227e-06, + "loss": 0.2771, + "step": 350 + }, + { + "epoch": 0.08675744065550066, + "grad_norm": 0.1713678389787674, + "learning_rate": 2.883534136546185e-06, + "loss": 0.2758, + "step": 360 + }, + { + "epoch": 0.08916736956259791, + "grad_norm": 0.1728232353925705, + "learning_rate": 2.9638554216867473e-06, + "loss": 0.2744, + "step": 370 + }, + { + "epoch": 0.09157729846969515, + "grad_norm": 0.15372931957244873, + "learning_rate": 3.0441767068273094e-06, + "loss": 0.2742, + "step": 380 + }, + { + "epoch": 0.09398722737679238, + "grad_norm": 0.23547357320785522, + "learning_rate": 3.124497991967872e-06, + "loss": 0.2737, + "step": 390 + }, + { + "epoch": 0.09639715628388963, + "grad_norm": 0.19922538101673126, + "learning_rate": 3.204819277108434e-06, + "loss": 0.2741, + "step": 400 + }, + { + "epoch": 0.09880708519098687, + "grad_norm": 0.1566275954246521, + "learning_rate": 3.2851405622489964e-06, + "loss": 0.267, + "step": 410 + }, + { + "epoch": 0.1012170140980841, + "grad_norm": 0.18829286098480225, + "learning_rate": 3.3654618473895585e-06, + "loss": 0.2697, + "step": 420 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 0.16823740303516388, + "learning_rate": 3.4457831325301206e-06, + "loss": 0.2736, + "step": 430 + }, + { + "epoch": 0.10603687191227859, + "grad_norm": 0.16835574805736542, + "learning_rate": 3.526104417670683e-06, + "loss": 0.2667, + "step": 440 + }, + { + "epoch": 0.10844680081937583, + "grad_norm": 0.16501298546791077, + "learning_rate": 3.6064257028112455e-06, + "loss": 0.2681, + "step": 450 + }, + { + "epoch": 0.11085672972647306, + "grad_norm": 0.16061000525951385, + "learning_rate": 3.6867469879518076e-06, + "loss": 0.2702, + "step": 460 + }, + { + "epoch": 0.11326665863357031, + "grad_norm": 0.16026341915130615, + "learning_rate": 3.7670682730923697e-06, + "loss": 0.2666, + "step": 470 + }, + { + "epoch": 0.11567658754066755, + "grad_norm": 0.16079480946063995, + "learning_rate": 3.847389558232932e-06, + "loss": 0.2667, + "step": 480 + }, + { + "epoch": 0.11808651644776479, + "grad_norm": 0.16454647481441498, + "learning_rate": 3.927710843373494e-06, + "loss": 0.2646, + "step": 490 + }, + { + "epoch": 0.12049644535486204, + "grad_norm": 0.20878982543945312, + "learning_rate": 4.008032128514057e-06, + "loss": 0.2686, + "step": 500 + }, + { + "epoch": 0.12290637426195927, + "grad_norm": 0.22473016381263733, + "learning_rate": 4.088353413654618e-06, + "loss": 0.264, + "step": 510 + }, + { + "epoch": 0.1253163031690565, + "grad_norm": 0.16910991072654724, + "learning_rate": 4.168674698795181e-06, + "loss": 0.2685, + "step": 520 + }, + { + "epoch": 0.12772623207615375, + "grad_norm": 0.16423989832401276, + "learning_rate": 4.248995983935743e-06, + "loss": 0.265, + "step": 530 + }, + { + "epoch": 0.13013616098325098, + "grad_norm": 0.16456137597560883, + "learning_rate": 4.329317269076306e-06, + "loss": 0.264, + "step": 540 + }, + { + "epoch": 0.13254608989034825, + "grad_norm": 0.22010430693626404, + "learning_rate": 4.4096385542168675e-06, + "loss": 0.2629, + "step": 550 + }, + { + "epoch": 0.13495601879744548, + "grad_norm": 0.186540886759758, + "learning_rate": 4.48995983935743e-06, + "loss": 0.2606, + "step": 560 + }, + { + "epoch": 0.13736594770454272, + "grad_norm": 0.1970025897026062, + "learning_rate": 4.5702811244979925e-06, + "loss": 0.2649, + "step": 570 + }, + { + "epoch": 0.13977587661163995, + "grad_norm": 0.18762168288230896, + "learning_rate": 4.650602409638554e-06, + "loss": 0.2628, + "step": 580 + }, + { + "epoch": 0.1421858055187372, + "grad_norm": 0.18925753235816956, + "learning_rate": 4.730923694779117e-06, + "loss": 0.2636, + "step": 590 + }, + { + "epoch": 0.14459573442583443, + "grad_norm": 0.24714338779449463, + "learning_rate": 4.811244979919679e-06, + "loss": 0.2587, + "step": 600 + }, + { + "epoch": 0.1470056633329317, + "grad_norm": 0.18089915812015533, + "learning_rate": 4.891566265060242e-06, + "loss": 0.2586, + "step": 610 + }, + { + "epoch": 0.14941559224002893, + "grad_norm": 0.21872669458389282, + "learning_rate": 4.971887550200803e-06, + "loss": 0.2596, + "step": 620 + }, + { + "epoch": 0.15182552114712616, + "grad_norm": 0.23077884316444397, + "learning_rate": 5.052208835341366e-06, + "loss": 0.2592, + "step": 630 + }, + { + "epoch": 0.1542354500542234, + "grad_norm": 0.20345240831375122, + "learning_rate": 5.132530120481927e-06, + "loss": 0.2582, + "step": 640 + }, + { + "epoch": 0.15664537896132064, + "grad_norm": 0.1861467957496643, + "learning_rate": 5.21285140562249e-06, + "loss": 0.2584, + "step": 650 + }, + { + "epoch": 0.15905530786841787, + "grad_norm": 0.18403823673725128, + "learning_rate": 5.293172690763053e-06, + "loss": 0.2572, + "step": 660 + }, + { + "epoch": 0.1614652367755151, + "grad_norm": 0.19485147297382355, + "learning_rate": 5.373493975903615e-06, + "loss": 0.2596, + "step": 670 + }, + { + "epoch": 0.16387516568261237, + "grad_norm": 0.26093626022338867, + "learning_rate": 5.453815261044177e-06, + "loss": 0.2566, + "step": 680 + }, + { + "epoch": 0.1662850945897096, + "grad_norm": 0.19746626913547516, + "learning_rate": 5.53413654618474e-06, + "loss": 0.2585, + "step": 690 + }, + { + "epoch": 0.16869502349680685, + "grad_norm": 0.19504830241203308, + "learning_rate": 5.6144578313253015e-06, + "loss": 0.2528, + "step": 700 + }, + { + "epoch": 0.17110495240390408, + "grad_norm": 0.25125887989997864, + "learning_rate": 5.694779116465864e-06, + "loss": 0.2533, + "step": 710 + }, + { + "epoch": 0.17351488131100132, + "grad_norm": 0.1774957925081253, + "learning_rate": 5.775100401606426e-06, + "loss": 0.2547, + "step": 720 + }, + { + "epoch": 0.17592481021809855, + "grad_norm": 0.2207845002412796, + "learning_rate": 5.855421686746988e-06, + "loss": 0.259, + "step": 730 + }, + { + "epoch": 0.17833473912519582, + "grad_norm": 0.20615459978580475, + "learning_rate": 5.935742971887551e-06, + "loss": 0.2585, + "step": 740 + }, + { + "epoch": 0.18074466803229305, + "grad_norm": 0.19467566907405853, + "learning_rate": 6.016064257028112e-06, + "loss": 0.2561, + "step": 750 + }, + { + "epoch": 0.1831545969393903, + "grad_norm": 0.18339978158473969, + "learning_rate": 6.096385542168676e-06, + "loss": 0.2502, + "step": 760 + }, + { + "epoch": 0.18556452584648753, + "grad_norm": 0.20017538964748383, + "learning_rate": 6.176706827309238e-06, + "loss": 0.2546, + "step": 770 + }, + { + "epoch": 0.18797445475358476, + "grad_norm": 0.17474955320358276, + "learning_rate": 6.2570281124498e-06, + "loss": 0.2515, + "step": 780 + }, + { + "epoch": 0.190384383660682, + "grad_norm": 0.21440750360488892, + "learning_rate": 6.337349397590362e-06, + "loss": 0.2564, + "step": 790 + }, + { + "epoch": 0.19279431256777926, + "grad_norm": 0.2097756266593933, + "learning_rate": 6.417670682730924e-06, + "loss": 0.254, + "step": 800 + }, + { + "epoch": 0.1952042414748765, + "grad_norm": 0.1950419396162033, + "learning_rate": 6.4979919678714864e-06, + "loss": 0.2523, + "step": 810 + }, + { + "epoch": 0.19761417038197374, + "grad_norm": 0.22887657582759857, + "learning_rate": 6.578313253012049e-06, + "loss": 0.2537, + "step": 820 + }, + { + "epoch": 0.20002409928907097, + "grad_norm": 0.27371925115585327, + "learning_rate": 6.6586345381526106e-06, + "loss": 0.2552, + "step": 830 + }, + { + "epoch": 0.2024340281961682, + "grad_norm": 0.23672780394554138, + "learning_rate": 6.738955823293173e-06, + "loss": 0.2513, + "step": 840 + }, + { + "epoch": 0.20484395710326544, + "grad_norm": 0.19304253160953522, + "learning_rate": 6.819277108433735e-06, + "loss": 0.2533, + "step": 850 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 0.199726864695549, + "learning_rate": 6.899598393574298e-06, + "loss": 0.251, + "step": 860 + }, + { + "epoch": 0.20966381491745995, + "grad_norm": 0.19144225120544434, + "learning_rate": 6.9799196787148605e-06, + "loss": 0.2525, + "step": 870 + }, + { + "epoch": 0.21207374382455718, + "grad_norm": 0.20240966975688934, + "learning_rate": 7.060240963855422e-06, + "loss": 0.2515, + "step": 880 + }, + { + "epoch": 0.21448367273165442, + "grad_norm": 0.22919943928718567, + "learning_rate": 7.140562248995985e-06, + "loss": 0.251, + "step": 890 + }, + { + "epoch": 0.21689360163875165, + "grad_norm": 0.23874454200267792, + "learning_rate": 7.220883534136547e-06, + "loss": 0.2475, + "step": 900 + }, + { + "epoch": 0.2193035305458489, + "grad_norm": 0.19525328278541565, + "learning_rate": 7.301204819277109e-06, + "loss": 0.2493, + "step": 910 + }, + { + "epoch": 0.22171345945294613, + "grad_norm": 0.2014315128326416, + "learning_rate": 7.381526104417671e-06, + "loss": 0.252, + "step": 920 + }, + { + "epoch": 0.2241233883600434, + "grad_norm": 0.19596624374389648, + "learning_rate": 7.461847389558233e-06, + "loss": 0.2501, + "step": 930 + }, + { + "epoch": 0.22653331726714063, + "grad_norm": 0.19004347920417786, + "learning_rate": 7.5421686746987955e-06, + "loss": 0.2498, + "step": 940 + }, + { + "epoch": 0.22894324617423786, + "grad_norm": 0.23779860138893127, + "learning_rate": 7.622489959839358e-06, + "loss": 0.2497, + "step": 950 + }, + { + "epoch": 0.2313531750813351, + "grad_norm": 0.23027803003787994, + "learning_rate": 7.702811244979921e-06, + "loss": 0.25, + "step": 960 + }, + { + "epoch": 0.23376310398843234, + "grad_norm": 0.21898239850997925, + "learning_rate": 7.783132530120484e-06, + "loss": 0.2493, + "step": 970 + }, + { + "epoch": 0.23617303289552957, + "grad_norm": 0.18939456343650818, + "learning_rate": 7.863453815261045e-06, + "loss": 0.2457, + "step": 980 + }, + { + "epoch": 0.23858296180262684, + "grad_norm": 0.19975551962852478, + "learning_rate": 7.943775100401607e-06, + "loss": 0.2479, + "step": 990 + }, + { + "epoch": 0.24099289070972407, + "grad_norm": 0.1990450918674469, + "learning_rate": 8.02409638554217e-06, + "loss": 0.2502, + "step": 1000 + }, + { + "epoch": 0.2434028196168213, + "grad_norm": 0.21345728635787964, + "learning_rate": 8.104417670682732e-06, + "loss": 0.2486, + "step": 1010 + }, + { + "epoch": 0.24581274852391855, + "grad_norm": 0.24549713730812073, + "learning_rate": 8.184738955823293e-06, + "loss": 0.2492, + "step": 1020 + }, + { + "epoch": 0.24822267743101578, + "grad_norm": 0.20228305459022522, + "learning_rate": 8.265060240963855e-06, + "loss": 0.2473, + "step": 1030 + }, + { + "epoch": 0.250632606338113, + "grad_norm": 0.1969956010580063, + "learning_rate": 8.345381526104418e-06, + "loss": 0.2477, + "step": 1040 + }, + { + "epoch": 0.25304253524521025, + "grad_norm": 0.23591509461402893, + "learning_rate": 8.42570281124498e-06, + "loss": 0.2468, + "step": 1050 + }, + { + "epoch": 0.2554524641523075, + "grad_norm": 0.21046389639377594, + "learning_rate": 8.506024096385543e-06, + "loss": 0.2479, + "step": 1060 + }, + { + "epoch": 0.2578623930594047, + "grad_norm": 0.2074204385280609, + "learning_rate": 8.586345381526105e-06, + "loss": 0.2452, + "step": 1070 + }, + { + "epoch": 0.26027232196650196, + "grad_norm": 0.3702162206172943, + "learning_rate": 8.666666666666668e-06, + "loss": 0.2477, + "step": 1080 + }, + { + "epoch": 0.26268225087359925, + "grad_norm": 0.21479126811027527, + "learning_rate": 8.74698795180723e-06, + "loss": 0.2486, + "step": 1090 + }, + { + "epoch": 0.2650921797806965, + "grad_norm": 0.20660068094730377, + "learning_rate": 8.827309236947791e-06, + "loss": 0.2465, + "step": 1100 + }, + { + "epoch": 0.2675021086877937, + "grad_norm": 0.2068599909543991, + "learning_rate": 8.907630522088354e-06, + "loss": 0.2512, + "step": 1110 + }, + { + "epoch": 0.26991203759489096, + "grad_norm": 0.2575608789920807, + "learning_rate": 8.987951807228916e-06, + "loss": 0.2476, + "step": 1120 + }, + { + "epoch": 0.2723219665019882, + "grad_norm": 0.19399215281009674, + "learning_rate": 9.068273092369479e-06, + "loss": 0.2435, + "step": 1130 + }, + { + "epoch": 0.27473189540908544, + "grad_norm": 0.2270069122314453, + "learning_rate": 9.148594377510041e-06, + "loss": 0.248, + "step": 1140 + }, + { + "epoch": 0.27714182431618267, + "grad_norm": 0.22409838438034058, + "learning_rate": 9.228915662650602e-06, + "loss": 0.2431, + "step": 1150 + }, + { + "epoch": 0.2795517532232799, + "grad_norm": 0.24383105337619781, + "learning_rate": 9.309236947791166e-06, + "loss": 0.2457, + "step": 1160 + }, + { + "epoch": 0.28196168213037714, + "grad_norm": 0.2236703783273697, + "learning_rate": 9.389558232931729e-06, + "loss": 0.243, + "step": 1170 + }, + { + "epoch": 0.2843716110374744, + "grad_norm": 0.22612161934375763, + "learning_rate": 9.46987951807229e-06, + "loss": 0.2448, + "step": 1180 + }, + { + "epoch": 0.2867815399445716, + "grad_norm": 0.23215730488300323, + "learning_rate": 9.550200803212852e-06, + "loss": 0.2418, + "step": 1190 + }, + { + "epoch": 0.28919146885166885, + "grad_norm": 0.22533060610294342, + "learning_rate": 9.630522088353414e-06, + "loss": 0.2425, + "step": 1200 + }, + { + "epoch": 0.2916013977587661, + "grad_norm": 0.22571276128292084, + "learning_rate": 9.710843373493977e-06, + "loss": 0.2455, + "step": 1210 + }, + { + "epoch": 0.2940113266658634, + "grad_norm": 0.24537992477416992, + "learning_rate": 9.79116465863454e-06, + "loss": 0.2394, + "step": 1220 + }, + { + "epoch": 0.2964212555729606, + "grad_norm": 0.21820850670337677, + "learning_rate": 9.8714859437751e-06, + "loss": 0.2464, + "step": 1230 + }, + { + "epoch": 0.29883118448005785, + "grad_norm": 0.2275010347366333, + "learning_rate": 9.951807228915663e-06, + "loss": 0.2427, + "step": 1240 + }, + { + "epoch": 0.3012411133871551, + "grad_norm": 0.1905357539653778, + "learning_rate": 9.999996855613166e-06, + "loss": 0.243, + "step": 1250 + }, + { + "epoch": 0.3036510422942523, + "grad_norm": 0.2797210216522217, + "learning_rate": 9.999961481306676e-06, + "loss": 0.2425, + "step": 1260 + }, + { + "epoch": 0.30606097120134956, + "grad_norm": 0.1980619579553604, + "learning_rate": 9.999886802489159e-06, + "loss": 0.2403, + "step": 1270 + }, + { + "epoch": 0.3084709001084468, + "grad_norm": 0.20793509483337402, + "learning_rate": 9.999772819747658e-06, + "loss": 0.2436, + "step": 1280 + }, + { + "epoch": 0.31088082901554404, + "grad_norm": 0.20385341346263885, + "learning_rate": 9.99961953397819e-06, + "loss": 0.2465, + "step": 1290 + }, + { + "epoch": 0.31329075792264127, + "grad_norm": 0.20672109723091125, + "learning_rate": 9.999426946385727e-06, + "loss": 0.2467, + "step": 1300 + }, + { + "epoch": 0.3157006868297385, + "grad_norm": 0.1893812119960785, + "learning_rate": 9.999195058484192e-06, + "loss": 0.2435, + "step": 1310 + }, + { + "epoch": 0.31811061573683574, + "grad_norm": 0.19826377928256989, + "learning_rate": 9.998923872096449e-06, + "loss": 0.2444, + "step": 1320 + }, + { + "epoch": 0.320520544643933, + "grad_norm": 0.25412455201148987, + "learning_rate": 9.998613389354283e-06, + "loss": 0.2404, + "step": 1330 + }, + { + "epoch": 0.3229304735510302, + "grad_norm": 0.22612008452415466, + "learning_rate": 9.998263612698386e-06, + "loss": 0.2457, + "step": 1340 + }, + { + "epoch": 0.3253404024581275, + "grad_norm": 0.2398287057876587, + "learning_rate": 9.997874544878343e-06, + "loss": 0.2417, + "step": 1350 + }, + { + "epoch": 0.32775033136522475, + "grad_norm": 0.26140671968460083, + "learning_rate": 9.997446188952599e-06, + "loss": 0.2368, + "step": 1360 + }, + { + "epoch": 0.330160260272322, + "grad_norm": 0.19120003283023834, + "learning_rate": 9.996978548288446e-06, + "loss": 0.245, + "step": 1370 + }, + { + "epoch": 0.3325701891794192, + "grad_norm": 0.21241794526576996, + "learning_rate": 9.996471626561988e-06, + "loss": 0.2424, + "step": 1380 + }, + { + "epoch": 0.33498011808651645, + "grad_norm": 0.20897141098976135, + "learning_rate": 9.995925427758117e-06, + "loss": 0.2424, + "step": 1390 + }, + { + "epoch": 0.3373900469936137, + "grad_norm": 0.2229437381029129, + "learning_rate": 9.995339956170482e-06, + "loss": 0.2404, + "step": 1400 + }, + { + "epoch": 0.3397999759007109, + "grad_norm": 0.23609474301338196, + "learning_rate": 9.994715216401457e-06, + "loss": 0.2392, + "step": 1410 + }, + { + "epoch": 0.34220990480780816, + "grad_norm": 0.20246021449565887, + "learning_rate": 9.994051213362091e-06, + "loss": 0.2401, + "step": 1420 + }, + { + "epoch": 0.3446198337149054, + "grad_norm": 0.18765322864055634, + "learning_rate": 9.993347952272095e-06, + "loss": 0.2351, + "step": 1430 + }, + { + "epoch": 0.34702976262200264, + "grad_norm": 0.23499734699726105, + "learning_rate": 9.992605438659773e-06, + "loss": 0.2392, + "step": 1440 + }, + { + "epoch": 0.34943969152909987, + "grad_norm": 0.2348964661359787, + "learning_rate": 9.991823678361997e-06, + "loss": 0.2405, + "step": 1450 + }, + { + "epoch": 0.3518496204361971, + "grad_norm": 0.21696138381958008, + "learning_rate": 9.991002677524158e-06, + "loss": 0.2385, + "step": 1460 + }, + { + "epoch": 0.3542595493432944, + "grad_norm": 0.2489856481552124, + "learning_rate": 9.990142442600113e-06, + "loss": 0.2426, + "step": 1470 + }, + { + "epoch": 0.35666947825039164, + "grad_norm": 0.20439158380031586, + "learning_rate": 9.989242980352134e-06, + "loss": 0.2422, + "step": 1480 + }, + { + "epoch": 0.3590794071574889, + "grad_norm": 0.23717837035655975, + "learning_rate": 9.988304297850864e-06, + "loss": 0.2405, + "step": 1490 + }, + { + "epoch": 0.3614893360645861, + "grad_norm": 0.22787103056907654, + "learning_rate": 9.987326402475246e-06, + "loss": 0.2417, + "step": 1500 + }, + { + "epoch": 0.36389926497168334, + "grad_norm": 0.2548515796661377, + "learning_rate": 9.986309301912484e-06, + "loss": 0.2365, + "step": 1510 + }, + { + "epoch": 0.3663091938787806, + "grad_norm": 0.2120496928691864, + "learning_rate": 9.985253004157967e-06, + "loss": 0.2382, + "step": 1520 + }, + { + "epoch": 0.3687191227858778, + "grad_norm": 0.22003258764743805, + "learning_rate": 9.984157517515209e-06, + "loss": 0.2358, + "step": 1530 + }, + { + "epoch": 0.37112905169297505, + "grad_norm": 0.24021808803081512, + "learning_rate": 9.983022850595794e-06, + "loss": 0.2379, + "step": 1540 + }, + { + "epoch": 0.3735389806000723, + "grad_norm": 0.17676447331905365, + "learning_rate": 9.981849012319294e-06, + "loss": 0.2374, + "step": 1550 + }, + { + "epoch": 0.3759489095071695, + "grad_norm": 0.21659384667873383, + "learning_rate": 9.980636011913207e-06, + "loss": 0.2378, + "step": 1560 + }, + { + "epoch": 0.37835883841426676, + "grad_norm": 0.21923330426216125, + "learning_rate": 9.979383858912886e-06, + "loss": 0.2383, + "step": 1570 + }, + { + "epoch": 0.380768767321364, + "grad_norm": 0.1923467069864273, + "learning_rate": 9.97809256316146e-06, + "loss": 0.2358, + "step": 1580 + }, + { + "epoch": 0.38317869622846124, + "grad_norm": 0.2098626345396042, + "learning_rate": 9.976762134809752e-06, + "loss": 0.2391, + "step": 1590 + }, + { + "epoch": 0.3855886251355585, + "grad_norm": 0.22998303174972534, + "learning_rate": 9.975392584316215e-06, + "loss": 0.2356, + "step": 1600 + }, + { + "epoch": 0.38799855404265576, + "grad_norm": 0.21946196258068085, + "learning_rate": 9.973983922446832e-06, + "loss": 0.2389, + "step": 1610 + }, + { + "epoch": 0.390408482949753, + "grad_norm": 0.2061765342950821, + "learning_rate": 9.972536160275042e-06, + "loss": 0.2352, + "step": 1620 + }, + { + "epoch": 0.39281841185685024, + "grad_norm": 0.24974577128887177, + "learning_rate": 9.971049309181648e-06, + "loss": 0.237, + "step": 1630 + }, + { + "epoch": 0.39522834076394747, + "grad_norm": 0.2364003211259842, + "learning_rate": 9.969523380854736e-06, + "loss": 0.239, + "step": 1640 + }, + { + "epoch": 0.3976382696710447, + "grad_norm": 0.23062963783740997, + "learning_rate": 9.967958387289564e-06, + "loss": 0.2396, + "step": 1650 + }, + { + "epoch": 0.40004819857814194, + "grad_norm": 0.17892813682556152, + "learning_rate": 9.966354340788496e-06, + "loss": 0.2352, + "step": 1660 + }, + { + "epoch": 0.4024581274852392, + "grad_norm": 0.19819405674934387, + "learning_rate": 9.964711253960877e-06, + "loss": 0.2364, + "step": 1670 + }, + { + "epoch": 0.4048680563923364, + "grad_norm": 0.2314508855342865, + "learning_rate": 9.963029139722952e-06, + "loss": 0.2371, + "step": 1680 + }, + { + "epoch": 0.40727798529943365, + "grad_norm": 0.21803642809391022, + "learning_rate": 9.96130801129776e-06, + "loss": 0.2354, + "step": 1690 + }, + { + "epoch": 0.4096879142065309, + "grad_norm": 0.23534069955348969, + "learning_rate": 9.959547882215025e-06, + "loss": 0.2366, + "step": 1700 + }, + { + "epoch": 0.4120978431136281, + "grad_norm": 0.22902339696884155, + "learning_rate": 9.957748766311059e-06, + "loss": 0.2393, + "step": 1710 + }, + { + "epoch": 0.41450777202072536, + "grad_norm": 0.2218971848487854, + "learning_rate": 9.955910677728643e-06, + "loss": 0.2377, + "step": 1720 + }, + { + "epoch": 0.41691770092782265, + "grad_norm": 0.1902938187122345, + "learning_rate": 9.954033630916926e-06, + "loss": 0.2339, + "step": 1730 + }, + { + "epoch": 0.4193276298349199, + "grad_norm": 0.16372597217559814, + "learning_rate": 9.952117640631298e-06, + "loss": 0.2336, + "step": 1740 + }, + { + "epoch": 0.4217375587420171, + "grad_norm": 0.24245886504650116, + "learning_rate": 9.950162721933292e-06, + "loss": 0.2362, + "step": 1750 + }, + { + "epoch": 0.42414748764911436, + "grad_norm": 0.19669395685195923, + "learning_rate": 9.948168890190448e-06, + "loss": 0.2332, + "step": 1760 + }, + { + "epoch": 0.4265574165562116, + "grad_norm": 0.1758386343717575, + "learning_rate": 9.946136161076205e-06, + "loss": 0.235, + "step": 1770 + }, + { + "epoch": 0.42896734546330884, + "grad_norm": 0.1818431317806244, + "learning_rate": 9.94406455056977e-06, + "loss": 0.2368, + "step": 1780 + }, + { + "epoch": 0.43137727437040607, + "grad_norm": 0.1829291433095932, + "learning_rate": 9.941954074955995e-06, + "loss": 0.2307, + "step": 1790 + }, + { + "epoch": 0.4337872032775033, + "grad_norm": 0.18251778185367584, + "learning_rate": 9.939804750825253e-06, + "loss": 0.2329, + "step": 1800 + }, + { + "epoch": 0.43619713218460054, + "grad_norm": 0.19176249206066132, + "learning_rate": 9.937616595073299e-06, + "loss": 0.2348, + "step": 1810 + }, + { + "epoch": 0.4386070610916978, + "grad_norm": 0.19189012050628662, + "learning_rate": 9.935389624901143e-06, + "loss": 0.2368, + "step": 1820 + }, + { + "epoch": 0.441016989998795, + "grad_norm": 0.1790163666009903, + "learning_rate": 9.933123857814917e-06, + "loss": 0.2354, + "step": 1830 + }, + { + "epoch": 0.44342691890589225, + "grad_norm": 0.2602790892124176, + "learning_rate": 9.93081931162573e-06, + "loss": 0.2341, + "step": 1840 + }, + { + "epoch": 0.4458368478129895, + "grad_norm": 0.16906730830669403, + "learning_rate": 9.928476004449534e-06, + "loss": 0.2334, + "step": 1850 + }, + { + "epoch": 0.4482467767200868, + "grad_norm": 0.1677951216697693, + "learning_rate": 9.926093954706982e-06, + "loss": 0.2338, + "step": 1860 + }, + { + "epoch": 0.450656705627184, + "grad_norm": 0.1938229352235794, + "learning_rate": 9.923673181123273e-06, + "loss": 0.2369, + "step": 1870 + }, + { + "epoch": 0.45306663453428125, + "grad_norm": 0.18625344336032867, + "learning_rate": 9.921213702728023e-06, + "loss": 0.2324, + "step": 1880 + }, + { + "epoch": 0.4554765634413785, + "grad_norm": 0.20547117292881012, + "learning_rate": 9.918715538855098e-06, + "loss": 0.2332, + "step": 1890 + }, + { + "epoch": 0.4578864923484757, + "grad_norm": 0.20020659267902374, + "learning_rate": 9.916178709142472e-06, + "loss": 0.233, + "step": 1900 + }, + { + "epoch": 0.46029642125557296, + "grad_norm": 0.24178935587406158, + "learning_rate": 9.913603233532067e-06, + "loss": 0.2341, + "step": 1910 + }, + { + "epoch": 0.4627063501626702, + "grad_norm": 0.17368467152118683, + "learning_rate": 9.910989132269604e-06, + "loss": 0.2339, + "step": 1920 + }, + { + "epoch": 0.46511627906976744, + "grad_norm": 0.19401304423809052, + "learning_rate": 9.908336425904432e-06, + "loss": 0.2341, + "step": 1930 + }, + { + "epoch": 0.46752620797686467, + "grad_norm": 0.19002994894981384, + "learning_rate": 9.905645135289378e-06, + "loss": 0.2342, + "step": 1940 + }, + { + "epoch": 0.4699361368839619, + "grad_norm": 0.21070018410682678, + "learning_rate": 9.902915281580581e-06, + "loss": 0.2356, + "step": 1950 + }, + { + "epoch": 0.47234606579105914, + "grad_norm": 0.19492904841899872, + "learning_rate": 9.900146886237316e-06, + "loss": 0.2341, + "step": 1960 + }, + { + "epoch": 0.4747559946981564, + "grad_norm": 0.2291557788848877, + "learning_rate": 9.897339971021836e-06, + "loss": 0.2332, + "step": 1970 + }, + { + "epoch": 0.4771659236052537, + "grad_norm": 0.21163547039031982, + "learning_rate": 9.894494557999195e-06, + "loss": 0.232, + "step": 1980 + }, + { + "epoch": 0.4795758525123509, + "grad_norm": 0.30296486616134644, + "learning_rate": 9.891610669537084e-06, + "loss": 0.2293, + "step": 1990 + }, + { + "epoch": 0.48198578141944814, + "grad_norm": 0.1773892492055893, + "learning_rate": 9.888688328305638e-06, + "loss": 0.2324, + "step": 2000 + }, + { + "epoch": 0.4843957103265454, + "grad_norm": 0.17265821993350983, + "learning_rate": 9.885727557277275e-06, + "loss": 0.2322, + "step": 2010 + }, + { + "epoch": 0.4868056392336426, + "grad_norm": 0.18319979310035706, + "learning_rate": 9.882728379726506e-06, + "loss": 0.2334, + "step": 2020 + }, + { + "epoch": 0.48921556814073985, + "grad_norm": 0.17308953404426575, + "learning_rate": 9.879690819229752e-06, + "loss": 0.2269, + "step": 2030 + }, + { + "epoch": 0.4916254970478371, + "grad_norm": 0.24003373086452484, + "learning_rate": 9.876614899665167e-06, + "loss": 0.2328, + "step": 2040 + }, + { + "epoch": 0.4940354259549343, + "grad_norm": 0.22242967784404755, + "learning_rate": 9.873500645212434e-06, + "loss": 0.2328, + "step": 2050 + }, + { + "epoch": 0.49644535486203156, + "grad_norm": 0.185471773147583, + "learning_rate": 9.870348080352597e-06, + "loss": 0.2324, + "step": 2060 + }, + { + "epoch": 0.4988552837691288, + "grad_norm": 0.17182640731334686, + "learning_rate": 9.867157229867847e-06, + "loss": 0.2301, + "step": 2070 + }, + { + "epoch": 0.501265212676226, + "grad_norm": 0.19547314941883087, + "learning_rate": 9.863928118841344e-06, + "loss": 0.2342, + "step": 2080 + }, + { + "epoch": 0.5036751415833233, + "grad_norm": 0.2110988199710846, + "learning_rate": 9.860660772657008e-06, + "loss": 0.2313, + "step": 2090 + }, + { + "epoch": 0.5060850704904205, + "grad_norm": 0.2130996584892273, + "learning_rate": 9.857355216999324e-06, + "loss": 0.229, + "step": 2100 + }, + { + "epoch": 0.5084949993975177, + "grad_norm": 0.20759201049804688, + "learning_rate": 9.854011477853147e-06, + "loss": 0.2325, + "step": 2110 + }, + { + "epoch": 0.510904928304615, + "grad_norm": 0.19392450153827667, + "learning_rate": 9.850629581503481e-06, + "loss": 0.2315, + "step": 2120 + }, + { + "epoch": 0.5133148572117122, + "grad_norm": 0.17622078955173492, + "learning_rate": 9.847209554535288e-06, + "loss": 0.2314, + "step": 2130 + }, + { + "epoch": 0.5157247861188095, + "grad_norm": 0.1835789829492569, + "learning_rate": 9.843751423833274e-06, + "loss": 0.2297, + "step": 2140 + }, + { + "epoch": 0.5181347150259067, + "grad_norm": 0.20619632303714752, + "learning_rate": 9.840255216581676e-06, + "loss": 0.2283, + "step": 2150 + }, + { + "epoch": 0.5205446439330039, + "grad_norm": 0.1713174283504486, + "learning_rate": 9.836720960264049e-06, + "loss": 0.2341, + "step": 2160 + }, + { + "epoch": 0.5229545728401012, + "grad_norm": 0.162379190325737, + "learning_rate": 9.833148682663048e-06, + "loss": 0.2328, + "step": 2170 + }, + { + "epoch": 0.5253645017471985, + "grad_norm": 0.16697606444358826, + "learning_rate": 9.829538411860218e-06, + "loss": 0.231, + "step": 2180 + }, + { + "epoch": 0.5277744306542957, + "grad_norm": 0.17996534705162048, + "learning_rate": 9.82589017623576e-06, + "loss": 0.2326, + "step": 2190 + }, + { + "epoch": 0.530184359561393, + "grad_norm": 0.18227118253707886, + "learning_rate": 9.822204004468319e-06, + "loss": 0.2322, + "step": 2200 + }, + { + "epoch": 0.5325942884684902, + "grad_norm": 0.168010875582695, + "learning_rate": 9.818479925534755e-06, + "loss": 0.2269, + "step": 2210 + }, + { + "epoch": 0.5350042173755875, + "grad_norm": 0.1992473155260086, + "learning_rate": 9.814717968709912e-06, + "loss": 0.2334, + "step": 2220 + }, + { + "epoch": 0.5374141462826847, + "grad_norm": 0.22966599464416504, + "learning_rate": 9.810918163566396e-06, + "loss": 0.2296, + "step": 2230 + }, + { + "epoch": 0.5398240751897819, + "grad_norm": 0.1932167261838913, + "learning_rate": 9.80708053997433e-06, + "loss": 0.23, + "step": 2240 + }, + { + "epoch": 0.5422340040968792, + "grad_norm": 0.20305748283863068, + "learning_rate": 9.803205128101134e-06, + "loss": 0.2325, + "step": 2250 + }, + { + "epoch": 0.5446439330039764, + "grad_norm": 0.1645287424325943, + "learning_rate": 9.799291958411273e-06, + "loss": 0.2311, + "step": 2260 + }, + { + "epoch": 0.5470538619110736, + "grad_norm": 0.1746455430984497, + "learning_rate": 9.795341061666031e-06, + "loss": 0.2302, + "step": 2270 + }, + { + "epoch": 0.5494637908181709, + "grad_norm": 0.2108176350593567, + "learning_rate": 9.791352468923257e-06, + "loss": 0.2325, + "step": 2280 + }, + { + "epoch": 0.5518737197252681, + "grad_norm": 0.1855156570672989, + "learning_rate": 9.787326211537132e-06, + "loss": 0.2269, + "step": 2290 + }, + { + "epoch": 0.5542836486323653, + "grad_norm": 0.1825191229581833, + "learning_rate": 9.783262321157915e-06, + "loss": 0.2314, + "step": 2300 + }, + { + "epoch": 0.5566935775394626, + "grad_norm": 0.18101802468299866, + "learning_rate": 9.779160829731698e-06, + "loss": 0.2311, + "step": 2310 + }, + { + "epoch": 0.5591035064465598, + "grad_norm": 0.1984725147485733, + "learning_rate": 9.77502176950015e-06, + "loss": 0.2296, + "step": 2320 + }, + { + "epoch": 0.561513435353657, + "grad_norm": 0.16921648383140564, + "learning_rate": 9.770845173000272e-06, + "loss": 0.2302, + "step": 2330 + }, + { + "epoch": 0.5639233642607543, + "grad_norm": 0.20288076996803284, + "learning_rate": 9.766631073064132e-06, + "loss": 0.2281, + "step": 2340 + }, + { + "epoch": 0.5663332931678515, + "grad_norm": 0.1872728168964386, + "learning_rate": 9.762379502818613e-06, + "loss": 0.2297, + "step": 2350 + }, + { + "epoch": 0.5687432220749488, + "grad_norm": 0.1912074089050293, + "learning_rate": 9.758090495685151e-06, + "loss": 0.2294, + "step": 2360 + }, + { + "epoch": 0.571153150982046, + "grad_norm": 0.20468741655349731, + "learning_rate": 9.75376408537947e-06, + "loss": 0.2308, + "step": 2370 + }, + { + "epoch": 0.5735630798891432, + "grad_norm": 0.19299040734767914, + "learning_rate": 9.749400305911323e-06, + "loss": 0.2299, + "step": 2380 + }, + { + "epoch": 0.5759730087962405, + "grad_norm": 0.1859942078590393, + "learning_rate": 9.744999191584214e-06, + "loss": 0.2288, + "step": 2390 + }, + { + "epoch": 0.5783829377033377, + "grad_norm": 0.17618857324123383, + "learning_rate": 9.740560776995142e-06, + "loss": 0.232, + "step": 2400 + }, + { + "epoch": 0.5807928666104349, + "grad_norm": 0.20461530983448029, + "learning_rate": 9.736085097034318e-06, + "loss": 0.2292, + "step": 2410 + }, + { + "epoch": 0.5832027955175322, + "grad_norm": 0.19141855835914612, + "learning_rate": 9.731572186884894e-06, + "loss": 0.2281, + "step": 2420 + }, + { + "epoch": 0.5856127244246295, + "grad_norm": 0.19507476687431335, + "learning_rate": 9.727022082022692e-06, + "loss": 0.2249, + "step": 2430 + }, + { + "epoch": 0.5880226533317268, + "grad_norm": 0.1954670399427414, + "learning_rate": 9.722434818215914e-06, + "loss": 0.2315, + "step": 2440 + }, + { + "epoch": 0.590432582238824, + "grad_norm": 0.1806354522705078, + "learning_rate": 9.71781043152487e-06, + "loss": 0.2295, + "step": 2450 + }, + { + "epoch": 0.5928425111459212, + "grad_norm": 0.1709500104188919, + "learning_rate": 9.713148958301692e-06, + "loss": 0.2284, + "step": 2460 + }, + { + "epoch": 0.5952524400530185, + "grad_norm": 0.18373407423496246, + "learning_rate": 9.708450435190048e-06, + "loss": 0.2311, + "step": 2470 + }, + { + "epoch": 0.5976623689601157, + "grad_norm": 0.16200312972068787, + "learning_rate": 9.703714899124853e-06, + "loss": 0.2267, + "step": 2480 + }, + { + "epoch": 0.600072297867213, + "grad_norm": 0.19261179864406586, + "learning_rate": 9.698942387331983e-06, + "loss": 0.2285, + "step": 2490 + }, + { + "epoch": 0.6024822267743102, + "grad_norm": 0.18697507679462433, + "learning_rate": 9.694132937327969e-06, + "loss": 0.2292, + "step": 2500 + }, + { + "epoch": 0.6048921556814074, + "grad_norm": 0.19558601081371307, + "learning_rate": 9.689286586919721e-06, + "loss": 0.231, + "step": 2510 + }, + { + "epoch": 0.6073020845885047, + "grad_norm": 0.1981854885816574, + "learning_rate": 9.684403374204223e-06, + "loss": 0.2322, + "step": 2520 + }, + { + "epoch": 0.6097120134956019, + "grad_norm": 0.1691233515739441, + "learning_rate": 9.679483337568223e-06, + "loss": 0.2303, + "step": 2530 + }, + { + "epoch": 0.6121219424026991, + "grad_norm": 0.18886698782444, + "learning_rate": 9.674526515687947e-06, + "loss": 0.2312, + "step": 2540 + }, + { + "epoch": 0.6145318713097964, + "grad_norm": 0.1817430853843689, + "learning_rate": 9.669532947528789e-06, + "loss": 0.2256, + "step": 2550 + }, + { + "epoch": 0.6169418002168936, + "grad_norm": 0.19985781610012054, + "learning_rate": 9.664502672345002e-06, + "loss": 0.2307, + "step": 2560 + }, + { + "epoch": 0.6193517291239908, + "grad_norm": 0.2230524867773056, + "learning_rate": 9.65943572967939e-06, + "loss": 0.2282, + "step": 2570 + }, + { + "epoch": 0.6217616580310881, + "grad_norm": 0.17067363858222961, + "learning_rate": 9.654332159363004e-06, + "loss": 0.2321, + "step": 2580 + }, + { + "epoch": 0.6241715869381853, + "grad_norm": 0.17747369408607483, + "learning_rate": 9.649192001514817e-06, + "loss": 0.2258, + "step": 2590 + }, + { + "epoch": 0.6265815158452825, + "grad_norm": 0.21526795625686646, + "learning_rate": 9.64401529654142e-06, + "loss": 0.2263, + "step": 2600 + }, + { + "epoch": 0.6289914447523798, + "grad_norm": 0.1777113825082779, + "learning_rate": 9.638802085136698e-06, + "loss": 0.2265, + "step": 2610 + }, + { + "epoch": 0.631401373659477, + "grad_norm": 0.1751803159713745, + "learning_rate": 9.63355240828151e-06, + "loss": 0.2273, + "step": 2620 + }, + { + "epoch": 0.6338113025665743, + "grad_norm": 0.19355806708335876, + "learning_rate": 9.628266307243373e-06, + "loss": 0.2263, + "step": 2630 + }, + { + "epoch": 0.6362212314736715, + "grad_norm": 0.22096408903598785, + "learning_rate": 9.62294382357613e-06, + "loss": 0.2272, + "step": 2640 + }, + { + "epoch": 0.6386311603807687, + "grad_norm": 0.17918968200683594, + "learning_rate": 9.617584999119624e-06, + "loss": 0.2246, + "step": 2650 + }, + { + "epoch": 0.641041089287866, + "grad_norm": 0.17709515988826752, + "learning_rate": 9.612189875999378e-06, + "loss": 0.2243, + "step": 2660 + }, + { + "epoch": 0.6434510181949632, + "grad_norm": 0.20072241127490997, + "learning_rate": 9.606758496626252e-06, + "loss": 0.2283, + "step": 2670 + }, + { + "epoch": 0.6458609471020604, + "grad_norm": 0.19501695036888123, + "learning_rate": 9.60129090369612e-06, + "loss": 0.2283, + "step": 2680 + }, + { + "epoch": 0.6482708760091578, + "grad_norm": 0.19082598388195038, + "learning_rate": 9.59578714018952e-06, + "loss": 0.2278, + "step": 2690 + }, + { + "epoch": 0.650680804916255, + "grad_norm": 0.1871589571237564, + "learning_rate": 9.590247249371338e-06, + "loss": 0.2301, + "step": 2700 + }, + { + "epoch": 0.6530907338233523, + "grad_norm": 0.20353522896766663, + "learning_rate": 9.584671274790447e-06, + "loss": 0.223, + "step": 2710 + }, + { + "epoch": 0.6555006627304495, + "grad_norm": 0.18655039370059967, + "learning_rate": 9.579059260279376e-06, + "loss": 0.2285, + "step": 2720 + }, + { + "epoch": 0.6579105916375467, + "grad_norm": 0.2111136019229889, + "learning_rate": 9.573411249953963e-06, + "loss": 0.2278, + "step": 2730 + }, + { + "epoch": 0.660320520544644, + "grad_norm": 0.19638030230998993, + "learning_rate": 9.567727288213005e-06, + "loss": 0.2283, + "step": 2740 + }, + { + "epoch": 0.6627304494517412, + "grad_norm": 0.18445482850074768, + "learning_rate": 9.562007419737916e-06, + "loss": 0.2282, + "step": 2750 + }, + { + "epoch": 0.6651403783588384, + "grad_norm": 0.22625213861465454, + "learning_rate": 9.556251689492366e-06, + "loss": 0.2296, + "step": 2760 + }, + { + "epoch": 0.6675503072659357, + "grad_norm": 0.23736052215099335, + "learning_rate": 9.550460142721938e-06, + "loss": 0.2256, + "step": 2770 + }, + { + "epoch": 0.6699602361730329, + "grad_norm": 0.16888026893138885, + "learning_rate": 9.544632824953767e-06, + "loss": 0.2233, + "step": 2780 + }, + { + "epoch": 0.6723701650801301, + "grad_norm": 0.20066499710083008, + "learning_rate": 9.538769781996178e-06, + "loss": 0.2256, + "step": 2790 + }, + { + "epoch": 0.6747800939872274, + "grad_norm": 0.18064813315868378, + "learning_rate": 9.532871059938335e-06, + "loss": 0.2266, + "step": 2800 + }, + { + "epoch": 0.6771900228943246, + "grad_norm": 0.17394624650478363, + "learning_rate": 9.526936705149872e-06, + "loss": 0.2273, + "step": 2810 + }, + { + "epoch": 0.6795999518014219, + "grad_norm": 0.1744656264781952, + "learning_rate": 9.520966764280532e-06, + "loss": 0.2258, + "step": 2820 + }, + { + "epoch": 0.6820098807085191, + "grad_norm": 0.1700519323348999, + "learning_rate": 9.514961284259796e-06, + "loss": 0.229, + "step": 2830 + }, + { + "epoch": 0.6844198096156163, + "grad_norm": 0.20003622770309448, + "learning_rate": 9.50892031229652e-06, + "loss": 0.2251, + "step": 2840 + }, + { + "epoch": 0.6868297385227136, + "grad_norm": 0.19110295176506042, + "learning_rate": 9.50284389587856e-06, + "loss": 0.2283, + "step": 2850 + }, + { + "epoch": 0.6892396674298108, + "grad_norm": 0.18319687247276306, + "learning_rate": 9.4967320827724e-06, + "loss": 0.2251, + "step": 2860 + }, + { + "epoch": 0.691649596336908, + "grad_norm": 0.19341939687728882, + "learning_rate": 9.490584921022773e-06, + "loss": 0.2262, + "step": 2870 + }, + { + "epoch": 0.6940595252440053, + "grad_norm": 0.17186424136161804, + "learning_rate": 9.484402458952289e-06, + "loss": 0.2253, + "step": 2880 + }, + { + "epoch": 0.6964694541511025, + "grad_norm": 0.16283217072486877, + "learning_rate": 9.478184745161052e-06, + "loss": 0.2263, + "step": 2890 + }, + { + "epoch": 0.6988793830581997, + "grad_norm": 0.17181184887886047, + "learning_rate": 9.471931828526282e-06, + "loss": 0.2288, + "step": 2900 + }, + { + "epoch": 0.701289311965297, + "grad_norm": 0.151957705616951, + "learning_rate": 9.46564375820192e-06, + "loss": 0.2237, + "step": 2910 + }, + { + "epoch": 0.7036992408723942, + "grad_norm": 0.21640506386756897, + "learning_rate": 9.459320583618253e-06, + "loss": 0.226, + "step": 2920 + }, + { + "epoch": 0.7061091697794915, + "grad_norm": 0.17349959909915924, + "learning_rate": 9.452962354481523e-06, + "loss": 0.226, + "step": 2930 + }, + { + "epoch": 0.7085190986865888, + "grad_norm": 0.16324348747730255, + "learning_rate": 9.44656912077353e-06, + "loss": 0.2264, + "step": 2940 + }, + { + "epoch": 0.710929027593686, + "grad_norm": 0.17272070050239563, + "learning_rate": 9.440140932751249e-06, + "loss": 0.2278, + "step": 2950 + }, + { + "epoch": 0.7133389565007833, + "grad_norm": 0.17296038568019867, + "learning_rate": 9.433677840946424e-06, + "loss": 0.2248, + "step": 2960 + }, + { + "epoch": 0.7157488854078805, + "grad_norm": 0.22695279121398926, + "learning_rate": 9.427179896165182e-06, + "loss": 0.2252, + "step": 2970 + }, + { + "epoch": 0.7181588143149777, + "grad_norm": 0.1850307136774063, + "learning_rate": 9.420647149487622e-06, + "loss": 0.2272, + "step": 2980 + }, + { + "epoch": 0.720568743222075, + "grad_norm": 0.17052899301052094, + "learning_rate": 9.414079652267422e-06, + "loss": 0.224, + "step": 2990 + }, + { + "epoch": 0.7229786721291722, + "grad_norm": 0.17390076816082, + "learning_rate": 9.407477456131438e-06, + "loss": 0.2263, + "step": 3000 + }, + { + "epoch": 0.7253886010362695, + "grad_norm": 0.19219738245010376, + "learning_rate": 9.400840612979283e-06, + "loss": 0.2289, + "step": 3010 + }, + { + "epoch": 0.7277985299433667, + "grad_norm": 0.1580750048160553, + "learning_rate": 9.394169174982935e-06, + "loss": 0.2254, + "step": 3020 + }, + { + "epoch": 0.7302084588504639, + "grad_norm": 0.17882807552814484, + "learning_rate": 9.387463194586321e-06, + "loss": 0.2239, + "step": 3030 + }, + { + "epoch": 0.7326183877575612, + "grad_norm": 0.19159093499183655, + "learning_rate": 9.380722724504902e-06, + "loss": 0.2246, + "step": 3040 + }, + { + "epoch": 0.7350283166646584, + "grad_norm": 0.17110764980316162, + "learning_rate": 9.373947817725262e-06, + "loss": 0.2267, + "step": 3050 + }, + { + "epoch": 0.7374382455717556, + "grad_norm": 0.17714989185333252, + "learning_rate": 9.367138527504694e-06, + "loss": 0.2248, + "step": 3060 + }, + { + "epoch": 0.7398481744788529, + "grad_norm": 0.19953234493732452, + "learning_rate": 9.36029490737077e-06, + "loss": 0.2249, + "step": 3070 + }, + { + "epoch": 0.7422581033859501, + "grad_norm": 0.24448458850383759, + "learning_rate": 9.353417011120937e-06, + "loss": 0.2237, + "step": 3080 + }, + { + "epoch": 0.7446680322930473, + "grad_norm": 0.20036014914512634, + "learning_rate": 9.34650489282208e-06, + "loss": 0.227, + "step": 3090 + }, + { + "epoch": 0.7470779612001446, + "grad_norm": 0.18248037993907928, + "learning_rate": 9.339558606810102e-06, + "loss": 0.2218, + "step": 3100 + }, + { + "epoch": 0.7494878901072418, + "grad_norm": 0.19500508904457092, + "learning_rate": 9.332578207689501e-06, + "loss": 0.2235, + "step": 3110 + }, + { + "epoch": 0.751897819014339, + "grad_norm": 0.2017701268196106, + "learning_rate": 9.325563750332935e-06, + "loss": 0.2257, + "step": 3120 + }, + { + "epoch": 0.7543077479214363, + "grad_norm": 0.1895110011100769, + "learning_rate": 9.31851528988079e-06, + "loss": 0.2237, + "step": 3130 + }, + { + "epoch": 0.7567176768285335, + "grad_norm": 0.17268790304660797, + "learning_rate": 9.311432881740752e-06, + "loss": 0.2237, + "step": 3140 + }, + { + "epoch": 0.7591276057356308, + "grad_norm": 0.1882033497095108, + "learning_rate": 9.304316581587367e-06, + "loss": 0.2215, + "step": 3150 + }, + { + "epoch": 0.761537534642728, + "grad_norm": 0.1775064915418625, + "learning_rate": 9.297166445361608e-06, + "loss": 0.2209, + "step": 3160 + }, + { + "epoch": 0.7639474635498252, + "grad_norm": 0.21683911979198456, + "learning_rate": 9.289982529270424e-06, + "loss": 0.2279, + "step": 3170 + }, + { + "epoch": 0.7663573924569225, + "grad_norm": 0.19911475479602814, + "learning_rate": 9.28276488978632e-06, + "loss": 0.2211, + "step": 3180 + }, + { + "epoch": 0.7687673213640197, + "grad_norm": 0.18233388662338257, + "learning_rate": 9.275513583646885e-06, + "loss": 0.2245, + "step": 3190 + }, + { + "epoch": 0.771177250271117, + "grad_norm": 0.19964131712913513, + "learning_rate": 9.26822866785437e-06, + "loss": 0.2255, + "step": 3200 + }, + { + "epoch": 0.7735871791782143, + "grad_norm": 0.1638319343328476, + "learning_rate": 9.260910199675224e-06, + "loss": 0.2225, + "step": 3210 + }, + { + "epoch": 0.7759971080853115, + "grad_norm": 0.18763862550258636, + "learning_rate": 9.253558236639654e-06, + "loss": 0.2265, + "step": 3220 + }, + { + "epoch": 0.7784070369924088, + "grad_norm": 0.20708943903446198, + "learning_rate": 9.246172836541167e-06, + "loss": 0.2246, + "step": 3230 + }, + { + "epoch": 0.780816965899506, + "grad_norm": 0.16020740568637848, + "learning_rate": 9.238754057436121e-06, + "loss": 0.2274, + "step": 3240 + }, + { + "epoch": 0.7832268948066032, + "grad_norm": 0.19551053643226624, + "learning_rate": 9.23130195764326e-06, + "loss": 0.223, + "step": 3250 + }, + { + "epoch": 0.7856368237137005, + "grad_norm": 0.18796652555465698, + "learning_rate": 9.223816595743263e-06, + "loss": 0.2206, + "step": 3260 + }, + { + "epoch": 0.7880467526207977, + "grad_norm": 0.18281373381614685, + "learning_rate": 9.21629803057828e-06, + "loss": 0.2282, + "step": 3270 + }, + { + "epoch": 0.7904566815278949, + "grad_norm": 0.19269907474517822, + "learning_rate": 9.208746321251477e-06, + "loss": 0.2258, + "step": 3280 + }, + { + "epoch": 0.7928666104349922, + "grad_norm": 0.18990552425384521, + "learning_rate": 9.201161527126554e-06, + "loss": 0.2207, + "step": 3290 + }, + { + "epoch": 0.7952765393420894, + "grad_norm": 0.2274039387702942, + "learning_rate": 9.193543707827297e-06, + "loss": 0.2239, + "step": 3300 + }, + { + "epoch": 0.7976864682491867, + "grad_norm": 0.23829886317253113, + "learning_rate": 9.185892923237101e-06, + "loss": 0.2208, + "step": 3310 + }, + { + "epoch": 0.8000963971562839, + "grad_norm": 0.18112841248512268, + "learning_rate": 9.178209233498497e-06, + "loss": 0.2232, + "step": 3320 + }, + { + "epoch": 0.8025063260633811, + "grad_norm": 0.1845228374004364, + "learning_rate": 9.170492699012686e-06, + "loss": 0.2258, + "step": 3330 + }, + { + "epoch": 0.8049162549704784, + "grad_norm": 0.16812431812286377, + "learning_rate": 9.162743380439057e-06, + "loss": 0.2256, + "step": 3340 + }, + { + "epoch": 0.8073261838775756, + "grad_norm": 0.19715647399425507, + "learning_rate": 9.154961338694714e-06, + "loss": 0.2188, + "step": 3350 + }, + { + "epoch": 0.8097361127846728, + "grad_norm": 0.16696830093860626, + "learning_rate": 9.147146634954e-06, + "loss": 0.224, + "step": 3360 + }, + { + "epoch": 0.8121460416917701, + "grad_norm": 0.20943832397460938, + "learning_rate": 9.139299330648006e-06, + "loss": 0.2243, + "step": 3370 + }, + { + "epoch": 0.8145559705988673, + "grad_norm": 0.18432581424713135, + "learning_rate": 9.131419487464104e-06, + "loss": 0.2224, + "step": 3380 + }, + { + "epoch": 0.8169658995059645, + "grad_norm": 0.1849818378686905, + "learning_rate": 9.123507167345444e-06, + "loss": 0.2257, + "step": 3390 + }, + { + "epoch": 0.8193758284130618, + "grad_norm": 0.18599388003349304, + "learning_rate": 9.115562432490482e-06, + "loss": 0.2252, + "step": 3400 + }, + { + "epoch": 0.821785757320159, + "grad_norm": 0.17823128402233124, + "learning_rate": 9.107585345352481e-06, + "loss": 0.2207, + "step": 3410 + }, + { + "epoch": 0.8241956862272563, + "grad_norm": 0.1734575480222702, + "learning_rate": 9.099575968639028e-06, + "loss": 0.2235, + "step": 3420 + }, + { + "epoch": 0.8266056151343535, + "grad_norm": 0.1771286129951477, + "learning_rate": 9.091534365311531e-06, + "loss": 0.2234, + "step": 3430 + }, + { + "epoch": 0.8290155440414507, + "grad_norm": 0.17510093748569489, + "learning_rate": 9.08346059858474e-06, + "loss": 0.2208, + "step": 3440 + }, + { + "epoch": 0.8314254729485481, + "grad_norm": 0.16973088681697845, + "learning_rate": 9.075354731926232e-06, + "loss": 0.2217, + "step": 3450 + }, + { + "epoch": 0.8338354018556453, + "grad_norm": 0.1625513732433319, + "learning_rate": 9.067216829055922e-06, + "loss": 0.224, + "step": 3460 + }, + { + "epoch": 0.8362453307627425, + "grad_norm": 0.17390358448028564, + "learning_rate": 9.059046953945563e-06, + "loss": 0.2226, + "step": 3470 + }, + { + "epoch": 0.8386552596698398, + "grad_norm": 0.1795208603143692, + "learning_rate": 9.050845170818239e-06, + "loss": 0.2219, + "step": 3480 + }, + { + "epoch": 0.841065188576937, + "grad_norm": 0.1884290724992752, + "learning_rate": 9.04261154414786e-06, + "loss": 0.2231, + "step": 3490 + }, + { + "epoch": 0.8434751174840343, + "grad_norm": 0.16878260672092438, + "learning_rate": 9.03434613865866e-06, + "loss": 0.2233, + "step": 3500 + }, + { + "epoch": 0.8458850463911315, + "grad_norm": 0.17098242044448853, + "learning_rate": 9.026049019324686e-06, + "loss": 0.2263, + "step": 3510 + }, + { + "epoch": 0.8482949752982287, + "grad_norm": 0.18065747618675232, + "learning_rate": 9.01772025136928e-06, + "loss": 0.218, + "step": 3520 + }, + { + "epoch": 0.850704904205326, + "grad_norm": 0.15557970106601715, + "learning_rate": 9.009359900264579e-06, + "loss": 0.2226, + "step": 3530 + }, + { + "epoch": 0.8531148331124232, + "grad_norm": 0.18261565268039703, + "learning_rate": 9.00096803173099e-06, + "loss": 0.2177, + "step": 3540 + }, + { + "epoch": 0.8555247620195204, + "grad_norm": 0.1887233406305313, + "learning_rate": 8.992544711736682e-06, + "loss": 0.2202, + "step": 3550 + }, + { + "epoch": 0.8579346909266177, + "grad_norm": 0.18528446555137634, + "learning_rate": 8.984090006497056e-06, + "loss": 0.2206, + "step": 3560 + }, + { + "epoch": 0.8603446198337149, + "grad_norm": 0.24964436888694763, + "learning_rate": 8.97560398247424e-06, + "loss": 0.2195, + "step": 3570 + }, + { + "epoch": 0.8627545487408121, + "grad_norm": 0.18638278543949127, + "learning_rate": 8.96708670637655e-06, + "loss": 0.2197, + "step": 3580 + }, + { + "epoch": 0.8651644776479094, + "grad_norm": 0.16028571128845215, + "learning_rate": 8.958538245157975e-06, + "loss": 0.2231, + "step": 3590 + }, + { + "epoch": 0.8675744065550066, + "grad_norm": 0.1577351987361908, + "learning_rate": 8.949958666017652e-06, + "loss": 0.2221, + "step": 3600 + }, + { + "epoch": 0.8699843354621039, + "grad_norm": 0.26832443475723267, + "learning_rate": 8.941348036399333e-06, + "loss": 0.2224, + "step": 3610 + }, + { + "epoch": 0.8723942643692011, + "grad_norm": 0.19743038713932037, + "learning_rate": 8.932706423990856e-06, + "loss": 0.224, + "step": 3620 + }, + { + "epoch": 0.8748041932762983, + "grad_norm": 0.17211681604385376, + "learning_rate": 8.924033896723617e-06, + "loss": 0.2243, + "step": 3630 + }, + { + "epoch": 0.8772141221833956, + "grad_norm": 0.18818245828151703, + "learning_rate": 8.915330522772028e-06, + "loss": 0.2229, + "step": 3640 + }, + { + "epoch": 0.8796240510904928, + "grad_norm": 0.18545369803905487, + "learning_rate": 8.906596370552985e-06, + "loss": 0.2204, + "step": 3650 + }, + { + "epoch": 0.88203397999759, + "grad_norm": 0.18533460795879364, + "learning_rate": 8.897831508725338e-06, + "loss": 0.2204, + "step": 3660 + }, + { + "epoch": 0.8844439089046873, + "grad_norm": 0.1828158050775528, + "learning_rate": 8.889036006189338e-06, + "loss": 0.221, + "step": 3670 + }, + { + "epoch": 0.8868538378117845, + "grad_norm": 0.19318468868732452, + "learning_rate": 8.880209932086102e-06, + "loss": 0.2238, + "step": 3680 + }, + { + "epoch": 0.8892637667188817, + "grad_norm": 0.16859950125217438, + "learning_rate": 8.87135335579707e-06, + "loss": 0.2179, + "step": 3690 + }, + { + "epoch": 0.891673695625979, + "grad_norm": 0.16465112566947937, + "learning_rate": 8.862466346943457e-06, + "loss": 0.2243, + "step": 3700 + }, + { + "epoch": 0.8940836245330763, + "grad_norm": 0.19712437689304352, + "learning_rate": 8.853548975385714e-06, + "loss": 0.2211, + "step": 3710 + }, + { + "epoch": 0.8964935534401736, + "grad_norm": 0.1706128865480423, + "learning_rate": 8.84460131122296e-06, + "loss": 0.2209, + "step": 3720 + }, + { + "epoch": 0.8989034823472708, + "grad_norm": 0.20529857277870178, + "learning_rate": 8.835623424792453e-06, + "loss": 0.2242, + "step": 3730 + }, + { + "epoch": 0.901313411254368, + "grad_norm": 0.16499772667884827, + "learning_rate": 8.826615386669025e-06, + "loss": 0.2209, + "step": 3740 + }, + { + "epoch": 0.9037233401614653, + "grad_norm": 0.16960477828979492, + "learning_rate": 8.817577267664528e-06, + "loss": 0.222, + "step": 3750 + }, + { + "epoch": 0.9061332690685625, + "grad_norm": 0.1796937882900238, + "learning_rate": 8.808509138827275e-06, + "loss": 0.2209, + "step": 3760 + }, + { + "epoch": 0.9085431979756597, + "grad_norm": 0.16342346370220184, + "learning_rate": 8.799411071441496e-06, + "loss": 0.2204, + "step": 3770 + }, + { + "epoch": 0.910953126882757, + "grad_norm": 0.1790073662996292, + "learning_rate": 8.790283137026754e-06, + "loss": 0.2203, + "step": 3780 + }, + { + "epoch": 0.9133630557898542, + "grad_norm": 0.17574553191661835, + "learning_rate": 8.781125407337405e-06, + "loss": 0.2221, + "step": 3790 + }, + { + "epoch": 0.9157729846969515, + "grad_norm": 0.15836398303508759, + "learning_rate": 8.77193795436202e-06, + "loss": 0.219, + "step": 3800 + }, + { + "epoch": 0.9181829136040487, + "grad_norm": 0.17478342354297638, + "learning_rate": 8.762720850322823e-06, + "loss": 0.2203, + "step": 3810 + }, + { + "epoch": 0.9205928425111459, + "grad_norm": 0.15719293057918549, + "learning_rate": 8.753474167675128e-06, + "loss": 0.2227, + "step": 3820 + }, + { + "epoch": 0.9230027714182432, + "grad_norm": 0.1693807989358902, + "learning_rate": 8.744197979106763e-06, + "loss": 0.2236, + "step": 3830 + }, + { + "epoch": 0.9254127003253404, + "grad_norm": 0.21582992374897003, + "learning_rate": 8.7348923575375e-06, + "loss": 0.2221, + "step": 3840 + }, + { + "epoch": 0.9278226292324376, + "grad_norm": 0.21359696984291077, + "learning_rate": 8.725557376118482e-06, + "loss": 0.2222, + "step": 3850 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 0.17620986700057983, + "learning_rate": 8.716193108231655e-06, + "loss": 0.2209, + "step": 3860 + }, + { + "epoch": 0.9326424870466321, + "grad_norm": 0.16482968628406525, + "learning_rate": 8.706799627489175e-06, + "loss": 0.2212, + "step": 3870 + }, + { + "epoch": 0.9350524159537293, + "grad_norm": 0.16502827405929565, + "learning_rate": 8.697377007732848e-06, + "loss": 0.2204, + "step": 3880 + }, + { + "epoch": 0.9374623448608266, + "grad_norm": 0.21359188854694366, + "learning_rate": 8.687925323033536e-06, + "loss": 0.2206, + "step": 3890 + }, + { + "epoch": 0.9398722737679238, + "grad_norm": 0.19615745544433594, + "learning_rate": 8.67844464769058e-06, + "loss": 0.2201, + "step": 3900 + }, + { + "epoch": 0.942282202675021, + "grad_norm": 0.17519152164459229, + "learning_rate": 8.668935056231216e-06, + "loss": 0.2182, + "step": 3910 + }, + { + "epoch": 0.9446921315821183, + "grad_norm": 0.1906772404909134, + "learning_rate": 8.659396623409987e-06, + "loss": 0.2239, + "step": 3920 + }, + { + "epoch": 0.9471020604892155, + "grad_norm": 0.19051863253116608, + "learning_rate": 8.649829424208163e-06, + "loss": 0.2224, + "step": 3930 + }, + { + "epoch": 0.9495119893963128, + "grad_norm": 0.22634129226207733, + "learning_rate": 8.640233533833136e-06, + "loss": 0.2187, + "step": 3940 + }, + { + "epoch": 0.95192191830341, + "grad_norm": 0.1962852030992508, + "learning_rate": 8.630609027717843e-06, + "loss": 0.2196, + "step": 3950 + }, + { + "epoch": 0.9543318472105073, + "grad_norm": 0.185867577791214, + "learning_rate": 8.620955981520171e-06, + "loss": 0.2194, + "step": 3960 + }, + { + "epoch": 0.9567417761176046, + "grad_norm": 0.17657962441444397, + "learning_rate": 8.611274471122355e-06, + "loss": 0.2229, + "step": 3970 + }, + { + "epoch": 0.9591517050247018, + "grad_norm": 0.15567971765995026, + "learning_rate": 8.601564572630387e-06, + "loss": 0.2193, + "step": 3980 + }, + { + "epoch": 0.961561633931799, + "grad_norm": 0.16838258504867554, + "learning_rate": 8.591826362373421e-06, + "loss": 0.2236, + "step": 3990 + }, + { + "epoch": 0.9639715628388963, + "grad_norm": 0.18067292869091034, + "learning_rate": 8.58205991690316e-06, + "loss": 0.2227, + "step": 4000 + }, + { + "epoch": 0.9663814917459935, + "grad_norm": 0.16675949096679688, + "learning_rate": 8.572265312993274e-06, + "loss": 0.2197, + "step": 4010 + }, + { + "epoch": 0.9687914206530908, + "grad_norm": 0.17282336950302124, + "learning_rate": 8.562442627638774e-06, + "loss": 0.2213, + "step": 4020 + }, + { + "epoch": 0.971201349560188, + "grad_norm": 0.192217618227005, + "learning_rate": 8.552591938055425e-06, + "loss": 0.2184, + "step": 4030 + }, + { + "epoch": 0.9736112784672852, + "grad_norm": 0.1581621617078781, + "learning_rate": 8.542713321679137e-06, + "loss": 0.2221, + "step": 4040 + }, + { + "epoch": 0.9760212073743825, + "grad_norm": 0.1765250712633133, + "learning_rate": 8.532806856165337e-06, + "loss": 0.2188, + "step": 4050 + }, + { + "epoch": 0.9784311362814797, + "grad_norm": 0.17979812622070312, + "learning_rate": 8.522872619388387e-06, + "loss": 0.222, + "step": 4060 + }, + { + "epoch": 0.9808410651885769, + "grad_norm": 0.1699352264404297, + "learning_rate": 8.512910689440951e-06, + "loss": 0.2201, + "step": 4070 + }, + { + "epoch": 0.9832509940956742, + "grad_norm": 0.17940454185009003, + "learning_rate": 8.50292114463339e-06, + "loss": 0.2163, + "step": 4080 + }, + { + "epoch": 0.9856609230027714, + "grad_norm": 0.16519592702388763, + "learning_rate": 8.49290406349314e-06, + "loss": 0.2203, + "step": 4090 + }, + { + "epoch": 0.9880708519098687, + "grad_norm": 0.16589729487895966, + "learning_rate": 8.482859524764108e-06, + "loss": 0.2224, + "step": 4100 + }, + { + "epoch": 0.9904807808169659, + "grad_norm": 0.15325430035591125, + "learning_rate": 8.472787607406036e-06, + "loss": 0.2187, + "step": 4110 + }, + { + "epoch": 0.9928907097240631, + "grad_norm": 0.16159406304359436, + "learning_rate": 8.462688390593894e-06, + "loss": 0.2247, + "step": 4120 + }, + { + "epoch": 0.9953006386311604, + "grad_norm": 0.1637933999300003, + "learning_rate": 8.452561953717246e-06, + "loss": 0.2178, + "step": 4130 + }, + { + "epoch": 0.9977105675382576, + "grad_norm": 0.16510990262031555, + "learning_rate": 8.442408376379637e-06, + "loss": 0.2219, + "step": 4140 + }, + { + "epoch": 1.0, + "grad_norm": 0.19598262012004852, + "learning_rate": 8.43222773839796e-06, + "loss": 0.2169, + "step": 4150 + }, + { + "epoch": 1.0024099289070973, + "grad_norm": 0.22410273551940918, + "learning_rate": 8.422020119801831e-06, + "loss": 0.2121, + "step": 4160 + }, + { + "epoch": 1.0048198578141945, + "grad_norm": 0.1659049689769745, + "learning_rate": 8.411785600832959e-06, + "loss": 0.2159, + "step": 4170 + }, + { + "epoch": 1.0072297867212918, + "grad_norm": 0.1600600630044937, + "learning_rate": 8.401524261944519e-06, + "loss": 0.2142, + "step": 4180 + }, + { + "epoch": 1.009639715628389, + "grad_norm": 0.17050403356552124, + "learning_rate": 8.39123618380051e-06, + "loss": 0.2133, + "step": 4190 + }, + { + "epoch": 1.0120496445354863, + "grad_norm": 0.1873362809419632, + "learning_rate": 8.380921447275137e-06, + "loss": 0.2161, + "step": 4200 + }, + { + "epoch": 1.0144595734425834, + "grad_norm": 0.17461583018302917, + "learning_rate": 8.370580133452153e-06, + "loss": 0.2161, + "step": 4210 + }, + { + "epoch": 1.0168695023496808, + "grad_norm": 0.17326101660728455, + "learning_rate": 8.360212323624246e-06, + "loss": 0.2137, + "step": 4220 + }, + { + "epoch": 1.019279431256778, + "grad_norm": 0.16447477042675018, + "learning_rate": 8.349818099292379e-06, + "loss": 0.2164, + "step": 4230 + }, + { + "epoch": 1.0216893601638752, + "grad_norm": 0.16297782957553864, + "learning_rate": 8.339397542165166e-06, + "loss": 0.213, + "step": 4240 + }, + { + "epoch": 1.0240992890709724, + "grad_norm": 0.1830231249332428, + "learning_rate": 8.328950734158219e-06, + "loss": 0.214, + "step": 4250 + }, + { + "epoch": 1.0265092179780697, + "grad_norm": 0.17782536149024963, + "learning_rate": 8.318477757393502e-06, + "loss": 0.2119, + "step": 4260 + }, + { + "epoch": 1.0289191468851668, + "grad_norm": 0.16562552750110626, + "learning_rate": 8.3079786941987e-06, + "loss": 0.2129, + "step": 4270 + }, + { + "epoch": 1.0313290757922642, + "grad_norm": 0.1697404384613037, + "learning_rate": 8.297453627106556e-06, + "loss": 0.2111, + "step": 4280 + }, + { + "epoch": 1.0337390046993613, + "grad_norm": 0.17114974558353424, + "learning_rate": 8.28690263885423e-06, + "loss": 0.2123, + "step": 4290 + }, + { + "epoch": 1.0361489336064587, + "grad_norm": 0.16717548668384552, + "learning_rate": 8.276325812382648e-06, + "loss": 0.2128, + "step": 4300 + }, + { + "epoch": 1.0385588625135558, + "grad_norm": 0.16737718880176544, + "learning_rate": 8.265723230835852e-06, + "loss": 0.2107, + "step": 4310 + }, + { + "epoch": 1.0409687914206531, + "grad_norm": 0.15507322549819946, + "learning_rate": 8.255094977560335e-06, + "loss": 0.2127, + "step": 4320 + }, + { + "epoch": 1.0433787203277503, + "grad_norm": 0.1764843910932541, + "learning_rate": 8.244441136104406e-06, + "loss": 0.212, + "step": 4330 + }, + { + "epoch": 1.0457886492348476, + "grad_norm": 0.17232689261436462, + "learning_rate": 8.233761790217512e-06, + "loss": 0.2136, + "step": 4340 + }, + { + "epoch": 1.0481985781419447, + "grad_norm": 0.1621418595314026, + "learning_rate": 8.223057023849595e-06, + "loss": 0.2128, + "step": 4350 + }, + { + "epoch": 1.050608507049042, + "grad_norm": 0.16862477362155914, + "learning_rate": 8.212326921150426e-06, + "loss": 0.2155, + "step": 4360 + }, + { + "epoch": 1.0530184359561392, + "grad_norm": 0.16127009689807892, + "learning_rate": 8.20157156646894e-06, + "loss": 0.2158, + "step": 4370 + }, + { + "epoch": 1.0554283648632365, + "grad_norm": 0.16644121706485748, + "learning_rate": 8.190791044352581e-06, + "loss": 0.2111, + "step": 4380 + }, + { + "epoch": 1.0578382937703337, + "grad_norm": 0.17563723027706146, + "learning_rate": 8.179985439546633e-06, + "loss": 0.2114, + "step": 4390 + }, + { + "epoch": 1.060248222677431, + "grad_norm": 0.1750829517841339, + "learning_rate": 8.16915483699355e-06, + "loss": 0.2117, + "step": 4400 + }, + { + "epoch": 1.0626581515845284, + "grad_norm": 0.17254221439361572, + "learning_rate": 8.158299321832301e-06, + "loss": 0.2118, + "step": 4410 + }, + { + "epoch": 1.0650680804916255, + "grad_norm": 0.17799583077430725, + "learning_rate": 8.147418979397682e-06, + "loss": 0.2187, + "step": 4420 + }, + { + "epoch": 1.0674780093987228, + "grad_norm": 0.17555776238441467, + "learning_rate": 8.13651389521966e-06, + "loss": 0.2155, + "step": 4430 + }, + { + "epoch": 1.06988793830582, + "grad_norm": 0.1640000343322754, + "learning_rate": 8.125584155022696e-06, + "loss": 0.2134, + "step": 4440 + }, + { + "epoch": 1.0722978672129173, + "grad_norm": 0.17578564584255219, + "learning_rate": 8.114629844725073e-06, + "loss": 0.2119, + "step": 4450 + }, + { + "epoch": 1.0747077961200144, + "grad_norm": 0.1732892245054245, + "learning_rate": 8.103651050438213e-06, + "loss": 0.2124, + "step": 4460 + }, + { + "epoch": 1.0771177250271118, + "grad_norm": 0.15404583513736725, + "learning_rate": 8.09264785846601e-06, + "loss": 0.2149, + "step": 4470 + }, + { + "epoch": 1.079527653934209, + "grad_norm": 0.18235298991203308, + "learning_rate": 8.081620355304147e-06, + "loss": 0.2106, + "step": 4480 + }, + { + "epoch": 1.0819375828413063, + "grad_norm": 0.1770295798778534, + "learning_rate": 8.070568627639418e-06, + "loss": 0.2141, + "step": 4490 + }, + { + "epoch": 1.0843475117484034, + "grad_norm": 0.1517912745475769, + "learning_rate": 8.059492762349037e-06, + "loss": 0.2125, + "step": 4500 + }, + { + "epoch": 1.0867574406555007, + "grad_norm": 0.17619790136814117, + "learning_rate": 8.048392846499974e-06, + "loss": 0.2135, + "step": 4510 + }, + { + "epoch": 1.0891673695625979, + "grad_norm": 0.1587211936712265, + "learning_rate": 8.037268967348252e-06, + "loss": 0.2152, + "step": 4520 + }, + { + "epoch": 1.0915772984696952, + "grad_norm": 0.22725209593772888, + "learning_rate": 8.026121212338271e-06, + "loss": 0.2122, + "step": 4530 + }, + { + "epoch": 1.0939872273767923, + "grad_norm": 0.18444053828716278, + "learning_rate": 8.014949669102117e-06, + "loss": 0.2099, + "step": 4540 + }, + { + "epoch": 1.0963971562838897, + "grad_norm": 0.1482313573360443, + "learning_rate": 8.003754425458878e-06, + "loss": 0.2138, + "step": 4550 + }, + { + "epoch": 1.0988070851909868, + "grad_norm": 0.16575364768505096, + "learning_rate": 7.992535569413944e-06, + "loss": 0.2094, + "step": 4560 + }, + { + "epoch": 1.1012170140980841, + "grad_norm": 0.19779005646705627, + "learning_rate": 7.981293189158327e-06, + "loss": 0.2097, + "step": 4570 + }, + { + "epoch": 1.1036269430051813, + "grad_norm": 0.16325879096984863, + "learning_rate": 7.970027373067961e-06, + "loss": 0.2128, + "step": 4580 + }, + { + "epoch": 1.1060368719122786, + "grad_norm": 0.18840157985687256, + "learning_rate": 7.958738209703004e-06, + "loss": 0.2114, + "step": 4590 + }, + { + "epoch": 1.1084468008193757, + "grad_norm": 0.1780056655406952, + "learning_rate": 7.94742578780715e-06, + "loss": 0.211, + "step": 4600 + }, + { + "epoch": 1.110856729726473, + "grad_norm": 0.15295591950416565, + "learning_rate": 7.936090196306925e-06, + "loss": 0.2125, + "step": 4610 + }, + { + "epoch": 1.1132666586335702, + "grad_norm": 0.17244797945022583, + "learning_rate": 7.924731524310993e-06, + "loss": 0.2121, + "step": 4620 + }, + { + "epoch": 1.1156765875406676, + "grad_norm": 0.16484029591083527, + "learning_rate": 7.91334986110945e-06, + "loss": 0.2161, + "step": 4630 + }, + { + "epoch": 1.1180865164477647, + "grad_norm": 0.16274769604206085, + "learning_rate": 7.90194529617313e-06, + "loss": 0.2127, + "step": 4640 + }, + { + "epoch": 1.120496445354862, + "grad_norm": 0.16935664415359497, + "learning_rate": 7.890517919152892e-06, + "loss": 0.2137, + "step": 4650 + }, + { + "epoch": 1.1229063742619592, + "grad_norm": 0.15090295672416687, + "learning_rate": 7.879067819878918e-06, + "loss": 0.213, + "step": 4660 + }, + { + "epoch": 1.1253163031690565, + "grad_norm": 0.16716746985912323, + "learning_rate": 7.867595088360016e-06, + "loss": 0.2115, + "step": 4670 + }, + { + "epoch": 1.1277262320761539, + "grad_norm": 0.20338410139083862, + "learning_rate": 7.856099814782901e-06, + "loss": 0.2133, + "step": 4680 + }, + { + "epoch": 1.130136160983251, + "grad_norm": 0.1768144816160202, + "learning_rate": 7.844582089511486e-06, + "loss": 0.2112, + "step": 4690 + }, + { + "epoch": 1.1325460898903483, + "grad_norm": 0.16909371316432953, + "learning_rate": 7.833042003086186e-06, + "loss": 0.2113, + "step": 4700 + }, + { + "epoch": 1.1349560187974455, + "grad_norm": 0.1611366719007492, + "learning_rate": 7.82147964622319e-06, + "loss": 0.2126, + "step": 4710 + }, + { + "epoch": 1.1373659477045428, + "grad_norm": 0.19154107570648193, + "learning_rate": 7.809895109813752e-06, + "loss": 0.2109, + "step": 4720 + }, + { + "epoch": 1.13977587661164, + "grad_norm": 0.14273270964622498, + "learning_rate": 7.798288484923482e-06, + "loss": 0.2117, + "step": 4730 + }, + { + "epoch": 1.1421858055187373, + "grad_norm": 0.19387304782867432, + "learning_rate": 7.786659862791628e-06, + "loss": 0.212, + "step": 4740 + }, + { + "epoch": 1.1445957344258344, + "grad_norm": 0.16093966364860535, + "learning_rate": 7.775009334830354e-06, + "loss": 0.2148, + "step": 4750 + }, + { + "epoch": 1.1470056633329317, + "grad_norm": 0.16083253920078278, + "learning_rate": 7.763336992624027e-06, + "loss": 0.2132, + "step": 4760 + }, + { + "epoch": 1.1494155922400289, + "grad_norm": 0.18043479323387146, + "learning_rate": 7.751642927928495e-06, + "loss": 0.2125, + "step": 4770 + }, + { + "epoch": 1.1518255211471262, + "grad_norm": 0.168365940451622, + "learning_rate": 7.739927232670363e-06, + "loss": 0.2138, + "step": 4780 + }, + { + "epoch": 1.1542354500542233, + "grad_norm": 0.1817869246006012, + "learning_rate": 7.728189998946278e-06, + "loss": 0.2123, + "step": 4790 + }, + { + "epoch": 1.1566453789613207, + "grad_norm": 0.18804657459259033, + "learning_rate": 7.716431319022197e-06, + "loss": 0.2142, + "step": 4800 + }, + { + "epoch": 1.1590553078684178, + "grad_norm": 0.18389475345611572, + "learning_rate": 7.704651285332662e-06, + "loss": 0.2123, + "step": 4810 + }, + { + "epoch": 1.1614652367755152, + "grad_norm": 0.20771625638008118, + "learning_rate": 7.692849990480082e-06, + "loss": 0.214, + "step": 4820 + }, + { + "epoch": 1.1638751656826123, + "grad_norm": 0.18531720340251923, + "learning_rate": 7.681027527233995e-06, + "loss": 0.2105, + "step": 4830 + }, + { + "epoch": 1.1662850945897096, + "grad_norm": 0.16333264112472534, + "learning_rate": 7.669183988530346e-06, + "loss": 0.214, + "step": 4840 + }, + { + "epoch": 1.1686950234968068, + "grad_norm": 0.17973899841308594, + "learning_rate": 7.65731946747075e-06, + "loss": 0.2142, + "step": 4850 + }, + { + "epoch": 1.171104952403904, + "grad_norm": 0.16610458493232727, + "learning_rate": 7.645434057321765e-06, + "loss": 0.211, + "step": 4860 + }, + { + "epoch": 1.1735148813110012, + "grad_norm": 0.14925527572631836, + "learning_rate": 7.633527851514163e-06, + "loss": 0.2113, + "step": 4870 + }, + { + "epoch": 1.1759248102180986, + "grad_norm": 0.16800275444984436, + "learning_rate": 7.621600943642175e-06, + "loss": 0.2128, + "step": 4880 + }, + { + "epoch": 1.1783347391251957, + "grad_norm": 0.1554495096206665, + "learning_rate": 7.609653427462789e-06, + "loss": 0.2144, + "step": 4890 + }, + { + "epoch": 1.180744668032293, + "grad_norm": 0.16693717241287231, + "learning_rate": 7.5976853968949785e-06, + "loss": 0.2125, + "step": 4900 + }, + { + "epoch": 1.1831545969393904, + "grad_norm": 0.1670588105916977, + "learning_rate": 7.585696946018988e-06, + "loss": 0.2127, + "step": 4910 + }, + { + "epoch": 1.1855645258464875, + "grad_norm": 0.14736416935920715, + "learning_rate": 7.573688169075584e-06, + "loss": 0.2088, + "step": 4920 + }, + { + "epoch": 1.1879744547535847, + "grad_norm": 0.1936376839876175, + "learning_rate": 7.561659160465314e-06, + "loss": 0.2092, + "step": 4930 + }, + { + "epoch": 1.190384383660682, + "grad_norm": 0.16551412642002106, + "learning_rate": 7.549610014747769e-06, + "loss": 0.2126, + "step": 4940 + }, + { + "epoch": 1.1927943125677793, + "grad_norm": 0.15562906861305237, + "learning_rate": 7.537540826640834e-06, + "loss": 0.2091, + "step": 4950 + }, + { + "epoch": 1.1952042414748765, + "grad_norm": 0.14535550773143768, + "learning_rate": 7.525451691019945e-06, + "loss": 0.21, + "step": 4960 + }, + { + "epoch": 1.1976141703819738, + "grad_norm": 0.157695010304451, + "learning_rate": 7.513342702917349e-06, + "loss": 0.2127, + "step": 4970 + }, + { + "epoch": 1.200024099289071, + "grad_norm": 0.15610983967781067, + "learning_rate": 7.5012139575213505e-06, + "loss": 0.2129, + "step": 4980 + }, + { + "epoch": 1.2024340281961683, + "grad_norm": 0.14861977100372314, + "learning_rate": 7.4890655501755634e-06, + "loss": 0.2127, + "step": 4990 + }, + { + "epoch": 1.2048439571032654, + "grad_norm": 0.1682192087173462, + "learning_rate": 7.476897576378169e-06, + "loss": 0.2116, + "step": 5000 + }, + { + "epoch": 1.2072538860103628, + "grad_norm": 0.1472558081150055, + "learning_rate": 7.464710131781154e-06, + "loss": 0.2124, + "step": 5010 + }, + { + "epoch": 1.20966381491746, + "grad_norm": 0.15624482929706573, + "learning_rate": 7.452503312189567e-06, + "loss": 0.2084, + "step": 5020 + }, + { + "epoch": 1.2120737438245572, + "grad_norm": 0.17334376275539398, + "learning_rate": 7.440277213560763e-06, + "loss": 0.2103, + "step": 5030 + }, + { + "epoch": 1.2144836727316544, + "grad_norm": 0.14388589560985565, + "learning_rate": 7.428031932003647e-06, + "loss": 0.2107, + "step": 5040 + }, + { + "epoch": 1.2168936016387517, + "grad_norm": 0.1735595464706421, + "learning_rate": 7.415767563777922e-06, + "loss": 0.2118, + "step": 5050 + }, + { + "epoch": 1.2193035305458488, + "grad_norm": 0.14402276277542114, + "learning_rate": 7.40348420529333e-06, + "loss": 0.2115, + "step": 5060 + }, + { + "epoch": 1.2217134594529462, + "grad_norm": 0.20903652906417847, + "learning_rate": 7.3911819531088926e-06, + "loss": 0.2125, + "step": 5070 + }, + { + "epoch": 1.2241233883600433, + "grad_norm": 0.16466741263866425, + "learning_rate": 7.378860903932159e-06, + "loss": 0.2103, + "step": 5080 + }, + { + "epoch": 1.2265333172671407, + "grad_norm": 0.15621258318424225, + "learning_rate": 7.366521154618438e-06, + "loss": 0.2116, + "step": 5090 + }, + { + "epoch": 1.2289432461742378, + "grad_norm": 0.14338956773281097, + "learning_rate": 7.354162802170037e-06, + "loss": 0.2119, + "step": 5100 + }, + { + "epoch": 1.2313531750813351, + "grad_norm": 0.14769229292869568, + "learning_rate": 7.341785943735507e-06, + "loss": 0.212, + "step": 5110 + }, + { + "epoch": 1.2337631039884323, + "grad_norm": 0.16111035645008087, + "learning_rate": 7.3293906766088694e-06, + "loss": 0.2108, + "step": 5120 + }, + { + "epoch": 1.2361730328955296, + "grad_norm": 0.16381020843982697, + "learning_rate": 7.316977098228858e-06, + "loss": 0.2147, + "step": 5130 + }, + { + "epoch": 1.238582961802627, + "grad_norm": 0.15808707475662231, + "learning_rate": 7.3045453061781504e-06, + "loss": 0.2143, + "step": 5140 + }, + { + "epoch": 1.240992890709724, + "grad_norm": 0.18159683048725128, + "learning_rate": 7.292095398182601e-06, + "loss": 0.2136, + "step": 5150 + }, + { + "epoch": 1.2434028196168212, + "grad_norm": 0.17652052640914917, + "learning_rate": 7.2796274721104745e-06, + "loss": 0.2104, + "step": 5160 + }, + { + "epoch": 1.2458127485239185, + "grad_norm": 0.21559830009937286, + "learning_rate": 7.267141625971672e-06, + "loss": 0.2114, + "step": 5170 + }, + { + "epoch": 1.248222677431016, + "grad_norm": 0.15372782945632935, + "learning_rate": 7.254637957916964e-06, + "loss": 0.2087, + "step": 5180 + }, + { + "epoch": 1.250632606338113, + "grad_norm": 0.16764594614505768, + "learning_rate": 7.2421165662372216e-06, + "loss": 0.2112, + "step": 5190 + }, + { + "epoch": 1.2530425352452101, + "grad_norm": 0.16227737069129944, + "learning_rate": 7.229577549362638e-06, + "loss": 0.211, + "step": 5200 + }, + { + "epoch": 1.2554524641523075, + "grad_norm": 0.15372371673583984, + "learning_rate": 7.217021005861957e-06, + "loss": 0.2098, + "step": 5210 + }, + { + "epoch": 1.2578623930594048, + "grad_norm": 0.14599712193012238, + "learning_rate": 7.204447034441699e-06, + "loss": 0.2128, + "step": 5220 + }, + { + "epoch": 1.260272321966502, + "grad_norm": 0.1904706507921219, + "learning_rate": 7.191855733945388e-06, + "loss": 0.2123, + "step": 5230 + }, + { + "epoch": 1.2626822508735993, + "grad_norm": 0.16215404868125916, + "learning_rate": 7.179247203352766e-06, + "loss": 0.211, + "step": 5240 + }, + { + "epoch": 1.2650921797806964, + "grad_norm": 0.1625434160232544, + "learning_rate": 7.166621541779023e-06, + "loss": 0.2127, + "step": 5250 + }, + { + "epoch": 1.2675021086877938, + "grad_norm": 0.15905945003032684, + "learning_rate": 7.153978848474015e-06, + "loss": 0.2093, + "step": 5260 + }, + { + "epoch": 1.269912037594891, + "grad_norm": 0.17562928795814514, + "learning_rate": 7.141319222821483e-06, + "loss": 0.2104, + "step": 5270 + }, + { + "epoch": 1.2723219665019883, + "grad_norm": 0.15171091258525848, + "learning_rate": 7.128642764338273e-06, + "loss": 0.213, + "step": 5280 + }, + { + "epoch": 1.2747318954090854, + "grad_norm": 0.17683686316013336, + "learning_rate": 7.115949572673552e-06, + "loss": 0.2084, + "step": 5290 + }, + { + "epoch": 1.2771418243161827, + "grad_norm": 0.16825149953365326, + "learning_rate": 7.1032397476080285e-06, + "loss": 0.21, + "step": 5300 + }, + { + "epoch": 1.2795517532232799, + "grad_norm": 0.16288043558597565, + "learning_rate": 7.090513389053164e-06, + "loss": 0.2093, + "step": 5310 + }, + { + "epoch": 1.2819616821303772, + "grad_norm": 0.1785936951637268, + "learning_rate": 7.0777705970503885e-06, + "loss": 0.2121, + "step": 5320 + }, + { + "epoch": 1.2843716110374743, + "grad_norm": 0.178307443857193, + "learning_rate": 7.065011471770316e-06, + "loss": 0.2094, + "step": 5330 + }, + { + "epoch": 1.2867815399445717, + "grad_norm": 0.176238015294075, + "learning_rate": 7.052236113511955e-06, + "loss": 0.2069, + "step": 5340 + }, + { + "epoch": 1.2891914688516688, + "grad_norm": 0.16577781736850739, + "learning_rate": 7.039444622701922e-06, + "loss": 0.2118, + "step": 5350 + }, + { + "epoch": 1.2916013977587661, + "grad_norm": 0.16853252053260803, + "learning_rate": 7.0266370998936475e-06, + "loss": 0.2095, + "step": 5360 + }, + { + "epoch": 1.2940113266658635, + "grad_norm": 0.17209801077842712, + "learning_rate": 7.013813645766593e-06, + "loss": 0.2083, + "step": 5370 + }, + { + "epoch": 1.2964212555729606, + "grad_norm": 0.15391361713409424, + "learning_rate": 7.000974361125454e-06, + "loss": 0.2097, + "step": 5380 + }, + { + "epoch": 1.2988311844800577, + "grad_norm": 0.15086984634399414, + "learning_rate": 6.98811934689937e-06, + "loss": 0.2119, + "step": 5390 + }, + { + "epoch": 1.301241113387155, + "grad_norm": 0.1588638871908188, + "learning_rate": 6.975248704141128e-06, + "loss": 0.2097, + "step": 5400 + }, + { + "epoch": 1.3036510422942524, + "grad_norm": 0.15231099724769592, + "learning_rate": 6.96236253402637e-06, + "loss": 0.2126, + "step": 5410 + }, + { + "epoch": 1.3060609712013496, + "grad_norm": 0.1609620451927185, + "learning_rate": 6.949460937852803e-06, + "loss": 0.2101, + "step": 5420 + }, + { + "epoch": 1.3084709001084467, + "grad_norm": 0.15570048987865448, + "learning_rate": 6.936544017039391e-06, + "loss": 0.2142, + "step": 5430 + }, + { + "epoch": 1.310880829015544, + "grad_norm": 0.158262237906456, + "learning_rate": 6.923611873125568e-06, + "loss": 0.2099, + "step": 5440 + }, + { + "epoch": 1.3132907579226414, + "grad_norm": 0.17083002626895905, + "learning_rate": 6.910664607770436e-06, + "loss": 0.2107, + "step": 5450 + }, + { + "epoch": 1.3157006868297385, + "grad_norm": 0.14634591341018677, + "learning_rate": 6.897702322751968e-06, + "loss": 0.211, + "step": 5460 + }, + { + "epoch": 1.3181106157368356, + "grad_norm": 0.16394425928592682, + "learning_rate": 6.8847251199662025e-06, + "loss": 0.2124, + "step": 5470 + }, + { + "epoch": 1.320520544643933, + "grad_norm": 0.14441823959350586, + "learning_rate": 6.871733101426446e-06, + "loss": 0.2063, + "step": 5480 + }, + { + "epoch": 1.3229304735510303, + "grad_norm": 0.14914725720882416, + "learning_rate": 6.858726369262474e-06, + "loss": 0.2086, + "step": 5490 + }, + { + "epoch": 1.3253404024581275, + "grad_norm": 0.15426112711429596, + "learning_rate": 6.8457050257197225e-06, + "loss": 0.2104, + "step": 5500 + }, + { + "epoch": 1.3277503313652248, + "grad_norm": 0.15343551337718964, + "learning_rate": 6.832669173158488e-06, + "loss": 0.2109, + "step": 5510 + }, + { + "epoch": 1.330160260272322, + "grad_norm": 0.16048264503479004, + "learning_rate": 6.819618914053126e-06, + "loss": 0.2082, + "step": 5520 + }, + { + "epoch": 1.3325701891794193, + "grad_norm": 0.17848889529705048, + "learning_rate": 6.806554350991233e-06, + "loss": 0.2125, + "step": 5530 + }, + { + "epoch": 1.3349801180865164, + "grad_norm": 0.16506092250347137, + "learning_rate": 6.793475586672853e-06, + "loss": 0.2116, + "step": 5540 + }, + { + "epoch": 1.3373900469936137, + "grad_norm": 0.16847744584083557, + "learning_rate": 6.780382723909669e-06, + "loss": 0.21, + "step": 5550 + }, + { + "epoch": 1.3397999759007109, + "grad_norm": 0.1667633205652237, + "learning_rate": 6.767275865624183e-06, + "loss": 0.212, + "step": 5560 + }, + { + "epoch": 1.3422099048078082, + "grad_norm": 0.1553545743227005, + "learning_rate": 6.754155114848924e-06, + "loss": 0.2121, + "step": 5570 + }, + { + "epoch": 1.3446198337149053, + "grad_norm": 0.19598965346813202, + "learning_rate": 6.741020574725622e-06, + "loss": 0.2088, + "step": 5580 + }, + { + "epoch": 1.3470297626220027, + "grad_norm": 0.16720835864543915, + "learning_rate": 6.72787234850441e-06, + "loss": 0.2128, + "step": 5590 + }, + { + "epoch": 1.3494396915290998, + "grad_norm": 0.181742861866951, + "learning_rate": 6.7147105395430045e-06, + "loss": 0.2104, + "step": 5600 + }, + { + "epoch": 1.3518496204361972, + "grad_norm": 0.17101141810417175, + "learning_rate": 6.701535251305895e-06, + "loss": 0.2082, + "step": 5610 + }, + { + "epoch": 1.3542595493432943, + "grad_norm": 0.15846946835517883, + "learning_rate": 6.688346587363533e-06, + "loss": 0.2097, + "step": 5620 + }, + { + "epoch": 1.3566694782503916, + "grad_norm": 0.16642266511917114, + "learning_rate": 6.675144651391511e-06, + "loss": 0.2115, + "step": 5630 + }, + { + "epoch": 1.359079407157489, + "grad_norm": 0.16766521334648132, + "learning_rate": 6.661929547169761e-06, + "loss": 0.2076, + "step": 5640 + }, + { + "epoch": 1.361489336064586, + "grad_norm": 0.16784071922302246, + "learning_rate": 6.648701378581722e-06, + "loss": 0.2117, + "step": 5650 + }, + { + "epoch": 1.3638992649716832, + "grad_norm": 0.14052754640579224, + "learning_rate": 6.635460249613537e-06, + "loss": 0.2068, + "step": 5660 + }, + { + "epoch": 1.3663091938787806, + "grad_norm": 0.16746129095554352, + "learning_rate": 6.622206264353225e-06, + "loss": 0.2069, + "step": 5670 + }, + { + "epoch": 1.368719122785878, + "grad_norm": 0.1509017050266266, + "learning_rate": 6.6089395269898715e-06, + "loss": 0.2066, + "step": 5680 + }, + { + "epoch": 1.371129051692975, + "grad_norm": 0.15285679697990417, + "learning_rate": 6.595660141812806e-06, + "loss": 0.2102, + "step": 5690 + }, + { + "epoch": 1.3735389806000722, + "grad_norm": 0.15320274233818054, + "learning_rate": 6.582368213210781e-06, + "loss": 0.2088, + "step": 5700 + }, + { + "epoch": 1.3759489095071695, + "grad_norm": 0.16169145703315735, + "learning_rate": 6.569063845671153e-06, + "loss": 0.2102, + "step": 5710 + }, + { + "epoch": 1.3783588384142669, + "grad_norm": 0.16042403876781464, + "learning_rate": 6.555747143779058e-06, + "loss": 0.2108, + "step": 5720 + }, + { + "epoch": 1.380768767321364, + "grad_norm": 0.1627410501241684, + "learning_rate": 6.542418212216592e-06, + "loss": 0.2097, + "step": 5730 + }, + { + "epoch": 1.3831786962284611, + "grad_norm": 0.16954395174980164, + "learning_rate": 6.5290771557619935e-06, + "loss": 0.2102, + "step": 5740 + }, + { + "epoch": 1.3855886251355585, + "grad_norm": 0.16062310338020325, + "learning_rate": 6.51572407928881e-06, + "loss": 0.2099, + "step": 5750 + }, + { + "epoch": 1.3879985540426558, + "grad_norm": 0.1502860188484192, + "learning_rate": 6.502359087765077e-06, + "loss": 0.2131, + "step": 5760 + }, + { + "epoch": 1.390408482949753, + "grad_norm": 0.15315794944763184, + "learning_rate": 6.488982286252495e-06, + "loss": 0.2114, + "step": 5770 + }, + { + "epoch": 1.3928184118568503, + "grad_norm": 0.14878347516059875, + "learning_rate": 6.475593779905604e-06, + "loss": 0.2071, + "step": 5780 + }, + { + "epoch": 1.3952283407639474, + "grad_norm": 0.15133234858512878, + "learning_rate": 6.462193673970954e-06, + "loss": 0.2095, + "step": 5790 + }, + { + "epoch": 1.3976382696710448, + "grad_norm": 0.15413503348827362, + "learning_rate": 6.448782073786276e-06, + "loss": 0.2091, + "step": 5800 + }, + { + "epoch": 1.400048198578142, + "grad_norm": 0.14477072656154633, + "learning_rate": 6.435359084779663e-06, + "loss": 0.2085, + "step": 5810 + }, + { + "epoch": 1.4024581274852392, + "grad_norm": 0.14902052283287048, + "learning_rate": 6.4219248124687295e-06, + "loss": 0.2137, + "step": 5820 + }, + { + "epoch": 1.4048680563923364, + "grad_norm": 0.1455923467874527, + "learning_rate": 6.408479362459791e-06, + "loss": 0.2069, + "step": 5830 + }, + { + "epoch": 1.4072779852994337, + "grad_norm": 0.1630076915025711, + "learning_rate": 6.39502284044703e-06, + "loss": 0.209, + "step": 5840 + }, + { + "epoch": 1.4096879142065308, + "grad_norm": 0.16327600181102753, + "learning_rate": 6.381555352211663e-06, + "loss": 0.2078, + "step": 5850 + }, + { + "epoch": 1.4120978431136282, + "grad_norm": 0.16105572879314423, + "learning_rate": 6.368077003621116e-06, + "loss": 0.2061, + "step": 5860 + }, + { + "epoch": 1.4145077720207253, + "grad_norm": 0.1474512666463852, + "learning_rate": 6.354587900628184e-06, + "loss": 0.2116, + "step": 5870 + }, + { + "epoch": 1.4169177009278227, + "grad_norm": 0.1535140872001648, + "learning_rate": 6.341088149270204e-06, + "loss": 0.2121, + "step": 5880 + }, + { + "epoch": 1.4193276298349198, + "grad_norm": 0.16071055829524994, + "learning_rate": 6.327577855668216e-06, + "loss": 0.2099, + "step": 5890 + }, + { + "epoch": 1.4217375587420171, + "grad_norm": 0.15555599331855774, + "learning_rate": 6.3140571260261385e-06, + "loss": 0.2119, + "step": 5900 + }, + { + "epoch": 1.4241474876491145, + "grad_norm": 0.1541549116373062, + "learning_rate": 6.300526066629923e-06, + "loss": 0.2087, + "step": 5910 + }, + { + "epoch": 1.4265574165562116, + "grad_norm": 0.1483973264694214, + "learning_rate": 6.286984783846723e-06, + "loss": 0.2126, + "step": 5920 + }, + { + "epoch": 1.4289673454633087, + "grad_norm": 0.1526423990726471, + "learning_rate": 6.273433384124058e-06, + "loss": 0.211, + "step": 5930 + }, + { + "epoch": 1.431377274370406, + "grad_norm": 0.18809592723846436, + "learning_rate": 6.259871973988977e-06, + "loss": 0.211, + "step": 5940 + }, + { + "epoch": 1.4337872032775034, + "grad_norm": 0.19463743269443512, + "learning_rate": 6.24630066004722e-06, + "loss": 0.2114, + "step": 5950 + }, + { + "epoch": 1.4361971321846005, + "grad_norm": 0.16021324694156647, + "learning_rate": 6.232719548982381e-06, + "loss": 0.2111, + "step": 5960 + }, + { + "epoch": 1.4386070610916977, + "grad_norm": 0.19545216858386993, + "learning_rate": 6.219128747555066e-06, + "loss": 0.2087, + "step": 5970 + }, + { + "epoch": 1.441016989998795, + "grad_norm": 0.16862092912197113, + "learning_rate": 6.205528362602064e-06, + "loss": 0.2073, + "step": 5980 + }, + { + "epoch": 1.4434269189058924, + "grad_norm": 0.1454867571592331, + "learning_rate": 6.19191850103549e-06, + "loss": 0.2089, + "step": 5990 + }, + { + "epoch": 1.4458368478129895, + "grad_norm": 0.14500007033348083, + "learning_rate": 6.1782992698419605e-06, + "loss": 0.2096, + "step": 6000 + }, + { + "epoch": 1.4482467767200868, + "grad_norm": 0.14368648827075958, + "learning_rate": 6.164670776081746e-06, + "loss": 0.209, + "step": 6010 + }, + { + "epoch": 1.450656705627184, + "grad_norm": 0.15274104475975037, + "learning_rate": 6.151033126887928e-06, + "loss": 0.2098, + "step": 6020 + }, + { + "epoch": 1.4530666345342813, + "grad_norm": 0.15271003544330597, + "learning_rate": 6.137386429465557e-06, + "loss": 0.2093, + "step": 6030 + }, + { + "epoch": 1.4554765634413784, + "grad_norm": 0.15823811292648315, + "learning_rate": 6.123730791090814e-06, + "loss": 0.2079, + "step": 6040 + }, + { + "epoch": 1.4578864923484758, + "grad_norm": 0.15441280603408813, + "learning_rate": 6.1100663191101664e-06, + "loss": 0.2106, + "step": 6050 + }, + { + "epoch": 1.460296421255573, + "grad_norm": 0.15077108144760132, + "learning_rate": 6.0963931209395165e-06, + "loss": 0.2088, + "step": 6060 + }, + { + "epoch": 1.4627063501626703, + "grad_norm": 0.1825905740261078, + "learning_rate": 6.082711304063369e-06, + "loss": 0.209, + "step": 6070 + }, + { + "epoch": 1.4651162790697674, + "grad_norm": 0.18124468624591827, + "learning_rate": 6.069020976033973e-06, + "loss": 0.2088, + "step": 6080 + }, + { + "epoch": 1.4675262079768647, + "grad_norm": 0.1527973711490631, + "learning_rate": 6.055322244470492e-06, + "loss": 0.212, + "step": 6090 + }, + { + "epoch": 1.4699361368839619, + "grad_norm": 0.17023658752441406, + "learning_rate": 6.041615217058141e-06, + "loss": 0.214, + "step": 6100 + }, + { + "epoch": 1.4723460657910592, + "grad_norm": 0.14895488321781158, + "learning_rate": 6.027900001547354e-06, + "loss": 0.2078, + "step": 6110 + }, + { + "epoch": 1.4747559946981563, + "grad_norm": 0.14877694845199585, + "learning_rate": 6.014176705752928e-06, + "loss": 0.2079, + "step": 6120 + }, + { + "epoch": 1.4771659236052537, + "grad_norm": 0.17328816652297974, + "learning_rate": 6.000445437553182e-06, + "loss": 0.2122, + "step": 6130 + }, + { + "epoch": 1.479575852512351, + "grad_norm": 0.14150387048721313, + "learning_rate": 5.986706304889103e-06, + "loss": 0.2093, + "step": 6140 + }, + { + "epoch": 1.4819857814194481, + "grad_norm": 0.14140060544013977, + "learning_rate": 5.9729594157635e-06, + "loss": 0.2076, + "step": 6150 + }, + { + "epoch": 1.4843957103265453, + "grad_norm": 0.145376518368721, + "learning_rate": 5.95920487824016e-06, + "loss": 0.2084, + "step": 6160 + }, + { + "epoch": 1.4868056392336426, + "grad_norm": 0.15817610919475555, + "learning_rate": 5.945442800442989e-06, + "loss": 0.2091, + "step": 6170 + }, + { + "epoch": 1.48921556814074, + "grad_norm": 0.16702431440353394, + "learning_rate": 5.9316732905551655e-06, + "loss": 0.2096, + "step": 6180 + }, + { + "epoch": 1.491625497047837, + "grad_norm": 0.1363145262002945, + "learning_rate": 5.917896456818296e-06, + "loss": 0.2051, + "step": 6190 + }, + { + "epoch": 1.4940354259549342, + "grad_norm": 0.1454513818025589, + "learning_rate": 5.904112407531558e-06, + "loss": 0.2101, + "step": 6200 + }, + { + "epoch": 1.4964453548620316, + "grad_norm": 0.14361591637134552, + "learning_rate": 5.89032125105085e-06, + "loss": 0.2086, + "step": 6210 + }, + { + "epoch": 1.498855283769129, + "grad_norm": 0.15663976967334747, + "learning_rate": 5.876523095787938e-06, + "loss": 0.2079, + "step": 6220 + }, + { + "epoch": 1.501265212676226, + "grad_norm": 0.16403771936893463, + "learning_rate": 5.862718050209608e-06, + "loss": 0.2077, + "step": 6230 + }, + { + "epoch": 1.5036751415833232, + "grad_norm": 0.14956094324588776, + "learning_rate": 5.848906222836808e-06, + "loss": 0.2102, + "step": 6240 + }, + { + "epoch": 1.5060850704904205, + "grad_norm": 0.1433073878288269, + "learning_rate": 5.835087722243801e-06, + "loss": 0.2067, + "step": 6250 + }, + { + "epoch": 1.5084949993975179, + "grad_norm": 0.14256806671619415, + "learning_rate": 5.821262657057303e-06, + "loss": 0.2094, + "step": 6260 + }, + { + "epoch": 1.510904928304615, + "grad_norm": 0.15444211661815643, + "learning_rate": 5.807431135955637e-06, + "loss": 0.2095, + "step": 6270 + }, + { + "epoch": 1.513314857211712, + "grad_norm": 0.15259243547916412, + "learning_rate": 5.793593267667876e-06, + "loss": 0.2086, + "step": 6280 + }, + { + "epoch": 1.5157247861188095, + "grad_norm": 0.1586480736732483, + "learning_rate": 5.779749160972988e-06, + "loss": 0.2105, + "step": 6290 + }, + { + "epoch": 1.5181347150259068, + "grad_norm": 0.140699103474617, + "learning_rate": 5.76589892469898e-06, + "loss": 0.2057, + "step": 6300 + }, + { + "epoch": 1.520544643933004, + "grad_norm": 0.1529569923877716, + "learning_rate": 5.752042667722044e-06, + "loss": 0.2095, + "step": 6310 + }, + { + "epoch": 1.522954572840101, + "grad_norm": 0.15028007328510284, + "learning_rate": 5.7381804989656995e-06, + "loss": 0.2088, + "step": 6320 + }, + { + "epoch": 1.5253645017471986, + "grad_norm": 0.15581537783145905, + "learning_rate": 5.724312527399939e-06, + "loss": 0.2069, + "step": 6330 + }, + { + "epoch": 1.5277744306542957, + "grad_norm": 0.15059341490268707, + "learning_rate": 5.710438862040368e-06, + "loss": 0.205, + "step": 6340 + }, + { + "epoch": 1.5301843595613929, + "grad_norm": 0.1440630406141281, + "learning_rate": 5.696559611947359e-06, + "loss": 0.208, + "step": 6350 + }, + { + "epoch": 1.5325942884684902, + "grad_norm": 0.1505700945854187, + "learning_rate": 5.682674886225175e-06, + "loss": 0.2064, + "step": 6360 + }, + { + "epoch": 1.5350042173755876, + "grad_norm": 0.1707584410905838, + "learning_rate": 5.6687847940211304e-06, + "loss": 0.2074, + "step": 6370 + }, + { + "epoch": 1.5374141462826847, + "grad_norm": 0.16003894805908203, + "learning_rate": 5.654889444524723e-06, + "loss": 0.2063, + "step": 6380 + }, + { + "epoch": 1.5398240751897818, + "grad_norm": 0.14350832998752594, + "learning_rate": 5.6409889469667765e-06, + "loss": 0.2079, + "step": 6390 + }, + { + "epoch": 1.5422340040968792, + "grad_norm": 0.1445057988166809, + "learning_rate": 5.627083410618585e-06, + "loss": 0.2072, + "step": 6400 + }, + { + "epoch": 1.5446439330039765, + "grad_norm": 0.1630876064300537, + "learning_rate": 5.613172944791053e-06, + "loss": 0.2072, + "step": 6410 + }, + { + "epoch": 1.5470538619110736, + "grad_norm": 0.14087872207164764, + "learning_rate": 5.599257658833833e-06, + "loss": 0.2068, + "step": 6420 + }, + { + "epoch": 1.5494637908181708, + "grad_norm": 0.14263446629047394, + "learning_rate": 5.585337662134471e-06, + "loss": 0.2103, + "step": 6430 + }, + { + "epoch": 1.551873719725268, + "grad_norm": 0.14731428027153015, + "learning_rate": 5.571413064117542e-06, + "loss": 0.2094, + "step": 6440 + }, + { + "epoch": 1.5542836486323655, + "grad_norm": 0.15938155353069305, + "learning_rate": 5.5574839742437924e-06, + "loss": 0.2082, + "step": 6450 + }, + { + "epoch": 1.5566935775394626, + "grad_norm": 0.14798109233379364, + "learning_rate": 5.543550502009279e-06, + "loss": 0.2107, + "step": 6460 + }, + { + "epoch": 1.5591035064465597, + "grad_norm": 0.16606839001178741, + "learning_rate": 5.529612756944509e-06, + "loss": 0.2059, + "step": 6470 + }, + { + "epoch": 1.561513435353657, + "grad_norm": 0.15381751954555511, + "learning_rate": 5.515670848613577e-06, + "loss": 0.2078, + "step": 6480 + }, + { + "epoch": 1.5639233642607544, + "grad_norm": 0.14584167301654816, + "learning_rate": 5.501724886613304e-06, + "loss": 0.2104, + "step": 6490 + }, + { + "epoch": 1.5663332931678515, + "grad_norm": 0.14175280928611755, + "learning_rate": 5.4877749805723805e-06, + "loss": 0.2077, + "step": 6500 + }, + { + "epoch": 1.5687432220749487, + "grad_norm": 0.15253135561943054, + "learning_rate": 5.4738212401504966e-06, + "loss": 0.2114, + "step": 6510 + }, + { + "epoch": 1.571153150982046, + "grad_norm": 0.14893187582492828, + "learning_rate": 5.459863775037486e-06, + "loss": 0.2114, + "step": 6520 + }, + { + "epoch": 1.5735630798891433, + "grad_norm": 0.1650102734565735, + "learning_rate": 5.445902694952464e-06, + "loss": 0.208, + "step": 6530 + }, + { + "epoch": 1.5759730087962405, + "grad_norm": 0.15241795778274536, + "learning_rate": 5.43193810964296e-06, + "loss": 0.2086, + "step": 6540 + }, + { + "epoch": 1.5783829377033376, + "grad_norm": 0.1766674816608429, + "learning_rate": 5.417970128884061e-06, + "loss": 0.2091, + "step": 6550 + }, + { + "epoch": 1.580792866610435, + "grad_norm": 0.15203633904457092, + "learning_rate": 5.403998862477538e-06, + "loss": 0.2091, + "step": 6560 + }, + { + "epoch": 1.5832027955175323, + "grad_norm": 0.17118015885353088, + "learning_rate": 5.390024420251003e-06, + "loss": 0.2056, + "step": 6570 + }, + { + "epoch": 1.5856127244246294, + "grad_norm": 0.1646401584148407, + "learning_rate": 5.376046912057022e-06, + "loss": 0.2051, + "step": 6580 + }, + { + "epoch": 1.5880226533317268, + "grad_norm": 0.1695377081632614, + "learning_rate": 5.3620664477722686e-06, + "loss": 0.2105, + "step": 6590 + }, + { + "epoch": 1.590432582238824, + "grad_norm": 0.15378722548484802, + "learning_rate": 5.34808313729665e-06, + "loss": 0.2051, + "step": 6600 + }, + { + "epoch": 1.5928425111459212, + "grad_norm": 0.1414097249507904, + "learning_rate": 5.3340970905524515e-06, + "loss": 0.2099, + "step": 6610 + }, + { + "epoch": 1.5952524400530184, + "grad_norm": 0.14073528349399567, + "learning_rate": 5.3201084174834615e-06, + "loss": 0.2086, + "step": 6620 + }, + { + "epoch": 1.5976623689601157, + "grad_norm": 0.14301954209804535, + "learning_rate": 5.306117228054123e-06, + "loss": 0.2048, + "step": 6630 + }, + { + "epoch": 1.600072297867213, + "grad_norm": 0.14161662757396698, + "learning_rate": 5.292123632248652e-06, + "loss": 0.2085, + "step": 6640 + }, + { + "epoch": 1.6024822267743102, + "grad_norm": 0.15665100514888763, + "learning_rate": 5.278127740070187e-06, + "loss": 0.2081, + "step": 6650 + }, + { + "epoch": 1.6048921556814073, + "grad_norm": 0.17283719778060913, + "learning_rate": 5.2641296615399116e-06, + "loss": 0.2038, + "step": 6660 + }, + { + "epoch": 1.6073020845885047, + "grad_norm": 0.15164655447006226, + "learning_rate": 5.2501295066962035e-06, + "loss": 0.2088, + "step": 6670 + }, + { + "epoch": 1.609712013495602, + "grad_norm": 0.14629319310188293, + "learning_rate": 5.236127385593754e-06, + "loss": 0.2092, + "step": 6680 + }, + { + "epoch": 1.6121219424026991, + "grad_norm": 0.1373308300971985, + "learning_rate": 5.222123408302722e-06, + "loss": 0.2065, + "step": 6690 + }, + { + "epoch": 1.6145318713097963, + "grad_norm": 0.14942149817943573, + "learning_rate": 5.208117684907846e-06, + "loss": 0.2092, + "step": 6700 + }, + { + "epoch": 1.6169418002168936, + "grad_norm": 0.15661108493804932, + "learning_rate": 5.194110325507599e-06, + "loss": 0.2104, + "step": 6710 + }, + { + "epoch": 1.619351729123991, + "grad_norm": 0.1510433405637741, + "learning_rate": 5.180101440213311e-06, + "loss": 0.2076, + "step": 6720 + }, + { + "epoch": 1.621761658031088, + "grad_norm": 0.1418112814426422, + "learning_rate": 5.166091139148307e-06, + "loss": 0.2094, + "step": 6730 + }, + { + "epoch": 1.6241715869381852, + "grad_norm": 0.15742062032222748, + "learning_rate": 5.152079532447042e-06, + "loss": 0.2056, + "step": 6740 + }, + { + "epoch": 1.6265815158452825, + "grad_norm": 0.15039823949337006, + "learning_rate": 5.138066730254236e-06, + "loss": 0.2048, + "step": 6750 + }, + { + "epoch": 1.62899144475238, + "grad_norm": 0.1480005532503128, + "learning_rate": 5.124052842724005e-06, + "loss": 0.2053, + "step": 6760 + }, + { + "epoch": 1.631401373659477, + "grad_norm": 0.156424880027771, + "learning_rate": 5.110037980018996e-06, + "loss": 0.2109, + "step": 6770 + }, + { + "epoch": 1.6338113025665741, + "grad_norm": 0.13942967355251312, + "learning_rate": 5.0960222523095235e-06, + "loss": 0.205, + "step": 6780 + }, + { + "epoch": 1.6362212314736715, + "grad_norm": 0.15093407034873962, + "learning_rate": 5.0820057697727e-06, + "loss": 0.2115, + "step": 6790 + }, + { + "epoch": 1.6386311603807688, + "grad_norm": 0.13788051903247833, + "learning_rate": 5.067988642591575e-06, + "loss": 0.2056, + "step": 6800 + }, + { + "epoch": 1.641041089287866, + "grad_norm": 0.14531250298023224, + "learning_rate": 5.053970980954263e-06, + "loss": 0.2069, + "step": 6810 + }, + { + "epoch": 1.643451018194963, + "grad_norm": 0.14479826390743256, + "learning_rate": 5.0399528950530776e-06, + "loss": 0.2058, + "step": 6820 + }, + { + "epoch": 1.6458609471020604, + "grad_norm": 0.14563125371932983, + "learning_rate": 5.0259344950836715e-06, + "loss": 0.2101, + "step": 6830 + }, + { + "epoch": 1.6482708760091578, + "grad_norm": 0.14034052193164825, + "learning_rate": 5.011915891244167e-06, + "loss": 0.207, + "step": 6840 + }, + { + "epoch": 1.650680804916255, + "grad_norm": 0.14122657477855682, + "learning_rate": 4.997897193734285e-06, + "loss": 0.208, + "step": 6850 + }, + { + "epoch": 1.6530907338233523, + "grad_norm": 0.15636655688285828, + "learning_rate": 4.9838785127544826e-06, + "loss": 0.2093, + "step": 6860 + }, + { + "epoch": 1.6555006627304496, + "grad_norm": 0.15074920654296875, + "learning_rate": 4.969859958505094e-06, + "loss": 0.2052, + "step": 6870 + }, + { + "epoch": 1.6579105916375467, + "grad_norm": 0.15180207788944244, + "learning_rate": 4.955841641185447e-06, + "loss": 0.2109, + "step": 6880 + }, + { + "epoch": 1.6603205205446439, + "grad_norm": 0.15867879986763, + "learning_rate": 4.941823670993016e-06, + "loss": 0.2096, + "step": 6890 + }, + { + "epoch": 1.6627304494517412, + "grad_norm": 0.14111950993537903, + "learning_rate": 4.92780615812254e-06, + "loss": 0.2051, + "step": 6900 + }, + { + "epoch": 1.6651403783588385, + "grad_norm": 0.14959976077079773, + "learning_rate": 4.913789212765166e-06, + "loss": 0.2063, + "step": 6910 + }, + { + "epoch": 1.6675503072659357, + "grad_norm": 0.16154243052005768, + "learning_rate": 4.899772945107583e-06, + "loss": 0.2068, + "step": 6920 + }, + { + "epoch": 1.6699602361730328, + "grad_norm": 0.14693604409694672, + "learning_rate": 4.885757465331144e-06, + "loss": 0.2111, + "step": 6930 + }, + { + "epoch": 1.6723701650801301, + "grad_norm": 0.1400395929813385, + "learning_rate": 4.871742883611018e-06, + "loss": 0.2057, + "step": 6940 + }, + { + "epoch": 1.6747800939872275, + "grad_norm": 0.13714446127414703, + "learning_rate": 4.857729310115307e-06, + "loss": 0.2058, + "step": 6950 + }, + { + "epoch": 1.6771900228943246, + "grad_norm": 0.1699269860982895, + "learning_rate": 4.843716855004194e-06, + "loss": 0.2081, + "step": 6960 + }, + { + "epoch": 1.6795999518014217, + "grad_norm": 0.1475098431110382, + "learning_rate": 4.829705628429061e-06, + "loss": 0.209, + "step": 6970 + }, + { + "epoch": 1.682009880708519, + "grad_norm": 0.14293253421783447, + "learning_rate": 4.815695740531643e-06, + "loss": 0.2068, + "step": 6980 + }, + { + "epoch": 1.6844198096156164, + "grad_norm": 0.143426775932312, + "learning_rate": 4.801687301443149e-06, + "loss": 0.2078, + "step": 6990 + }, + { + "epoch": 1.6868297385227136, + "grad_norm": 0.14896607398986816, + "learning_rate": 4.787680421283391e-06, + "loss": 0.205, + "step": 7000 + }, + { + "epoch": 1.6892396674298107, + "grad_norm": 0.14818888902664185, + "learning_rate": 4.773675210159938e-06, + "loss": 0.2055, + "step": 7010 + }, + { + "epoch": 1.691649596336908, + "grad_norm": 0.15228557586669922, + "learning_rate": 4.759671778167228e-06, + "loss": 0.2057, + "step": 7020 + }, + { + "epoch": 1.6940595252440054, + "grad_norm": 0.1329268515110016, + "learning_rate": 4.745670235385723e-06, + "loss": 0.2097, + "step": 7030 + }, + { + "epoch": 1.6964694541511025, + "grad_norm": 0.14906267821788788, + "learning_rate": 4.7316706918810265e-06, + "loss": 0.2086, + "step": 7040 + }, + { + "epoch": 1.6988793830581996, + "grad_norm": 0.16887860000133514, + "learning_rate": 4.71767325770303e-06, + "loss": 0.2067, + "step": 7050 + }, + { + "epoch": 1.701289311965297, + "grad_norm": 0.13959096372127533, + "learning_rate": 4.703678042885044e-06, + "loss": 0.2075, + "step": 7060 + }, + { + "epoch": 1.7036992408723943, + "grad_norm": 0.150526762008667, + "learning_rate": 4.689685157442927e-06, + "loss": 0.2032, + "step": 7070 + }, + { + "epoch": 1.7061091697794915, + "grad_norm": 0.15431441366672516, + "learning_rate": 4.675694711374234e-06, + "loss": 0.2047, + "step": 7080 + }, + { + "epoch": 1.7085190986865888, + "grad_norm": 0.14908158779144287, + "learning_rate": 4.661706814657338e-06, + "loss": 0.2078, + "step": 7090 + }, + { + "epoch": 1.7109290275936861, + "grad_norm": 0.15429182350635529, + "learning_rate": 4.647721577250578e-06, + "loss": 0.2052, + "step": 7100 + }, + { + "epoch": 1.7133389565007833, + "grad_norm": 0.15195661783218384, + "learning_rate": 4.633739109091379e-06, + "loss": 0.2122, + "step": 7110 + }, + { + "epoch": 1.7157488854078804, + "grad_norm": 0.14676164090633392, + "learning_rate": 4.61975952009541e-06, + "loss": 0.2055, + "step": 7120 + }, + { + "epoch": 1.7181588143149777, + "grad_norm": 0.13878756761550903, + "learning_rate": 4.6057829201556905e-06, + "loss": 0.2073, + "step": 7130 + }, + { + "epoch": 1.720568743222075, + "grad_norm": 0.1423376202583313, + "learning_rate": 4.591809419141758e-06, + "loss": 0.2064, + "step": 7140 + }, + { + "epoch": 1.7229786721291722, + "grad_norm": 0.14275546371936798, + "learning_rate": 4.577839126898784e-06, + "loss": 0.2083, + "step": 7150 + }, + { + "epoch": 1.7253886010362693, + "grad_norm": 0.15022709965705872, + "learning_rate": 4.5638721532467125e-06, + "loss": 0.2068, + "step": 7160 + }, + { + "epoch": 1.7277985299433667, + "grad_norm": 0.14701788127422333, + "learning_rate": 4.549908607979407e-06, + "loss": 0.2068, + "step": 7170 + }, + { + "epoch": 1.730208458850464, + "grad_norm": 0.14002536237239838, + "learning_rate": 4.535948600863774e-06, + "loss": 0.2098, + "step": 7180 + }, + { + "epoch": 1.7326183877575612, + "grad_norm": 0.15771852433681488, + "learning_rate": 4.521992241638912e-06, + "loss": 0.2051, + "step": 7190 + }, + { + "epoch": 1.7350283166646583, + "grad_norm": 0.1449091136455536, + "learning_rate": 4.508039640015237e-06, + "loss": 0.2069, + "step": 7200 + }, + { + "epoch": 1.7374382455717556, + "grad_norm": 0.14416846632957458, + "learning_rate": 4.494090905673634e-06, + "loss": 0.2098, + "step": 7210 + }, + { + "epoch": 1.739848174478853, + "grad_norm": 0.1471363753080368, + "learning_rate": 4.480146148264586e-06, + "loss": 0.207, + "step": 7220 + }, + { + "epoch": 1.74225810338595, + "grad_norm": 0.15887276828289032, + "learning_rate": 4.466205477407308e-06, + "loss": 0.2063, + "step": 7230 + }, + { + "epoch": 1.7446680322930472, + "grad_norm": 0.14145879447460175, + "learning_rate": 4.452269002688897e-06, + "loss": 0.2079, + "step": 7240 + }, + { + "epoch": 1.7470779612001446, + "grad_norm": 0.14188285171985626, + "learning_rate": 4.438336833663459e-06, + "loss": 0.2094, + "step": 7250 + }, + { + "epoch": 1.749487890107242, + "grad_norm": 0.12969772517681122, + "learning_rate": 4.424409079851262e-06, + "loss": 0.2081, + "step": 7260 + }, + { + "epoch": 1.751897819014339, + "grad_norm": 0.13423402607440948, + "learning_rate": 4.410485850737853e-06, + "loss": 0.2081, + "step": 7270 + }, + { + "epoch": 1.7543077479214362, + "grad_norm": 0.14299023151397705, + "learning_rate": 4.39656725577322e-06, + "loss": 0.2064, + "step": 7280 + }, + { + "epoch": 1.7567176768285335, + "grad_norm": 0.14568351209163666, + "learning_rate": 4.382653404370922e-06, + "loss": 0.207, + "step": 7290 + }, + { + "epoch": 1.7591276057356309, + "grad_norm": 0.13449597358703613, + "learning_rate": 4.368744405907224e-06, + "loss": 0.2081, + "step": 7300 + }, + { + "epoch": 1.761537534642728, + "grad_norm": 0.1447085440158844, + "learning_rate": 4.354840369720249e-06, + "loss": 0.2081, + "step": 7310 + }, + { + "epoch": 1.7639474635498251, + "grad_norm": 0.13538062572479248, + "learning_rate": 4.340941405109102e-06, + "loss": 0.2101, + "step": 7320 + }, + { + "epoch": 1.7663573924569225, + "grad_norm": 0.16251257061958313, + "learning_rate": 4.327047621333031e-06, + "loss": 0.2064, + "step": 7330 + }, + { + "epoch": 1.7687673213640198, + "grad_norm": 0.1393039971590042, + "learning_rate": 4.31315912761055e-06, + "loss": 0.2045, + "step": 7340 + }, + { + "epoch": 1.771177250271117, + "grad_norm": 0.14810238778591156, + "learning_rate": 4.299276033118592e-06, + "loss": 0.208, + "step": 7350 + }, + { + "epoch": 1.7735871791782143, + "grad_norm": 0.1339663565158844, + "learning_rate": 4.285398446991648e-06, + "loss": 0.2082, + "step": 7360 + }, + { + "epoch": 1.7759971080853116, + "grad_norm": 0.14078187942504883, + "learning_rate": 4.271526478320901e-06, + "loss": 0.2065, + "step": 7370 + }, + { + "epoch": 1.7784070369924088, + "grad_norm": 0.13188710808753967, + "learning_rate": 4.257660236153387e-06, + "loss": 0.2041, + "step": 7380 + }, + { + "epoch": 1.7808169658995059, + "grad_norm": 0.1408006250858307, + "learning_rate": 4.2437998294911114e-06, + "loss": 0.2082, + "step": 7390 + }, + { + "epoch": 1.7832268948066032, + "grad_norm": 0.14175231754779816, + "learning_rate": 4.229945367290222e-06, + "loss": 0.2076, + "step": 7400 + }, + { + "epoch": 1.7856368237137006, + "grad_norm": 0.13607344031333923, + "learning_rate": 4.216096958460126e-06, + "loss": 0.2059, + "step": 7410 + }, + { + "epoch": 1.7880467526207977, + "grad_norm": 0.15421463549137115, + "learning_rate": 4.2022547118626515e-06, + "loss": 0.208, + "step": 7420 + }, + { + "epoch": 1.7904566815278948, + "grad_norm": 0.14106203615665436, + "learning_rate": 4.18841873631118e-06, + "loss": 0.2079, + "step": 7430 + }, + { + "epoch": 1.7928666104349922, + "grad_norm": 0.14798112213611603, + "learning_rate": 4.174589140569805e-06, + "loss": 0.2073, + "step": 7440 + }, + { + "epoch": 1.7952765393420895, + "grad_norm": 0.13187192380428314, + "learning_rate": 4.160766033352462e-06, + "loss": 0.2066, + "step": 7450 + }, + { + "epoch": 1.7976864682491867, + "grad_norm": 0.1424017697572708, + "learning_rate": 4.14694952332208e-06, + "loss": 0.2046, + "step": 7460 + }, + { + "epoch": 1.8000963971562838, + "grad_norm": 0.13720762729644775, + "learning_rate": 4.133139719089735e-06, + "loss": 0.2032, + "step": 7470 + }, + { + "epoch": 1.8025063260633811, + "grad_norm": 0.14268174767494202, + "learning_rate": 4.119336729213778e-06, + "loss": 0.2097, + "step": 7480 + }, + { + "epoch": 1.8049162549704785, + "grad_norm": 0.15912844240665436, + "learning_rate": 4.105540662199002e-06, + "loss": 0.2082, + "step": 7490 + }, + { + "epoch": 1.8073261838775756, + "grad_norm": 0.1520354002714157, + "learning_rate": 4.0917516264957735e-06, + "loss": 0.2038, + "step": 7500 + }, + { + "epoch": 1.8097361127846727, + "grad_norm": 0.13783125579357147, + "learning_rate": 4.07796973049919e-06, + "loss": 0.2067, + "step": 7510 + }, + { + "epoch": 1.81214604169177, + "grad_norm": 0.14950409531593323, + "learning_rate": 4.0641950825482265e-06, + "loss": 0.2065, + "step": 7520 + }, + { + "epoch": 1.8145559705988674, + "grad_norm": 0.14664943516254425, + "learning_rate": 4.0504277909248715e-06, + "loss": 0.2048, + "step": 7530 + }, + { + "epoch": 1.8169658995059645, + "grad_norm": 0.14661647379398346, + "learning_rate": 4.036667963853296e-06, + "loss": 0.208, + "step": 7540 + }, + { + "epoch": 1.8193758284130617, + "grad_norm": 0.14886929094791412, + "learning_rate": 4.022915709498985e-06, + "loss": 0.2071, + "step": 7550 + }, + { + "epoch": 1.821785757320159, + "grad_norm": 0.16615243256092072, + "learning_rate": 4.009171135967902e-06, + "loss": 0.2066, + "step": 7560 + }, + { + "epoch": 1.8241956862272564, + "grad_norm": 0.13197271525859833, + "learning_rate": 3.9954343513056236e-06, + "loss": 0.207, + "step": 7570 + }, + { + "epoch": 1.8266056151343535, + "grad_norm": 0.13960954546928406, + "learning_rate": 3.981705463496504e-06, + "loss": 0.2045, + "step": 7580 + }, + { + "epoch": 1.8290155440414506, + "grad_norm": 0.13621185719966888, + "learning_rate": 3.967984580462821e-06, + "loss": 0.2079, + "step": 7590 + }, + { + "epoch": 1.8314254729485482, + "grad_norm": 0.14133788645267487, + "learning_rate": 3.954271810063922e-06, + "loss": 0.2087, + "step": 7600 + }, + { + "epoch": 1.8338354018556453, + "grad_norm": 0.15335774421691895, + "learning_rate": 3.940567260095389e-06, + "loss": 0.2075, + "step": 7610 + }, + { + "epoch": 1.8362453307627424, + "grad_norm": 0.15637780725955963, + "learning_rate": 3.926871038288173e-06, + "loss": 0.2054, + "step": 7620 + }, + { + "epoch": 1.8386552596698398, + "grad_norm": 0.13300193846225739, + "learning_rate": 3.9131832523077685e-06, + "loss": 0.207, + "step": 7630 + }, + { + "epoch": 1.8410651885769371, + "grad_norm": 0.14508311450481415, + "learning_rate": 3.8995040097533495e-06, + "loss": 0.2075, + "step": 7640 + }, + { + "epoch": 1.8434751174840343, + "grad_norm": 0.13044893741607666, + "learning_rate": 3.885833418156932e-06, + "loss": 0.2077, + "step": 7650 + }, + { + "epoch": 1.8458850463911314, + "grad_norm": 0.14246976375579834, + "learning_rate": 3.8721715849825305e-06, + "loss": 0.2062, + "step": 7660 + }, + { + "epoch": 1.8482949752982287, + "grad_norm": 0.13763968646526337, + "learning_rate": 3.858518617625301e-06, + "loss": 0.2062, + "step": 7670 + }, + { + "epoch": 1.850704904205326, + "grad_norm": 0.12822872400283813, + "learning_rate": 3.844874623410718e-06, + "loss": 0.2053, + "step": 7680 + }, + { + "epoch": 1.8531148331124232, + "grad_norm": 0.14770273864269257, + "learning_rate": 3.831239709593707e-06, + "loss": 0.2059, + "step": 7690 + }, + { + "epoch": 1.8555247620195203, + "grad_norm": 0.15450125932693481, + "learning_rate": 3.8176139833578215e-06, + "loss": 0.2054, + "step": 7700 + }, + { + "epoch": 1.8579346909266177, + "grad_norm": 0.15053947269916534, + "learning_rate": 3.8039975518143862e-06, + "loss": 0.2059, + "step": 7710 + }, + { + "epoch": 1.860344619833715, + "grad_norm": 0.16280502080917358, + "learning_rate": 3.790390522001662e-06, + "loss": 0.207, + "step": 7720 + }, + { + "epoch": 1.8627545487408121, + "grad_norm": 0.13911543786525726, + "learning_rate": 3.7767930008840055e-06, + "loss": 0.2052, + "step": 7730 + }, + { + "epoch": 1.8651644776479093, + "grad_norm": 0.14479966461658478, + "learning_rate": 3.763205095351021e-06, + "loss": 0.2049, + "step": 7740 + }, + { + "epoch": 1.8675744065550066, + "grad_norm": 0.13834457099437714, + "learning_rate": 3.7496269122167306e-06, + "loss": 0.2071, + "step": 7750 + }, + { + "epoch": 1.869984335462104, + "grad_norm": 0.1692366749048233, + "learning_rate": 3.7360585582187246e-06, + "loss": 0.2093, + "step": 7760 + }, + { + "epoch": 1.872394264369201, + "grad_norm": 0.14670221507549286, + "learning_rate": 3.7225001400173303e-06, + "loss": 0.2057, + "step": 7770 + }, + { + "epoch": 1.8748041932762982, + "grad_norm": 0.14623087644577026, + "learning_rate": 3.708951764194767e-06, + "loss": 0.2052, + "step": 7780 + }, + { + "epoch": 1.8772141221833956, + "grad_norm": 0.13682162761688232, + "learning_rate": 3.6954135372543133e-06, + "loss": 0.2058, + "step": 7790 + }, + { + "epoch": 1.879624051090493, + "grad_norm": 0.13509906828403473, + "learning_rate": 3.681885565619465e-06, + "loss": 0.2061, + "step": 7800 + }, + { + "epoch": 1.88203397999759, + "grad_norm": 0.13584059476852417, + "learning_rate": 3.668367955633107e-06, + "loss": 0.2063, + "step": 7810 + }, + { + "epoch": 1.8844439089046872, + "grad_norm": 0.14202377200126648, + "learning_rate": 3.654860813556666e-06, + "loss": 0.2078, + "step": 7820 + }, + { + "epoch": 1.8868538378117845, + "grad_norm": 0.1395299881696701, + "learning_rate": 3.6413642455692826e-06, + "loss": 0.2022, + "step": 7830 + }, + { + "epoch": 1.8892637667188819, + "grad_norm": 0.14222587645053864, + "learning_rate": 3.6278783577669762e-06, + "loss": 0.2076, + "step": 7840 + }, + { + "epoch": 1.891673695625979, + "grad_norm": 0.14933638274669647, + "learning_rate": 3.614403256161807e-06, + "loss": 0.2047, + "step": 7850 + }, + { + "epoch": 1.8940836245330763, + "grad_norm": 0.1349928379058838, + "learning_rate": 3.60093904668105e-06, + "loss": 0.2057, + "step": 7860 + }, + { + "epoch": 1.8964935534401737, + "grad_norm": 0.13887856900691986, + "learning_rate": 3.5874858351663513e-06, + "loss": 0.2053, + "step": 7870 + }, + { + "epoch": 1.8989034823472708, + "grad_norm": 0.13957622647285461, + "learning_rate": 3.5740437273729074e-06, + "loss": 0.2069, + "step": 7880 + }, + { + "epoch": 1.901313411254368, + "grad_norm": 0.149143248796463, + "learning_rate": 3.560612828968627e-06, + "loss": 0.2056, + "step": 7890 + }, + { + "epoch": 1.9037233401614653, + "grad_norm": 0.14313067495822906, + "learning_rate": 3.5471932455333013e-06, + "loss": 0.2058, + "step": 7900 + }, + { + "epoch": 1.9061332690685626, + "grad_norm": 0.13201439380645752, + "learning_rate": 3.533785082557779e-06, + "loss": 0.2063, + "step": 7910 + }, + { + "epoch": 1.9085431979756597, + "grad_norm": 0.14562028646469116, + "learning_rate": 3.520388445443126e-06, + "loss": 0.207, + "step": 7920 + }, + { + "epoch": 1.9109531268827569, + "grad_norm": 0.1508123278617859, + "learning_rate": 3.5070034394998108e-06, + "loss": 0.2047, + "step": 7930 + }, + { + "epoch": 1.9133630557898542, + "grad_norm": 0.1282895803451538, + "learning_rate": 3.4936301699468646e-06, + "loss": 0.2055, + "step": 7940 + }, + { + "epoch": 1.9157729846969516, + "grad_norm": 0.14230744540691376, + "learning_rate": 3.4802687419110635e-06, + "loss": 0.2032, + "step": 7950 + }, + { + "epoch": 1.9181829136040487, + "grad_norm": 0.135861337184906, + "learning_rate": 3.466919260426095e-06, + "loss": 0.2067, + "step": 7960 + }, + { + "epoch": 1.9205928425111458, + "grad_norm": 0.1485607922077179, + "learning_rate": 3.4535818304317338e-06, + "loss": 0.2041, + "step": 7970 + }, + { + "epoch": 1.9230027714182432, + "grad_norm": 0.14034852385520935, + "learning_rate": 3.440256556773025e-06, + "loss": 0.2034, + "step": 7980 + }, + { + "epoch": 1.9254127003253405, + "grad_norm": 0.1325804740190506, + "learning_rate": 3.426943544199444e-06, + "loss": 0.2061, + "step": 7990 + }, + { + "epoch": 1.9278226292324376, + "grad_norm": 0.14449110627174377, + "learning_rate": 3.413642897364091e-06, + "loss": 0.2063, + "step": 8000 + }, + { + "epoch": 1.9302325581395348, + "grad_norm": 0.14102737605571747, + "learning_rate": 3.400354720822851e-06, + "loss": 0.2072, + "step": 8010 + }, + { + "epoch": 1.932642487046632, + "grad_norm": 0.13754191994667053, + "learning_rate": 3.38707911903359e-06, + "loss": 0.2088, + "step": 8020 + }, + { + "epoch": 1.9350524159537295, + "grad_norm": 0.15111030638217926, + "learning_rate": 3.373816196355315e-06, + "loss": 0.2039, + "step": 8030 + }, + { + "epoch": 1.9374623448608266, + "grad_norm": 0.1356048732995987, + "learning_rate": 3.3605660570473687e-06, + "loss": 0.2055, + "step": 8040 + }, + { + "epoch": 1.9398722737679237, + "grad_norm": 0.1311032921075821, + "learning_rate": 3.3473288052686055e-06, + "loss": 0.2066, + "step": 8050 + }, + { + "epoch": 1.942282202675021, + "grad_norm": 0.13652199506759644, + "learning_rate": 3.334104545076564e-06, + "loss": 0.2046, + "step": 8060 + }, + { + "epoch": 1.9446921315821184, + "grad_norm": 0.1403491497039795, + "learning_rate": 3.320893380426667e-06, + "loss": 0.2069, + "step": 8070 + }, + { + "epoch": 1.9471020604892155, + "grad_norm": 0.14957809448242188, + "learning_rate": 3.3076954151713815e-06, + "loss": 0.2067, + "step": 8080 + }, + { + "epoch": 1.9495119893963127, + "grad_norm": 0.1357545703649521, + "learning_rate": 3.294510753059427e-06, + "loss": 0.2024, + "step": 8090 + }, + { + "epoch": 1.95192191830341, + "grad_norm": 0.13906139135360718, + "learning_rate": 3.2813394977349356e-06, + "loss": 0.2029, + "step": 8100 + }, + { + "epoch": 1.9543318472105073, + "grad_norm": 0.1386336386203766, + "learning_rate": 3.2681817527366575e-06, + "loss": 0.2036, + "step": 8110 + }, + { + "epoch": 1.9567417761176045, + "grad_norm": 0.1412561684846878, + "learning_rate": 3.2550376214971395e-06, + "loss": 0.207, + "step": 8120 + }, + { + "epoch": 1.9591517050247018, + "grad_norm": 0.13507452607154846, + "learning_rate": 3.241907207341902e-06, + "loss": 0.2046, + "step": 8130 + }, + { + "epoch": 1.9615616339317992, + "grad_norm": 0.12901359796524048, + "learning_rate": 3.2287906134886483e-06, + "loss": 0.2033, + "step": 8140 + }, + { + "epoch": 1.9639715628388963, + "grad_norm": 0.1413521021604538, + "learning_rate": 3.215687943046427e-06, + "loss": 0.2042, + "step": 8150 + }, + { + "epoch": 1.9663814917459934, + "grad_norm": 0.15636803209781647, + "learning_rate": 3.202599299014849e-06, + "loss": 0.208, + "step": 8160 + }, + { + "epoch": 1.9687914206530908, + "grad_norm": 0.1508229821920395, + "learning_rate": 3.1895247842832523e-06, + "loss": 0.206, + "step": 8170 + }, + { + "epoch": 1.971201349560188, + "grad_norm": 0.14489571750164032, + "learning_rate": 3.1764645016299133e-06, + "loss": 0.1998, + "step": 8180 + }, + { + "epoch": 1.9736112784672852, + "grad_norm": 0.1390523463487625, + "learning_rate": 3.163418553721229e-06, + "loss": 0.2053, + "step": 8190 + }, + { + "epoch": 1.9760212073743824, + "grad_norm": 0.14797191321849823, + "learning_rate": 3.1503870431109067e-06, + "loss": 0.2079, + "step": 8200 + }, + { + "epoch": 1.9784311362814797, + "grad_norm": 0.12731759250164032, + "learning_rate": 3.1373700722391696e-06, + "loss": 0.203, + "step": 8210 + }, + { + "epoch": 1.980841065188577, + "grad_norm": 0.1378549486398697, + "learning_rate": 3.1243677434319373e-06, + "loss": 0.2066, + "step": 8220 + }, + { + "epoch": 1.9832509940956742, + "grad_norm": 0.14989231526851654, + "learning_rate": 3.111380158900037e-06, + "loss": 0.2032, + "step": 8230 + }, + { + "epoch": 1.9856609230027713, + "grad_norm": 0.15311060845851898, + "learning_rate": 3.098407420738382e-06, + "loss": 0.2049, + "step": 8240 + }, + { + "epoch": 1.9880708519098687, + "grad_norm": 0.1495356559753418, + "learning_rate": 3.0854496309251857e-06, + "loss": 0.2061, + "step": 8250 + }, + { + "epoch": 1.990480780816966, + "grad_norm": 0.13795362412929535, + "learning_rate": 3.0725068913211546e-06, + "loss": 0.2031, + "step": 8260 + }, + { + "epoch": 1.9928907097240631, + "grad_norm": 0.12623263895511627, + "learning_rate": 3.059579303668678e-06, + "loss": 0.207, + "step": 8270 + }, + { + "epoch": 1.9953006386311603, + "grad_norm": 0.13744844496250153, + "learning_rate": 3.046666969591046e-06, + "loss": 0.2053, + "step": 8280 + }, + { + "epoch": 1.9977105675382576, + "grad_norm": 0.13788847625255585, + "learning_rate": 3.0337699905916308e-06, + "loss": 0.2064, + "step": 8290 + }, + { + "epoch": 2.0, + "grad_norm": 0.1785513460636139, + "learning_rate": 3.020888468053109e-06, + "loss": 0.2027, + "step": 8300 + }, + { + "epoch": 2.002409928907097, + "grad_norm": 0.13940368592739105, + "learning_rate": 3.0080225032366443e-06, + "loss": 0.1998, + "step": 8310 + }, + { + "epoch": 2.0048198578141947, + "grad_norm": 0.1354367583990097, + "learning_rate": 2.9951721972811133e-06, + "loss": 0.1983, + "step": 8320 + }, + { + "epoch": 2.007229786721292, + "grad_norm": 0.1270235925912857, + "learning_rate": 2.982337651202286e-06, + "loss": 0.1963, + "step": 8330 + }, + { + "epoch": 2.009639715628389, + "grad_norm": 0.13577590882778168, + "learning_rate": 2.9695189658920555e-06, + "loss": 0.1993, + "step": 8340 + }, + { + "epoch": 2.012049644535486, + "grad_norm": 0.14152930676937103, + "learning_rate": 2.95671624211763e-06, + "loss": 0.2001, + "step": 8350 + }, + { + "epoch": 2.0144595734425836, + "grad_norm": 0.13332296907901764, + "learning_rate": 2.9439295805207415e-06, + "loss": 0.1965, + "step": 8360 + }, + { + "epoch": 2.0168695023496808, + "grad_norm": 0.14344486594200134, + "learning_rate": 2.9311590816168646e-06, + "loss": 0.1979, + "step": 8370 + }, + { + "epoch": 2.019279431256778, + "grad_norm": 0.14192751049995422, + "learning_rate": 2.918404845794411e-06, + "loss": 0.1973, + "step": 8380 + }, + { + "epoch": 2.021689360163875, + "grad_norm": 0.1400498002767563, + "learning_rate": 2.905666973313957e-06, + "loss": 0.1979, + "step": 8390 + }, + { + "epoch": 2.0240992890709726, + "grad_norm": 0.13859783113002777, + "learning_rate": 2.8929455643074433e-06, + "loss": 0.1978, + "step": 8400 + }, + { + "epoch": 2.0265092179780697, + "grad_norm": 0.13500601053237915, + "learning_rate": 2.8802407187773917e-06, + "loss": 0.2007, + "step": 8410 + }, + { + "epoch": 2.028919146885167, + "grad_norm": 0.14077068865299225, + "learning_rate": 2.86755253659612e-06, + "loss": 0.1995, + "step": 8420 + }, + { + "epoch": 2.031329075792264, + "grad_norm": 0.12985944747924805, + "learning_rate": 2.854881117504954e-06, + "loss": 0.1997, + "step": 8430 + }, + { + "epoch": 2.0337390046993615, + "grad_norm": 0.12818634510040283, + "learning_rate": 2.8422265611134535e-06, + "loss": 0.2004, + "step": 8440 + }, + { + "epoch": 2.0361489336064587, + "grad_norm": 0.13415859639644623, + "learning_rate": 2.829588966898607e-06, + "loss": 0.1983, + "step": 8450 + }, + { + "epoch": 2.038558862513556, + "grad_norm": 0.13348065316677094, + "learning_rate": 2.8169684342040802e-06, + "loss": 0.1995, + "step": 8460 + }, + { + "epoch": 2.040968791420653, + "grad_norm": 0.13919700682163239, + "learning_rate": 2.8043650622394023e-06, + "loss": 0.1994, + "step": 8470 + }, + { + "epoch": 2.0433787203277505, + "grad_norm": 0.12803754210472107, + "learning_rate": 2.791778950079217e-06, + "loss": 0.1984, + "step": 8480 + }, + { + "epoch": 2.0457886492348476, + "grad_norm": 0.12549884617328644, + "learning_rate": 2.779210196662482e-06, + "loss": 0.1983, + "step": 8490 + }, + { + "epoch": 2.0481985781419447, + "grad_norm": 0.13034506142139435, + "learning_rate": 2.766658900791699e-06, + "loss": 0.2004, + "step": 8500 + }, + { + "epoch": 2.050608507049042, + "grad_norm": 0.12898895144462585, + "learning_rate": 2.7541251611321385e-06, + "loss": 0.2015, + "step": 8510 + }, + { + "epoch": 2.0530184359561394, + "grad_norm": 0.1386294662952423, + "learning_rate": 2.7416090762110603e-06, + "loss": 0.1988, + "step": 8520 + }, + { + "epoch": 2.0554283648632365, + "grad_norm": 0.15744347870349884, + "learning_rate": 2.729110744416943e-06, + "loss": 0.2002, + "step": 8530 + }, + { + "epoch": 2.0578382937703337, + "grad_norm": 0.1407720148563385, + "learning_rate": 2.716630263998706e-06, + "loss": 0.1982, + "step": 8540 + }, + { + "epoch": 2.060248222677431, + "grad_norm": 0.14954039454460144, + "learning_rate": 2.7041677330649408e-06, + "loss": 0.2026, + "step": 8550 + }, + { + "epoch": 2.0626581515845284, + "grad_norm": 0.1366991400718689, + "learning_rate": 2.6917232495831436e-06, + "loss": 0.1978, + "step": 8560 + }, + { + "epoch": 2.0650680804916255, + "grad_norm": 0.1298081874847412, + "learning_rate": 2.6792969113789285e-06, + "loss": 0.2013, + "step": 8570 + }, + { + "epoch": 2.0674780093987226, + "grad_norm": 0.13624030351638794, + "learning_rate": 2.666888816135285e-06, + "loss": 0.1967, + "step": 8580 + }, + { + "epoch": 2.06988793830582, + "grad_norm": 0.13076886534690857, + "learning_rate": 2.6544990613917803e-06, + "loss": 0.1984, + "step": 8590 + }, + { + "epoch": 2.0722978672129173, + "grad_norm": 0.1781616061925888, + "learning_rate": 2.642127744543823e-06, + "loss": 0.2006, + "step": 8600 + }, + { + "epoch": 2.0747077961200144, + "grad_norm": 0.1324646919965744, + "learning_rate": 2.6297749628418654e-06, + "loss": 0.198, + "step": 8610 + }, + { + "epoch": 2.0771177250271116, + "grad_norm": 0.1453695148229599, + "learning_rate": 2.617440813390674e-06, + "loss": 0.1996, + "step": 8620 + }, + { + "epoch": 2.079527653934209, + "grad_norm": 0.1317797154188156, + "learning_rate": 2.605125393148529e-06, + "loss": 0.1976, + "step": 8630 + }, + { + "epoch": 2.0819375828413063, + "grad_norm": 0.12967994809150696, + "learning_rate": 2.592828798926496e-06, + "loss": 0.1948, + "step": 8640 + }, + { + "epoch": 2.0843475117484034, + "grad_norm": 0.12639065086841583, + "learning_rate": 2.580551127387644e-06, + "loss": 0.1995, + "step": 8650 + }, + { + "epoch": 2.0867574406555005, + "grad_norm": 0.13363341987133026, + "learning_rate": 2.5682924750462907e-06, + "loss": 0.1984, + "step": 8660 + }, + { + "epoch": 2.089167369562598, + "grad_norm": 0.1281256228685379, + "learning_rate": 2.5560529382672462e-06, + "loss": 0.1974, + "step": 8670 + }, + { + "epoch": 2.091577298469695, + "grad_norm": 0.13677582144737244, + "learning_rate": 2.5438326132650524e-06, + "loss": 0.1997, + "step": 8680 + }, + { + "epoch": 2.0939872273767923, + "grad_norm": 0.13215544819831848, + "learning_rate": 2.531631596103231e-06, + "loss": 0.1982, + "step": 8690 + }, + { + "epoch": 2.0963971562838895, + "grad_norm": 0.1397799551486969, + "learning_rate": 2.5194499826935216e-06, + "loss": 0.1972, + "step": 8700 + }, + { + "epoch": 2.098807085190987, + "grad_norm": 0.12224389612674713, + "learning_rate": 2.507287868795133e-06, + "loss": 0.1984, + "step": 8710 + }, + { + "epoch": 2.101217014098084, + "grad_norm": 0.13411986827850342, + "learning_rate": 2.495145350013995e-06, + "loss": 0.1975, + "step": 8720 + }, + { + "epoch": 2.1036269430051813, + "grad_norm": 0.1322869509458542, + "learning_rate": 2.4830225218019878e-06, + "loss": 0.1998, + "step": 8730 + }, + { + "epoch": 2.1060368719122784, + "grad_norm": 0.12509003281593323, + "learning_rate": 2.4709194794562204e-06, + "loss": 0.1984, + "step": 8740 + }, + { + "epoch": 2.108446800819376, + "grad_norm": 0.12176263332366943, + "learning_rate": 2.45883631811825e-06, + "loss": 0.1982, + "step": 8750 + }, + { + "epoch": 2.110856729726473, + "grad_norm": 0.1365647315979004, + "learning_rate": 2.4467731327733665e-06, + "loss": 0.1968, + "step": 8760 + }, + { + "epoch": 2.11326665863357, + "grad_norm": 0.1353791207075119, + "learning_rate": 2.4347300182498116e-06, + "loss": 0.1991, + "step": 8770 + }, + { + "epoch": 2.1156765875406673, + "grad_norm": 0.1300830841064453, + "learning_rate": 2.422707069218068e-06, + "loss": 0.1993, + "step": 8780 + }, + { + "epoch": 2.118086516447765, + "grad_norm": 0.13076601922512054, + "learning_rate": 2.4107043801900863e-06, + "loss": 0.1977, + "step": 8790 + }, + { + "epoch": 2.120496445354862, + "grad_norm": 0.1522458791732788, + "learning_rate": 2.39872204551856e-06, + "loss": 0.1974, + "step": 8800 + }, + { + "epoch": 2.122906374261959, + "grad_norm": 0.13156403601169586, + "learning_rate": 2.3867601593961744e-06, + "loss": 0.2021, + "step": 8810 + }, + { + "epoch": 2.1253163031690567, + "grad_norm": 0.13702353835105896, + "learning_rate": 2.374818815854871e-06, + "loss": 0.1971, + "step": 8820 + }, + { + "epoch": 2.127726232076154, + "grad_norm": 0.13330979645252228, + "learning_rate": 2.3628981087651073e-06, + "loss": 0.1968, + "step": 8830 + }, + { + "epoch": 2.130136160983251, + "grad_norm": 0.14794397354125977, + "learning_rate": 2.350998131835117e-06, + "loss": 0.2004, + "step": 8840 + }, + { + "epoch": 2.132546089890348, + "grad_norm": 0.14478328824043274, + "learning_rate": 2.339118978610175e-06, + "loss": 0.2013, + "step": 8850 + }, + { + "epoch": 2.1349560187974457, + "grad_norm": 0.130048468708992, + "learning_rate": 2.3272607424718675e-06, + "loss": 0.197, + "step": 8860 + }, + { + "epoch": 2.137365947704543, + "grad_norm": 0.129298135638237, + "learning_rate": 2.315423516637339e-06, + "loss": 0.1994, + "step": 8870 + }, + { + "epoch": 2.13977587661164, + "grad_norm": 0.12726633250713348, + "learning_rate": 2.3036073941585898e-06, + "loss": 0.1967, + "step": 8880 + }, + { + "epoch": 2.142185805518737, + "grad_norm": 0.12661658227443695, + "learning_rate": 2.2918124679217106e-06, + "loss": 0.1944, + "step": 8890 + }, + { + "epoch": 2.1445957344258346, + "grad_norm": 0.1268956959247589, + "learning_rate": 2.2800388306461847e-06, + "loss": 0.1976, + "step": 8900 + }, + { + "epoch": 2.1470056633329317, + "grad_norm": 0.12949858605861664, + "learning_rate": 2.2682865748841293e-06, + "loss": 0.1973, + "step": 8910 + }, + { + "epoch": 2.149415592240029, + "grad_norm": 0.1312812715768814, + "learning_rate": 2.2565557930195963e-06, + "loss": 0.2005, + "step": 8920 + }, + { + "epoch": 2.151825521147126, + "grad_norm": 0.13287796080112457, + "learning_rate": 2.244846577267818e-06, + "loss": 0.1984, + "step": 8930 + }, + { + "epoch": 2.1542354500542236, + "grad_norm": 0.13826043903827667, + "learning_rate": 2.2331590196745094e-06, + "loss": 0.2, + "step": 8940 + }, + { + "epoch": 2.1566453789613207, + "grad_norm": 0.12798218429088593, + "learning_rate": 2.221493212115123e-06, + "loss": 0.2005, + "step": 8950 + }, + { + "epoch": 2.159055307868418, + "grad_norm": 0.17131908237934113, + "learning_rate": 2.209849246294138e-06, + "loss": 0.1991, + "step": 8960 + }, + { + "epoch": 2.161465236775515, + "grad_norm": 0.14354480803012848, + "learning_rate": 2.1982272137443356e-06, + "loss": 0.1972, + "step": 8970 + }, + { + "epoch": 2.1638751656826125, + "grad_norm": 0.13229574263095856, + "learning_rate": 2.186627205826082e-06, + "loss": 0.199, + "step": 8980 + }, + { + "epoch": 2.1662850945897096, + "grad_norm": 0.12601736187934875, + "learning_rate": 2.1750493137266064e-06, + "loss": 0.202, + "step": 8990 + }, + { + "epoch": 2.1686950234968068, + "grad_norm": 0.1347973644733429, + "learning_rate": 2.1634936284592882e-06, + "loss": 0.2004, + "step": 9000 + }, + { + "epoch": 2.171104952403904, + "grad_norm": 0.14534367620944977, + "learning_rate": 2.151960240862937e-06, + "loss": 0.2009, + "step": 9010 + }, + { + "epoch": 2.1735148813110015, + "grad_norm": 0.13880938291549683, + "learning_rate": 2.1404492416010885e-06, + "loss": 0.1972, + "step": 9020 + }, + { + "epoch": 2.1759248102180986, + "grad_norm": 0.12886486947536469, + "learning_rate": 2.128960721161273e-06, + "loss": 0.1998, + "step": 9030 + }, + { + "epoch": 2.1783347391251957, + "grad_norm": 0.13548776507377625, + "learning_rate": 2.1174947698543276e-06, + "loss": 0.1975, + "step": 9040 + }, + { + "epoch": 2.1807446680322933, + "grad_norm": 0.1285417228937149, + "learning_rate": 2.106051477813662e-06, + "loss": 0.2003, + "step": 9050 + }, + { + "epoch": 2.1831545969393904, + "grad_norm": 0.12515997886657715, + "learning_rate": 2.0946309349945764e-06, + "loss": 0.1992, + "step": 9060 + }, + { + "epoch": 2.1855645258464875, + "grad_norm": 0.13670197129249573, + "learning_rate": 2.0832332311735255e-06, + "loss": 0.1969, + "step": 9070 + }, + { + "epoch": 2.1879744547535847, + "grad_norm": 0.13152837753295898, + "learning_rate": 2.071858455947439e-06, + "loss": 0.2005, + "step": 9080 + }, + { + "epoch": 2.1903843836606818, + "grad_norm": 0.12892812490463257, + "learning_rate": 2.0605066987330015e-06, + "loss": 0.2029, + "step": 9090 + }, + { + "epoch": 2.1927943125677793, + "grad_norm": 0.125423401594162, + "learning_rate": 2.0491780487659518e-06, + "loss": 0.1995, + "step": 9100 + }, + { + "epoch": 2.1952042414748765, + "grad_norm": 0.13129930198192596, + "learning_rate": 2.0378725951003863e-06, + "loss": 0.201, + "step": 9110 + }, + { + "epoch": 2.1976141703819736, + "grad_norm": 0.12565447390079498, + "learning_rate": 2.0265904266080553e-06, + "loss": 0.1973, + "step": 9120 + }, + { + "epoch": 2.200024099289071, + "grad_norm": 0.11998391151428223, + "learning_rate": 2.0153316319776663e-06, + "loss": 0.1969, + "step": 9130 + }, + { + "epoch": 2.2024340281961683, + "grad_norm": 0.1322380006313324, + "learning_rate": 2.004096299714182e-06, + "loss": 0.1952, + "step": 9140 + }, + { + "epoch": 2.2048439571032654, + "grad_norm": 0.13540686666965485, + "learning_rate": 1.992884518138132e-06, + "loss": 0.1982, + "step": 9150 + }, + { + "epoch": 2.2072538860103625, + "grad_norm": 0.13474301993846893, + "learning_rate": 1.9816963753849173e-06, + "loss": 0.1974, + "step": 9160 + }, + { + "epoch": 2.20966381491746, + "grad_norm": 0.13281075656414032, + "learning_rate": 1.9705319594041055e-06, + "loss": 0.2006, + "step": 9170 + }, + { + "epoch": 2.2120737438245572, + "grad_norm": 0.13555601239204407, + "learning_rate": 1.959391357958761e-06, + "loss": 0.199, + "step": 9180 + }, + { + "epoch": 2.2144836727316544, + "grad_norm": 0.12954938411712646, + "learning_rate": 1.9482746586247307e-06, + "loss": 0.1992, + "step": 9190 + }, + { + "epoch": 2.2168936016387515, + "grad_norm": 0.1328035593032837, + "learning_rate": 1.937181948789979e-06, + "loss": 0.1993, + "step": 9200 + }, + { + "epoch": 2.219303530545849, + "grad_norm": 0.12565462291240692, + "learning_rate": 1.926113315653883e-06, + "loss": 0.1973, + "step": 9210 + }, + { + "epoch": 2.221713459452946, + "grad_norm": 0.123862624168396, + "learning_rate": 1.9150688462265567e-06, + "loss": 0.1967, + "step": 9220 + }, + { + "epoch": 2.2241233883600433, + "grad_norm": 0.13187842071056366, + "learning_rate": 1.9040486273281611e-06, + "loss": 0.1977, + "step": 9230 + }, + { + "epoch": 2.2265333172671404, + "grad_norm": 0.1303049623966217, + "learning_rate": 1.8930527455882285e-06, + "loss": 0.1994, + "step": 9240 + }, + { + "epoch": 2.228943246174238, + "grad_norm": 0.1302567571401596, + "learning_rate": 1.8820812874449745e-06, + "loss": 0.2006, + "step": 9250 + }, + { + "epoch": 2.231353175081335, + "grad_norm": 0.13377898931503296, + "learning_rate": 1.871134339144624e-06, + "loss": 0.1992, + "step": 9260 + }, + { + "epoch": 2.2337631039884323, + "grad_norm": 0.12116660177707672, + "learning_rate": 1.8602119867407293e-06, + "loss": 0.1993, + "step": 9270 + }, + { + "epoch": 2.2361730328955294, + "grad_norm": 0.11837529391050339, + "learning_rate": 1.8493143160934962e-06, + "loss": 0.1975, + "step": 9280 + }, + { + "epoch": 2.238582961802627, + "grad_norm": 0.1215796023607254, + "learning_rate": 1.838441412869108e-06, + "loss": 0.1987, + "step": 9290 + }, + { + "epoch": 2.240992890709724, + "grad_norm": 0.16470345854759216, + "learning_rate": 1.827593362539052e-06, + "loss": 0.1992, + "step": 9300 + }, + { + "epoch": 2.243402819616821, + "grad_norm": 0.12217085063457489, + "learning_rate": 1.816770250379446e-06, + "loss": 0.1968, + "step": 9310 + }, + { + "epoch": 2.2458127485239183, + "grad_norm": 0.1257430613040924, + "learning_rate": 1.8059721614703757e-06, + "loss": 0.1999, + "step": 9320 + }, + { + "epoch": 2.248222677431016, + "grad_norm": 0.12400317937135696, + "learning_rate": 1.7951991806952135e-06, + "loss": 0.1999, + "step": 9330 + }, + { + "epoch": 2.250632606338113, + "grad_norm": 0.1254672110080719, + "learning_rate": 1.7844513927399605e-06, + "loss": 0.1971, + "step": 9340 + }, + { + "epoch": 2.25304253524521, + "grad_norm": 0.12951840460300446, + "learning_rate": 1.7737288820925775e-06, + "loss": 0.1994, + "step": 9350 + }, + { + "epoch": 2.2554524641523077, + "grad_norm": 0.13221119344234467, + "learning_rate": 1.7630317330423213e-06, + "loss": 0.1973, + "step": 9360 + }, + { + "epoch": 2.257862393059405, + "grad_norm": 0.13148674368858337, + "learning_rate": 1.7523600296790827e-06, + "loss": 0.1982, + "step": 9370 + }, + { + "epoch": 2.260272321966502, + "grad_norm": 0.12214197218418121, + "learning_rate": 1.7417138558927244e-06, + "loss": 0.2, + "step": 9380 + }, + { + "epoch": 2.262682250873599, + "grad_norm": 0.13050705194473267, + "learning_rate": 1.731093295372422e-06, + "loss": 0.198, + "step": 9390 + }, + { + "epoch": 2.2650921797806967, + "grad_norm": 0.12322299927473068, + "learning_rate": 1.7204984316060063e-06, + "loss": 0.1964, + "step": 9400 + }, + { + "epoch": 2.267502108687794, + "grad_norm": 0.1294129341840744, + "learning_rate": 1.7099293478793066e-06, + "loss": 0.2003, + "step": 9410 + }, + { + "epoch": 2.269912037594891, + "grad_norm": 0.13078904151916504, + "learning_rate": 1.699386127275497e-06, + "loss": 0.1996, + "step": 9420 + }, + { + "epoch": 2.272321966501988, + "grad_norm": 0.14058412611484528, + "learning_rate": 1.6888688526744419e-06, + "loss": 0.1992, + "step": 9430 + }, + { + "epoch": 2.2747318954090856, + "grad_norm": 0.12963654100894928, + "learning_rate": 1.6783776067520435e-06, + "loss": 0.1979, + "step": 9440 + }, + { + "epoch": 2.2771418243161827, + "grad_norm": 0.13461080193519592, + "learning_rate": 1.667912471979599e-06, + "loss": 0.1988, + "step": 9450 + }, + { + "epoch": 2.27955175322328, + "grad_norm": 0.1404641717672348, + "learning_rate": 1.6574735306231415e-06, + "loss": 0.1971, + "step": 9460 + }, + { + "epoch": 2.281961682130377, + "grad_norm": 0.13926105201244354, + "learning_rate": 1.6470608647427994e-06, + "loss": 0.2002, + "step": 9470 + }, + { + "epoch": 2.2843716110374745, + "grad_norm": 0.1367444097995758, + "learning_rate": 1.6366745561921526e-06, + "loss": 0.2011, + "step": 9480 + }, + { + "epoch": 2.2867815399445717, + "grad_norm": 0.13203634321689606, + "learning_rate": 1.626314686617586e-06, + "loss": 0.2018, + "step": 9490 + }, + { + "epoch": 2.289191468851669, + "grad_norm": 0.12602578103542328, + "learning_rate": 1.6159813374576473e-06, + "loss": 0.1984, + "step": 9500 + }, + { + "epoch": 2.291601397758766, + "grad_norm": 0.12635314464569092, + "learning_rate": 1.605674589942411e-06, + "loss": 0.1975, + "step": 9510 + }, + { + "epoch": 2.2940113266658635, + "grad_norm": 0.12311098724603653, + "learning_rate": 1.5953945250928337e-06, + "loss": 0.1997, + "step": 9520 + }, + { + "epoch": 2.2964212555729606, + "grad_norm": 0.120974600315094, + "learning_rate": 1.5851412237201241e-06, + "loss": 0.2014, + "step": 9530 + }, + { + "epoch": 2.2988311844800577, + "grad_norm": 0.13310423493385315, + "learning_rate": 1.5749147664251008e-06, + "loss": 0.197, + "step": 9540 + }, + { + "epoch": 2.301241113387155, + "grad_norm": 0.12243101745843887, + "learning_rate": 1.5647152335975675e-06, + "loss": 0.2, + "step": 9550 + }, + { + "epoch": 2.3036510422942524, + "grad_norm": 0.1300317943096161, + "learning_rate": 1.5545427054156659e-06, + "loss": 0.1967, + "step": 9560 + }, + { + "epoch": 2.3060609712013496, + "grad_norm": 0.12345019727945328, + "learning_rate": 1.5443972618452685e-06, + "loss": 0.1955, + "step": 9570 + }, + { + "epoch": 2.3084709001084467, + "grad_norm": 0.126836359500885, + "learning_rate": 1.5342789826393223e-06, + "loss": 0.1988, + "step": 9580 + }, + { + "epoch": 2.3108808290155443, + "grad_norm": 0.11867355555295944, + "learning_rate": 1.5241879473372501e-06, + "loss": 0.1973, + "step": 9590 + }, + { + "epoch": 2.3132907579226414, + "grad_norm": 0.1371241956949234, + "learning_rate": 1.5141242352642975e-06, + "loss": 0.1969, + "step": 9600 + }, + { + "epoch": 2.3157006868297385, + "grad_norm": 0.11983615905046463, + "learning_rate": 1.5040879255309366e-06, + "loss": 0.1963, + "step": 9610 + }, + { + "epoch": 2.3181106157368356, + "grad_norm": 0.12212738394737244, + "learning_rate": 1.4940790970322217e-06, + "loss": 0.1976, + "step": 9620 + }, + { + "epoch": 2.3205205446439328, + "grad_norm": 0.12494116276502609, + "learning_rate": 1.4840978284471818e-06, + "loss": 0.1955, + "step": 9630 + }, + { + "epoch": 2.3229304735510303, + "grad_norm": 0.12898215651512146, + "learning_rate": 1.4741441982381965e-06, + "loss": 0.1988, + "step": 9640 + }, + { + "epoch": 2.3253404024581275, + "grad_norm": 0.13270017504692078, + "learning_rate": 1.4642182846503834e-06, + "loss": 0.1974, + "step": 9650 + }, + { + "epoch": 2.3277503313652246, + "grad_norm": 0.1331421136856079, + "learning_rate": 1.454320165710979e-06, + "loss": 0.2005, + "step": 9660 + }, + { + "epoch": 2.330160260272322, + "grad_norm": 0.12074144184589386, + "learning_rate": 1.4444499192287275e-06, + "loss": 0.1974, + "step": 9670 + }, + { + "epoch": 2.3325701891794193, + "grad_norm": 0.11966565996408463, + "learning_rate": 1.434607622793268e-06, + "loss": 0.2004, + "step": 9680 + }, + { + "epoch": 2.3349801180865164, + "grad_norm": 0.12328942865133286, + "learning_rate": 1.4247933537745312e-06, + "loss": 0.1984, + "step": 9690 + }, + { + "epoch": 2.3373900469936135, + "grad_norm": 0.12541703879833221, + "learning_rate": 1.4150071893221134e-06, + "loss": 0.1984, + "step": 9700 + }, + { + "epoch": 2.339799975900711, + "grad_norm": 0.12745068967342377, + "learning_rate": 1.4052492063646954e-06, + "loss": 0.1974, + "step": 9710 + }, + { + "epoch": 2.342209904807808, + "grad_norm": 0.12669983506202698, + "learning_rate": 1.395519481609412e-06, + "loss": 0.1968, + "step": 9720 + }, + { + "epoch": 2.3446198337149053, + "grad_norm": 0.12797443568706512, + "learning_rate": 1.3858180915412733e-06, + "loss": 0.1965, + "step": 9730 + }, + { + "epoch": 2.3470297626220025, + "grad_norm": 0.13788028061389923, + "learning_rate": 1.376145112422539e-06, + "loss": 0.2012, + "step": 9740 + }, + { + "epoch": 2.3494396915291, + "grad_norm": 0.14654968678951263, + "learning_rate": 1.3665006202921422e-06, + "loss": 0.1984, + "step": 9750 + }, + { + "epoch": 2.351849620436197, + "grad_norm": 0.13153453171253204, + "learning_rate": 1.3568846909650757e-06, + "loss": 0.1972, + "step": 9760 + }, + { + "epoch": 2.3542595493432943, + "grad_norm": 0.1279347985982895, + "learning_rate": 1.347297400031801e-06, + "loss": 0.1986, + "step": 9770 + }, + { + "epoch": 2.3566694782503914, + "grad_norm": 0.1343514323234558, + "learning_rate": 1.337738822857656e-06, + "loss": 0.197, + "step": 9780 + }, + { + "epoch": 2.359079407157489, + "grad_norm": 0.1319359540939331, + "learning_rate": 1.3282090345822591e-06, + "loss": 0.1999, + "step": 9790 + }, + { + "epoch": 2.361489336064586, + "grad_norm": 0.12443242222070694, + "learning_rate": 1.3187081101189215e-06, + "loss": 0.1973, + "step": 9800 + }, + { + "epoch": 2.3638992649716832, + "grad_norm": 0.1242903470993042, + "learning_rate": 1.309236124154057e-06, + "loss": 0.202, + "step": 9810 + }, + { + "epoch": 2.366309193878781, + "grad_norm": 0.12554222345352173, + "learning_rate": 1.299793151146594e-06, + "loss": 0.1966, + "step": 9820 + }, + { + "epoch": 2.368719122785878, + "grad_norm": 0.12424005568027496, + "learning_rate": 1.2903792653273916e-06, + "loss": 0.1985, + "step": 9830 + }, + { + "epoch": 2.371129051692975, + "grad_norm": 0.13209636509418488, + "learning_rate": 1.2809945406986546e-06, + "loss": 0.1994, + "step": 9840 + }, + { + "epoch": 2.373538980600072, + "grad_norm": 0.13022255897521973, + "learning_rate": 1.2716390510333586e-06, + "loss": 0.1999, + "step": 9850 + }, + { + "epoch": 2.3759489095071693, + "grad_norm": 0.12892918288707733, + "learning_rate": 1.2623128698746527e-06, + "loss": 0.2001, + "step": 9860 + }, + { + "epoch": 2.378358838414267, + "grad_norm": 0.1219906210899353, + "learning_rate": 1.2530160705353068e-06, + "loss": 0.1954, + "step": 9870 + }, + { + "epoch": 2.380768767321364, + "grad_norm": 0.12953771650791168, + "learning_rate": 1.243748726097107e-06, + "loss": 0.1954, + "step": 9880 + }, + { + "epoch": 2.383178696228461, + "grad_norm": 0.12258866429328918, + "learning_rate": 1.2345109094103102e-06, + "loss": 0.1969, + "step": 9890 + }, + { + "epoch": 2.3855886251355587, + "grad_norm": 0.1489989310503006, + "learning_rate": 1.2253026930930422e-06, + "loss": 0.1998, + "step": 9900 + }, + { + "epoch": 2.387998554042656, + "grad_norm": 0.12289855629205704, + "learning_rate": 1.2161241495307546e-06, + "loss": 0.2003, + "step": 9910 + }, + { + "epoch": 2.390408482949753, + "grad_norm": 0.12116892635822296, + "learning_rate": 1.2069753508756332e-06, + "loss": 0.2007, + "step": 9920 + }, + { + "epoch": 2.39281841185685, + "grad_norm": 0.12422724068164825, + "learning_rate": 1.1978563690460454e-06, + "loss": 0.1979, + "step": 9930 + }, + { + "epoch": 2.3952283407639476, + "grad_norm": 0.12689027190208435, + "learning_rate": 1.188767275725966e-06, + "loss": 0.1976, + "step": 9940 + }, + { + "epoch": 2.3976382696710448, + "grad_norm": 0.13472682237625122, + "learning_rate": 1.1797081423644207e-06, + "loss": 0.1992, + "step": 9950 + }, + { + "epoch": 2.400048198578142, + "grad_norm": 0.12316633760929108, + "learning_rate": 1.1706790401749191e-06, + "loss": 0.1964, + "step": 9960 + }, + { + "epoch": 2.402458127485239, + "grad_norm": 0.12504875659942627, + "learning_rate": 1.161680040134897e-06, + "loss": 0.1996, + "step": 9970 + }, + { + "epoch": 2.4048680563923366, + "grad_norm": 0.137268528342247, + "learning_rate": 1.152711212985157e-06, + "loss": 0.2009, + "step": 9980 + }, + { + "epoch": 2.4072779852994337, + "grad_norm": 0.1279861330986023, + "learning_rate": 1.1437726292293205e-06, + "loss": 0.2005, + "step": 9990 + }, + { + "epoch": 2.409687914206531, + "grad_norm": 0.12599658966064453, + "learning_rate": 1.1348643591332554e-06, + "loss": 0.1983, + "step": 10000 + }, + { + "epoch": 2.412097843113628, + "grad_norm": 0.12882481515407562, + "learning_rate": 1.1259864727245462e-06, + "loss": 0.2012, + "step": 10010 + }, + { + "epoch": 2.4145077720207255, + "grad_norm": 0.12207888811826706, + "learning_rate": 1.117139039791923e-06, + "loss": 0.2005, + "step": 10020 + }, + { + "epoch": 2.4169177009278227, + "grad_norm": 0.1228935495018959, + "learning_rate": 1.1083221298847318e-06, + "loss": 0.1992, + "step": 10030 + }, + { + "epoch": 2.41932762983492, + "grad_norm": 0.12841002643108368, + "learning_rate": 1.0995358123123672e-06, + "loss": 0.1982, + "step": 10040 + }, + { + "epoch": 2.4217375587420173, + "grad_norm": 0.12162084132432938, + "learning_rate": 1.0907801561437487e-06, + "loss": 0.1978, + "step": 10050 + }, + { + "epoch": 2.4241474876491145, + "grad_norm": 0.12312573939561844, + "learning_rate": 1.0820552302067626e-06, + "loss": 0.2002, + "step": 10060 + }, + { + "epoch": 2.4265574165562116, + "grad_norm": 0.1217411682009697, + "learning_rate": 1.0733611030877261e-06, + "loss": 0.1965, + "step": 10070 + }, + { + "epoch": 2.4289673454633087, + "grad_norm": 0.12311185896396637, + "learning_rate": 1.064697843130849e-06, + "loss": 0.2001, + "step": 10080 + }, + { + "epoch": 2.431377274370406, + "grad_norm": 0.12260051816701889, + "learning_rate": 1.0560655184376956e-06, + "loss": 0.1971, + "step": 10090 + }, + { + "epoch": 2.4337872032775034, + "grad_norm": 0.11911850422620773, + "learning_rate": 1.0474641968666482e-06, + "loss": 0.1968, + "step": 10100 + }, + { + "epoch": 2.4361971321846005, + "grad_norm": 0.1274978220462799, + "learning_rate": 1.0388939460323761e-06, + "loss": 0.2007, + "step": 10110 + }, + { + "epoch": 2.4386070610916977, + "grad_norm": 0.12279874086380005, + "learning_rate": 1.030354833305301e-06, + "loss": 0.2007, + "step": 10120 + }, + { + "epoch": 2.4410169899987952, + "grad_norm": 0.11948572844266891, + "learning_rate": 1.0218469258110713e-06, + "loss": 0.2028, + "step": 10130 + }, + { + "epoch": 2.4434269189058924, + "grad_norm": 0.11698044836521149, + "learning_rate": 1.013370290430029e-06, + "loss": 0.1985, + "step": 10140 + }, + { + "epoch": 2.4458368478129895, + "grad_norm": 0.1203853189945221, + "learning_rate": 1.0049249937966938e-06, + "loss": 0.2024, + "step": 10150 + }, + { + "epoch": 2.4482467767200866, + "grad_norm": 0.1191658154129982, + "learning_rate": 9.96511102299222e-07, + "loss": 0.198, + "step": 10160 + }, + { + "epoch": 2.450656705627184, + "grad_norm": 0.13178017735481262, + "learning_rate": 9.881286820789066e-07, + "loss": 0.1965, + "step": 10170 + }, + { + "epoch": 2.4530666345342813, + "grad_norm": 0.12409935891628265, + "learning_rate": 9.797777990296364e-07, + "loss": 0.198, + "step": 10180 + }, + { + "epoch": 2.4554765634413784, + "grad_norm": 0.12384223937988281, + "learning_rate": 9.714585187973962e-07, + "loss": 0.1996, + "step": 10190 + }, + { + "epoch": 2.4578864923484756, + "grad_norm": 0.12161009758710861, + "learning_rate": 9.631709067797346e-07, + "loss": 0.1979, + "step": 10200 + }, + { + "epoch": 2.460296421255573, + "grad_norm": 0.1231224313378334, + "learning_rate": 9.549150281252633e-07, + "loss": 0.196, + "step": 10210 + }, + { + "epoch": 2.4627063501626703, + "grad_norm": 0.12427617609500885, + "learning_rate": 9.466909477331365e-07, + "loss": 0.1953, + "step": 10220 + }, + { + "epoch": 2.4651162790697674, + "grad_norm": 0.12145771086215973, + "learning_rate": 9.384987302525439e-07, + "loss": 0.1972, + "step": 10230 + }, + { + "epoch": 2.4675262079768645, + "grad_norm": 0.15090271830558777, + "learning_rate": 9.303384400822019e-07, + "loss": 0.1972, + "step": 10240 + }, + { + "epoch": 2.469936136883962, + "grad_norm": 0.13022595643997192, + "learning_rate": 9.222101413698475e-07, + "loss": 0.1994, + "step": 10250 + }, + { + "epoch": 2.472346065791059, + "grad_norm": 0.11662918329238892, + "learning_rate": 9.141138980117348e-07, + "loss": 0.1963, + "step": 10260 + }, + { + "epoch": 2.4747559946981563, + "grad_norm": 0.1264001727104187, + "learning_rate": 9.060497736521312e-07, + "loss": 0.1996, + "step": 10270 + }, + { + "epoch": 2.477165923605254, + "grad_norm": 0.12486141920089722, + "learning_rate": 8.980178316828158e-07, + "loss": 0.1961, + "step": 10280 + }, + { + "epoch": 2.479575852512351, + "grad_norm": 0.12094808369874954, + "learning_rate": 8.900181352425907e-07, + "loss": 0.1987, + "step": 10290 + }, + { + "epoch": 2.481985781419448, + "grad_norm": 0.11713796108961105, + "learning_rate": 8.82050747216766e-07, + "loss": 0.1967, + "step": 10300 + }, + { + "epoch": 2.4843957103265453, + "grad_norm": 0.11617062240839005, + "learning_rate": 8.741157302366859e-07, + "loss": 0.1968, + "step": 10310 + }, + { + "epoch": 2.4868056392336424, + "grad_norm": 0.14669108390808105, + "learning_rate": 8.662131466792217e-07, + "loss": 0.1975, + "step": 10320 + }, + { + "epoch": 2.48921556814074, + "grad_norm": 0.11834236234426498, + "learning_rate": 8.5834305866629e-07, + "loss": 0.1996, + "step": 10330 + }, + { + "epoch": 2.491625497047837, + "grad_norm": 0.11807161569595337, + "learning_rate": 8.505055280643582e-07, + "loss": 0.1959, + "step": 10340 + }, + { + "epoch": 2.494035425954934, + "grad_norm": 0.12231793254613876, + "learning_rate": 8.42700616483963e-07, + "loss": 0.1976, + "step": 10350 + }, + { + "epoch": 2.496445354862032, + "grad_norm": 0.12237108498811722, + "learning_rate": 8.34928385279224e-07, + "loss": 0.1982, + "step": 10360 + }, + { + "epoch": 2.498855283769129, + "grad_norm": 0.11896923184394836, + "learning_rate": 8.271888955473606e-07, + "loss": 0.1985, + "step": 10370 + }, + { + "epoch": 2.501265212676226, + "grad_norm": 0.1200416311621666, + "learning_rate": 8.194822081282144e-07, + "loss": 0.1995, + "step": 10380 + }, + { + "epoch": 2.503675141583323, + "grad_norm": 0.1195942834019661, + "learning_rate": 8.118083836037677e-07, + "loss": 0.198, + "step": 10390 + }, + { + "epoch": 2.5060850704904203, + "grad_norm": 0.12644240260124207, + "learning_rate": 8.041674822976686e-07, + "loss": 0.1988, + "step": 10400 + }, + { + "epoch": 2.508494999397518, + "grad_norm": 0.13783343136310577, + "learning_rate": 7.965595642747593e-07, + "loss": 0.2012, + "step": 10410 + }, + { + "epoch": 2.510904928304615, + "grad_norm": 0.11760956048965454, + "learning_rate": 7.889846893405978e-07, + "loss": 0.1968, + "step": 10420 + }, + { + "epoch": 2.513314857211712, + "grad_norm": 0.11873472481966019, + "learning_rate": 7.814429170409965e-07, + "loss": 0.1992, + "step": 10430 + }, + { + "epoch": 2.5157247861188097, + "grad_norm": 0.12854446470737457, + "learning_rate": 7.739343066615457e-07, + "loss": 0.1968, + "step": 10440 + }, + { + "epoch": 2.518134715025907, + "grad_norm": 0.12183453142642975, + "learning_rate": 7.664589172271519e-07, + "loss": 0.1978, + "step": 10450 + }, + { + "epoch": 2.520544643933004, + "grad_norm": 0.12129750847816467, + "learning_rate": 7.590168075015725e-07, + "loss": 0.1967, + "step": 10460 + }, + { + "epoch": 2.522954572840101, + "grad_norm": 0.12526491284370422, + "learning_rate": 7.51608035986956e-07, + "loss": 0.1966, + "step": 10470 + }, + { + "epoch": 2.5253645017471986, + "grad_norm": 0.12419699132442474, + "learning_rate": 7.442326609233786e-07, + "loss": 0.1978, + "step": 10480 + }, + { + "epoch": 2.5277744306542957, + "grad_norm": 0.11712777614593506, + "learning_rate": 7.368907402883896e-07, + "loss": 0.1982, + "step": 10490 + }, + { + "epoch": 2.530184359561393, + "grad_norm": 0.11896039545536041, + "learning_rate": 7.295823317965533e-07, + "loss": 0.2023, + "step": 10500 + }, + { + "epoch": 2.5325942884684904, + "grad_norm": 0.12150632590055466, + "learning_rate": 7.223074928989971e-07, + "loss": 0.1982, + "step": 10510 + }, + { + "epoch": 2.5350042173755876, + "grad_norm": 0.11829578876495361, + "learning_rate": 7.150662807829584e-07, + "loss": 0.1989, + "step": 10520 + }, + { + "epoch": 2.5374141462826847, + "grad_norm": 0.12783962488174438, + "learning_rate": 7.07858752371336e-07, + "loss": 0.1988, + "step": 10530 + }, + { + "epoch": 2.539824075189782, + "grad_norm": 0.11983250826597214, + "learning_rate": 7.006849643222425e-07, + "loss": 0.1954, + "step": 10540 + }, + { + "epoch": 2.542234004096879, + "grad_norm": 0.12253327667713165, + "learning_rate": 6.935449730285576e-07, + "loss": 0.1995, + "step": 10550 + }, + { + "epoch": 2.5446439330039765, + "grad_norm": 0.12103965878486633, + "learning_rate": 6.864388346174899e-07, + "loss": 0.1964, + "step": 10560 + }, + { + "epoch": 2.5470538619110736, + "grad_norm": 0.12630999088287354, + "learning_rate": 6.793666049501252e-07, + "loss": 0.1984, + "step": 10570 + }, + { + "epoch": 2.5494637908181708, + "grad_norm": 0.11937756091356277, + "learning_rate": 6.723283396210006e-07, + "loss": 0.199, + "step": 10580 + }, + { + "epoch": 2.5518737197252683, + "grad_norm": 0.12514546513557434, + "learning_rate": 6.65324093957656e-07, + "loss": 0.1992, + "step": 10590 + }, + { + "epoch": 2.5542836486323655, + "grad_norm": 0.1185721680521965, + "learning_rate": 6.583539230202062e-07, + "loss": 0.1988, + "step": 10600 + }, + { + "epoch": 2.5566935775394626, + "grad_norm": 0.15600068867206573, + "learning_rate": 6.514178816009059e-07, + "loss": 0.1981, + "step": 10610 + }, + { + "epoch": 2.5591035064465597, + "grad_norm": 0.11238270252943039, + "learning_rate": 6.445160242237181e-07, + "loss": 0.1963, + "step": 10620 + }, + { + "epoch": 2.561513435353657, + "grad_norm": 0.1281278282403946, + "learning_rate": 6.376484051438864e-07, + "loss": 0.1964, + "step": 10630 + }, + { + "epoch": 2.5639233642607544, + "grad_norm": 0.12617678940296173, + "learning_rate": 6.308150783475086e-07, + "loss": 0.2001, + "step": 10640 + }, + { + "epoch": 2.5663332931678515, + "grad_norm": 0.13613717257976532, + "learning_rate": 6.240160975511117e-07, + "loss": 0.197, + "step": 10650 + }, + { + "epoch": 2.5687432220749487, + "grad_norm": 0.1211361438035965, + "learning_rate": 6.172515162012332e-07, + "loss": 0.1978, + "step": 10660 + }, + { + "epoch": 2.571153150982046, + "grad_norm": 0.11443440616130829, + "learning_rate": 6.105213874739913e-07, + "loss": 0.201, + "step": 10670 + }, + { + "epoch": 2.5735630798891433, + "grad_norm": 0.1274467259645462, + "learning_rate": 6.038257642746815e-07, + "loss": 0.198, + "step": 10680 + }, + { + "epoch": 2.5759730087962405, + "grad_norm": 0.11638859659433365, + "learning_rate": 5.971646992373442e-07, + "loss": 0.2009, + "step": 10690 + }, + { + "epoch": 2.5783829377033376, + "grad_norm": 0.12000516802072525, + "learning_rate": 5.905382447243679e-07, + "loss": 0.1972, + "step": 10700 + }, + { + "epoch": 2.5807928666104347, + "grad_norm": 0.11631269007921219, + "learning_rate": 5.839464528260602e-07, + "loss": 0.1953, + "step": 10710 + }, + { + "epoch": 2.5832027955175323, + "grad_norm": 0.12172136455774307, + "learning_rate": 5.773893753602556e-07, + "loss": 0.1983, + "step": 10720 + }, + { + "epoch": 2.5856127244246294, + "grad_norm": 0.12186679244041443, + "learning_rate": 5.708670638718944e-07, + "loss": 0.1975, + "step": 10730 + }, + { + "epoch": 2.588022653331727, + "grad_norm": 0.11384265124797821, + "learning_rate": 5.643795696326248e-07, + "loss": 0.1989, + "step": 10740 + }, + { + "epoch": 2.590432582238824, + "grad_norm": 0.12156783789396286, + "learning_rate": 5.579269436403967e-07, + "loss": 0.2025, + "step": 10750 + }, + { + "epoch": 2.5928425111459212, + "grad_norm": 0.11662750691175461, + "learning_rate": 5.515092366190633e-07, + "loss": 0.1983, + "step": 10760 + }, + { + "epoch": 2.5952524400530184, + "grad_norm": 0.1321050077676773, + "learning_rate": 5.451264990179806e-07, + "loss": 0.1995, + "step": 10770 + }, + { + "epoch": 2.5976623689601155, + "grad_norm": 0.11678937822580338, + "learning_rate": 5.387787810116107e-07, + "loss": 0.1971, + "step": 10780 + }, + { + "epoch": 2.600072297867213, + "grad_norm": 0.12364859879016876, + "learning_rate": 5.324661324991287e-07, + "loss": 0.1974, + "step": 10790 + }, + { + "epoch": 2.60248222677431, + "grad_norm": 0.11932826787233353, + "learning_rate": 5.261886031040297e-07, + "loss": 0.1958, + "step": 10800 + }, + { + "epoch": 2.6048921556814073, + "grad_norm": 0.11405885964632034, + "learning_rate": 5.199462421737378e-07, + "loss": 0.1979, + "step": 10810 + }, + { + "epoch": 2.607302084588505, + "grad_norm": 0.11789312213659286, + "learning_rate": 5.137390987792224e-07, + "loss": 0.1979, + "step": 10820 + }, + { + "epoch": 2.609712013495602, + "grad_norm": 0.11750980466604233, + "learning_rate": 5.075672217146021e-07, + "loss": 0.1968, + "step": 10830 + }, + { + "epoch": 2.612121942402699, + "grad_norm": 0.1324010044336319, + "learning_rate": 5.014306594967777e-07, + "loss": 0.1968, + "step": 10840 + }, + { + "epoch": 2.6145318713097963, + "grad_norm": 0.12013759464025497, + "learning_rate": 4.953294603650321e-07, + "loss": 0.1967, + "step": 10850 + }, + { + "epoch": 2.6169418002168934, + "grad_norm": 0.11585763096809387, + "learning_rate": 4.892636722806681e-07, + "loss": 0.199, + "step": 10860 + }, + { + "epoch": 2.619351729123991, + "grad_norm": 0.11899024248123169, + "learning_rate": 4.832333429266162e-07, + "loss": 0.1985, + "step": 10870 + }, + { + "epoch": 2.621761658031088, + "grad_norm": 0.12400858104228973, + "learning_rate": 4.772385197070734e-07, + "loss": 0.2, + "step": 10880 + }, + { + "epoch": 2.624171586938185, + "grad_norm": 0.12234139442443848, + "learning_rate": 4.712792497471219e-07, + "loss": 0.1991, + "step": 10890 + }, + { + "epoch": 2.6265815158452828, + "grad_norm": 0.11881452798843384, + "learning_rate": 4.653555798923598e-07, + "loss": 0.1969, + "step": 10900 + }, + { + "epoch": 2.62899144475238, + "grad_norm": 0.11506690829992294, + "learning_rate": 4.59467556708536e-07, + "loss": 0.1974, + "step": 10910 + }, + { + "epoch": 2.631401373659477, + "grad_norm": 0.12822450697422028, + "learning_rate": 4.5361522648118163e-07, + "loss": 0.1993, + "step": 10920 + }, + { + "epoch": 2.633811302566574, + "grad_norm": 0.12246552854776382, + "learning_rate": 4.477986352152458e-07, + "loss": 0.1993, + "step": 10930 + }, + { + "epoch": 2.6362212314736713, + "grad_norm": 0.12038519978523254, + "learning_rate": 4.420178286347365e-07, + "loss": 0.1966, + "step": 10940 + }, + { + "epoch": 2.638631160380769, + "grad_norm": 0.12032466381788254, + "learning_rate": 4.3627285218235836e-07, + "loss": 0.1977, + "step": 10950 + }, + { + "epoch": 2.641041089287866, + "grad_norm": 0.12104138731956482, + "learning_rate": 4.305637510191596e-07, + "loss": 0.2005, + "step": 10960 + }, + { + "epoch": 2.643451018194963, + "grad_norm": 0.11752533912658691, + "learning_rate": 4.248905700241679e-07, + "loss": 0.1996, + "step": 10970 + }, + { + "epoch": 2.6458609471020607, + "grad_norm": 0.12105824798345566, + "learning_rate": 4.192533537940524e-07, + "loss": 0.1982, + "step": 10980 + }, + { + "epoch": 2.648270876009158, + "grad_norm": 0.11677469313144684, + "learning_rate": 4.1365214664275624e-07, + "loss": 0.1994, + "step": 10990 + }, + { + "epoch": 2.650680804916255, + "grad_norm": 0.1191301941871643, + "learning_rate": 4.0808699260116267e-07, + "loss": 0.1999, + "step": 11000 + }, + { + "epoch": 2.653090733823352, + "grad_norm": 0.11744995415210724, + "learning_rate": 4.025579354167386e-07, + "loss": 0.1979, + "step": 11010 + }, + { + "epoch": 2.6555006627304496, + "grad_norm": 0.11751815676689148, + "learning_rate": 3.9706501855319767e-07, + "loss": 0.1977, + "step": 11020 + }, + { + "epoch": 2.6579105916375467, + "grad_norm": 0.11772134900093079, + "learning_rate": 3.9160828519015537e-07, + "loss": 0.1966, + "step": 11030 + }, + { + "epoch": 2.660320520544644, + "grad_norm": 0.12891718745231628, + "learning_rate": 3.8618777822278854e-07, + "loss": 0.2011, + "step": 11040 + }, + { + "epoch": 2.6627304494517414, + "grad_norm": 0.11987189948558807, + "learning_rate": 3.8080354026150067e-07, + "loss": 0.1969, + "step": 11050 + }, + { + "epoch": 2.6651403783588385, + "grad_norm": 0.12312425673007965, + "learning_rate": 3.754556136315862e-07, + "loss": 0.1974, + "step": 11060 + }, + { + "epoch": 2.6675503072659357, + "grad_norm": 0.11463374644517899, + "learning_rate": 3.701440403728973e-07, + "loss": 0.1966, + "step": 11070 + }, + { + "epoch": 2.669960236173033, + "grad_norm": 0.11283261328935623, + "learning_rate": 3.6486886223951356e-07, + "loss": 0.1964, + "step": 11080 + }, + { + "epoch": 2.67237016508013, + "grad_norm": 0.11768094450235367, + "learning_rate": 3.596301206994135e-07, + "loss": 0.1954, + "step": 11090 + }, + { + "epoch": 2.6747800939872275, + "grad_norm": 0.1149783730506897, + "learning_rate": 3.5442785693414916e-07, + "loss": 0.1971, + "step": 11100 + }, + { + "epoch": 2.6771900228943246, + "grad_norm": 0.12526769936084747, + "learning_rate": 3.4926211183852257e-07, + "loss": 0.1952, + "step": 11110 + }, + { + "epoch": 2.6795999518014217, + "grad_norm": 0.12192758172750473, + "learning_rate": 3.441329260202647e-07, + "loss": 0.1952, + "step": 11120 + }, + { + "epoch": 2.6820098807085193, + "grad_norm": 0.11524799466133118, + "learning_rate": 3.390403397997116e-07, + "loss": 0.1994, + "step": 11130 + }, + { + "epoch": 2.6844198096156164, + "grad_norm": 0.11889517307281494, + "learning_rate": 3.339843932094977e-07, + "loss": 0.1972, + "step": 11140 + }, + { + "epoch": 2.6868297385227136, + "grad_norm": 0.11067784577608109, + "learning_rate": 3.289651259942267e-07, + "loss": 0.1944, + "step": 11150 + }, + { + "epoch": 2.6892396674298107, + "grad_norm": 0.12659992277622223, + "learning_rate": 3.2398257761017516e-07, + "loss": 0.1946, + "step": 11160 + }, + { + "epoch": 2.691649596336908, + "grad_norm": 0.11584974825382233, + "learning_rate": 3.190367872249672e-07, + "loss": 0.1983, + "step": 11170 + }, + { + "epoch": 2.6940595252440054, + "grad_norm": 0.12076602131128311, + "learning_rate": 3.1412779371727873e-07, + "loss": 0.195, + "step": 11180 + }, + { + "epoch": 2.6964694541511025, + "grad_norm": 0.12034130841493607, + "learning_rate": 3.0925563567652474e-07, + "loss": 0.1986, + "step": 11190 + }, + { + "epoch": 2.6988793830581996, + "grad_norm": 0.11660711467266083, + "learning_rate": 3.044203514025579e-07, + "loss": 0.1965, + "step": 11200 + }, + { + "epoch": 2.701289311965297, + "grad_norm": 0.12302997708320618, + "learning_rate": 2.996219789053678e-07, + "loss": 0.1977, + "step": 11210 + }, + { + "epoch": 2.7036992408723943, + "grad_norm": 0.11597088724374771, + "learning_rate": 2.948605559047818e-07, + "loss": 0.1991, + "step": 11220 + }, + { + "epoch": 2.7061091697794915, + "grad_norm": 0.11687976866960526, + "learning_rate": 2.9013611983016887e-07, + "loss": 0.1979, + "step": 11230 + }, + { + "epoch": 2.7085190986865886, + "grad_norm": 0.12074779719114304, + "learning_rate": 2.8544870782014566e-07, + "loss": 0.1994, + "step": 11240 + }, + { + "epoch": 2.710929027593686, + "grad_norm": 0.1221262738108635, + "learning_rate": 2.807983567222822e-07, + "loss": 0.1972, + "step": 11250 + }, + { + "epoch": 2.7133389565007833, + "grad_norm": 0.11421184986829758, + "learning_rate": 2.7618510309281756e-07, + "loss": 0.1979, + "step": 11260 + }, + { + "epoch": 2.7157488854078804, + "grad_norm": 0.12446928024291992, + "learning_rate": 2.716089831963636e-07, + "loss": 0.1964, + "step": 11270 + }, + { + "epoch": 2.718158814314978, + "grad_norm": 0.11847683787345886, + "learning_rate": 2.6707003300563196e-07, + "loss": 0.2021, + "step": 11280 + }, + { + "epoch": 2.720568743222075, + "grad_norm": 0.11553952842950821, + "learning_rate": 2.6256828820113765e-07, + "loss": 0.1975, + "step": 11290 + }, + { + "epoch": 2.722978672129172, + "grad_norm": 0.12082428485155106, + "learning_rate": 2.581037841709322e-07, + "loss": 0.1974, + "step": 11300 + }, + { + "epoch": 2.7253886010362693, + "grad_norm": 0.11691971868276596, + "learning_rate": 2.536765560103122e-07, + "loss": 0.1984, + "step": 11310 + }, + { + "epoch": 2.7277985299433665, + "grad_norm": 0.11397454142570496, + "learning_rate": 2.492866385215559e-07, + "loss": 0.198, + "step": 11320 + }, + { + "epoch": 2.730208458850464, + "grad_norm": 0.11686151474714279, + "learning_rate": 2.449340662136407e-07, + "loss": 0.195, + "step": 11330 + }, + { + "epoch": 2.732618387757561, + "grad_norm": 0.11599799245595932, + "learning_rate": 2.4061887330197485e-07, + "loss": 0.1971, + "step": 11340 + }, + { + "epoch": 2.7350283166646583, + "grad_norm": 0.12791751325130463, + "learning_rate": 2.3634109370813008e-07, + "loss": 0.1985, + "step": 11350 + }, + { + "epoch": 2.737438245571756, + "grad_norm": 0.1324101835489273, + "learning_rate": 2.3210076105957103e-07, + "loss": 0.1998, + "step": 11360 + }, + { + "epoch": 2.739848174478853, + "grad_norm": 0.11545724421739578, + "learning_rate": 2.278979086893962e-07, + "loss": 0.2001, + "step": 11370 + }, + { + "epoch": 2.74225810338595, + "grad_norm": 0.11950286477804184, + "learning_rate": 2.2373256963607093e-07, + "loss": 0.1986, + "step": 11380 + }, + { + "epoch": 2.7446680322930472, + "grad_norm": 0.11261560767889023, + "learning_rate": 2.1960477664317027e-07, + "loss": 0.1991, + "step": 11390 + }, + { + "epoch": 2.7470779612001444, + "grad_norm": 0.11667031794786453, + "learning_rate": 2.1551456215912147e-07, + "loss": 0.1969, + "step": 11400 + }, + { + "epoch": 2.749487890107242, + "grad_norm": 0.11386009305715561, + "learning_rate": 2.114619583369476e-07, + "loss": 0.1995, + "step": 11410 + }, + { + "epoch": 2.751897819014339, + "grad_norm": 0.11694635450839996, + "learning_rate": 2.0744699703401817e-07, + "loss": 0.1954, + "step": 11420 + }, + { + "epoch": 2.754307747921436, + "grad_norm": 0.12221164256334305, + "learning_rate": 2.034697098117927e-07, + "loss": 0.1968, + "step": 11430 + }, + { + "epoch": 2.7567176768285337, + "grad_norm": 0.12051833420991898, + "learning_rate": 1.995301279355788e-07, + "loss": 0.1967, + "step": 11440 + }, + { + "epoch": 2.759127605735631, + "grad_norm": 0.1132727712392807, + "learning_rate": 1.9562828237428332e-07, + "loss": 0.1992, + "step": 11450 + }, + { + "epoch": 2.761537534642728, + "grad_norm": 0.11581676453351974, + "learning_rate": 1.917642038001677e-07, + "loss": 0.201, + "step": 11460 + }, + { + "epoch": 2.763947463549825, + "grad_norm": 0.11355219036340714, + "learning_rate": 1.8793792258861077e-07, + "loss": 0.1972, + "step": 11470 + }, + { + "epoch": 2.7663573924569222, + "grad_norm": 0.11676806956529617, + "learning_rate": 1.8414946881786634e-07, + "loss": 0.1995, + "step": 11480 + }, + { + "epoch": 2.76876732136402, + "grad_norm": 0.11301768571138382, + "learning_rate": 1.8039887226882823e-07, + "loss": 0.1964, + "step": 11490 + }, + { + "epoch": 2.771177250271117, + "grad_norm": 0.11706644296646118, + "learning_rate": 1.7668616242479618e-07, + "loss": 0.1995, + "step": 11500 + }, + { + "epoch": 2.7735871791782145, + "grad_norm": 0.1184009313583374, + "learning_rate": 1.7301136847124477e-07, + "loss": 0.1967, + "step": 11510 + }, + { + "epoch": 2.7759971080853116, + "grad_norm": 0.1254088580608368, + "learning_rate": 1.6937451929559147e-07, + "loss": 0.1985, + "step": 11520 + }, + { + "epoch": 2.7784070369924088, + "grad_norm": 0.1315934807062149, + "learning_rate": 1.6577564348697284e-07, + "loss": 0.1989, + "step": 11530 + }, + { + "epoch": 2.780816965899506, + "grad_norm": 0.11553444713354111, + "learning_rate": 1.622147693360171e-07, + "loss": 0.1979, + "step": 11540 + }, + { + "epoch": 2.783226894806603, + "grad_norm": 0.11310292780399323, + "learning_rate": 1.586919248346236e-07, + "loss": 0.1986, + "step": 11550 + }, + { + "epoch": 2.7856368237137006, + "grad_norm": 0.11396036297082901, + "learning_rate": 1.5520713767574247e-07, + "loss": 0.1987, + "step": 11560 + }, + { + "epoch": 2.7880467526207977, + "grad_norm": 0.11298102885484695, + "learning_rate": 1.5176043525315543e-07, + "loss": 0.1986, + "step": 11570 + }, + { + "epoch": 2.790456681527895, + "grad_norm": 0.12029560655355453, + "learning_rate": 1.483518446612614e-07, + "loss": 0.1967, + "step": 11580 + }, + { + "epoch": 2.7928666104349924, + "grad_norm": 0.11798765510320663, + "learning_rate": 1.4498139269486455e-07, + "loss": 0.1996, + "step": 11590 + }, + { + "epoch": 2.7952765393420895, + "grad_norm": 0.1185227632522583, + "learning_rate": 1.4164910584896163e-07, + "loss": 0.1979, + "step": 11600 + }, + { + "epoch": 2.7976864682491867, + "grad_norm": 0.11194013059139252, + "learning_rate": 1.383550103185366e-07, + "loss": 0.1962, + "step": 11610 + }, + { + "epoch": 2.800096397156284, + "grad_norm": 0.11631465703248978, + "learning_rate": 1.350991319983508e-07, + "loss": 0.1984, + "step": 11620 + }, + { + "epoch": 2.802506326063381, + "grad_norm": 0.1182403415441513, + "learning_rate": 1.3188149648274307e-07, + "loss": 0.1993, + "step": 11630 + }, + { + "epoch": 2.8049162549704785, + "grad_norm": 0.15543074905872345, + "learning_rate": 1.2870212906542612e-07, + "loss": 0.1971, + "step": 11640 + }, + { + "epoch": 2.8073261838775756, + "grad_norm": 0.12442866712808609, + "learning_rate": 1.2556105473928824e-07, + "loss": 0.1983, + "step": 11650 + }, + { + "epoch": 2.8097361127846727, + "grad_norm": 0.11325249820947647, + "learning_rate": 1.2245829819619858e-07, + "loss": 0.1985, + "step": 11660 + }, + { + "epoch": 2.8121460416917703, + "grad_norm": 0.11899420619010925, + "learning_rate": 1.1939388382681106e-07, + "loss": 0.1988, + "step": 11670 + }, + { + "epoch": 2.8145559705988674, + "grad_norm": 0.12117994576692581, + "learning_rate": 1.163678357203718e-07, + "loss": 0.1989, + "step": 11680 + }, + { + "epoch": 2.8169658995059645, + "grad_norm": 0.13569891452789307, + "learning_rate": 1.133801776645338e-07, + "loss": 0.1995, + "step": 11690 + }, + { + "epoch": 2.8193758284130617, + "grad_norm": 0.11266092211008072, + "learning_rate": 1.1043093314516418e-07, + "loss": 0.197, + "step": 11700 + }, + { + "epoch": 2.821785757320159, + "grad_norm": 0.11539432406425476, + "learning_rate": 1.0752012534616496e-07, + "loss": 0.1973, + "step": 11710 + }, + { + "epoch": 2.8241956862272564, + "grad_norm": 0.11695801466703415, + "learning_rate": 1.046477771492882e-07, + "loss": 0.1965, + "step": 11720 + }, + { + "epoch": 2.8266056151343535, + "grad_norm": 0.1182837262749672, + "learning_rate": 1.0181391113395611e-07, + "loss": 0.1992, + "step": 11730 + }, + { + "epoch": 2.8290155440414506, + "grad_norm": 0.11519190669059753, + "learning_rate": 9.901854957708345e-08, + "loss": 0.1979, + "step": 11740 + }, + { + "epoch": 2.831425472948548, + "grad_norm": 0.11443960666656494, + "learning_rate": 9.626171445290378e-08, + "loss": 0.1957, + "step": 11750 + }, + { + "epoch": 2.8338354018556453, + "grad_norm": 0.11359517276287079, + "learning_rate": 9.354342743279455e-08, + "loss": 0.2002, + "step": 11760 + }, + { + "epoch": 2.8362453307627424, + "grad_norm": 0.11192893981933594, + "learning_rate": 9.086370988511006e-08, + "loss": 0.1969, + "step": 11770 + }, + { + "epoch": 2.8386552596698396, + "grad_norm": 0.11451409757137299, + "learning_rate": 8.822258287500829e-08, + "loss": 0.1983, + "step": 11780 + }, + { + "epoch": 2.841065188576937, + "grad_norm": 0.11336790770292282, + "learning_rate": 8.562006716429316e-08, + "loss": 0.1984, + "step": 11790 + }, + { + "epoch": 2.8434751174840343, + "grad_norm": 0.11402434855699539, + "learning_rate": 8.305618321124087e-08, + "loss": 0.1973, + "step": 11800 + }, + { + "epoch": 2.8458850463911314, + "grad_norm": 0.11499971151351929, + "learning_rate": 8.053095117044995e-08, + "loss": 0.1989, + "step": 11810 + }, + { + "epoch": 2.848294975298229, + "grad_norm": 0.11966579407453537, + "learning_rate": 7.804439089267368e-08, + "loss": 0.2011, + "step": 11820 + }, + { + "epoch": 2.850704904205326, + "grad_norm": 0.12160076946020126, + "learning_rate": 7.559652192467127e-08, + "loss": 0.1983, + "step": 11830 + }, + { + "epoch": 2.853114833112423, + "grad_norm": 0.11282498389482498, + "learning_rate": 7.318736350904798e-08, + "loss": 0.1983, + "step": 11840 + }, + { + "epoch": 2.8555247620195203, + "grad_norm": 0.1345667690038681, + "learning_rate": 7.081693458410977e-08, + "loss": 0.1996, + "step": 11850 + }, + { + "epoch": 2.8579346909266174, + "grad_norm": 0.11672775447368622, + "learning_rate": 6.848525378370995e-08, + "loss": 0.1964, + "step": 11860 + }, + { + "epoch": 2.860344619833715, + "grad_norm": 0.11396300047636032, + "learning_rate": 6.61923394371039e-08, + "loss": 0.1973, + "step": 11870 + }, + { + "epoch": 2.862754548740812, + "grad_norm": 0.1159951463341713, + "learning_rate": 6.393820956880681e-08, + "loss": 0.1979, + "step": 11880 + }, + { + "epoch": 2.8651644776479093, + "grad_norm": 0.11563079059123993, + "learning_rate": 6.172288189844833e-08, + "loss": 0.1992, + "step": 11890 + }, + { + "epoch": 2.867574406555007, + "grad_norm": 0.1158207431435585, + "learning_rate": 5.954637384063766e-08, + "loss": 0.2007, + "step": 11900 + }, + { + "epoch": 2.869984335462104, + "grad_norm": 0.12424372881650925, + "learning_rate": 5.740870250482367e-08, + "loss": 0.1985, + "step": 11910 + }, + { + "epoch": 2.872394264369201, + "grad_norm": 0.11300027370452881, + "learning_rate": 5.530988469516052e-08, + "loss": 0.199, + "step": 11920 + }, + { + "epoch": 2.874804193276298, + "grad_norm": 0.11593002080917358, + "learning_rate": 5.324993691037783e-08, + "loss": 0.1977, + "step": 11930 + }, + { + "epoch": 2.8772141221833953, + "grad_norm": 0.11310574412345886, + "learning_rate": 5.12288753436474e-08, + "loss": 0.1971, + "step": 11940 + }, + { + "epoch": 2.879624051090493, + "grad_norm": 0.11324238777160645, + "learning_rate": 4.924671588246e-08, + "loss": 0.2001, + "step": 11950 + }, + { + "epoch": 2.88203397999759, + "grad_norm": 0.11741005629301071, + "learning_rate": 4.7303474108496e-08, + "loss": 0.1979, + "step": 11960 + }, + { + "epoch": 2.884443908904687, + "grad_norm": 0.11062411963939667, + "learning_rate": 4.539916529750832e-08, + "loss": 0.1936, + "step": 11970 + }, + { + "epoch": 2.8868538378117847, + "grad_norm": 0.12052568048238754, + "learning_rate": 4.353380441919575e-08, + "loss": 0.196, + "step": 11980 + }, + { + "epoch": 2.889263766718882, + "grad_norm": 0.11128173768520355, + "learning_rate": 4.170740613709201e-08, + "loss": 0.1982, + "step": 11990 + }, + { + "epoch": 2.891673695625979, + "grad_norm": 0.1159716323018074, + "learning_rate": 3.9919984808445836e-08, + "loss": 0.1975, + "step": 12000 + }, + { + "epoch": 2.894083624533076, + "grad_norm": 0.12408029288053513, + "learning_rate": 3.817155448410936e-08, + "loss": 0.1969, + "step": 12010 + }, + { + "epoch": 2.8964935534401737, + "grad_norm": 0.11627837270498276, + "learning_rate": 3.6462128908428265e-08, + "loss": 0.1982, + "step": 12020 + }, + { + "epoch": 2.898903482347271, + "grad_norm": 0.12328115850687027, + "learning_rate": 3.479172151913346e-08, + "loss": 0.1976, + "step": 12030 + }, + { + "epoch": 2.901313411254368, + "grad_norm": 0.11239629238843918, + "learning_rate": 3.3160345447235674e-08, + "loss": 0.1938, + "step": 12040 + }, + { + "epoch": 2.9037233401614655, + "grad_norm": 0.11254783719778061, + "learning_rate": 3.156801351692051e-08, + "loss": 0.1948, + "step": 12050 + }, + { + "epoch": 2.9061332690685626, + "grad_norm": 0.1124378964304924, + "learning_rate": 3.0014738245450756e-08, + "loss": 0.1987, + "step": 12060 + }, + { + "epoch": 2.9085431979756597, + "grad_norm": 0.11649011820554733, + "learning_rate": 2.8500531843065893e-08, + "loss": 0.196, + "step": 12070 + }, + { + "epoch": 2.910953126882757, + "grad_norm": 0.10951084643602371, + "learning_rate": 2.702540621288441e-08, + "loss": 0.1951, + "step": 12080 + }, + { + "epoch": 2.913363055789854, + "grad_norm": 0.11133934557437897, + "learning_rate": 2.5589372950815538e-08, + "loss": 0.197, + "step": 12090 + }, + { + "epoch": 2.9157729846969516, + "grad_norm": 0.11958010494709015, + "learning_rate": 2.4192443345462667e-08, + "loss": 0.199, + "step": 12100 + }, + { + "epoch": 2.9181829136040487, + "grad_norm": 0.11115157604217529, + "learning_rate": 2.2834628378037848e-08, + "loss": 0.1991, + "step": 12110 + }, + { + "epoch": 2.920592842511146, + "grad_norm": 0.10967998206615448, + "learning_rate": 2.1515938722272977e-08, + "loss": 0.196, + "step": 12120 + }, + { + "epoch": 2.9230027714182434, + "grad_norm": 0.1160879135131836, + "learning_rate": 2.023638474433931e-08, + "loss": 0.202, + "step": 12130 + }, + { + "epoch": 2.9254127003253405, + "grad_norm": 0.10980487614870071, + "learning_rate": 1.8995976502762526e-08, + "loss": 0.1967, + "step": 12140 + }, + { + "epoch": 2.9278226292324376, + "grad_norm": 0.11112086474895477, + "learning_rate": 1.779472374834612e-08, + "loss": 0.1977, + "step": 12150 + }, + { + "epoch": 2.9302325581395348, + "grad_norm": 0.1170421689748764, + "learning_rate": 1.6632635924092587e-08, + "loss": 0.1986, + "step": 12160 + }, + { + "epoch": 2.932642487046632, + "grad_norm": 0.11265738308429718, + "learning_rate": 1.5509722165131246e-08, + "loss": 0.1976, + "step": 12170 + }, + { + "epoch": 2.9350524159537295, + "grad_norm": 0.11651454865932465, + "learning_rate": 1.4425991298645525e-08, + "loss": 0.2013, + "step": 12180 + }, + { + "epoch": 2.9374623448608266, + "grad_norm": 0.11379573494195938, + "learning_rate": 1.3381451843803572e-08, + "loss": 0.1968, + "step": 12190 + }, + { + "epoch": 2.9398722737679237, + "grad_norm": 0.12176429480314255, + "learning_rate": 1.2376112011691088e-08, + "loss": 0.2, + "step": 12200 + }, + { + "epoch": 2.9422822026750213, + "grad_norm": 0.11730526387691498, + "learning_rate": 1.1409979705246932e-08, + "loss": 0.1987, + "step": 12210 + }, + { + "epoch": 2.9446921315821184, + "grad_norm": 0.11338816583156586, + "learning_rate": 1.0483062519200949e-08, + "loss": 0.1966, + "step": 12220 + }, + { + "epoch": 2.9471020604892155, + "grad_norm": 0.11472780257463455, + "learning_rate": 9.595367740014572e-09, + "loss": 0.1988, + "step": 12230 + }, + { + "epoch": 2.9495119893963127, + "grad_norm": 0.11212562769651413, + "learning_rate": 8.746902345824204e-09, + "loss": 0.1981, + "step": 12240 + }, + { + "epoch": 2.9519219183034098, + "grad_norm": 0.13362792134284973, + "learning_rate": 7.937673006384039e-09, + "loss": 0.1995, + "step": 12250 + }, + { + "epoch": 2.9543318472105073, + "grad_norm": 0.11798904836177826, + "learning_rate": 7.167686083015546e-09, + "loss": 0.1954, + "step": 12260 + }, + { + "epoch": 2.9567417761176045, + "grad_norm": 0.11756514012813568, + "learning_rate": 6.4369476285580656e-09, + "loss": 0.197, + "step": 12270 + }, + { + "epoch": 2.959151705024702, + "grad_norm": 0.11590098589658737, + "learning_rate": 5.7454633873188505e-09, + "loss": 0.1981, + "step": 12280 + }, + { + "epoch": 2.961561633931799, + "grad_norm": 0.11385900527238846, + "learning_rate": 5.09323879503032e-09, + "loss": 0.1999, + "step": 12290 + }, + { + "epoch": 2.9639715628388963, + "grad_norm": 0.12390254437923431, + "learning_rate": 4.480278978804542e-09, + "loss": 0.2028, + "step": 12300 + }, + { + "epoch": 2.9663814917459934, + "grad_norm": 0.11569690704345703, + "learning_rate": 3.906588757097152e-09, + "loss": 0.1962, + "step": 12310 + }, + { + "epoch": 2.9687914206530905, + "grad_norm": 0.11260288953781128, + "learning_rate": 3.372172639664606e-09, + "loss": 0.1978, + "step": 12320 + }, + { + "epoch": 2.971201349560188, + "grad_norm": 0.11671577394008636, + "learning_rate": 2.877034827532543e-09, + "loss": 0.1979, + "step": 12330 + }, + { + "epoch": 2.9736112784672852, + "grad_norm": 0.11633649468421936, + "learning_rate": 2.4211792129608112e-09, + "loss": 0.1958, + "step": 12340 + }, + { + "epoch": 2.9760212073743824, + "grad_norm": 0.11560673266649246, + "learning_rate": 2.004609379413491e-09, + "loss": 0.199, + "step": 12350 + }, + { + "epoch": 2.97843113628148, + "grad_norm": 0.14515136182308197, + "learning_rate": 1.6273286015305866e-09, + "loss": 0.1962, + "step": 12360 + }, + { + "epoch": 2.980841065188577, + "grad_norm": 0.11362209916114807, + "learning_rate": 1.2893398451024886e-09, + "loss": 0.2004, + "step": 12370 + }, + { + "epoch": 2.983250994095674, + "grad_norm": 0.11587734520435333, + "learning_rate": 9.906457670449953e-10, + "loss": 0.1933, + "step": 12380 + }, + { + "epoch": 2.9856609230027713, + "grad_norm": 0.12775199115276337, + "learning_rate": 7.312487153826597e-10, + "loss": 0.1976, + "step": 12390 + }, + { + "epoch": 2.9880708519098684, + "grad_norm": 0.11507666110992432, + "learning_rate": 5.111507292254736e-10, + "loss": 0.1981, + "step": 12400 + }, + { + "epoch": 2.990480780816966, + "grad_norm": 0.11416053771972656, + "learning_rate": 3.3035353875499056e-10, + "loss": 0.1952, + "step": 12410 + }, + { + "epoch": 2.992890709724063, + "grad_norm": 0.11342688649892807, + "learning_rate": 1.8885856521211333e-10, + "loss": 0.1975, + "step": 12420 + }, + { + "epoch": 2.9953006386311603, + "grad_norm": 0.11560092121362686, + "learning_rate": 8.666692088266094e-11, + "loss": 0.1997, + "step": 12430 + }, + { + "epoch": 2.997710567538258, + "grad_norm": 0.13347730040550232, + "learning_rate": 2.377940909237264e-11, + "loss": 0.1993, + "step": 12440 + }, + { + "epoch": 3.0, + "grad_norm": 0.16301463544368744, + "learning_rate": 1.9652419636084773e-13, + "loss": 0.2003, + "step": 12450 + } + ], + "logging_steps": 10, + "max_steps": 12450, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 10000000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.276371997398208e+21, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}