diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,84034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.26464769877798927, + "eval_steps": 500, + "global_step": 120000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 2.205397489816577e-05, + "grad_norm": 6.236355304718018, + "learning_rate": 1.8e-08, + "loss": 0.6077, + "step": 10 + }, + { + "epoch": 4.410794979633154e-05, + "grad_norm": 5.577098369598389, + "learning_rate": 3.7999999999999996e-08, + "loss": 0.5817, + "step": 20 + }, + { + "epoch": 6.616192469449732e-05, + "grad_norm": 6.076129913330078, + "learning_rate": 5.8e-08, + "loss": 0.6055, + "step": 30 + }, + { + "epoch": 8.821589959266308e-05, + "grad_norm": 6.092587947845459, + "learning_rate": 7.8e-08, + "loss": 0.6055, + "step": 40 + }, + { + "epoch": 0.00011026987449082886, + "grad_norm": 5.6150336265563965, + "learning_rate": 9.8e-08, + "loss": 0.6069, + "step": 50 + }, + { + "epoch": 0.00013232384938899463, + "grad_norm": 6.005609035491943, + "learning_rate": 1.18e-07, + "loss": 0.6, + "step": 60 + }, + { + "epoch": 0.0001543778242871604, + "grad_norm": 5.9279866218566895, + "learning_rate": 1.38e-07, + "loss": 0.5884, + "step": 70 + }, + { + "epoch": 0.00017643179918532616, + "grad_norm": 6.284666061401367, + "learning_rate": 1.58e-07, + "loss": 0.6088, + "step": 80 + }, + { + "epoch": 0.00019848577408349195, + "grad_norm": 5.776525497436523, + "learning_rate": 1.78e-07, + "loss": 0.5743, + "step": 90 + }, + { + "epoch": 0.0002205397489816577, + "grad_norm": 5.976769924163818, + "learning_rate": 1.98e-07, + "loss": 0.5452, + "step": 100 + }, + { + "epoch": 0.00024259372387982347, + "grad_norm": 5.471545219421387, + "learning_rate": 2.1800000000000002e-07, + "loss": 0.5672, + "step": 110 + }, + { + "epoch": 0.00026464769877798926, + "grad_norm": 5.748497009277344, + "learning_rate": 2.3800000000000001e-07, + "loss": 0.5688, + "step": 120 + }, + { + "epoch": 0.000286701673676155, + "grad_norm": 4.853139400482178, + "learning_rate": 2.58e-07, + "loss": 0.5151, + "step": 130 + }, + { + "epoch": 0.0003087556485743208, + "grad_norm": 4.23729133605957, + "learning_rate": 2.78e-07, + "loss": 0.4623, + "step": 140 + }, + { + "epoch": 0.0003308096234724866, + "grad_norm": 4.65359354019165, + "learning_rate": 2.9800000000000005e-07, + "loss": 0.4671, + "step": 150 + }, + { + "epoch": 0.0003528635983706523, + "grad_norm": 4.504486560821533, + "learning_rate": 3.18e-07, + "loss": 0.4273, + "step": 160 + }, + { + "epoch": 0.0003749175732688181, + "grad_norm": 4.540933609008789, + "learning_rate": 3.38e-07, + "loss": 0.443, + "step": 170 + }, + { + "epoch": 0.0003969715481669839, + "grad_norm": 2.358808755874634, + "learning_rate": 3.58e-07, + "loss": 0.3548, + "step": 180 + }, + { + "epoch": 0.00041902552306514963, + "grad_norm": 2.272480010986328, + "learning_rate": 3.78e-07, + "loss": 0.2744, + "step": 190 + }, + { + "epoch": 0.0004410794979633154, + "grad_norm": 1.6835079193115234, + "learning_rate": 3.98e-07, + "loss": 0.2561, + "step": 200 + }, + { + "epoch": 0.0004631334728614812, + "grad_norm": 1.4765801429748535, + "learning_rate": 4.18e-07, + "loss": 0.2505, + "step": 210 + }, + { + "epoch": 0.00048518744775964695, + "grad_norm": 1.263611078262329, + "learning_rate": 4.3800000000000003e-07, + "loss": 0.2458, + "step": 220 + }, + { + "epoch": 0.0005072414226578127, + "grad_norm": 0.8305543661117554, + "learning_rate": 4.58e-07, + "loss": 0.2211, + "step": 230 + }, + { + "epoch": 0.0005292953975559785, + "grad_norm": 0.5932561159133911, + "learning_rate": 4.78e-07, + "loss": 0.2046, + "step": 240 + }, + { + "epoch": 0.0005513493724541443, + "grad_norm": 0.4622391164302826, + "learning_rate": 4.98e-07, + "loss": 0.1897, + "step": 250 + }, + { + "epoch": 0.00057340334735231, + "grad_norm": 0.4523414373397827, + "learning_rate": 5.18e-07, + "loss": 0.1791, + "step": 260 + }, + { + "epoch": 0.0005954573222504758, + "grad_norm": 0.4274807274341583, + "learning_rate": 5.38e-07, + "loss": 0.1836, + "step": 270 + }, + { + "epoch": 0.0006175112971486416, + "grad_norm": 0.41153350472450256, + "learning_rate": 5.58e-07, + "loss": 0.164, + "step": 280 + }, + { + "epoch": 0.0006395652720468073, + "grad_norm": 0.41905999183654785, + "learning_rate": 5.78e-07, + "loss": 0.1685, + "step": 290 + }, + { + "epoch": 0.0006616192469449732, + "grad_norm": 0.4363863468170166, + "learning_rate": 5.98e-07, + "loss": 0.1639, + "step": 300 + }, + { + "epoch": 0.0006836732218431389, + "grad_norm": 0.3312281668186188, + "learning_rate": 6.180000000000001e-07, + "loss": 0.164, + "step": 310 + }, + { + "epoch": 0.0007057271967413046, + "grad_norm": 0.369183212518692, + "learning_rate": 6.38e-07, + "loss": 0.1604, + "step": 320 + }, + { + "epoch": 0.0007277811716394705, + "grad_norm": 0.34902021288871765, + "learning_rate": 6.58e-07, + "loss": 0.1648, + "step": 330 + }, + { + "epoch": 0.0007498351465376362, + "grad_norm": 0.4132331907749176, + "learning_rate": 6.78e-07, + "loss": 0.1536, + "step": 340 + }, + { + "epoch": 0.000771889121435802, + "grad_norm": 0.4094333350658417, + "learning_rate": 6.98e-07, + "loss": 0.1464, + "step": 350 + }, + { + "epoch": 0.0007939430963339678, + "grad_norm": 0.36116212606430054, + "learning_rate": 7.18e-07, + "loss": 0.1493, + "step": 360 + }, + { + "epoch": 0.0008159970712321335, + "grad_norm": 0.3136536180973053, + "learning_rate": 7.380000000000001e-07, + "loss": 0.1491, + "step": 370 + }, + { + "epoch": 0.0008380510461302993, + "grad_norm": 0.3362431526184082, + "learning_rate": 7.58e-07, + "loss": 0.1534, + "step": 380 + }, + { + "epoch": 0.0008601050210284651, + "grad_norm": 0.3149309754371643, + "learning_rate": 7.78e-07, + "loss": 0.1444, + "step": 390 + }, + { + "epoch": 0.0008821589959266308, + "grad_norm": 0.3361152708530426, + "learning_rate": 7.98e-07, + "loss": 0.1396, + "step": 400 + }, + { + "epoch": 0.0009042129708247966, + "grad_norm": 0.3823968768119812, + "learning_rate": 8.18e-07, + "loss": 0.1366, + "step": 410 + }, + { + "epoch": 0.0009262669457229624, + "grad_norm": 0.4331970512866974, + "learning_rate": 8.380000000000001e-07, + "loss": 0.1375, + "step": 420 + }, + { + "epoch": 0.0009483209206211282, + "grad_norm": 0.2987608015537262, + "learning_rate": 8.580000000000001e-07, + "loss": 0.1353, + "step": 430 + }, + { + "epoch": 0.0009703748955192939, + "grad_norm": 0.3788529336452484, + "learning_rate": 8.78e-07, + "loss": 0.1369, + "step": 440 + }, + { + "epoch": 0.0009924288704174597, + "grad_norm": 0.39351794123649597, + "learning_rate": 8.98e-07, + "loss": 0.1309, + "step": 450 + }, + { + "epoch": 0.0010144828453156254, + "grad_norm": 0.3603001534938812, + "learning_rate": 9.179999999999999e-07, + "loss": 0.1227, + "step": 460 + }, + { + "epoch": 0.0010365368202137912, + "grad_norm": 0.4162672460079193, + "learning_rate": 9.38e-07, + "loss": 0.1312, + "step": 470 + }, + { + "epoch": 0.001058590795111957, + "grad_norm": 0.294006884098053, + "learning_rate": 9.58e-07, + "loss": 0.1274, + "step": 480 + }, + { + "epoch": 0.0010806447700101227, + "grad_norm": 0.304779589176178, + "learning_rate": 9.78e-07, + "loss": 0.1236, + "step": 490 + }, + { + "epoch": 0.0011026987449082885, + "grad_norm": 0.31698623299598694, + "learning_rate": 9.98e-07, + "loss": 0.1253, + "step": 500 + }, + { + "epoch": 0.0011247527198064544, + "grad_norm": 0.45465099811553955, + "learning_rate": 1.018e-06, + "loss": 0.1243, + "step": 510 + }, + { + "epoch": 0.00114680669470462, + "grad_norm": 0.4005429744720459, + "learning_rate": 1.038e-06, + "loss": 0.1139, + "step": 520 + }, + { + "epoch": 0.0011688606696027858, + "grad_norm": 0.3112233877182007, + "learning_rate": 1.058e-06, + "loss": 0.1098, + "step": 530 + }, + { + "epoch": 0.0011909146445009517, + "grad_norm": 0.34861186146736145, + "learning_rate": 1.0779999999999999e-06, + "loss": 0.123, + "step": 540 + }, + { + "epoch": 0.0012129686193991173, + "grad_norm": 0.33357760310173035, + "learning_rate": 1.0980000000000001e-06, + "loss": 0.1119, + "step": 550 + }, + { + "epoch": 0.0012350225942972832, + "grad_norm": 0.3199368417263031, + "learning_rate": 1.118e-06, + "loss": 0.112, + "step": 560 + }, + { + "epoch": 0.001257076569195449, + "grad_norm": 0.2986534833908081, + "learning_rate": 1.138e-06, + "loss": 0.1158, + "step": 570 + }, + { + "epoch": 0.0012791305440936146, + "grad_norm": 0.3802856504917145, + "learning_rate": 1.1580000000000002e-06, + "loss": 0.1126, + "step": 580 + }, + { + "epoch": 0.0013011845189917805, + "grad_norm": 0.31765979528427124, + "learning_rate": 1.178e-06, + "loss": 0.116, + "step": 590 + }, + { + "epoch": 0.0013232384938899463, + "grad_norm": 0.36523547768592834, + "learning_rate": 1.198e-06, + "loss": 0.111, + "step": 600 + }, + { + "epoch": 0.001345292468788112, + "grad_norm": 0.3730740249156952, + "learning_rate": 1.218e-06, + "loss": 0.1102, + "step": 610 + }, + { + "epoch": 0.0013673464436862778, + "grad_norm": 0.29716721177101135, + "learning_rate": 1.238e-06, + "loss": 0.1165, + "step": 620 + }, + { + "epoch": 0.0013894004185844436, + "grad_norm": 0.3549136221408844, + "learning_rate": 1.258e-06, + "loss": 0.1123, + "step": 630 + }, + { + "epoch": 0.0014114543934826093, + "grad_norm": 0.36589285731315613, + "learning_rate": 1.278e-06, + "loss": 0.1075, + "step": 640 + }, + { + "epoch": 0.001433508368380775, + "grad_norm": 0.35864007472991943, + "learning_rate": 1.2980000000000001e-06, + "loss": 0.1092, + "step": 650 + }, + { + "epoch": 0.001455562343278941, + "grad_norm": 0.2923118472099304, + "learning_rate": 1.318e-06, + "loss": 0.0991, + "step": 660 + }, + { + "epoch": 0.0014776163181771066, + "grad_norm": 0.41723373532295227, + "learning_rate": 1.3380000000000001e-06, + "loss": 0.1094, + "step": 670 + }, + { + "epoch": 0.0014996702930752724, + "grad_norm": 0.33340829610824585, + "learning_rate": 1.358e-06, + "loss": 0.1044, + "step": 680 + }, + { + "epoch": 0.0015217242679734383, + "grad_norm": 0.3534899353981018, + "learning_rate": 1.378e-06, + "loss": 0.1019, + "step": 690 + }, + { + "epoch": 0.001543778242871604, + "grad_norm": 0.3312937319278717, + "learning_rate": 1.3980000000000002e-06, + "loss": 0.1, + "step": 700 + }, + { + "epoch": 0.0015658322177697697, + "grad_norm": 0.3278820812702179, + "learning_rate": 1.418e-06, + "loss": 0.0997, + "step": 710 + }, + { + "epoch": 0.0015878861926679356, + "grad_norm": 0.35403117537498474, + "learning_rate": 1.438e-06, + "loss": 0.0994, + "step": 720 + }, + { + "epoch": 0.0016099401675661012, + "grad_norm": 0.3909393548965454, + "learning_rate": 1.458e-06, + "loss": 0.1017, + "step": 730 + }, + { + "epoch": 0.001631994142464267, + "grad_norm": 0.3699544668197632, + "learning_rate": 1.478e-06, + "loss": 0.1009, + "step": 740 + }, + { + "epoch": 0.001654048117362433, + "grad_norm": 0.4458639621734619, + "learning_rate": 1.498e-06, + "loss": 0.0988, + "step": 750 + }, + { + "epoch": 0.0016761020922605985, + "grad_norm": 0.3848456144332886, + "learning_rate": 1.518e-06, + "loss": 0.0947, + "step": 760 + }, + { + "epoch": 0.0016981560671587644, + "grad_norm": 0.38945040106773376, + "learning_rate": 1.5380000000000001e-06, + "loss": 0.1014, + "step": 770 + }, + { + "epoch": 0.0017202100420569302, + "grad_norm": 0.3110882341861725, + "learning_rate": 1.558e-06, + "loss": 0.1007, + "step": 780 + }, + { + "epoch": 0.0017422640169550958, + "grad_norm": 0.4024699032306671, + "learning_rate": 1.578e-06, + "loss": 0.0996, + "step": 790 + }, + { + "epoch": 0.0017643179918532617, + "grad_norm": 0.44866907596588135, + "learning_rate": 1.598e-06, + "loss": 0.1006, + "step": 800 + }, + { + "epoch": 0.0017863719667514275, + "grad_norm": 0.49260222911834717, + "learning_rate": 1.618e-06, + "loss": 0.0929, + "step": 810 + }, + { + "epoch": 0.0018084259416495932, + "grad_norm": 0.41991692781448364, + "learning_rate": 1.638e-06, + "loss": 0.0921, + "step": 820 + }, + { + "epoch": 0.001830479916547759, + "grad_norm": 0.3339521288871765, + "learning_rate": 1.658e-06, + "loss": 0.0923, + "step": 830 + }, + { + "epoch": 0.0018525338914459248, + "grad_norm": 0.3442651927471161, + "learning_rate": 1.678e-06, + "loss": 0.0939, + "step": 840 + }, + { + "epoch": 0.0018745878663440905, + "grad_norm": 0.33252769708633423, + "learning_rate": 1.6979999999999999e-06, + "loss": 0.1026, + "step": 850 + }, + { + "epoch": 0.0018966418412422563, + "grad_norm": 0.49486759305000305, + "learning_rate": 1.7180000000000001e-06, + "loss": 0.0922, + "step": 860 + }, + { + "epoch": 0.0019186958161404222, + "grad_norm": 0.48068714141845703, + "learning_rate": 1.7380000000000001e-06, + "loss": 0.0943, + "step": 870 + }, + { + "epoch": 0.0019407497910385878, + "grad_norm": 0.3113871216773987, + "learning_rate": 1.758e-06, + "loss": 0.0927, + "step": 880 + }, + { + "epoch": 0.0019628037659367536, + "grad_norm": 0.37896743416786194, + "learning_rate": 1.7780000000000002e-06, + "loss": 0.0944, + "step": 890 + }, + { + "epoch": 0.0019848577408349195, + "grad_norm": 0.3041408061981201, + "learning_rate": 1.798e-06, + "loss": 0.093, + "step": 900 + }, + { + "epoch": 0.0020069117157330853, + "grad_norm": 0.38490772247314453, + "learning_rate": 1.818e-06, + "loss": 0.0932, + "step": 910 + }, + { + "epoch": 0.0020289656906312507, + "grad_norm": 0.4889361262321472, + "learning_rate": 1.838e-06, + "loss": 0.0949, + "step": 920 + }, + { + "epoch": 0.0020510196655294166, + "grad_norm": 0.4176810383796692, + "learning_rate": 1.858e-06, + "loss": 0.0962, + "step": 930 + }, + { + "epoch": 0.0020730736404275824, + "grad_norm": 0.5086779594421387, + "learning_rate": 1.878e-06, + "loss": 0.0887, + "step": 940 + }, + { + "epoch": 0.0020951276153257483, + "grad_norm": 0.48460653424263, + "learning_rate": 1.898e-06, + "loss": 0.0926, + "step": 950 + }, + { + "epoch": 0.002117181590223914, + "grad_norm": 0.3525198996067047, + "learning_rate": 1.918e-06, + "loss": 0.0904, + "step": 960 + }, + { + "epoch": 0.00213923556512208, + "grad_norm": 0.29430606961250305, + "learning_rate": 1.9380000000000003e-06, + "loss": 0.0895, + "step": 970 + }, + { + "epoch": 0.0021612895400202454, + "grad_norm": 0.5151742696762085, + "learning_rate": 1.958e-06, + "loss": 0.0905, + "step": 980 + }, + { + "epoch": 0.002183343514918411, + "grad_norm": 0.2935705780982971, + "learning_rate": 1.978e-06, + "loss": 0.0918, + "step": 990 + }, + { + "epoch": 0.002205397489816577, + "grad_norm": 0.44864049553871155, + "learning_rate": 1.998e-06, + "loss": 0.0909, + "step": 1000 + }, + { + "epoch": 0.002227451464714743, + "grad_norm": 0.46720701456069946, + "learning_rate": 2.018e-06, + "loss": 0.0928, + "step": 1010 + }, + { + "epoch": 0.0022495054396129087, + "grad_norm": 0.2856927514076233, + "learning_rate": 2.038e-06, + "loss": 0.0899, + "step": 1020 + }, + { + "epoch": 0.0022715594145110746, + "grad_norm": 0.29202744364738464, + "learning_rate": 2.058e-06, + "loss": 0.0881, + "step": 1030 + }, + { + "epoch": 0.00229361338940924, + "grad_norm": 0.3498113751411438, + "learning_rate": 2.0780000000000003e-06, + "loss": 0.0861, + "step": 1040 + }, + { + "epoch": 0.002315667364307406, + "grad_norm": 0.5100016593933105, + "learning_rate": 2.098e-06, + "loss": 0.0921, + "step": 1050 + }, + { + "epoch": 0.0023377213392055717, + "grad_norm": 0.42712828516960144, + "learning_rate": 2.118e-06, + "loss": 0.0869, + "step": 1060 + }, + { + "epoch": 0.0023597753141037375, + "grad_norm": 0.3814938962459564, + "learning_rate": 2.138e-06, + "loss": 0.0867, + "step": 1070 + }, + { + "epoch": 0.0023818292890019034, + "grad_norm": 0.29198750853538513, + "learning_rate": 2.158e-06, + "loss": 0.0846, + "step": 1080 + }, + { + "epoch": 0.0024038832639000692, + "grad_norm": 0.44254612922668457, + "learning_rate": 2.178e-06, + "loss": 0.0879, + "step": 1090 + }, + { + "epoch": 0.0024259372387982346, + "grad_norm": 0.385423481464386, + "learning_rate": 2.198e-06, + "loss": 0.0885, + "step": 1100 + }, + { + "epoch": 0.0024479912136964005, + "grad_norm": 0.39866361021995544, + "learning_rate": 2.218e-06, + "loss": 0.0864, + "step": 1110 + }, + { + "epoch": 0.0024700451885945663, + "grad_norm": 0.33310386538505554, + "learning_rate": 2.238e-06, + "loss": 0.0849, + "step": 1120 + }, + { + "epoch": 0.002492099163492732, + "grad_norm": 0.42193976044654846, + "learning_rate": 2.258e-06, + "loss": 0.0918, + "step": 1130 + }, + { + "epoch": 0.002514153138390898, + "grad_norm": 0.4669645130634308, + "learning_rate": 2.278e-06, + "loss": 0.0819, + "step": 1140 + }, + { + "epoch": 0.002536207113289064, + "grad_norm": 0.35376301407814026, + "learning_rate": 2.2980000000000003e-06, + "loss": 0.0851, + "step": 1150 + }, + { + "epoch": 0.0025582610881872293, + "grad_norm": 0.5796065330505371, + "learning_rate": 2.318e-06, + "loss": 0.0905, + "step": 1160 + }, + { + "epoch": 0.002580315063085395, + "grad_norm": 0.5244941711425781, + "learning_rate": 2.338e-06, + "loss": 0.0859, + "step": 1170 + }, + { + "epoch": 0.002602369037983561, + "grad_norm": 0.5381441116333008, + "learning_rate": 2.358e-06, + "loss": 0.0871, + "step": 1180 + }, + { + "epoch": 0.002624423012881727, + "grad_norm": 0.3412414789199829, + "learning_rate": 2.378e-06, + "loss": 0.0858, + "step": 1190 + }, + { + "epoch": 0.0026464769877798926, + "grad_norm": 0.3215349316596985, + "learning_rate": 2.3979999999999997e-06, + "loss": 0.084, + "step": 1200 + }, + { + "epoch": 0.0026685309626780585, + "grad_norm": 0.3245294392108917, + "learning_rate": 2.4180000000000004e-06, + "loss": 0.087, + "step": 1210 + }, + { + "epoch": 0.002690584937576224, + "grad_norm": 0.3702090382575989, + "learning_rate": 2.438e-06, + "loss": 0.0856, + "step": 1220 + }, + { + "epoch": 0.0027126389124743897, + "grad_norm": 0.359549343585968, + "learning_rate": 2.458e-06, + "loss": 0.0839, + "step": 1230 + }, + { + "epoch": 0.0027346928873725556, + "grad_norm": 0.3551362454891205, + "learning_rate": 2.4780000000000002e-06, + "loss": 0.0831, + "step": 1240 + }, + { + "epoch": 0.0027567468622707214, + "grad_norm": 0.33409518003463745, + "learning_rate": 2.498e-06, + "loss": 0.0872, + "step": 1250 + }, + { + "epoch": 0.0027788008371688873, + "grad_norm": 0.374502956867218, + "learning_rate": 2.518e-06, + "loss": 0.084, + "step": 1260 + }, + { + "epoch": 0.002800854812067053, + "grad_norm": 0.36086857318878174, + "learning_rate": 2.538e-06, + "loss": 0.0825, + "step": 1270 + }, + { + "epoch": 0.0028229087869652185, + "grad_norm": 0.29267945885658264, + "learning_rate": 2.5580000000000003e-06, + "loss": 0.0834, + "step": 1280 + }, + { + "epoch": 0.0028449627618633844, + "grad_norm": 0.25817960500717163, + "learning_rate": 2.578e-06, + "loss": 0.0796, + "step": 1290 + }, + { + "epoch": 0.00286701673676155, + "grad_norm": 0.3484591543674469, + "learning_rate": 2.598e-06, + "loss": 0.0787, + "step": 1300 + }, + { + "epoch": 0.002889070711659716, + "grad_norm": 0.35242339968681335, + "learning_rate": 2.618e-06, + "loss": 0.0809, + "step": 1310 + }, + { + "epoch": 0.002911124686557882, + "grad_norm": 0.3299354612827301, + "learning_rate": 2.638e-06, + "loss": 0.0802, + "step": 1320 + }, + { + "epoch": 0.0029331786614560473, + "grad_norm": 0.5549840331077576, + "learning_rate": 2.6580000000000002e-06, + "loss": 0.082, + "step": 1330 + }, + { + "epoch": 0.002955232636354213, + "grad_norm": 0.4164532721042633, + "learning_rate": 2.678e-06, + "loss": 0.082, + "step": 1340 + }, + { + "epoch": 0.002977286611252379, + "grad_norm": 0.3622371554374695, + "learning_rate": 2.6980000000000003e-06, + "loss": 0.0824, + "step": 1350 + }, + { + "epoch": 0.002999340586150545, + "grad_norm": 0.4721771776676178, + "learning_rate": 2.718e-06, + "loss": 0.0791, + "step": 1360 + }, + { + "epoch": 0.0030213945610487107, + "grad_norm": 0.37546437978744507, + "learning_rate": 2.738e-06, + "loss": 0.08, + "step": 1370 + }, + { + "epoch": 0.0030434485359468765, + "grad_norm": 0.36587586998939514, + "learning_rate": 2.758e-06, + "loss": 0.0768, + "step": 1380 + }, + { + "epoch": 0.003065502510845042, + "grad_norm": 0.3034915328025818, + "learning_rate": 2.778e-06, + "loss": 0.0827, + "step": 1390 + }, + { + "epoch": 0.003087556485743208, + "grad_norm": 0.298182874917984, + "learning_rate": 2.798e-06, + "loss": 0.0782, + "step": 1400 + }, + { + "epoch": 0.0031096104606413736, + "grad_norm": 0.2949972450733185, + "learning_rate": 2.818e-06, + "loss": 0.081, + "step": 1410 + }, + { + "epoch": 0.0031316644355395395, + "grad_norm": 0.433998703956604, + "learning_rate": 2.838e-06, + "loss": 0.0799, + "step": 1420 + }, + { + "epoch": 0.0031537184104377053, + "grad_norm": 0.298586368560791, + "learning_rate": 2.858e-06, + "loss": 0.0824, + "step": 1430 + }, + { + "epoch": 0.003175772385335871, + "grad_norm": 0.35934579372406006, + "learning_rate": 2.878e-06, + "loss": 0.0778, + "step": 1440 + }, + { + "epoch": 0.0031978263602340366, + "grad_norm": 0.30846357345581055, + "learning_rate": 2.898e-06, + "loss": 0.0801, + "step": 1450 + }, + { + "epoch": 0.0032198803351322024, + "grad_norm": 0.2565101981163025, + "learning_rate": 2.9180000000000003e-06, + "loss": 0.0798, + "step": 1460 + }, + { + "epoch": 0.0032419343100303683, + "grad_norm": 0.3625105619430542, + "learning_rate": 2.938e-06, + "loss": 0.0822, + "step": 1470 + }, + { + "epoch": 0.003263988284928534, + "grad_norm": 0.44676029682159424, + "learning_rate": 2.958e-06, + "loss": 0.0794, + "step": 1480 + }, + { + "epoch": 0.0032860422598267, + "grad_norm": 0.44718340039253235, + "learning_rate": 2.978e-06, + "loss": 0.0773, + "step": 1490 + }, + { + "epoch": 0.003308096234724866, + "grad_norm": 0.3818584084510803, + "learning_rate": 2.998e-06, + "loss": 0.0805, + "step": 1500 + }, + { + "epoch": 0.003330150209623031, + "grad_norm": 0.29796621203422546, + "learning_rate": 3.0179999999999997e-06, + "loss": 0.0778, + "step": 1510 + }, + { + "epoch": 0.003352204184521197, + "grad_norm": 0.31690311431884766, + "learning_rate": 3.0380000000000004e-06, + "loss": 0.076, + "step": 1520 + }, + { + "epoch": 0.003374258159419363, + "grad_norm": 0.25693565607070923, + "learning_rate": 3.058e-06, + "loss": 0.0777, + "step": 1530 + }, + { + "epoch": 0.0033963121343175287, + "grad_norm": 0.3614344596862793, + "learning_rate": 3.078e-06, + "loss": 0.0781, + "step": 1540 + }, + { + "epoch": 0.0034183661092156946, + "grad_norm": 0.31134164333343506, + "learning_rate": 3.0980000000000002e-06, + "loss": 0.0803, + "step": 1550 + }, + { + "epoch": 0.0034404200841138604, + "grad_norm": 0.2796422839164734, + "learning_rate": 3.118e-06, + "loss": 0.0765, + "step": 1560 + }, + { + "epoch": 0.003462474059012026, + "grad_norm": 0.33609864115715027, + "learning_rate": 3.138e-06, + "loss": 0.0803, + "step": 1570 + }, + { + "epoch": 0.0034845280339101917, + "grad_norm": 0.3802903890609741, + "learning_rate": 3.158e-06, + "loss": 0.0785, + "step": 1580 + }, + { + "epoch": 0.0035065820088083575, + "grad_norm": 0.4035937190055847, + "learning_rate": 3.1780000000000003e-06, + "loss": 0.0745, + "step": 1590 + }, + { + "epoch": 0.0035286359837065234, + "grad_norm": 0.3191973865032196, + "learning_rate": 3.198e-06, + "loss": 0.0766, + "step": 1600 + }, + { + "epoch": 0.0035506899586046892, + "grad_norm": 0.3030049502849579, + "learning_rate": 3.218e-06, + "loss": 0.077, + "step": 1610 + }, + { + "epoch": 0.003572743933502855, + "grad_norm": 0.3728023171424866, + "learning_rate": 3.238e-06, + "loss": 0.0741, + "step": 1620 + }, + { + "epoch": 0.0035947979084010205, + "grad_norm": 0.38205528259277344, + "learning_rate": 3.258e-06, + "loss": 0.0753, + "step": 1630 + }, + { + "epoch": 0.0036168518832991863, + "grad_norm": 0.23988257348537445, + "learning_rate": 3.278e-06, + "loss": 0.076, + "step": 1640 + }, + { + "epoch": 0.003638905858197352, + "grad_norm": 0.35274428129196167, + "learning_rate": 3.298e-06, + "loss": 0.0753, + "step": 1650 + }, + { + "epoch": 0.003660959833095518, + "grad_norm": 0.46815815567970276, + "learning_rate": 3.3180000000000003e-06, + "loss": 0.0807, + "step": 1660 + }, + { + "epoch": 0.003683013807993684, + "grad_norm": 0.42095738649368286, + "learning_rate": 3.338e-06, + "loss": 0.0765, + "step": 1670 + }, + { + "epoch": 0.0037050677828918497, + "grad_norm": 0.3158316910266876, + "learning_rate": 3.358e-06, + "loss": 0.0773, + "step": 1680 + }, + { + "epoch": 0.003727121757790015, + "grad_norm": 0.3424791097640991, + "learning_rate": 3.378e-06, + "loss": 0.0781, + "step": 1690 + }, + { + "epoch": 0.003749175732688181, + "grad_norm": 0.325890451669693, + "learning_rate": 3.3980000000000003e-06, + "loss": 0.0752, + "step": 1700 + }, + { + "epoch": 0.003771229707586347, + "grad_norm": 0.3553590178489685, + "learning_rate": 3.418e-06, + "loss": 0.0737, + "step": 1710 + }, + { + "epoch": 0.0037932836824845126, + "grad_norm": 0.3319801390171051, + "learning_rate": 3.438e-06, + "loss": 0.0787, + "step": 1720 + }, + { + "epoch": 0.0038153376573826785, + "grad_norm": 0.34738534688949585, + "learning_rate": 3.458e-06, + "loss": 0.0794, + "step": 1730 + }, + { + "epoch": 0.0038373916322808443, + "grad_norm": 0.31648749113082886, + "learning_rate": 3.478e-06, + "loss": 0.0743, + "step": 1740 + }, + { + "epoch": 0.0038594456071790097, + "grad_norm": 0.3431965112686157, + "learning_rate": 3.498e-06, + "loss": 0.0744, + "step": 1750 + }, + { + "epoch": 0.0038814995820771756, + "grad_norm": 0.3210974633693695, + "learning_rate": 3.5180000000000005e-06, + "loss": 0.0742, + "step": 1760 + }, + { + "epoch": 0.0039035535569753414, + "grad_norm": 0.2956540584564209, + "learning_rate": 3.5380000000000003e-06, + "loss": 0.0751, + "step": 1770 + }, + { + "epoch": 0.003925607531873507, + "grad_norm": 0.4587995409965515, + "learning_rate": 3.558e-06, + "loss": 0.0742, + "step": 1780 + }, + { + "epoch": 0.003947661506771673, + "grad_norm": 0.28840482234954834, + "learning_rate": 3.5780000000000003e-06, + "loss": 0.0759, + "step": 1790 + }, + { + "epoch": 0.003969715481669839, + "grad_norm": 0.3668798506259918, + "learning_rate": 3.598e-06, + "loss": 0.0766, + "step": 1800 + }, + { + "epoch": 0.003991769456568005, + "grad_norm": 0.3076854348182678, + "learning_rate": 3.618e-06, + "loss": 0.0738, + "step": 1810 + }, + { + "epoch": 0.004013823431466171, + "grad_norm": 0.2947024703025818, + "learning_rate": 3.6379999999999997e-06, + "loss": 0.0729, + "step": 1820 + }, + { + "epoch": 0.0040358774063643365, + "grad_norm": 0.4051259756088257, + "learning_rate": 3.6580000000000004e-06, + "loss": 0.0733, + "step": 1830 + }, + { + "epoch": 0.0040579313812625015, + "grad_norm": 0.5834921598434448, + "learning_rate": 3.678e-06, + "loss": 0.0766, + "step": 1840 + }, + { + "epoch": 0.004079985356160667, + "grad_norm": 0.523248553276062, + "learning_rate": 3.698e-06, + "loss": 0.0718, + "step": 1850 + }, + { + "epoch": 0.004102039331058833, + "grad_norm": 0.5094141960144043, + "learning_rate": 3.7180000000000002e-06, + "loss": 0.0724, + "step": 1860 + }, + { + "epoch": 0.004124093305956999, + "grad_norm": 0.3509286642074585, + "learning_rate": 3.738e-06, + "loss": 0.0699, + "step": 1870 + }, + { + "epoch": 0.004146147280855165, + "grad_norm": 0.36207273602485657, + "learning_rate": 3.758e-06, + "loss": 0.0734, + "step": 1880 + }, + { + "epoch": 0.004168201255753331, + "grad_norm": 0.30276432633399963, + "learning_rate": 3.7780000000000005e-06, + "loss": 0.0726, + "step": 1890 + }, + { + "epoch": 0.0041902552306514965, + "grad_norm": 0.47685667872428894, + "learning_rate": 3.798e-06, + "loss": 0.0712, + "step": 1900 + }, + { + "epoch": 0.004212309205549662, + "grad_norm": 0.2595645785331726, + "learning_rate": 3.818e-06, + "loss": 0.0713, + "step": 1910 + }, + { + "epoch": 0.004234363180447828, + "grad_norm": 0.3619635999202728, + "learning_rate": 3.838e-06, + "loss": 0.0733, + "step": 1920 + }, + { + "epoch": 0.004256417155345994, + "grad_norm": 0.27920055389404297, + "learning_rate": 3.858e-06, + "loss": 0.0706, + "step": 1930 + }, + { + "epoch": 0.00427847113024416, + "grad_norm": 0.40286391973495483, + "learning_rate": 3.878e-06, + "loss": 0.0714, + "step": 1940 + }, + { + "epoch": 0.004300525105142326, + "grad_norm": 0.3221932649612427, + "learning_rate": 3.898000000000001e-06, + "loss": 0.0693, + "step": 1950 + }, + { + "epoch": 0.004322579080040491, + "grad_norm": 0.37792372703552246, + "learning_rate": 3.918e-06, + "loss": 0.0748, + "step": 1960 + }, + { + "epoch": 0.004344633054938657, + "grad_norm": 0.36442190408706665, + "learning_rate": 3.938e-06, + "loss": 0.0676, + "step": 1970 + }, + { + "epoch": 0.004366687029836822, + "grad_norm": 0.39901003241539, + "learning_rate": 3.958000000000001e-06, + "loss": 0.0704, + "step": 1980 + }, + { + "epoch": 0.004388741004734988, + "grad_norm": 0.2873448133468628, + "learning_rate": 3.978e-06, + "loss": 0.0757, + "step": 1990 + }, + { + "epoch": 0.004410794979633154, + "grad_norm": 0.27699071168899536, + "learning_rate": 3.9980000000000005e-06, + "loss": 0.0743, + "step": 2000 + }, + { + "epoch": 0.00443284895453132, + "grad_norm": 0.3708782196044922, + "learning_rate": 4.0179999999999995e-06, + "loss": 0.0718, + "step": 2010 + }, + { + "epoch": 0.004454902929429486, + "grad_norm": 0.2978207767009735, + "learning_rate": 4.038e-06, + "loss": 0.0671, + "step": 2020 + }, + { + "epoch": 0.004476956904327652, + "grad_norm": 0.3575374186038971, + "learning_rate": 4.058e-06, + "loss": 0.0699, + "step": 2030 + }, + { + "epoch": 0.0044990108792258175, + "grad_norm": 0.2912289500236511, + "learning_rate": 4.078e-06, + "loss": 0.0706, + "step": 2040 + }, + { + "epoch": 0.004521064854123983, + "grad_norm": 0.23911058902740479, + "learning_rate": 4.098e-06, + "loss": 0.0681, + "step": 2050 + }, + { + "epoch": 0.004543118829022149, + "grad_norm": 0.3408352732658386, + "learning_rate": 4.118e-06, + "loss": 0.0727, + "step": 2060 + }, + { + "epoch": 0.004565172803920314, + "grad_norm": 0.3528062701225281, + "learning_rate": 4.138e-06, + "loss": 0.0705, + "step": 2070 + }, + { + "epoch": 0.00458722677881848, + "grad_norm": 0.5070157647132874, + "learning_rate": 4.158e-06, + "loss": 0.0693, + "step": 2080 + }, + { + "epoch": 0.004609280753716646, + "grad_norm": 0.37441501021385193, + "learning_rate": 4.1780000000000005e-06, + "loss": 0.0737, + "step": 2090 + }, + { + "epoch": 0.004631334728614812, + "grad_norm": 0.32936862111091614, + "learning_rate": 4.198e-06, + "loss": 0.0705, + "step": 2100 + }, + { + "epoch": 0.0046533887035129775, + "grad_norm": 0.35178640484809875, + "learning_rate": 4.218e-06, + "loss": 0.0692, + "step": 2110 + }, + { + "epoch": 0.004675442678411143, + "grad_norm": 0.3573431968688965, + "learning_rate": 4.238000000000001e-06, + "loss": 0.0712, + "step": 2120 + }, + { + "epoch": 0.004697496653309309, + "grad_norm": 0.2697157859802246, + "learning_rate": 4.258e-06, + "loss": 0.0714, + "step": 2130 + }, + { + "epoch": 0.004719550628207475, + "grad_norm": 0.31113767623901367, + "learning_rate": 4.278e-06, + "loss": 0.0687, + "step": 2140 + }, + { + "epoch": 0.004741604603105641, + "grad_norm": 0.29966020584106445, + "learning_rate": 4.297999999999999e-06, + "loss": 0.0709, + "step": 2150 + }, + { + "epoch": 0.004763658578003807, + "grad_norm": 0.29748237133026123, + "learning_rate": 4.318e-06, + "loss": 0.0712, + "step": 2160 + }, + { + "epoch": 0.004785712552901973, + "grad_norm": 0.3134734630584717, + "learning_rate": 4.338000000000001e-06, + "loss": 0.0704, + "step": 2170 + }, + { + "epoch": 0.0048077665278001384, + "grad_norm": 0.3475766181945801, + "learning_rate": 4.358e-06, + "loss": 0.0693, + "step": 2180 + }, + { + "epoch": 0.004829820502698303, + "grad_norm": 0.3045957088470459, + "learning_rate": 4.378e-06, + "loss": 0.0716, + "step": 2190 + }, + { + "epoch": 0.004851874477596469, + "grad_norm": 0.2981160879135132, + "learning_rate": 4.398e-06, + "loss": 0.0699, + "step": 2200 + }, + { + "epoch": 0.004873928452494635, + "grad_norm": 0.3648008406162262, + "learning_rate": 4.418e-06, + "loss": 0.0685, + "step": 2210 + }, + { + "epoch": 0.004895982427392801, + "grad_norm": 0.26480159163475037, + "learning_rate": 4.438e-06, + "loss": 0.0705, + "step": 2220 + }, + { + "epoch": 0.004918036402290967, + "grad_norm": 0.319692462682724, + "learning_rate": 4.458e-06, + "loss": 0.0703, + "step": 2230 + }, + { + "epoch": 0.004940090377189133, + "grad_norm": 0.3691520690917969, + "learning_rate": 4.478e-06, + "loss": 0.0699, + "step": 2240 + }, + { + "epoch": 0.0049621443520872985, + "grad_norm": 0.30383631587028503, + "learning_rate": 4.498e-06, + "loss": 0.071, + "step": 2250 + }, + { + "epoch": 0.004984198326985464, + "grad_norm": 0.35137876868247986, + "learning_rate": 4.518000000000001e-06, + "loss": 0.0676, + "step": 2260 + }, + { + "epoch": 0.00500625230188363, + "grad_norm": 0.29253238439559937, + "learning_rate": 4.538e-06, + "loss": 0.0689, + "step": 2270 + }, + { + "epoch": 0.005028306276781796, + "grad_norm": 0.3684479892253876, + "learning_rate": 4.558e-06, + "loss": 0.0691, + "step": 2280 + }, + { + "epoch": 0.005050360251679962, + "grad_norm": 0.35132738947868347, + "learning_rate": 4.578000000000001e-06, + "loss": 0.0671, + "step": 2290 + }, + { + "epoch": 0.005072414226578128, + "grad_norm": 0.29923465847969055, + "learning_rate": 4.598e-06, + "loss": 0.0667, + "step": 2300 + }, + { + "epoch": 0.005094468201476293, + "grad_norm": 0.3069044351577759, + "learning_rate": 4.6180000000000005e-06, + "loss": 0.0671, + "step": 2310 + }, + { + "epoch": 0.0051165221763744585, + "grad_norm": 0.35444480180740356, + "learning_rate": 4.6379999999999995e-06, + "loss": 0.0671, + "step": 2320 + }, + { + "epoch": 0.005138576151272624, + "grad_norm": 0.27560463547706604, + "learning_rate": 4.658e-06, + "loss": 0.069, + "step": 2330 + }, + { + "epoch": 0.00516063012617079, + "grad_norm": 0.4283864200115204, + "learning_rate": 4.678e-06, + "loss": 0.0677, + "step": 2340 + }, + { + "epoch": 0.005182684101068956, + "grad_norm": 0.31741178035736084, + "learning_rate": 4.698e-06, + "loss": 0.0712, + "step": 2350 + }, + { + "epoch": 0.005204738075967122, + "grad_norm": 0.2863287627696991, + "learning_rate": 4.718e-06, + "loss": 0.0689, + "step": 2360 + }, + { + "epoch": 0.005226792050865288, + "grad_norm": 0.33595383167266846, + "learning_rate": 4.738e-06, + "loss": 0.0662, + "step": 2370 + }, + { + "epoch": 0.005248846025763454, + "grad_norm": 0.29162922501564026, + "learning_rate": 4.758e-06, + "loss": 0.0706, + "step": 2380 + }, + { + "epoch": 0.0052709000006616194, + "grad_norm": 0.29663971066474915, + "learning_rate": 4.778e-06, + "loss": 0.0675, + "step": 2390 + }, + { + "epoch": 0.005292953975559785, + "grad_norm": 0.37551984190940857, + "learning_rate": 4.7980000000000005e-06, + "loss": 0.0696, + "step": 2400 + }, + { + "epoch": 0.005315007950457951, + "grad_norm": 0.32034194469451904, + "learning_rate": 4.818e-06, + "loss": 0.0694, + "step": 2410 + }, + { + "epoch": 0.005337061925356117, + "grad_norm": 0.30355340242385864, + "learning_rate": 4.838e-06, + "loss": 0.0675, + "step": 2420 + }, + { + "epoch": 0.005359115900254282, + "grad_norm": 0.342400461435318, + "learning_rate": 4.858000000000001e-06, + "loss": 0.0668, + "step": 2430 + }, + { + "epoch": 0.005381169875152448, + "grad_norm": 0.28430891036987305, + "learning_rate": 4.878e-06, + "loss": 0.0661, + "step": 2440 + }, + { + "epoch": 0.005403223850050614, + "grad_norm": 0.25718510150909424, + "learning_rate": 4.898e-06, + "loss": 0.0693, + "step": 2450 + }, + { + "epoch": 0.0054252778249487795, + "grad_norm": 0.2941652834415436, + "learning_rate": 4.917999999999999e-06, + "loss": 0.07, + "step": 2460 + }, + { + "epoch": 0.005447331799846945, + "grad_norm": 0.4158078134059906, + "learning_rate": 4.938e-06, + "loss": 0.0701, + "step": 2470 + }, + { + "epoch": 0.005469385774745111, + "grad_norm": 0.24961020052433014, + "learning_rate": 4.958000000000001e-06, + "loss": 0.0677, + "step": 2480 + }, + { + "epoch": 0.005491439749643277, + "grad_norm": 0.2838428020477295, + "learning_rate": 4.978e-06, + "loss": 0.0683, + "step": 2490 + }, + { + "epoch": 0.005513493724541443, + "grad_norm": 0.30422818660736084, + "learning_rate": 4.998e-06, + "loss": 0.067, + "step": 2500 + }, + { + "epoch": 0.005535547699439609, + "grad_norm": 0.3266562819480896, + "learning_rate": 5.018e-06, + "loss": 0.0677, + "step": 2510 + }, + { + "epoch": 0.0055576016743377745, + "grad_norm": 0.2882653474807739, + "learning_rate": 5.038e-06, + "loss": 0.0676, + "step": 2520 + }, + { + "epoch": 0.00557965564923594, + "grad_norm": 0.3426012694835663, + "learning_rate": 5.0580000000000005e-06, + "loss": 0.0662, + "step": 2530 + }, + { + "epoch": 0.005601709624134106, + "grad_norm": 0.27944085001945496, + "learning_rate": 5.078e-06, + "loss": 0.0648, + "step": 2540 + }, + { + "epoch": 0.005623763599032271, + "grad_norm": 0.32894742488861084, + "learning_rate": 5.098e-06, + "loss": 0.067, + "step": 2550 + }, + { + "epoch": 0.005645817573930437, + "grad_norm": 0.3286929726600647, + "learning_rate": 5.118e-06, + "loss": 0.0683, + "step": 2560 + }, + { + "epoch": 0.005667871548828603, + "grad_norm": 0.27646973729133606, + "learning_rate": 5.138000000000001e-06, + "loss": 0.0689, + "step": 2570 + }, + { + "epoch": 0.005689925523726769, + "grad_norm": 0.24092449247837067, + "learning_rate": 5.1579999999999996e-06, + "loss": 0.0629, + "step": 2580 + }, + { + "epoch": 0.005711979498624935, + "grad_norm": 0.2936400771141052, + "learning_rate": 5.178e-06, + "loss": 0.0641, + "step": 2590 + }, + { + "epoch": 0.0057340334735231, + "grad_norm": 0.3549675941467285, + "learning_rate": 5.198000000000001e-06, + "loss": 0.0717, + "step": 2600 + }, + { + "epoch": 0.005756087448421266, + "grad_norm": 0.34402531385421753, + "learning_rate": 5.218e-06, + "loss": 0.066, + "step": 2610 + }, + { + "epoch": 0.005778141423319432, + "grad_norm": 0.36729779839515686, + "learning_rate": 5.2380000000000005e-06, + "loss": 0.0667, + "step": 2620 + }, + { + "epoch": 0.005800195398217598, + "grad_norm": 0.414663165807724, + "learning_rate": 5.2579999999999995e-06, + "loss": 0.0679, + "step": 2630 + }, + { + "epoch": 0.005822249373115764, + "grad_norm": 0.4288348853588104, + "learning_rate": 5.278e-06, + "loss": 0.0638, + "step": 2640 + }, + { + "epoch": 0.00584430334801393, + "grad_norm": 0.5638272762298584, + "learning_rate": 5.298e-06, + "loss": 0.067, + "step": 2650 + }, + { + "epoch": 0.005866357322912095, + "grad_norm": 0.25538113713264465, + "learning_rate": 5.318e-06, + "loss": 0.0708, + "step": 2660 + }, + { + "epoch": 0.0058884112978102605, + "grad_norm": 0.3583112061023712, + "learning_rate": 5.338e-06, + "loss": 0.065, + "step": 2670 + }, + { + "epoch": 0.005910465272708426, + "grad_norm": 0.28061485290527344, + "learning_rate": 5.358e-06, + "loss": 0.0655, + "step": 2680 + }, + { + "epoch": 0.005932519247606592, + "grad_norm": 0.26373618841171265, + "learning_rate": 5.378e-06, + "loss": 0.0645, + "step": 2690 + }, + { + "epoch": 0.005954573222504758, + "grad_norm": 0.3314564824104309, + "learning_rate": 5.398e-06, + "loss": 0.0646, + "step": 2700 + }, + { + "epoch": 0.005976627197402924, + "grad_norm": 0.2683243751525879, + "learning_rate": 5.4180000000000005e-06, + "loss": 0.0668, + "step": 2710 + }, + { + "epoch": 0.00599868117230109, + "grad_norm": 0.30022430419921875, + "learning_rate": 5.438e-06, + "loss": 0.0629, + "step": 2720 + }, + { + "epoch": 0.0060207351471992555, + "grad_norm": 0.26741671562194824, + "learning_rate": 5.458e-06, + "loss": 0.0623, + "step": 2730 + }, + { + "epoch": 0.006042789122097421, + "grad_norm": 0.25646597146987915, + "learning_rate": 5.478000000000001e-06, + "loss": 0.0655, + "step": 2740 + }, + { + "epoch": 0.006064843096995587, + "grad_norm": 0.38476601243019104, + "learning_rate": 5.498e-06, + "loss": 0.0649, + "step": 2750 + }, + { + "epoch": 0.006086897071893753, + "grad_norm": 0.29161807894706726, + "learning_rate": 5.518e-06, + "loss": 0.0644, + "step": 2760 + }, + { + "epoch": 0.006108951046791919, + "grad_norm": 0.343201607465744, + "learning_rate": 5.537999999999999e-06, + "loss": 0.0656, + "step": 2770 + }, + { + "epoch": 0.006131005021690084, + "grad_norm": 0.3636700510978699, + "learning_rate": 5.558e-06, + "loss": 0.0684, + "step": 2780 + }, + { + "epoch": 0.00615305899658825, + "grad_norm": 0.37437009811401367, + "learning_rate": 5.578000000000001e-06, + "loss": 0.0673, + "step": 2790 + }, + { + "epoch": 0.006175112971486416, + "grad_norm": 0.28337278962135315, + "learning_rate": 5.598e-06, + "loss": 0.0664, + "step": 2800 + }, + { + "epoch": 0.006197166946384581, + "grad_norm": 0.34299546480178833, + "learning_rate": 5.618e-06, + "loss": 0.0661, + "step": 2810 + }, + { + "epoch": 0.006219220921282747, + "grad_norm": 0.30486825108528137, + "learning_rate": 5.638e-06, + "loss": 0.0647, + "step": 2820 + }, + { + "epoch": 0.006241274896180913, + "grad_norm": 0.2943286895751953, + "learning_rate": 5.658e-06, + "loss": 0.064, + "step": 2830 + }, + { + "epoch": 0.006263328871079079, + "grad_norm": 0.39788371324539185, + "learning_rate": 5.6780000000000005e-06, + "loss": 0.0659, + "step": 2840 + }, + { + "epoch": 0.006285382845977245, + "grad_norm": 0.4118451178073883, + "learning_rate": 5.698e-06, + "loss": 0.0646, + "step": 2850 + }, + { + "epoch": 0.006307436820875411, + "grad_norm": 0.4346439838409424, + "learning_rate": 5.718e-06, + "loss": 0.0628, + "step": 2860 + }, + { + "epoch": 0.0063294907957735765, + "grad_norm": 0.3380887806415558, + "learning_rate": 5.738e-06, + "loss": 0.067, + "step": 2870 + }, + { + "epoch": 0.006351544770671742, + "grad_norm": 0.3581794500350952, + "learning_rate": 5.758000000000001e-06, + "loss": 0.0647, + "step": 2880 + }, + { + "epoch": 0.006373598745569908, + "grad_norm": 0.3002529442310333, + "learning_rate": 5.7779999999999996e-06, + "loss": 0.0661, + "step": 2890 + }, + { + "epoch": 0.006395652720468073, + "grad_norm": 0.2853856384754181, + "learning_rate": 5.798e-06, + "loss": 0.0656, + "step": 2900 + }, + { + "epoch": 0.006417706695366239, + "grad_norm": 0.3638043701648712, + "learning_rate": 5.818000000000001e-06, + "loss": 0.0606, + "step": 2910 + }, + { + "epoch": 0.006439760670264405, + "grad_norm": 0.32191896438598633, + "learning_rate": 5.838e-06, + "loss": 0.0632, + "step": 2920 + }, + { + "epoch": 0.006461814645162571, + "grad_norm": 0.2643214166164398, + "learning_rate": 5.8580000000000005e-06, + "loss": 0.0633, + "step": 2930 + }, + { + "epoch": 0.0064838686200607365, + "grad_norm": 0.323943167924881, + "learning_rate": 5.8779999999999995e-06, + "loss": 0.0625, + "step": 2940 + }, + { + "epoch": 0.006505922594958902, + "grad_norm": 0.376961886882782, + "learning_rate": 5.898e-06, + "loss": 0.0616, + "step": 2950 + }, + { + "epoch": 0.006527976569857068, + "grad_norm": 0.266848087310791, + "learning_rate": 5.918000000000001e-06, + "loss": 0.064, + "step": 2960 + }, + { + "epoch": 0.006550030544755234, + "grad_norm": 0.2937847673892975, + "learning_rate": 5.938e-06, + "loss": 0.0635, + "step": 2970 + }, + { + "epoch": 0.0065720845196534, + "grad_norm": 0.2581852376461029, + "learning_rate": 5.958e-06, + "loss": 0.0624, + "step": 2980 + }, + { + "epoch": 0.006594138494551566, + "grad_norm": 0.2565463185310364, + "learning_rate": 5.978e-06, + "loss": 0.063, + "step": 2990 + }, + { + "epoch": 0.006616192469449732, + "grad_norm": 0.267559677362442, + "learning_rate": 5.998e-06, + "loss": 0.0603, + "step": 3000 + }, + { + "epoch": 0.0066382464443478974, + "grad_norm": 0.34105780720710754, + "learning_rate": 6.018e-06, + "loss": 0.0614, + "step": 3010 + }, + { + "epoch": 0.006660300419246062, + "grad_norm": 0.31085067987442017, + "learning_rate": 6.0380000000000005e-06, + "loss": 0.0647, + "step": 3020 + }, + { + "epoch": 0.006682354394144228, + "grad_norm": 0.3096911311149597, + "learning_rate": 6.058e-06, + "loss": 0.0647, + "step": 3030 + }, + { + "epoch": 0.006704408369042394, + "grad_norm": 0.22951480746269226, + "learning_rate": 6.078e-06, + "loss": 0.0635, + "step": 3040 + }, + { + "epoch": 0.00672646234394056, + "grad_norm": 0.23978067934513092, + "learning_rate": 6.098000000000001e-06, + "loss": 0.0623, + "step": 3050 + }, + { + "epoch": 0.006748516318838726, + "grad_norm": 0.29393652081489563, + "learning_rate": 6.118e-06, + "loss": 0.0638, + "step": 3060 + }, + { + "epoch": 0.006770570293736892, + "grad_norm": 0.3090970516204834, + "learning_rate": 6.138e-06, + "loss": 0.0644, + "step": 3070 + }, + { + "epoch": 0.0067926242686350575, + "grad_norm": 0.3653760552406311, + "learning_rate": 6.158e-06, + "loss": 0.0613, + "step": 3080 + }, + { + "epoch": 0.006814678243533223, + "grad_norm": 0.24779744446277618, + "learning_rate": 6.178e-06, + "loss": 0.0616, + "step": 3090 + }, + { + "epoch": 0.006836732218431389, + "grad_norm": 0.3206455707550049, + "learning_rate": 6.198000000000001e-06, + "loss": 0.0628, + "step": 3100 + }, + { + "epoch": 0.006858786193329555, + "grad_norm": 0.2806645333766937, + "learning_rate": 6.218e-06, + "loss": 0.0627, + "step": 3110 + }, + { + "epoch": 0.006880840168227721, + "grad_norm": 0.2615962028503418, + "learning_rate": 6.238e-06, + "loss": 0.0608, + "step": 3120 + }, + { + "epoch": 0.006902894143125887, + "grad_norm": 0.29614076018333435, + "learning_rate": 6.258e-06, + "loss": 0.0609, + "step": 3130 + }, + { + "epoch": 0.006924948118024052, + "grad_norm": 0.23091615736484528, + "learning_rate": 6.278e-06, + "loss": 0.0614, + "step": 3140 + }, + { + "epoch": 0.0069470020929222175, + "grad_norm": 0.37498924136161804, + "learning_rate": 6.2980000000000005e-06, + "loss": 0.0619, + "step": 3150 + }, + { + "epoch": 0.006969056067820383, + "grad_norm": 0.2550981938838959, + "learning_rate": 6.318e-06, + "loss": 0.06, + "step": 3160 + }, + { + "epoch": 0.006991110042718549, + "grad_norm": 0.25921159982681274, + "learning_rate": 6.338e-06, + "loss": 0.0596, + "step": 3170 + }, + { + "epoch": 0.007013164017616715, + "grad_norm": 0.28145331144332886, + "learning_rate": 6.358e-06, + "loss": 0.0636, + "step": 3180 + }, + { + "epoch": 0.007035217992514881, + "grad_norm": 0.23595786094665527, + "learning_rate": 6.378000000000001e-06, + "loss": 0.0594, + "step": 3190 + }, + { + "epoch": 0.007057271967413047, + "grad_norm": 0.33188968896865845, + "learning_rate": 6.3979999999999996e-06, + "loss": 0.065, + "step": 3200 + }, + { + "epoch": 0.007079325942311213, + "grad_norm": 0.24808284640312195, + "learning_rate": 6.418e-06, + "loss": 0.0664, + "step": 3210 + }, + { + "epoch": 0.0071013799172093784, + "grad_norm": 0.3112703561782837, + "learning_rate": 6.438000000000001e-06, + "loss": 0.0613, + "step": 3220 + }, + { + "epoch": 0.007123433892107544, + "grad_norm": 0.2585701644420624, + "learning_rate": 6.458e-06, + "loss": 0.0639, + "step": 3230 + }, + { + "epoch": 0.00714548786700571, + "grad_norm": 0.3886573016643524, + "learning_rate": 6.4780000000000005e-06, + "loss": 0.061, + "step": 3240 + }, + { + "epoch": 0.007167541841903876, + "grad_norm": 0.35702627897262573, + "learning_rate": 6.4979999999999994e-06, + "loss": 0.0663, + "step": 3250 + }, + { + "epoch": 0.007189595816802041, + "grad_norm": 0.32805517315864563, + "learning_rate": 6.518e-06, + "loss": 0.0628, + "step": 3260 + }, + { + "epoch": 0.007211649791700207, + "grad_norm": 0.3252144455909729, + "learning_rate": 6.538000000000001e-06, + "loss": 0.0579, + "step": 3270 + }, + { + "epoch": 0.007233703766598373, + "grad_norm": 0.2851217985153198, + "learning_rate": 6.558e-06, + "loss": 0.0632, + "step": 3280 + }, + { + "epoch": 0.0072557577414965385, + "grad_norm": 0.2506609857082367, + "learning_rate": 6.578e-06, + "loss": 0.0615, + "step": 3290 + }, + { + "epoch": 0.007277811716394704, + "grad_norm": 0.2701919972896576, + "learning_rate": 6.598e-06, + "loss": 0.0608, + "step": 3300 + }, + { + "epoch": 0.00729986569129287, + "grad_norm": 0.33619505167007446, + "learning_rate": 6.618e-06, + "loss": 0.0629, + "step": 3310 + }, + { + "epoch": 0.007321919666191036, + "grad_norm": 0.26977840065956116, + "learning_rate": 6.638e-06, + "loss": 0.0625, + "step": 3320 + }, + { + "epoch": 0.007343973641089202, + "grad_norm": 0.3098174035549164, + "learning_rate": 6.6580000000000005e-06, + "loss": 0.0631, + "step": 3330 + }, + { + "epoch": 0.007366027615987368, + "grad_norm": 0.26077786087989807, + "learning_rate": 6.678e-06, + "loss": 0.0627, + "step": 3340 + }, + { + "epoch": 0.0073880815908855335, + "grad_norm": 0.3439631760120392, + "learning_rate": 6.698e-06, + "loss": 0.0621, + "step": 3350 + }, + { + "epoch": 0.007410135565783699, + "grad_norm": 0.28788596391677856, + "learning_rate": 6.718000000000001e-06, + "loss": 0.0649, + "step": 3360 + }, + { + "epoch": 0.007432189540681864, + "grad_norm": 0.23983387649059296, + "learning_rate": 6.738e-06, + "loss": 0.0604, + "step": 3370 + }, + { + "epoch": 0.00745424351558003, + "grad_norm": 0.27308568358421326, + "learning_rate": 6.758e-06, + "loss": 0.0641, + "step": 3380 + }, + { + "epoch": 0.007476297490478196, + "grad_norm": 0.26832854747772217, + "learning_rate": 6.778000000000001e-06, + "loss": 0.065, + "step": 3390 + }, + { + "epoch": 0.007498351465376362, + "grad_norm": 0.30068346858024597, + "learning_rate": 6.798e-06, + "loss": 0.0622, + "step": 3400 + }, + { + "epoch": 0.007520405440274528, + "grad_norm": 0.2998885214328766, + "learning_rate": 6.818000000000001e-06, + "loss": 0.0623, + "step": 3410 + }, + { + "epoch": 0.007542459415172694, + "grad_norm": 0.3420950174331665, + "learning_rate": 6.838e-06, + "loss": 0.063, + "step": 3420 + }, + { + "epoch": 0.007564513390070859, + "grad_norm": 0.3471826910972595, + "learning_rate": 6.858e-06, + "loss": 0.0608, + "step": 3430 + }, + { + "epoch": 0.007586567364969025, + "grad_norm": 0.242545023560524, + "learning_rate": 6.878e-06, + "loss": 0.0617, + "step": 3440 + }, + { + "epoch": 0.007608621339867191, + "grad_norm": 0.2787421941757202, + "learning_rate": 6.898e-06, + "loss": 0.0626, + "step": 3450 + }, + { + "epoch": 0.007630675314765357, + "grad_norm": 0.2496810257434845, + "learning_rate": 6.9180000000000005e-06, + "loss": 0.0664, + "step": 3460 + }, + { + "epoch": 0.007652729289663523, + "grad_norm": 0.26940447092056274, + "learning_rate": 6.938e-06, + "loss": 0.0611, + "step": 3470 + }, + { + "epoch": 0.007674783264561689, + "grad_norm": 0.32606521248817444, + "learning_rate": 6.958e-06, + "loss": 0.0608, + "step": 3480 + }, + { + "epoch": 0.007696837239459854, + "grad_norm": 0.31853097677230835, + "learning_rate": 6.978e-06, + "loss": 0.0632, + "step": 3490 + }, + { + "epoch": 0.0077188912143580195, + "grad_norm": 0.2706216275691986, + "learning_rate": 6.998000000000001e-06, + "loss": 0.0592, + "step": 3500 + }, + { + "epoch": 0.007740945189256185, + "grad_norm": 0.3453659415245056, + "learning_rate": 7.0179999999999996e-06, + "loss": 0.0593, + "step": 3510 + }, + { + "epoch": 0.007762999164154351, + "grad_norm": 0.2361634224653244, + "learning_rate": 7.038e-06, + "loss": 0.06, + "step": 3520 + }, + { + "epoch": 0.007785053139052517, + "grad_norm": 0.31715482473373413, + "learning_rate": 7.058000000000001e-06, + "loss": 0.0623, + "step": 3530 + }, + { + "epoch": 0.007807107113950683, + "grad_norm": 0.2563883364200592, + "learning_rate": 7.078e-06, + "loss": 0.0644, + "step": 3540 + }, + { + "epoch": 0.007829161088848848, + "grad_norm": 0.2685178518295288, + "learning_rate": 7.0980000000000005e-06, + "loss": 0.0613, + "step": 3550 + }, + { + "epoch": 0.007851215063747015, + "grad_norm": 0.3310301899909973, + "learning_rate": 7.1179999999999994e-06, + "loss": 0.06, + "step": 3560 + }, + { + "epoch": 0.00787326903864518, + "grad_norm": 0.2616768777370453, + "learning_rate": 7.138e-06, + "loss": 0.0592, + "step": 3570 + }, + { + "epoch": 0.007895323013543346, + "grad_norm": 0.3107730448246002, + "learning_rate": 7.158000000000001e-06, + "loss": 0.059, + "step": 3580 + }, + { + "epoch": 0.007917376988441511, + "grad_norm": 0.2765670716762543, + "learning_rate": 7.178e-06, + "loss": 0.0612, + "step": 3590 + }, + { + "epoch": 0.007939430963339678, + "grad_norm": 0.31119582056999207, + "learning_rate": 7.198e-06, + "loss": 0.0592, + "step": 3600 + }, + { + "epoch": 0.007961484938237843, + "grad_norm": 0.4273374676704407, + "learning_rate": 7.218e-06, + "loss": 0.0603, + "step": 3610 + }, + { + "epoch": 0.00798353891313601, + "grad_norm": 0.38916483521461487, + "learning_rate": 7.238e-06, + "loss": 0.0622, + "step": 3620 + }, + { + "epoch": 0.008005592888034175, + "grad_norm": 0.2611677348613739, + "learning_rate": 7.258e-06, + "loss": 0.0568, + "step": 3630 + }, + { + "epoch": 0.008027646862932341, + "grad_norm": 0.39021772146224976, + "learning_rate": 7.2780000000000005e-06, + "loss": 0.0619, + "step": 3640 + }, + { + "epoch": 0.008049700837830506, + "grad_norm": 0.5480630993843079, + "learning_rate": 7.298e-06, + "loss": 0.0628, + "step": 3650 + }, + { + "epoch": 0.008071754812728673, + "grad_norm": 0.24865207076072693, + "learning_rate": 7.318e-06, + "loss": 0.062, + "step": 3660 + }, + { + "epoch": 0.008093808787626838, + "grad_norm": 0.3047155439853668, + "learning_rate": 7.338000000000001e-06, + "loss": 0.0602, + "step": 3670 + }, + { + "epoch": 0.008115862762525003, + "grad_norm": 0.26974332332611084, + "learning_rate": 7.358e-06, + "loss": 0.0644, + "step": 3680 + }, + { + "epoch": 0.00813791673742317, + "grad_norm": 0.35667678713798523, + "learning_rate": 7.378e-06, + "loss": 0.0618, + "step": 3690 + }, + { + "epoch": 0.008159970712321335, + "grad_norm": 0.21324078738689423, + "learning_rate": 7.398000000000001e-06, + "loss": 0.0605, + "step": 3700 + }, + { + "epoch": 0.008182024687219501, + "grad_norm": 0.28584450483322144, + "learning_rate": 7.418e-06, + "loss": 0.0602, + "step": 3710 + }, + { + "epoch": 0.008204078662117666, + "grad_norm": 0.27948451042175293, + "learning_rate": 7.438000000000001e-06, + "loss": 0.0627, + "step": 3720 + }, + { + "epoch": 0.008226132637015833, + "grad_norm": 0.24823912978172302, + "learning_rate": 7.4579999999999996e-06, + "loss": 0.0609, + "step": 3730 + }, + { + "epoch": 0.008248186611913998, + "grad_norm": 0.45835080742836, + "learning_rate": 7.478e-06, + "loss": 0.0599, + "step": 3740 + }, + { + "epoch": 0.008270240586812165, + "grad_norm": 0.30171751976013184, + "learning_rate": 7.498e-06, + "loss": 0.0593, + "step": 3750 + }, + { + "epoch": 0.00829229456171033, + "grad_norm": 0.2666544020175934, + "learning_rate": 7.518e-06, + "loss": 0.0609, + "step": 3760 + }, + { + "epoch": 0.008314348536608496, + "grad_norm": 0.3067034184932709, + "learning_rate": 7.538000000000001e-06, + "loss": 0.0587, + "step": 3770 + }, + { + "epoch": 0.008336402511506661, + "grad_norm": 0.2935773432254791, + "learning_rate": 7.558e-06, + "loss": 0.0604, + "step": 3780 + }, + { + "epoch": 0.008358456486404826, + "grad_norm": 0.2924072742462158, + "learning_rate": 7.578e-06, + "loss": 0.0578, + "step": 3790 + }, + { + "epoch": 0.008380510461302993, + "grad_norm": 0.26772022247314453, + "learning_rate": 7.597999999999999e-06, + "loss": 0.0588, + "step": 3800 + }, + { + "epoch": 0.008402564436201158, + "grad_norm": 0.23002734780311584, + "learning_rate": 7.618000000000001e-06, + "loss": 0.0571, + "step": 3810 + }, + { + "epoch": 0.008424618411099325, + "grad_norm": 0.24257367849349976, + "learning_rate": 7.638e-06, + "loss": 0.0574, + "step": 3820 + }, + { + "epoch": 0.00844667238599749, + "grad_norm": 0.3015168309211731, + "learning_rate": 7.658e-06, + "loss": 0.0617, + "step": 3830 + }, + { + "epoch": 0.008468726360895656, + "grad_norm": 0.28673112392425537, + "learning_rate": 7.678e-06, + "loss": 0.0617, + "step": 3840 + }, + { + "epoch": 0.008490780335793821, + "grad_norm": 0.25286832451820374, + "learning_rate": 7.698e-06, + "loss": 0.0607, + "step": 3850 + }, + { + "epoch": 0.008512834310691988, + "grad_norm": 0.31783536076545715, + "learning_rate": 7.718e-06, + "loss": 0.0587, + "step": 3860 + }, + { + "epoch": 0.008534888285590153, + "grad_norm": 0.26434755325317383, + "learning_rate": 7.738000000000001e-06, + "loss": 0.0586, + "step": 3870 + }, + { + "epoch": 0.00855694226048832, + "grad_norm": 0.22747065126895905, + "learning_rate": 7.758000000000001e-06, + "loss": 0.0602, + "step": 3880 + }, + { + "epoch": 0.008578996235386485, + "grad_norm": 0.3660389482975006, + "learning_rate": 7.777999999999999e-06, + "loss": 0.0608, + "step": 3890 + }, + { + "epoch": 0.008601050210284652, + "grad_norm": 0.31740331649780273, + "learning_rate": 7.798e-06, + "loss": 0.0595, + "step": 3900 + }, + { + "epoch": 0.008623104185182816, + "grad_norm": 0.32500943541526794, + "learning_rate": 7.818e-06, + "loss": 0.0565, + "step": 3910 + }, + { + "epoch": 0.008645158160080981, + "grad_norm": 0.32588082551956177, + "learning_rate": 7.838e-06, + "loss": 0.0624, + "step": 3920 + }, + { + "epoch": 0.008667212134979148, + "grad_norm": 0.3482590615749359, + "learning_rate": 7.858e-06, + "loss": 0.0596, + "step": 3930 + }, + { + "epoch": 0.008689266109877313, + "grad_norm": 0.29964977502822876, + "learning_rate": 7.878e-06, + "loss": 0.0578, + "step": 3940 + }, + { + "epoch": 0.00871132008477548, + "grad_norm": 0.4074782133102417, + "learning_rate": 7.898e-06, + "loss": 0.0623, + "step": 3950 + }, + { + "epoch": 0.008733374059673645, + "grad_norm": 0.29136916995048523, + "learning_rate": 7.918000000000001e-06, + "loss": 0.0601, + "step": 3960 + }, + { + "epoch": 0.008755428034571812, + "grad_norm": 0.24187502264976501, + "learning_rate": 7.938000000000001e-06, + "loss": 0.0577, + "step": 3970 + }, + { + "epoch": 0.008777482009469977, + "grad_norm": 0.37700366973876953, + "learning_rate": 7.957999999999999e-06, + "loss": 0.0601, + "step": 3980 + }, + { + "epoch": 0.008799535984368143, + "grad_norm": 0.2947405278682709, + "learning_rate": 7.978e-06, + "loss": 0.0594, + "step": 3990 + }, + { + "epoch": 0.008821589959266308, + "grad_norm": 0.21882131695747375, + "learning_rate": 7.998e-06, + "loss": 0.0574, + "step": 4000 + }, + { + "epoch": 0.008843643934164475, + "grad_norm": 0.22814391553401947, + "learning_rate": 8.018e-06, + "loss": 0.0577, + "step": 4010 + }, + { + "epoch": 0.00886569790906264, + "grad_norm": 0.22362060844898224, + "learning_rate": 8.038000000000002e-06, + "loss": 0.0595, + "step": 4020 + }, + { + "epoch": 0.008887751883960805, + "grad_norm": 0.28541770577430725, + "learning_rate": 8.058e-06, + "loss": 0.0584, + "step": 4030 + }, + { + "epoch": 0.008909805858858972, + "grad_norm": 0.2697514593601227, + "learning_rate": 8.078e-06, + "loss": 0.0628, + "step": 4040 + }, + { + "epoch": 0.008931859833757137, + "grad_norm": 0.20919664204120636, + "learning_rate": 8.098000000000001e-06, + "loss": 0.0566, + "step": 4050 + }, + { + "epoch": 0.008953913808655303, + "grad_norm": 0.2683131694793701, + "learning_rate": 8.118000000000001e-06, + "loss": 0.0606, + "step": 4060 + }, + { + "epoch": 0.008975967783553468, + "grad_norm": 0.23620542883872986, + "learning_rate": 8.138e-06, + "loss": 0.0559, + "step": 4070 + }, + { + "epoch": 0.008998021758451635, + "grad_norm": 0.2825395464897156, + "learning_rate": 8.158e-06, + "loss": 0.0594, + "step": 4080 + }, + { + "epoch": 0.0090200757333498, + "grad_norm": 0.30868908762931824, + "learning_rate": 8.178e-06, + "loss": 0.0593, + "step": 4090 + }, + { + "epoch": 0.009042129708247967, + "grad_norm": 0.3286108672618866, + "learning_rate": 8.198e-06, + "loss": 0.062, + "step": 4100 + }, + { + "epoch": 0.009064183683146132, + "grad_norm": 0.33780381083488464, + "learning_rate": 8.218e-06, + "loss": 0.0596, + "step": 4110 + }, + { + "epoch": 0.009086237658044298, + "grad_norm": 0.2622765600681305, + "learning_rate": 8.238e-06, + "loss": 0.0589, + "step": 4120 + }, + { + "epoch": 0.009108291632942463, + "grad_norm": 0.24875041842460632, + "learning_rate": 8.258e-06, + "loss": 0.0588, + "step": 4130 + }, + { + "epoch": 0.009130345607840628, + "grad_norm": 0.2365342378616333, + "learning_rate": 8.278e-06, + "loss": 0.0596, + "step": 4140 + }, + { + "epoch": 0.009152399582738795, + "grad_norm": 0.23841072618961334, + "learning_rate": 8.298000000000001e-06, + "loss": 0.0605, + "step": 4150 + }, + { + "epoch": 0.00917445355763696, + "grad_norm": 0.29331308603286743, + "learning_rate": 8.318e-06, + "loss": 0.0573, + "step": 4160 + }, + { + "epoch": 0.009196507532535127, + "grad_norm": 0.21334050595760345, + "learning_rate": 8.337999999999999e-06, + "loss": 0.0588, + "step": 4170 + }, + { + "epoch": 0.009218561507433292, + "grad_norm": 0.21543589234352112, + "learning_rate": 8.358e-06, + "loss": 0.0567, + "step": 4180 + }, + { + "epoch": 0.009240615482331458, + "grad_norm": 0.30685189366340637, + "learning_rate": 8.378e-06, + "loss": 0.0572, + "step": 4190 + }, + { + "epoch": 0.009262669457229623, + "grad_norm": 0.2933172881603241, + "learning_rate": 8.398e-06, + "loss": 0.0624, + "step": 4200 + }, + { + "epoch": 0.00928472343212779, + "grad_norm": 0.3261091411113739, + "learning_rate": 8.418000000000001e-06, + "loss": 0.0571, + "step": 4210 + }, + { + "epoch": 0.009306777407025955, + "grad_norm": 0.2657065689563751, + "learning_rate": 8.438e-06, + "loss": 0.0555, + "step": 4220 + }, + { + "epoch": 0.009328831381924122, + "grad_norm": 0.23623140156269073, + "learning_rate": 8.458e-06, + "loss": 0.0595, + "step": 4230 + }, + { + "epoch": 0.009350885356822287, + "grad_norm": 0.2751134932041168, + "learning_rate": 8.478e-06, + "loss": 0.0585, + "step": 4240 + }, + { + "epoch": 0.009372939331720453, + "grad_norm": 0.28332117199897766, + "learning_rate": 8.498e-06, + "loss": 0.0609, + "step": 4250 + }, + { + "epoch": 0.009394993306618618, + "grad_norm": 0.2717749774456024, + "learning_rate": 8.518e-06, + "loss": 0.0586, + "step": 4260 + }, + { + "epoch": 0.009417047281516783, + "grad_norm": 0.29004380106925964, + "learning_rate": 8.538e-06, + "loss": 0.0603, + "step": 4270 + }, + { + "epoch": 0.00943910125641495, + "grad_norm": 0.29459598660469055, + "learning_rate": 8.558e-06, + "loss": 0.0605, + "step": 4280 + }, + { + "epoch": 0.009461155231313115, + "grad_norm": 0.2577609419822693, + "learning_rate": 8.578e-06, + "loss": 0.0613, + "step": 4290 + }, + { + "epoch": 0.009483209206211282, + "grad_norm": 0.31732645630836487, + "learning_rate": 8.598000000000001e-06, + "loss": 0.0574, + "step": 4300 + }, + { + "epoch": 0.009505263181109447, + "grad_norm": 0.2260037213563919, + "learning_rate": 8.618e-06, + "loss": 0.0592, + "step": 4310 + }, + { + "epoch": 0.009527317156007614, + "grad_norm": 0.25486475229263306, + "learning_rate": 8.638e-06, + "loss": 0.0607, + "step": 4320 + }, + { + "epoch": 0.009549371130905778, + "grad_norm": 0.24553680419921875, + "learning_rate": 8.658e-06, + "loss": 0.0557, + "step": 4330 + }, + { + "epoch": 0.009571425105803945, + "grad_norm": 0.21142174303531647, + "learning_rate": 8.678e-06, + "loss": 0.0608, + "step": 4340 + }, + { + "epoch": 0.00959347908070211, + "grad_norm": 0.274966835975647, + "learning_rate": 8.698e-06, + "loss": 0.0556, + "step": 4350 + }, + { + "epoch": 0.009615533055600277, + "grad_norm": 0.21358132362365723, + "learning_rate": 8.718e-06, + "loss": 0.057, + "step": 4360 + }, + { + "epoch": 0.009637587030498442, + "grad_norm": 0.24318403005599976, + "learning_rate": 8.738e-06, + "loss": 0.0598, + "step": 4370 + }, + { + "epoch": 0.009659641005396607, + "grad_norm": 0.22157375514507294, + "learning_rate": 8.758e-06, + "loss": 0.0562, + "step": 4380 + }, + { + "epoch": 0.009681694980294774, + "grad_norm": 0.22918131947517395, + "learning_rate": 8.778000000000001e-06, + "loss": 0.0615, + "step": 4390 + }, + { + "epoch": 0.009703748955192939, + "grad_norm": 0.33655476570129395, + "learning_rate": 8.798000000000001e-06, + "loss": 0.0558, + "step": 4400 + }, + { + "epoch": 0.009725802930091105, + "grad_norm": 0.2451484501361847, + "learning_rate": 8.818e-06, + "loss": 0.0578, + "step": 4410 + }, + { + "epoch": 0.00974785690498927, + "grad_norm": 0.2485387623310089, + "learning_rate": 8.837999999999999e-06, + "loss": 0.0577, + "step": 4420 + }, + { + "epoch": 0.009769910879887437, + "grad_norm": 0.2955739498138428, + "learning_rate": 8.858e-06, + "loss": 0.0547, + "step": 4430 + }, + { + "epoch": 0.009791964854785602, + "grad_norm": 0.24922801554203033, + "learning_rate": 8.878e-06, + "loss": 0.0571, + "step": 4440 + }, + { + "epoch": 0.009814018829683769, + "grad_norm": 0.24564331769943237, + "learning_rate": 8.898e-06, + "loss": 0.0578, + "step": 4450 + }, + { + "epoch": 0.009836072804581934, + "grad_norm": 0.35592734813690186, + "learning_rate": 8.918e-06, + "loss": 0.0574, + "step": 4460 + }, + { + "epoch": 0.0098581267794801, + "grad_norm": 0.2930261492729187, + "learning_rate": 8.938e-06, + "loss": 0.0606, + "step": 4470 + }, + { + "epoch": 0.009880180754378265, + "grad_norm": 0.28131476044654846, + "learning_rate": 8.958e-06, + "loss": 0.0569, + "step": 4480 + }, + { + "epoch": 0.009902234729276432, + "grad_norm": 0.23457996547222137, + "learning_rate": 8.978000000000001e-06, + "loss": 0.0594, + "step": 4490 + }, + { + "epoch": 0.009924288704174597, + "grad_norm": 0.29706552624702454, + "learning_rate": 8.998000000000001e-06, + "loss": 0.0574, + "step": 4500 + }, + { + "epoch": 0.009946342679072762, + "grad_norm": 0.3209041357040405, + "learning_rate": 9.017999999999999e-06, + "loss": 0.0581, + "step": 4510 + }, + { + "epoch": 0.009968396653970929, + "grad_norm": 0.23292528092861176, + "learning_rate": 9.038e-06, + "loss": 0.0595, + "step": 4520 + }, + { + "epoch": 0.009990450628869094, + "grad_norm": 0.23914776742458344, + "learning_rate": 9.058e-06, + "loss": 0.0583, + "step": 4530 + }, + { + "epoch": 0.01001250460376726, + "grad_norm": 0.29901155829429626, + "learning_rate": 9.078e-06, + "loss": 0.0574, + "step": 4540 + }, + { + "epoch": 0.010034558578665425, + "grad_norm": 0.28524908423423767, + "learning_rate": 9.098e-06, + "loss": 0.0577, + "step": 4550 + }, + { + "epoch": 0.010056612553563592, + "grad_norm": 0.2554095983505249, + "learning_rate": 9.118e-06, + "loss": 0.0583, + "step": 4560 + }, + { + "epoch": 0.010078666528461757, + "grad_norm": 0.3629189133644104, + "learning_rate": 9.138e-06, + "loss": 0.0574, + "step": 4570 + }, + { + "epoch": 0.010100720503359924, + "grad_norm": 0.35136839747428894, + "learning_rate": 9.158000000000001e-06, + "loss": 0.0557, + "step": 4580 + }, + { + "epoch": 0.010122774478258089, + "grad_norm": 0.25652170181274414, + "learning_rate": 9.178000000000001e-06, + "loss": 0.0565, + "step": 4590 + }, + { + "epoch": 0.010144828453156255, + "grad_norm": 0.25941789150238037, + "learning_rate": 9.197999999999999e-06, + "loss": 0.0582, + "step": 4600 + }, + { + "epoch": 0.01016688242805442, + "grad_norm": 0.299848735332489, + "learning_rate": 9.218e-06, + "loss": 0.06, + "step": 4610 + }, + { + "epoch": 0.010188936402952585, + "grad_norm": 0.25899359583854675, + "learning_rate": 9.238e-06, + "loss": 0.0558, + "step": 4620 + }, + { + "epoch": 0.010210990377850752, + "grad_norm": 0.2373044192790985, + "learning_rate": 9.258e-06, + "loss": 0.0574, + "step": 4630 + }, + { + "epoch": 0.010233044352748917, + "grad_norm": 0.3429427742958069, + "learning_rate": 9.278000000000002e-06, + "loss": 0.058, + "step": 4640 + }, + { + "epoch": 0.010255098327647084, + "grad_norm": 0.22446012496948242, + "learning_rate": 9.298e-06, + "loss": 0.0585, + "step": 4650 + }, + { + "epoch": 0.010277152302545249, + "grad_norm": 0.23050202429294586, + "learning_rate": 9.318e-06, + "loss": 0.0542, + "step": 4660 + }, + { + "epoch": 0.010299206277443415, + "grad_norm": 0.22613675892353058, + "learning_rate": 9.338000000000001e-06, + "loss": 0.0587, + "step": 4670 + }, + { + "epoch": 0.01032126025234158, + "grad_norm": 0.22374628484249115, + "learning_rate": 9.358000000000001e-06, + "loss": 0.0562, + "step": 4680 + }, + { + "epoch": 0.010343314227239747, + "grad_norm": 0.2375580072402954, + "learning_rate": 9.378e-06, + "loss": 0.0538, + "step": 4690 + }, + { + "epoch": 0.010365368202137912, + "grad_norm": 0.3429473042488098, + "learning_rate": 9.398e-06, + "loss": 0.0565, + "step": 4700 + }, + { + "epoch": 0.010387422177036079, + "grad_norm": 0.2861204445362091, + "learning_rate": 9.418e-06, + "loss": 0.0565, + "step": 4710 + }, + { + "epoch": 0.010409476151934244, + "grad_norm": 0.2674434185028076, + "learning_rate": 9.438e-06, + "loss": 0.0586, + "step": 4720 + }, + { + "epoch": 0.010431530126832409, + "grad_norm": 0.2558889389038086, + "learning_rate": 9.458e-06, + "loss": 0.0561, + "step": 4730 + }, + { + "epoch": 0.010453584101730575, + "grad_norm": 0.24445241689682007, + "learning_rate": 9.478e-06, + "loss": 0.0569, + "step": 4740 + }, + { + "epoch": 0.01047563807662874, + "grad_norm": 0.20070789754390717, + "learning_rate": 9.498e-06, + "loss": 0.0558, + "step": 4750 + }, + { + "epoch": 0.010497692051526907, + "grad_norm": 0.28781846165657043, + "learning_rate": 9.518e-06, + "loss": 0.0612, + "step": 4760 + }, + { + "epoch": 0.010519746026425072, + "grad_norm": 0.22213181853294373, + "learning_rate": 9.538e-06, + "loss": 0.0548, + "step": 4770 + }, + { + "epoch": 0.010541800001323239, + "grad_norm": 0.2788512408733368, + "learning_rate": 9.558e-06, + "loss": 0.0576, + "step": 4780 + }, + { + "epoch": 0.010563853976221404, + "grad_norm": 0.21963061392307281, + "learning_rate": 9.577999999999999e-06, + "loss": 0.0581, + "step": 4790 + }, + { + "epoch": 0.01058590795111957, + "grad_norm": 0.23587004840373993, + "learning_rate": 9.598e-06, + "loss": 0.0538, + "step": 4800 + }, + { + "epoch": 0.010607961926017736, + "grad_norm": 0.2832496166229248, + "learning_rate": 9.618e-06, + "loss": 0.054, + "step": 4810 + }, + { + "epoch": 0.010630015900915902, + "grad_norm": 0.20732246339321136, + "learning_rate": 9.638e-06, + "loss": 0.0571, + "step": 4820 + }, + { + "epoch": 0.010652069875814067, + "grad_norm": 0.17400626838207245, + "learning_rate": 9.658000000000001e-06, + "loss": 0.055, + "step": 4830 + }, + { + "epoch": 0.010674123850712234, + "grad_norm": 0.260433167219162, + "learning_rate": 9.678e-06, + "loss": 0.0565, + "step": 4840 + }, + { + "epoch": 0.010696177825610399, + "grad_norm": 0.3052978217601776, + "learning_rate": 9.698e-06, + "loss": 0.0555, + "step": 4850 + }, + { + "epoch": 0.010718231800508564, + "grad_norm": 0.20729346573352814, + "learning_rate": 9.718e-06, + "loss": 0.055, + "step": 4860 + }, + { + "epoch": 0.01074028577540673, + "grad_norm": 0.2476194202899933, + "learning_rate": 9.738e-06, + "loss": 0.0561, + "step": 4870 + }, + { + "epoch": 0.010762339750304896, + "grad_norm": 0.2633949816226959, + "learning_rate": 9.758e-06, + "loss": 0.0565, + "step": 4880 + }, + { + "epoch": 0.010784393725203062, + "grad_norm": 0.27239900827407837, + "learning_rate": 9.778e-06, + "loss": 0.0586, + "step": 4890 + }, + { + "epoch": 0.010806447700101227, + "grad_norm": 0.28755971789360046, + "learning_rate": 9.798e-06, + "loss": 0.0556, + "step": 4900 + }, + { + "epoch": 0.010828501674999394, + "grad_norm": 0.28479138016700745, + "learning_rate": 9.818e-06, + "loss": 0.0552, + "step": 4910 + }, + { + "epoch": 0.010850555649897559, + "grad_norm": 0.2488296777009964, + "learning_rate": 9.838000000000001e-06, + "loss": 0.0571, + "step": 4920 + }, + { + "epoch": 0.010872609624795726, + "grad_norm": 0.23646678030490875, + "learning_rate": 9.858000000000001e-06, + "loss": 0.0566, + "step": 4930 + }, + { + "epoch": 0.01089466359969389, + "grad_norm": 0.2702755928039551, + "learning_rate": 9.878e-06, + "loss": 0.0564, + "step": 4940 + }, + { + "epoch": 0.010916717574592057, + "grad_norm": 0.2656896412372589, + "learning_rate": 9.898e-06, + "loss": 0.0581, + "step": 4950 + }, + { + "epoch": 0.010938771549490222, + "grad_norm": 0.2322610765695572, + "learning_rate": 9.918e-06, + "loss": 0.058, + "step": 4960 + }, + { + "epoch": 0.010960825524388387, + "grad_norm": 0.19388830661773682, + "learning_rate": 9.938e-06, + "loss": 0.0571, + "step": 4970 + }, + { + "epoch": 0.010982879499286554, + "grad_norm": 0.33587324619293213, + "learning_rate": 9.958e-06, + "loss": 0.0577, + "step": 4980 + }, + { + "epoch": 0.011004933474184719, + "grad_norm": 0.2569821774959564, + "learning_rate": 9.978e-06, + "loss": 0.054, + "step": 4990 + }, + { + "epoch": 0.011026987449082886, + "grad_norm": 0.21988193690776825, + "learning_rate": 9.998e-06, + "loss": 0.0554, + "step": 5000 + }, + { + "epoch": 0.01104904142398105, + "grad_norm": 0.24852299690246582, + "learning_rate": 1.0018000000000001e-05, + "loss": 0.0543, + "step": 5010 + }, + { + "epoch": 0.011071095398879217, + "grad_norm": 0.25232943892478943, + "learning_rate": 1.0038000000000001e-05, + "loss": 0.0557, + "step": 5020 + }, + { + "epoch": 0.011093149373777382, + "grad_norm": 0.26900073885917664, + "learning_rate": 1.0058e-05, + "loss": 0.0549, + "step": 5030 + }, + { + "epoch": 0.011115203348675549, + "grad_norm": 0.23082850873470306, + "learning_rate": 1.0077999999999999e-05, + "loss": 0.0586, + "step": 5040 + }, + { + "epoch": 0.011137257323573714, + "grad_norm": 0.294994980096817, + "learning_rate": 1.0098e-05, + "loss": 0.0583, + "step": 5050 + }, + { + "epoch": 0.01115931129847188, + "grad_norm": 0.2888437807559967, + "learning_rate": 1.0118e-05, + "loss": 0.0574, + "step": 5060 + }, + { + "epoch": 0.011181365273370046, + "grad_norm": 0.33343419432640076, + "learning_rate": 1.0138e-05, + "loss": 0.0541, + "step": 5070 + }, + { + "epoch": 0.011203419248268212, + "grad_norm": 0.33629223704338074, + "learning_rate": 1.0158e-05, + "loss": 0.0516, + "step": 5080 + }, + { + "epoch": 0.011225473223166377, + "grad_norm": 0.23765337467193604, + "learning_rate": 1.0178e-05, + "loss": 0.0554, + "step": 5090 + }, + { + "epoch": 0.011247527198064542, + "grad_norm": 0.2315746545791626, + "learning_rate": 1.0198e-05, + "loss": 0.054, + "step": 5100 + }, + { + "epoch": 0.011269581172962709, + "grad_norm": 0.2640497088432312, + "learning_rate": 1.0218000000000001e-05, + "loss": 0.0572, + "step": 5110 + }, + { + "epoch": 0.011291635147860874, + "grad_norm": 0.22572806477546692, + "learning_rate": 1.0238000000000001e-05, + "loss": 0.0552, + "step": 5120 + }, + { + "epoch": 0.01131368912275904, + "grad_norm": 0.32811635732650757, + "learning_rate": 1.0257999999999999e-05, + "loss": 0.0543, + "step": 5130 + }, + { + "epoch": 0.011335743097657206, + "grad_norm": 0.2973591983318329, + "learning_rate": 1.0278e-05, + "loss": 0.0537, + "step": 5140 + }, + { + "epoch": 0.011357797072555373, + "grad_norm": 0.2621936798095703, + "learning_rate": 1.0298e-05, + "loss": 0.0577, + "step": 5150 + }, + { + "epoch": 0.011379851047453537, + "grad_norm": 0.2829953730106354, + "learning_rate": 1.0318e-05, + "loss": 0.0558, + "step": 5160 + }, + { + "epoch": 0.011401905022351704, + "grad_norm": 0.2446788102388382, + "learning_rate": 1.0338e-05, + "loss": 0.056, + "step": 5170 + }, + { + "epoch": 0.01142395899724987, + "grad_norm": 0.28703954815864563, + "learning_rate": 1.0358e-05, + "loss": 0.0538, + "step": 5180 + }, + { + "epoch": 0.011446012972148036, + "grad_norm": 0.2455293834209442, + "learning_rate": 1.0378e-05, + "loss": 0.0549, + "step": 5190 + }, + { + "epoch": 0.0114680669470462, + "grad_norm": 0.268676221370697, + "learning_rate": 1.0398000000000001e-05, + "loss": 0.056, + "step": 5200 + }, + { + "epoch": 0.011490120921944366, + "grad_norm": 0.24970373511314392, + "learning_rate": 1.0418000000000001e-05, + "loss": 0.0551, + "step": 5210 + }, + { + "epoch": 0.011512174896842533, + "grad_norm": 0.22058038413524628, + "learning_rate": 1.0437999999999999e-05, + "loss": 0.0527, + "step": 5220 + }, + { + "epoch": 0.011534228871740698, + "grad_norm": 0.21639348566532135, + "learning_rate": 1.0458e-05, + "loss": 0.0566, + "step": 5230 + }, + { + "epoch": 0.011556282846638864, + "grad_norm": 0.23265330493450165, + "learning_rate": 1.0478e-05, + "loss": 0.0546, + "step": 5240 + }, + { + "epoch": 0.01157833682153703, + "grad_norm": 0.32153233885765076, + "learning_rate": 1.0498e-05, + "loss": 0.0582, + "step": 5250 + }, + { + "epoch": 0.011600390796435196, + "grad_norm": 0.26935774087905884, + "learning_rate": 1.0518000000000002e-05, + "loss": 0.0551, + "step": 5260 + }, + { + "epoch": 0.011622444771333361, + "grad_norm": 0.24868260324001312, + "learning_rate": 1.0538e-05, + "loss": 0.0573, + "step": 5270 + }, + { + "epoch": 0.011644498746231528, + "grad_norm": 0.21609461307525635, + "learning_rate": 1.0558e-05, + "loss": 0.0549, + "step": 5280 + }, + { + "epoch": 0.011666552721129693, + "grad_norm": 0.3015119731426239, + "learning_rate": 1.0578000000000001e-05, + "loss": 0.0566, + "step": 5290 + }, + { + "epoch": 0.01168860669602786, + "grad_norm": 0.30674347281455994, + "learning_rate": 1.0598000000000001e-05, + "loss": 0.0558, + "step": 5300 + }, + { + "epoch": 0.011710660670926024, + "grad_norm": 0.18593810498714447, + "learning_rate": 1.0618e-05, + "loss": 0.0551, + "step": 5310 + }, + { + "epoch": 0.01173271464582419, + "grad_norm": 0.31692370772361755, + "learning_rate": 1.0638e-05, + "loss": 0.0559, + "step": 5320 + }, + { + "epoch": 0.011754768620722356, + "grad_norm": 0.2365453690290451, + "learning_rate": 1.0658e-05, + "loss": 0.0569, + "step": 5330 + }, + { + "epoch": 0.011776822595620521, + "grad_norm": 0.18835842609405518, + "learning_rate": 1.0678e-05, + "loss": 0.0558, + "step": 5340 + }, + { + "epoch": 0.011798876570518688, + "grad_norm": 0.2204076200723648, + "learning_rate": 1.0698e-05, + "loss": 0.0553, + "step": 5350 + }, + { + "epoch": 0.011820930545416853, + "grad_norm": 0.23166564106941223, + "learning_rate": 1.0718000000000001e-05, + "loss": 0.0544, + "step": 5360 + }, + { + "epoch": 0.01184298452031502, + "grad_norm": 0.22818811237812042, + "learning_rate": 1.0738e-05, + "loss": 0.0548, + "step": 5370 + }, + { + "epoch": 0.011865038495213184, + "grad_norm": 0.2560960352420807, + "learning_rate": 1.0758e-05, + "loss": 0.0564, + "step": 5380 + }, + { + "epoch": 0.011887092470111351, + "grad_norm": 0.2721211910247803, + "learning_rate": 1.0778e-05, + "loss": 0.0533, + "step": 5390 + }, + { + "epoch": 0.011909146445009516, + "grad_norm": 0.22009602189064026, + "learning_rate": 1.0798e-05, + "loss": 0.0538, + "step": 5400 + }, + { + "epoch": 0.011931200419907683, + "grad_norm": 0.29308605194091797, + "learning_rate": 1.0817999999999999e-05, + "loss": 0.0552, + "step": 5410 + }, + { + "epoch": 0.011953254394805848, + "grad_norm": 0.27353501319885254, + "learning_rate": 1.0838e-05, + "loss": 0.0556, + "step": 5420 + }, + { + "epoch": 0.011975308369704014, + "grad_norm": 0.23343591392040253, + "learning_rate": 1.0858e-05, + "loss": 0.0539, + "step": 5430 + }, + { + "epoch": 0.01199736234460218, + "grad_norm": 0.24155740439891815, + "learning_rate": 1.0878e-05, + "loss": 0.0551, + "step": 5440 + }, + { + "epoch": 0.012019416319500344, + "grad_norm": 0.2803206145763397, + "learning_rate": 1.0898000000000001e-05, + "loss": 0.0536, + "step": 5450 + }, + { + "epoch": 0.012041470294398511, + "grad_norm": 0.3495491147041321, + "learning_rate": 1.0918e-05, + "loss": 0.0557, + "step": 5460 + }, + { + "epoch": 0.012063524269296676, + "grad_norm": 0.2705005705356598, + "learning_rate": 1.0938e-05, + "loss": 0.0571, + "step": 5470 + }, + { + "epoch": 0.012085578244194843, + "grad_norm": 0.2523742914199829, + "learning_rate": 1.0958e-05, + "loss": 0.0567, + "step": 5480 + }, + { + "epoch": 0.012107632219093008, + "grad_norm": 0.2744428813457489, + "learning_rate": 1.0978e-05, + "loss": 0.0593, + "step": 5490 + }, + { + "epoch": 0.012129686193991174, + "grad_norm": 0.21891586482524872, + "learning_rate": 1.0998e-05, + "loss": 0.0557, + "step": 5500 + }, + { + "epoch": 0.01215174016888934, + "grad_norm": 0.30228638648986816, + "learning_rate": 1.1018e-05, + "loss": 0.056, + "step": 5510 + }, + { + "epoch": 0.012173794143787506, + "grad_norm": 0.27503079175949097, + "learning_rate": 1.1038e-05, + "loss": 0.0534, + "step": 5520 + }, + { + "epoch": 0.012195848118685671, + "grad_norm": 0.2332761585712433, + "learning_rate": 1.1058e-05, + "loss": 0.0555, + "step": 5530 + }, + { + "epoch": 0.012217902093583838, + "grad_norm": 0.2660626173019409, + "learning_rate": 1.1078000000000001e-05, + "loss": 0.0559, + "step": 5540 + }, + { + "epoch": 0.012239956068482003, + "grad_norm": 0.2079872339963913, + "learning_rate": 1.1098000000000001e-05, + "loss": 0.0533, + "step": 5550 + }, + { + "epoch": 0.012262010043380168, + "grad_norm": 0.2143123894929886, + "learning_rate": 1.1118e-05, + "loss": 0.0543, + "step": 5560 + }, + { + "epoch": 0.012284064018278334, + "grad_norm": 0.24103057384490967, + "learning_rate": 1.1138e-05, + "loss": 0.0533, + "step": 5570 + }, + { + "epoch": 0.0123061179931765, + "grad_norm": 0.1660945862531662, + "learning_rate": 1.1158e-05, + "loss": 0.053, + "step": 5580 + }, + { + "epoch": 0.012328171968074666, + "grad_norm": 0.18919117748737335, + "learning_rate": 1.1178e-05, + "loss": 0.056, + "step": 5590 + }, + { + "epoch": 0.012350225942972831, + "grad_norm": 0.21702130138874054, + "learning_rate": 1.1198e-05, + "loss": 0.0557, + "step": 5600 + }, + { + "epoch": 0.012372279917870998, + "grad_norm": 0.2108684778213501, + "learning_rate": 1.1218e-05, + "loss": 0.0544, + "step": 5610 + }, + { + "epoch": 0.012394333892769163, + "grad_norm": 0.23368525505065918, + "learning_rate": 1.1238e-05, + "loss": 0.0554, + "step": 5620 + }, + { + "epoch": 0.01241638786766733, + "grad_norm": 0.183770552277565, + "learning_rate": 1.1258000000000001e-05, + "loss": 0.0539, + "step": 5630 + }, + { + "epoch": 0.012438441842565495, + "grad_norm": 0.22425153851509094, + "learning_rate": 1.1278000000000001e-05, + "loss": 0.0555, + "step": 5640 + }, + { + "epoch": 0.012460495817463661, + "grad_norm": 0.21193422377109528, + "learning_rate": 1.1298e-05, + "loss": 0.0531, + "step": 5650 + }, + { + "epoch": 0.012482549792361826, + "grad_norm": 0.2662297189235687, + "learning_rate": 1.1317999999999999e-05, + "loss": 0.0544, + "step": 5660 + }, + { + "epoch": 0.012504603767259993, + "grad_norm": 0.24672529101371765, + "learning_rate": 1.1338e-05, + "loss": 0.0555, + "step": 5670 + }, + { + "epoch": 0.012526657742158158, + "grad_norm": 0.23796379566192627, + "learning_rate": 1.1358e-05, + "loss": 0.0578, + "step": 5680 + }, + { + "epoch": 0.012548711717056323, + "grad_norm": 0.2757438123226166, + "learning_rate": 1.1378e-05, + "loss": 0.0545, + "step": 5690 + }, + { + "epoch": 0.01257076569195449, + "grad_norm": 0.2230820208787918, + "learning_rate": 1.1398e-05, + "loss": 0.057, + "step": 5700 + }, + { + "epoch": 0.012592819666852655, + "grad_norm": 0.2829890549182892, + "learning_rate": 1.1418e-05, + "loss": 0.0555, + "step": 5710 + }, + { + "epoch": 0.012614873641750821, + "grad_norm": 0.25633668899536133, + "learning_rate": 1.1438e-05, + "loss": 0.0535, + "step": 5720 + }, + { + "epoch": 0.012636927616648986, + "grad_norm": 0.29842695593833923, + "learning_rate": 1.1458000000000001e-05, + "loss": 0.0569, + "step": 5730 + }, + { + "epoch": 0.012658981591547153, + "grad_norm": 0.19899365305900574, + "learning_rate": 1.1478000000000001e-05, + "loss": 0.0547, + "step": 5740 + }, + { + "epoch": 0.012681035566445318, + "grad_norm": 0.22494958341121674, + "learning_rate": 1.1497999999999999e-05, + "loss": 0.053, + "step": 5750 + }, + { + "epoch": 0.012703089541343485, + "grad_norm": 0.22715908288955688, + "learning_rate": 1.1518e-05, + "loss": 0.0544, + "step": 5760 + }, + { + "epoch": 0.01272514351624165, + "grad_norm": 0.2688244581222534, + "learning_rate": 1.1538e-05, + "loss": 0.0572, + "step": 5770 + }, + { + "epoch": 0.012747197491139816, + "grad_norm": 0.27353107929229736, + "learning_rate": 1.1558e-05, + "loss": 0.056, + "step": 5780 + }, + { + "epoch": 0.012769251466037981, + "grad_norm": 0.22430983185768127, + "learning_rate": 1.1578000000000002e-05, + "loss": 0.0547, + "step": 5790 + }, + { + "epoch": 0.012791305440936146, + "grad_norm": 0.2459115982055664, + "learning_rate": 1.1598e-05, + "loss": 0.053, + "step": 5800 + }, + { + "epoch": 0.012813359415834313, + "grad_norm": 0.22110265493392944, + "learning_rate": 1.1618e-05, + "loss": 0.0544, + "step": 5810 + }, + { + "epoch": 0.012835413390732478, + "grad_norm": 0.22485661506652832, + "learning_rate": 1.1638000000000001e-05, + "loss": 0.0523, + "step": 5820 + }, + { + "epoch": 0.012857467365630645, + "grad_norm": 0.17196382582187653, + "learning_rate": 1.1658000000000001e-05, + "loss": 0.0515, + "step": 5830 + }, + { + "epoch": 0.01287952134052881, + "grad_norm": 0.3035365641117096, + "learning_rate": 1.1677999999999999e-05, + "loss": 0.0556, + "step": 5840 + }, + { + "epoch": 0.012901575315426976, + "grad_norm": 0.21505942940711975, + "learning_rate": 1.1698e-05, + "loss": 0.0538, + "step": 5850 + }, + { + "epoch": 0.012923629290325141, + "grad_norm": 0.23731686174869537, + "learning_rate": 1.1718e-05, + "loss": 0.0525, + "step": 5860 + }, + { + "epoch": 0.012945683265223308, + "grad_norm": 0.2423177808523178, + "learning_rate": 1.1738e-05, + "loss": 0.0541, + "step": 5870 + }, + { + "epoch": 0.012967737240121473, + "grad_norm": 0.22558912634849548, + "learning_rate": 1.1758000000000002e-05, + "loss": 0.0548, + "step": 5880 + }, + { + "epoch": 0.01298979121501964, + "grad_norm": 0.20690171420574188, + "learning_rate": 1.1778e-05, + "loss": 0.0539, + "step": 5890 + }, + { + "epoch": 0.013011845189917805, + "grad_norm": 0.2036699801683426, + "learning_rate": 1.1798e-05, + "loss": 0.0543, + "step": 5900 + }, + { + "epoch": 0.013033899164815971, + "grad_norm": 0.20415185391902924, + "learning_rate": 1.1818000000000001e-05, + "loss": 0.0535, + "step": 5910 + }, + { + "epoch": 0.013055953139714136, + "grad_norm": 0.3156754970550537, + "learning_rate": 1.1838e-05, + "loss": 0.0547, + "step": 5920 + }, + { + "epoch": 0.013078007114612301, + "grad_norm": 0.20219798386096954, + "learning_rate": 1.1858e-05, + "loss": 0.0543, + "step": 5930 + }, + { + "epoch": 0.013100061089510468, + "grad_norm": 0.21625296771526337, + "learning_rate": 1.1878e-05, + "loss": 0.0553, + "step": 5940 + }, + { + "epoch": 0.013122115064408633, + "grad_norm": 0.24593745172023773, + "learning_rate": 1.1898e-05, + "loss": 0.0531, + "step": 5950 + }, + { + "epoch": 0.0131441690393068, + "grad_norm": 0.2308315485715866, + "learning_rate": 1.1918e-05, + "loss": 0.0554, + "step": 5960 + }, + { + "epoch": 0.013166223014204965, + "grad_norm": 0.19488172233104706, + "learning_rate": 1.1938e-05, + "loss": 0.0593, + "step": 5970 + }, + { + "epoch": 0.013188276989103132, + "grad_norm": 0.18813557922840118, + "learning_rate": 1.1958000000000001e-05, + "loss": 0.0557, + "step": 5980 + }, + { + "epoch": 0.013210330964001296, + "grad_norm": 0.23395954072475433, + "learning_rate": 1.1978e-05, + "loss": 0.0543, + "step": 5990 + }, + { + "epoch": 0.013232384938899463, + "grad_norm": 0.24725018441677094, + "learning_rate": 1.1998e-05, + "loss": 0.0553, + "step": 6000 + }, + { + "epoch": 0.013254438913797628, + "grad_norm": 0.23884902894496918, + "learning_rate": 1.2018e-05, + "loss": 0.0551, + "step": 6010 + }, + { + "epoch": 0.013276492888695795, + "grad_norm": 0.23261256515979767, + "learning_rate": 1.2038e-05, + "loss": 0.0523, + "step": 6020 + }, + { + "epoch": 0.01329854686359396, + "grad_norm": 0.19332483410835266, + "learning_rate": 1.2058e-05, + "loss": 0.0541, + "step": 6030 + }, + { + "epoch": 0.013320600838492125, + "grad_norm": 0.19036760926246643, + "learning_rate": 1.2078e-05, + "loss": 0.052, + "step": 6040 + }, + { + "epoch": 0.013342654813390292, + "grad_norm": 0.1950320452451706, + "learning_rate": 1.2098e-05, + "loss": 0.0529, + "step": 6050 + }, + { + "epoch": 0.013364708788288457, + "grad_norm": 0.25137197971343994, + "learning_rate": 1.2118e-05, + "loss": 0.0542, + "step": 6060 + }, + { + "epoch": 0.013386762763186623, + "grad_norm": 0.21463464200496674, + "learning_rate": 1.2138000000000001e-05, + "loss": 0.0523, + "step": 6070 + }, + { + "epoch": 0.013408816738084788, + "grad_norm": 0.21450018882751465, + "learning_rate": 1.2158e-05, + "loss": 0.0537, + "step": 6080 + }, + { + "epoch": 0.013430870712982955, + "grad_norm": 0.21272356808185577, + "learning_rate": 1.2178e-05, + "loss": 0.0543, + "step": 6090 + }, + { + "epoch": 0.01345292468788112, + "grad_norm": 0.17603573203086853, + "learning_rate": 1.2198e-05, + "loss": 0.0523, + "step": 6100 + }, + { + "epoch": 0.013474978662779287, + "grad_norm": 0.2698224186897278, + "learning_rate": 1.2218e-05, + "loss": 0.0558, + "step": 6110 + }, + { + "epoch": 0.013497032637677452, + "grad_norm": 0.239791139960289, + "learning_rate": 1.2238e-05, + "loss": 0.0542, + "step": 6120 + }, + { + "epoch": 0.013519086612575618, + "grad_norm": 0.19685699045658112, + "learning_rate": 1.2258e-05, + "loss": 0.0522, + "step": 6130 + }, + { + "epoch": 0.013541140587473783, + "grad_norm": 0.1942872256040573, + "learning_rate": 1.2278e-05, + "loss": 0.0535, + "step": 6140 + }, + { + "epoch": 0.013563194562371948, + "grad_norm": 0.23316338658332825, + "learning_rate": 1.2298e-05, + "loss": 0.0553, + "step": 6150 + }, + { + "epoch": 0.013585248537270115, + "grad_norm": 0.22797027230262756, + "learning_rate": 1.2318000000000001e-05, + "loss": 0.0531, + "step": 6160 + }, + { + "epoch": 0.01360730251216828, + "grad_norm": 0.2039550095796585, + "learning_rate": 1.2338000000000001e-05, + "loss": 0.0572, + "step": 6170 + }, + { + "epoch": 0.013629356487066447, + "grad_norm": 0.2054409384727478, + "learning_rate": 1.2358e-05, + "loss": 0.0533, + "step": 6180 + }, + { + "epoch": 0.013651410461964612, + "grad_norm": 0.2624484896659851, + "learning_rate": 1.2378e-05, + "loss": 0.0534, + "step": 6190 + }, + { + "epoch": 0.013673464436862778, + "grad_norm": 0.23179051280021667, + "learning_rate": 1.2398e-05, + "loss": 0.0542, + "step": 6200 + }, + { + "epoch": 0.013695518411760943, + "grad_norm": 0.19968749582767487, + "learning_rate": 1.2418e-05, + "loss": 0.0496, + "step": 6210 + }, + { + "epoch": 0.01371757238665911, + "grad_norm": 0.22836701571941376, + "learning_rate": 1.2438000000000002e-05, + "loss": 0.0534, + "step": 6220 + }, + { + "epoch": 0.013739626361557275, + "grad_norm": 0.2507542371749878, + "learning_rate": 1.2458e-05, + "loss": 0.0521, + "step": 6230 + }, + { + "epoch": 0.013761680336455442, + "grad_norm": 0.23295515775680542, + "learning_rate": 1.2478e-05, + "loss": 0.0551, + "step": 6240 + }, + { + "epoch": 0.013783734311353607, + "grad_norm": 0.25672855973243713, + "learning_rate": 1.2498000000000001e-05, + "loss": 0.0538, + "step": 6250 + }, + { + "epoch": 0.013805788286251773, + "grad_norm": 0.19772526621818542, + "learning_rate": 1.2518000000000001e-05, + "loss": 0.0522, + "step": 6260 + }, + { + "epoch": 0.013827842261149938, + "grad_norm": 0.2178756296634674, + "learning_rate": 1.2538e-05, + "loss": 0.0533, + "step": 6270 + }, + { + "epoch": 0.013849896236048103, + "grad_norm": 0.20835131406784058, + "learning_rate": 1.2558e-05, + "loss": 0.0532, + "step": 6280 + }, + { + "epoch": 0.01387195021094627, + "grad_norm": 0.2244410216808319, + "learning_rate": 1.2578e-05, + "loss": 0.0532, + "step": 6290 + }, + { + "epoch": 0.013894004185844435, + "grad_norm": 0.21213115751743317, + "learning_rate": 1.2598e-05, + "loss": 0.0534, + "step": 6300 + }, + { + "epoch": 0.013916058160742602, + "grad_norm": 0.19685019552707672, + "learning_rate": 1.2618e-05, + "loss": 0.051, + "step": 6310 + }, + { + "epoch": 0.013938112135640767, + "grad_norm": 0.25899818539619446, + "learning_rate": 1.2638e-05, + "loss": 0.0537, + "step": 6320 + }, + { + "epoch": 0.013960166110538933, + "grad_norm": 0.20324259996414185, + "learning_rate": 1.2658e-05, + "loss": 0.0524, + "step": 6330 + }, + { + "epoch": 0.013982220085437098, + "grad_norm": 0.2674635350704193, + "learning_rate": 1.2678e-05, + "loss": 0.0529, + "step": 6340 + }, + { + "epoch": 0.014004274060335265, + "grad_norm": 0.23732565343379974, + "learning_rate": 1.2698000000000001e-05, + "loss": 0.0533, + "step": 6350 + }, + { + "epoch": 0.01402632803523343, + "grad_norm": 0.22770585119724274, + "learning_rate": 1.2718000000000001e-05, + "loss": 0.0546, + "step": 6360 + }, + { + "epoch": 0.014048382010131597, + "grad_norm": 0.21605637669563293, + "learning_rate": 1.2737999999999999e-05, + "loss": 0.0527, + "step": 6370 + }, + { + "epoch": 0.014070435985029762, + "grad_norm": 0.2328803390264511, + "learning_rate": 1.2758e-05, + "loss": 0.0572, + "step": 6380 + }, + { + "epoch": 0.014092489959927927, + "grad_norm": 0.2304060459136963, + "learning_rate": 1.2778e-05, + "loss": 0.0527, + "step": 6390 + }, + { + "epoch": 0.014114543934826093, + "grad_norm": 0.28768107295036316, + "learning_rate": 1.2798e-05, + "loss": 0.0538, + "step": 6400 + }, + { + "epoch": 0.014136597909724258, + "grad_norm": 0.2566186785697937, + "learning_rate": 1.2818000000000002e-05, + "loss": 0.0506, + "step": 6410 + }, + { + "epoch": 0.014158651884622425, + "grad_norm": 0.27003908157348633, + "learning_rate": 1.2838e-05, + "loss": 0.0523, + "step": 6420 + }, + { + "epoch": 0.01418070585952059, + "grad_norm": 0.21128253638744354, + "learning_rate": 1.2858e-05, + "loss": 0.0545, + "step": 6430 + }, + { + "epoch": 0.014202759834418757, + "grad_norm": 0.19891968369483948, + "learning_rate": 1.2878000000000001e-05, + "loss": 0.0532, + "step": 6440 + }, + { + "epoch": 0.014224813809316922, + "grad_norm": 0.25419580936431885, + "learning_rate": 1.2898000000000001e-05, + "loss": 0.0521, + "step": 6450 + }, + { + "epoch": 0.014246867784215089, + "grad_norm": 0.26541832089424133, + "learning_rate": 1.2917999999999999e-05, + "loss": 0.0546, + "step": 6460 + }, + { + "epoch": 0.014268921759113254, + "grad_norm": 0.2809012830257416, + "learning_rate": 1.2938e-05, + "loss": 0.0559, + "step": 6470 + }, + { + "epoch": 0.01429097573401142, + "grad_norm": 0.24063530564308167, + "learning_rate": 1.2958e-05, + "loss": 0.0528, + "step": 6480 + }, + { + "epoch": 0.014313029708909585, + "grad_norm": 0.2232658714056015, + "learning_rate": 1.2978e-05, + "loss": 0.0533, + "step": 6490 + }, + { + "epoch": 0.014335083683807752, + "grad_norm": 0.2619054615497589, + "learning_rate": 1.2998000000000002e-05, + "loss": 0.0534, + "step": 6500 + }, + { + "epoch": 0.014357137658705917, + "grad_norm": 0.25805097818374634, + "learning_rate": 1.3018e-05, + "loss": 0.0546, + "step": 6510 + }, + { + "epoch": 0.014379191633604082, + "grad_norm": 0.27423855662345886, + "learning_rate": 1.3038e-05, + "loss": 0.0528, + "step": 6520 + }, + { + "epoch": 0.014401245608502249, + "grad_norm": 0.30606332421302795, + "learning_rate": 1.3058000000000001e-05, + "loss": 0.054, + "step": 6530 + }, + { + "epoch": 0.014423299583400414, + "grad_norm": 0.22839264571666718, + "learning_rate": 1.3078e-05, + "loss": 0.0516, + "step": 6540 + }, + { + "epoch": 0.01444535355829858, + "grad_norm": 0.2497618943452835, + "learning_rate": 1.3098e-05, + "loss": 0.0522, + "step": 6550 + }, + { + "epoch": 0.014467407533196745, + "grad_norm": 0.24815215170383453, + "learning_rate": 1.3118e-05, + "loss": 0.0541, + "step": 6560 + }, + { + "epoch": 0.014489461508094912, + "grad_norm": 0.25625473260879517, + "learning_rate": 1.3138e-05, + "loss": 0.0549, + "step": 6570 + }, + { + "epoch": 0.014511515482993077, + "grad_norm": 0.2164168655872345, + "learning_rate": 1.3158e-05, + "loss": 0.0549, + "step": 6580 + }, + { + "epoch": 0.014533569457891244, + "grad_norm": 0.24251316487789154, + "learning_rate": 1.3178000000000002e-05, + "loss": 0.0559, + "step": 6590 + }, + { + "epoch": 0.014555623432789409, + "grad_norm": 0.2130284160375595, + "learning_rate": 1.3198000000000001e-05, + "loss": 0.0534, + "step": 6600 + }, + { + "epoch": 0.014577677407687575, + "grad_norm": 0.2537199854850769, + "learning_rate": 1.3218e-05, + "loss": 0.0527, + "step": 6610 + }, + { + "epoch": 0.01459973138258574, + "grad_norm": 0.2530772089958191, + "learning_rate": 1.3238e-05, + "loss": 0.0526, + "step": 6620 + }, + { + "epoch": 0.014621785357483905, + "grad_norm": 0.20588862895965576, + "learning_rate": 1.3258e-05, + "loss": 0.054, + "step": 6630 + }, + { + "epoch": 0.014643839332382072, + "grad_norm": 0.2314484715461731, + "learning_rate": 1.3278e-05, + "loss": 0.0498, + "step": 6640 + }, + { + "epoch": 0.014665893307280237, + "grad_norm": 0.3414452373981476, + "learning_rate": 1.3298e-05, + "loss": 0.0528, + "step": 6650 + }, + { + "epoch": 0.014687947282178404, + "grad_norm": 0.272035151720047, + "learning_rate": 1.3318e-05, + "loss": 0.0545, + "step": 6660 + }, + { + "epoch": 0.014710001257076569, + "grad_norm": 0.21264702081680298, + "learning_rate": 1.3338e-05, + "loss": 0.051, + "step": 6670 + }, + { + "epoch": 0.014732055231974735, + "grad_norm": 0.18512214720249176, + "learning_rate": 1.3358e-05, + "loss": 0.0544, + "step": 6680 + }, + { + "epoch": 0.0147541092068729, + "grad_norm": 0.24913454055786133, + "learning_rate": 1.3378000000000001e-05, + "loss": 0.0525, + "step": 6690 + }, + { + "epoch": 0.014776163181771067, + "grad_norm": 0.18428218364715576, + "learning_rate": 1.3398e-05, + "loss": 0.0515, + "step": 6700 + }, + { + "epoch": 0.014798217156669232, + "grad_norm": 0.18678995966911316, + "learning_rate": 1.3418e-05, + "loss": 0.054, + "step": 6710 + }, + { + "epoch": 0.014820271131567399, + "grad_norm": 0.184461310505867, + "learning_rate": 1.3438e-05, + "loss": 0.054, + "step": 6720 + }, + { + "epoch": 0.014842325106465564, + "grad_norm": 0.18939700722694397, + "learning_rate": 1.3458e-05, + "loss": 0.0519, + "step": 6730 + }, + { + "epoch": 0.014864379081363729, + "grad_norm": 0.25126150250434875, + "learning_rate": 1.3478e-05, + "loss": 0.054, + "step": 6740 + }, + { + "epoch": 0.014886433056261895, + "grad_norm": 0.19192638993263245, + "learning_rate": 1.3498e-05, + "loss": 0.0509, + "step": 6750 + }, + { + "epoch": 0.01490848703116006, + "grad_norm": 0.2094321846961975, + "learning_rate": 1.3518e-05, + "loss": 0.0533, + "step": 6760 + }, + { + "epoch": 0.014930541006058227, + "grad_norm": 0.21215103566646576, + "learning_rate": 1.3538e-05, + "loss": 0.052, + "step": 6770 + }, + { + "epoch": 0.014952594980956392, + "grad_norm": 0.2387271225452423, + "learning_rate": 1.3558000000000001e-05, + "loss": 0.0525, + "step": 6780 + }, + { + "epoch": 0.014974648955854559, + "grad_norm": 0.21928320825099945, + "learning_rate": 1.3578000000000001e-05, + "loss": 0.0526, + "step": 6790 + }, + { + "epoch": 0.014996702930752724, + "grad_norm": 0.21776659786701202, + "learning_rate": 1.3598e-05, + "loss": 0.0548, + "step": 6800 + }, + { + "epoch": 0.01501875690565089, + "grad_norm": 0.21616902947425842, + "learning_rate": 1.3618e-05, + "loss": 0.0519, + "step": 6810 + }, + { + "epoch": 0.015040810880549055, + "grad_norm": 0.23736758530139923, + "learning_rate": 1.3638e-05, + "loss": 0.0518, + "step": 6820 + }, + { + "epoch": 0.015062864855447222, + "grad_norm": 0.2864232659339905, + "learning_rate": 1.3658e-05, + "loss": 0.0541, + "step": 6830 + }, + { + "epoch": 0.015084918830345387, + "grad_norm": 0.21740347146987915, + "learning_rate": 1.3678000000000002e-05, + "loss": 0.0541, + "step": 6840 + }, + { + "epoch": 0.015106972805243554, + "grad_norm": 0.18633809685707092, + "learning_rate": 1.3698e-05, + "loss": 0.0521, + "step": 6850 + }, + { + "epoch": 0.015129026780141719, + "grad_norm": 0.2286366969347, + "learning_rate": 1.3718e-05, + "loss": 0.0505, + "step": 6860 + }, + { + "epoch": 0.015151080755039884, + "grad_norm": 0.19989734888076782, + "learning_rate": 1.3738000000000001e-05, + "loss": 0.0523, + "step": 6870 + }, + { + "epoch": 0.01517313472993805, + "grad_norm": 0.22600312530994415, + "learning_rate": 1.3758000000000001e-05, + "loss": 0.0549, + "step": 6880 + }, + { + "epoch": 0.015195188704836216, + "grad_norm": 0.22751250863075256, + "learning_rate": 1.3778e-05, + "loss": 0.0529, + "step": 6890 + }, + { + "epoch": 0.015217242679734382, + "grad_norm": 0.20790846645832062, + "learning_rate": 1.3798e-05, + "loss": 0.0511, + "step": 6900 + }, + { + "epoch": 0.015239296654632547, + "grad_norm": 0.2456839382648468, + "learning_rate": 1.3818e-05, + "loss": 0.0498, + "step": 6910 + }, + { + "epoch": 0.015261350629530714, + "grad_norm": 0.2444455623626709, + "learning_rate": 1.3838e-05, + "loss": 0.0551, + "step": 6920 + }, + { + "epoch": 0.015283404604428879, + "grad_norm": 0.2307882159948349, + "learning_rate": 1.3858e-05, + "loss": 0.0478, + "step": 6930 + }, + { + "epoch": 0.015305458579327046, + "grad_norm": 0.2712048292160034, + "learning_rate": 1.3878e-05, + "loss": 0.0541, + "step": 6940 + }, + { + "epoch": 0.01532751255422521, + "grad_norm": 0.19443735480308533, + "learning_rate": 1.3898e-05, + "loss": 0.0516, + "step": 6950 + }, + { + "epoch": 0.015349566529123377, + "grad_norm": 0.24664384126663208, + "learning_rate": 1.3918e-05, + "loss": 0.0533, + "step": 6960 + }, + { + "epoch": 0.015371620504021542, + "grad_norm": 0.24078591167926788, + "learning_rate": 1.3938000000000001e-05, + "loss": 0.05, + "step": 6970 + }, + { + "epoch": 0.015393674478919707, + "grad_norm": 0.19158951938152313, + "learning_rate": 1.3958000000000001e-05, + "loss": 0.0519, + "step": 6980 + }, + { + "epoch": 0.015415728453817874, + "grad_norm": 0.19449582695960999, + "learning_rate": 1.3977999999999999e-05, + "loss": 0.0522, + "step": 6990 + }, + { + "epoch": 0.015437782428716039, + "grad_norm": 0.23425239324569702, + "learning_rate": 1.3998e-05, + "loss": 0.0516, + "step": 7000 + }, + { + "epoch": 0.015459836403614206, + "grad_norm": 0.19884148240089417, + "learning_rate": 1.4018e-05, + "loss": 0.0519, + "step": 7010 + }, + { + "epoch": 0.01548189037851237, + "grad_norm": 0.19722257554531097, + "learning_rate": 1.4038e-05, + "loss": 0.0499, + "step": 7020 + }, + { + "epoch": 0.015503944353410537, + "grad_norm": 0.215051531791687, + "learning_rate": 1.4058000000000002e-05, + "loss": 0.0524, + "step": 7030 + }, + { + "epoch": 0.015525998328308702, + "grad_norm": 0.19781139492988586, + "learning_rate": 1.4078e-05, + "loss": 0.0512, + "step": 7040 + }, + { + "epoch": 0.015548052303206869, + "grad_norm": 0.19600249826908112, + "learning_rate": 1.4098e-05, + "loss": 0.0487, + "step": 7050 + }, + { + "epoch": 0.015570106278105034, + "grad_norm": 0.2286180555820465, + "learning_rate": 1.4118000000000001e-05, + "loss": 0.0523, + "step": 7060 + }, + { + "epoch": 0.0155921602530032, + "grad_norm": 0.21966098248958588, + "learning_rate": 1.4138e-05, + "loss": 0.053, + "step": 7070 + }, + { + "epoch": 0.015614214227901366, + "grad_norm": 0.31877315044403076, + "learning_rate": 1.4158e-05, + "loss": 0.0517, + "step": 7080 + }, + { + "epoch": 0.015636268202799532, + "grad_norm": 0.258185476064682, + "learning_rate": 1.4178e-05, + "loss": 0.0521, + "step": 7090 + }, + { + "epoch": 0.015658322177697696, + "grad_norm": 0.26160821318626404, + "learning_rate": 1.4198e-05, + "loss": 0.0522, + "step": 7100 + }, + { + "epoch": 0.015680376152595862, + "grad_norm": 0.203607439994812, + "learning_rate": 1.4218e-05, + "loss": 0.0547, + "step": 7110 + }, + { + "epoch": 0.01570243012749403, + "grad_norm": 0.17326895892620087, + "learning_rate": 1.4238000000000002e-05, + "loss": 0.0525, + "step": 7120 + }, + { + "epoch": 0.015724484102392196, + "grad_norm": 0.22587984800338745, + "learning_rate": 1.4258e-05, + "loss": 0.0506, + "step": 7130 + }, + { + "epoch": 0.01574653807729036, + "grad_norm": 0.24968160688877106, + "learning_rate": 1.4278e-05, + "loss": 0.052, + "step": 7140 + }, + { + "epoch": 0.015768592052188526, + "grad_norm": 0.2030867040157318, + "learning_rate": 1.4298000000000001e-05, + "loss": 0.0497, + "step": 7150 + }, + { + "epoch": 0.015790646027086692, + "grad_norm": 0.210909903049469, + "learning_rate": 1.4318e-05, + "loss": 0.0527, + "step": 7160 + }, + { + "epoch": 0.01581270000198486, + "grad_norm": 0.20292481780052185, + "learning_rate": 1.4338e-05, + "loss": 0.0525, + "step": 7170 + }, + { + "epoch": 0.015834753976883022, + "grad_norm": 0.26816526055336, + "learning_rate": 1.4358e-05, + "loss": 0.0507, + "step": 7180 + }, + { + "epoch": 0.01585680795178119, + "grad_norm": 0.20499949157238007, + "learning_rate": 1.4378e-05, + "loss": 0.0529, + "step": 7190 + }, + { + "epoch": 0.015878861926679356, + "grad_norm": 0.21028561890125275, + "learning_rate": 1.4398e-05, + "loss": 0.0533, + "step": 7200 + }, + { + "epoch": 0.015900915901577523, + "grad_norm": 0.18625155091285706, + "learning_rate": 1.4418000000000002e-05, + "loss": 0.049, + "step": 7210 + }, + { + "epoch": 0.015922969876475686, + "grad_norm": 0.19303570687770844, + "learning_rate": 1.4438000000000001e-05, + "loss": 0.051, + "step": 7220 + }, + { + "epoch": 0.015945023851373853, + "grad_norm": 0.21123197674751282, + "learning_rate": 1.4458e-05, + "loss": 0.0512, + "step": 7230 + }, + { + "epoch": 0.01596707782627202, + "grad_norm": 0.2795341908931732, + "learning_rate": 1.4478e-05, + "loss": 0.0513, + "step": 7240 + }, + { + "epoch": 0.015989131801170182, + "grad_norm": 0.1799897849559784, + "learning_rate": 1.4498e-05, + "loss": 0.0523, + "step": 7250 + }, + { + "epoch": 0.01601118577606835, + "grad_norm": 0.20411722362041473, + "learning_rate": 1.4518e-05, + "loss": 0.0512, + "step": 7260 + }, + { + "epoch": 0.016033239750966516, + "grad_norm": 0.20573630928993225, + "learning_rate": 1.4538e-05, + "loss": 0.0495, + "step": 7270 + }, + { + "epoch": 0.016055293725864683, + "grad_norm": 0.23036569356918335, + "learning_rate": 1.4558e-05, + "loss": 0.0529, + "step": 7280 + }, + { + "epoch": 0.016077347700762846, + "grad_norm": 0.19112274050712585, + "learning_rate": 1.4578e-05, + "loss": 0.0506, + "step": 7290 + }, + { + "epoch": 0.016099401675661013, + "grad_norm": 0.266683965921402, + "learning_rate": 1.4598e-05, + "loss": 0.0537, + "step": 7300 + }, + { + "epoch": 0.01612145565055918, + "grad_norm": 0.24860864877700806, + "learning_rate": 1.4618000000000001e-05, + "loss": 0.051, + "step": 7310 + }, + { + "epoch": 0.016143509625457346, + "grad_norm": 0.2478235512971878, + "learning_rate": 1.4638e-05, + "loss": 0.0539, + "step": 7320 + }, + { + "epoch": 0.01616556360035551, + "grad_norm": 0.22833551466464996, + "learning_rate": 1.4658e-05, + "loss": 0.0498, + "step": 7330 + }, + { + "epoch": 0.016187617575253676, + "grad_norm": 0.24763043224811554, + "learning_rate": 1.4678e-05, + "loss": 0.055, + "step": 7340 + }, + { + "epoch": 0.016209671550151843, + "grad_norm": 0.30790987610816956, + "learning_rate": 1.4698e-05, + "loss": 0.052, + "step": 7350 + }, + { + "epoch": 0.016231725525050006, + "grad_norm": 0.2595881223678589, + "learning_rate": 1.4718e-05, + "loss": 0.0527, + "step": 7360 + }, + { + "epoch": 0.016253779499948173, + "grad_norm": 0.3177628219127655, + "learning_rate": 1.4738e-05, + "loss": 0.0511, + "step": 7370 + }, + { + "epoch": 0.01627583347484634, + "grad_norm": 0.30119454860687256, + "learning_rate": 1.4758e-05, + "loss": 0.0507, + "step": 7380 + }, + { + "epoch": 0.016297887449744506, + "grad_norm": 0.228239506483078, + "learning_rate": 1.4778e-05, + "loss": 0.0516, + "step": 7390 + }, + { + "epoch": 0.01631994142464267, + "grad_norm": 0.1759062260389328, + "learning_rate": 1.4798000000000001e-05, + "loss": 0.0525, + "step": 7400 + }, + { + "epoch": 0.016341995399540836, + "grad_norm": 0.25458383560180664, + "learning_rate": 1.4818000000000001e-05, + "loss": 0.0501, + "step": 7410 + }, + { + "epoch": 0.016364049374439003, + "grad_norm": 0.20893266797065735, + "learning_rate": 1.4838e-05, + "loss": 0.0525, + "step": 7420 + }, + { + "epoch": 0.01638610334933717, + "grad_norm": 0.22534596920013428, + "learning_rate": 1.4858e-05, + "loss": 0.0488, + "step": 7430 + }, + { + "epoch": 0.016408157324235333, + "grad_norm": 0.23035256564617157, + "learning_rate": 1.4878e-05, + "loss": 0.053, + "step": 7440 + }, + { + "epoch": 0.0164302112991335, + "grad_norm": 0.21633362770080566, + "learning_rate": 1.4898e-05, + "loss": 0.0522, + "step": 7450 + }, + { + "epoch": 0.016452265274031666, + "grad_norm": 0.21311159431934357, + "learning_rate": 1.4918000000000002e-05, + "loss": 0.052, + "step": 7460 + }, + { + "epoch": 0.01647431924892983, + "grad_norm": 0.19857266545295715, + "learning_rate": 1.4938e-05, + "loss": 0.0506, + "step": 7470 + }, + { + "epoch": 0.016496373223827996, + "grad_norm": 0.22794677317142487, + "learning_rate": 1.4958e-05, + "loss": 0.0512, + "step": 7480 + }, + { + "epoch": 0.016518427198726163, + "grad_norm": 0.1994977593421936, + "learning_rate": 1.4978000000000001e-05, + "loss": 0.0554, + "step": 7490 + }, + { + "epoch": 0.01654048117362433, + "grad_norm": 0.25342124700546265, + "learning_rate": 1.4998000000000001e-05, + "loss": 0.0497, + "step": 7500 + }, + { + "epoch": 0.016562535148522493, + "grad_norm": 0.21941567957401276, + "learning_rate": 1.5018000000000001e-05, + "loss": 0.0542, + "step": 7510 + }, + { + "epoch": 0.01658458912342066, + "grad_norm": 0.2513224184513092, + "learning_rate": 1.5037999999999999e-05, + "loss": 0.0506, + "step": 7520 + }, + { + "epoch": 0.016606643098318826, + "grad_norm": 0.17372111976146698, + "learning_rate": 1.5058e-05, + "loss": 0.0526, + "step": 7530 + }, + { + "epoch": 0.016628697073216993, + "grad_norm": 0.27871736884117126, + "learning_rate": 1.5078000000000002e-05, + "loss": 0.0556, + "step": 7540 + }, + { + "epoch": 0.016650751048115156, + "grad_norm": 0.2620364725589752, + "learning_rate": 1.5098e-05, + "loss": 0.0543, + "step": 7550 + }, + { + "epoch": 0.016672805023013323, + "grad_norm": 0.2295795977115631, + "learning_rate": 1.5118e-05, + "loss": 0.0505, + "step": 7560 + }, + { + "epoch": 0.01669485899791149, + "grad_norm": 0.24145501852035522, + "learning_rate": 1.5138000000000001e-05, + "loss": 0.0502, + "step": 7570 + }, + { + "epoch": 0.016716912972809653, + "grad_norm": 0.17568273842334747, + "learning_rate": 1.5158e-05, + "loss": 0.0474, + "step": 7580 + }, + { + "epoch": 0.01673896694770782, + "grad_norm": 0.2156372219324112, + "learning_rate": 1.5178000000000001e-05, + "loss": 0.0501, + "step": 7590 + }, + { + "epoch": 0.016761020922605986, + "grad_norm": 0.21067404747009277, + "learning_rate": 1.5198000000000003e-05, + "loss": 0.0493, + "step": 7600 + }, + { + "epoch": 0.016783074897504153, + "grad_norm": 0.20680232346057892, + "learning_rate": 1.5217999999999999e-05, + "loss": 0.0508, + "step": 7610 + }, + { + "epoch": 0.016805128872402316, + "grad_norm": 0.19222471117973328, + "learning_rate": 1.5238e-05, + "loss": 0.0523, + "step": 7620 + }, + { + "epoch": 0.016827182847300483, + "grad_norm": 0.15709199011325836, + "learning_rate": 1.5258000000000002e-05, + "loss": 0.0501, + "step": 7630 + }, + { + "epoch": 0.01684923682219865, + "grad_norm": 0.21644054353237152, + "learning_rate": 1.5278e-05, + "loss": 0.0517, + "step": 7640 + }, + { + "epoch": 0.016871290797096816, + "grad_norm": 0.2347736656665802, + "learning_rate": 1.5298e-05, + "loss": 0.0511, + "step": 7650 + }, + { + "epoch": 0.01689334477199498, + "grad_norm": 0.18600808084011078, + "learning_rate": 1.5318e-05, + "loss": 0.049, + "step": 7660 + }, + { + "epoch": 0.016915398746893146, + "grad_norm": 0.25950267910957336, + "learning_rate": 1.5338e-05, + "loss": 0.0532, + "step": 7670 + }, + { + "epoch": 0.016937452721791313, + "grad_norm": 0.1718023270368576, + "learning_rate": 1.5358e-05, + "loss": 0.0524, + "step": 7680 + }, + { + "epoch": 0.016959506696689476, + "grad_norm": 0.28055238723754883, + "learning_rate": 1.5377999999999997e-05, + "loss": 0.0524, + "step": 7690 + }, + { + "epoch": 0.016981560671587643, + "grad_norm": 0.17155461013317108, + "learning_rate": 1.5398e-05, + "loss": 0.0506, + "step": 7700 + }, + { + "epoch": 0.01700361464648581, + "grad_norm": 0.19725483655929565, + "learning_rate": 1.5418e-05, + "loss": 0.0496, + "step": 7710 + }, + { + "epoch": 0.017025668621383976, + "grad_norm": 0.24731144309043884, + "learning_rate": 1.5438e-05, + "loss": 0.0525, + "step": 7720 + }, + { + "epoch": 0.01704772259628214, + "grad_norm": 0.18755152821540833, + "learning_rate": 1.5458e-05, + "loss": 0.0512, + "step": 7730 + }, + { + "epoch": 0.017069776571180306, + "grad_norm": 0.18122684955596924, + "learning_rate": 1.5478e-05, + "loss": 0.0504, + "step": 7740 + }, + { + "epoch": 0.017091830546078473, + "grad_norm": 0.16906939446926117, + "learning_rate": 1.5498e-05, + "loss": 0.0515, + "step": 7750 + }, + { + "epoch": 0.01711388452097664, + "grad_norm": 0.23409870266914368, + "learning_rate": 1.5518e-05, + "loss": 0.0513, + "step": 7760 + }, + { + "epoch": 0.017135938495874803, + "grad_norm": 0.22171127796173096, + "learning_rate": 1.5538000000000003e-05, + "loss": 0.0506, + "step": 7770 + }, + { + "epoch": 0.01715799247077297, + "grad_norm": 0.34373483061790466, + "learning_rate": 1.5558e-05, + "loss": 0.0528, + "step": 7780 + }, + { + "epoch": 0.017180046445671136, + "grad_norm": 0.18574070930480957, + "learning_rate": 1.5578e-05, + "loss": 0.0537, + "step": 7790 + }, + { + "epoch": 0.017202100420569303, + "grad_norm": 0.22858242690563202, + "learning_rate": 1.5598000000000002e-05, + "loss": 0.0521, + "step": 7800 + }, + { + "epoch": 0.017224154395467466, + "grad_norm": 0.18675969541072845, + "learning_rate": 1.5618e-05, + "loss": 0.0538, + "step": 7810 + }, + { + "epoch": 0.017246208370365633, + "grad_norm": 0.23186562955379486, + "learning_rate": 1.5638000000000002e-05, + "loss": 0.0497, + "step": 7820 + }, + { + "epoch": 0.0172682623452638, + "grad_norm": 0.2526072859764099, + "learning_rate": 1.5658e-05, + "loss": 0.0512, + "step": 7830 + }, + { + "epoch": 0.017290316320161963, + "grad_norm": 0.2183874100446701, + "learning_rate": 1.5677999999999998e-05, + "loss": 0.051, + "step": 7840 + }, + { + "epoch": 0.01731237029506013, + "grad_norm": 0.17222356796264648, + "learning_rate": 1.5698e-05, + "loss": 0.0498, + "step": 7850 + }, + { + "epoch": 0.017334424269958296, + "grad_norm": 0.20113737881183624, + "learning_rate": 1.5718e-05, + "loss": 0.05, + "step": 7860 + }, + { + "epoch": 0.017356478244856463, + "grad_norm": 0.17191410064697266, + "learning_rate": 1.5737999999999997e-05, + "loss": 0.051, + "step": 7870 + }, + { + "epoch": 0.017378532219754626, + "grad_norm": 0.250554621219635, + "learning_rate": 1.5758e-05, + "loss": 0.0513, + "step": 7880 + }, + { + "epoch": 0.017400586194652793, + "grad_norm": 0.20576544106006622, + "learning_rate": 1.5778e-05, + "loss": 0.0526, + "step": 7890 + }, + { + "epoch": 0.01742264016955096, + "grad_norm": 0.19663357734680176, + "learning_rate": 1.5798e-05, + "loss": 0.0528, + "step": 7900 + }, + { + "epoch": 0.017444694144449126, + "grad_norm": 0.16757600009441376, + "learning_rate": 1.5818e-05, + "loss": 0.0487, + "step": 7910 + }, + { + "epoch": 0.01746674811934729, + "grad_norm": 0.1591968834400177, + "learning_rate": 1.5838e-05, + "loss": 0.0495, + "step": 7920 + }, + { + "epoch": 0.017488802094245456, + "grad_norm": 0.1614818125963211, + "learning_rate": 1.5858e-05, + "loss": 0.0526, + "step": 7930 + }, + { + "epoch": 0.017510856069143623, + "grad_norm": 0.19728083908557892, + "learning_rate": 1.5878e-05, + "loss": 0.0498, + "step": 7940 + }, + { + "epoch": 0.017532910044041786, + "grad_norm": 0.17271815240383148, + "learning_rate": 1.5898000000000003e-05, + "loss": 0.0495, + "step": 7950 + }, + { + "epoch": 0.017554964018939953, + "grad_norm": 0.19307036697864532, + "learning_rate": 1.5918e-05, + "loss": 0.0501, + "step": 7960 + }, + { + "epoch": 0.01757701799383812, + "grad_norm": 0.2211480289697647, + "learning_rate": 1.5938e-05, + "loss": 0.0507, + "step": 7970 + }, + { + "epoch": 0.017599071968736286, + "grad_norm": 0.1850193589925766, + "learning_rate": 1.5958000000000002e-05, + "loss": 0.0514, + "step": 7980 + }, + { + "epoch": 0.01762112594363445, + "grad_norm": 0.18562085926532745, + "learning_rate": 1.5978e-05, + "loss": 0.0521, + "step": 7990 + }, + { + "epoch": 0.017643179918532616, + "grad_norm": 0.1822761744260788, + "learning_rate": 1.5998e-05, + "loss": 0.0509, + "step": 8000 + }, + { + "epoch": 0.017665233893430783, + "grad_norm": 0.1951671987771988, + "learning_rate": 1.6018e-05, + "loss": 0.0512, + "step": 8010 + }, + { + "epoch": 0.01768728786832895, + "grad_norm": 0.2130981832742691, + "learning_rate": 1.6037999999999998e-05, + "loss": 0.0481, + "step": 8020 + }, + { + "epoch": 0.017709341843227113, + "grad_norm": 0.18469052016735077, + "learning_rate": 1.6058e-05, + "loss": 0.0521, + "step": 8030 + }, + { + "epoch": 0.01773139581812528, + "grad_norm": 0.22373712062835693, + "learning_rate": 1.6078e-05, + "loss": 0.0524, + "step": 8040 + }, + { + "epoch": 0.017753449793023447, + "grad_norm": 0.2073076367378235, + "learning_rate": 1.6098e-05, + "loss": 0.0503, + "step": 8050 + }, + { + "epoch": 0.01777550376792161, + "grad_norm": 0.20461128652095795, + "learning_rate": 1.6118e-05, + "loss": 0.0522, + "step": 8060 + }, + { + "epoch": 0.017797557742819776, + "grad_norm": 0.21583066880702972, + "learning_rate": 1.6138e-05, + "loss": 0.0496, + "step": 8070 + }, + { + "epoch": 0.017819611717717943, + "grad_norm": 0.2775830328464508, + "learning_rate": 1.6158e-05, + "loss": 0.0515, + "step": 8080 + }, + { + "epoch": 0.01784166569261611, + "grad_norm": 0.22770071029663086, + "learning_rate": 1.6178e-05, + "loss": 0.0519, + "step": 8090 + }, + { + "epoch": 0.017863719667514273, + "grad_norm": 0.24268120527267456, + "learning_rate": 1.6198000000000003e-05, + "loss": 0.0546, + "step": 8100 + }, + { + "epoch": 0.01788577364241244, + "grad_norm": 0.24623814225196838, + "learning_rate": 1.6218e-05, + "loss": 0.0533, + "step": 8110 + }, + { + "epoch": 0.017907827617310607, + "grad_norm": 0.2034955769777298, + "learning_rate": 1.6238e-05, + "loss": 0.048, + "step": 8120 + }, + { + "epoch": 0.017929881592208773, + "grad_norm": 0.2053639143705368, + "learning_rate": 1.6258000000000003e-05, + "loss": 0.053, + "step": 8130 + }, + { + "epoch": 0.017951935567106937, + "grad_norm": 0.22012478113174438, + "learning_rate": 1.6278e-05, + "loss": 0.053, + "step": 8140 + }, + { + "epoch": 0.017973989542005103, + "grad_norm": 0.2735626697540283, + "learning_rate": 1.6298000000000002e-05, + "loss": 0.0531, + "step": 8150 + }, + { + "epoch": 0.01799604351690327, + "grad_norm": 0.17728598415851593, + "learning_rate": 1.6318000000000002e-05, + "loss": 0.0479, + "step": 8160 + }, + { + "epoch": 0.018018097491801433, + "grad_norm": 0.1967795342206955, + "learning_rate": 1.6338e-05, + "loss": 0.051, + "step": 8170 + }, + { + "epoch": 0.0180401514666996, + "grad_norm": 0.2169613540172577, + "learning_rate": 1.6358e-05, + "loss": 0.0506, + "step": 8180 + }, + { + "epoch": 0.018062205441597767, + "grad_norm": 0.18675784766674042, + "learning_rate": 1.6378e-05, + "loss": 0.0507, + "step": 8190 + }, + { + "epoch": 0.018084259416495933, + "grad_norm": 0.24386359751224518, + "learning_rate": 1.6398e-05, + "loss": 0.052, + "step": 8200 + }, + { + "epoch": 0.018106313391394097, + "grad_norm": 0.22122615575790405, + "learning_rate": 1.6418e-05, + "loss": 0.0494, + "step": 8210 + }, + { + "epoch": 0.018128367366292263, + "grad_norm": 0.290922075510025, + "learning_rate": 1.6438e-05, + "loss": 0.0528, + "step": 8220 + }, + { + "epoch": 0.01815042134119043, + "grad_norm": 0.1794348508119583, + "learning_rate": 1.6458e-05, + "loss": 0.051, + "step": 8230 + }, + { + "epoch": 0.018172475316088597, + "grad_norm": 0.20874358713626862, + "learning_rate": 1.6478e-05, + "loss": 0.0507, + "step": 8240 + }, + { + "epoch": 0.01819452929098676, + "grad_norm": 0.1648881584405899, + "learning_rate": 1.6498000000000004e-05, + "loss": 0.0487, + "step": 8250 + }, + { + "epoch": 0.018216583265884927, + "grad_norm": 0.16724713146686554, + "learning_rate": 1.6518e-05, + "loss": 0.0502, + "step": 8260 + }, + { + "epoch": 0.018238637240783093, + "grad_norm": 0.17653889954090118, + "learning_rate": 1.6538e-05, + "loss": 0.0485, + "step": 8270 + }, + { + "epoch": 0.018260691215681257, + "grad_norm": 0.21956099569797516, + "learning_rate": 1.6558000000000003e-05, + "loss": 0.0497, + "step": 8280 + }, + { + "epoch": 0.018282745190579423, + "grad_norm": 0.1995537430047989, + "learning_rate": 1.6578e-05, + "loss": 0.0486, + "step": 8290 + }, + { + "epoch": 0.01830479916547759, + "grad_norm": 0.211398184299469, + "learning_rate": 1.6598e-05, + "loss": 0.0528, + "step": 8300 + }, + { + "epoch": 0.018326853140375757, + "grad_norm": 0.178161159157753, + "learning_rate": 1.6618000000000003e-05, + "loss": 0.0518, + "step": 8310 + }, + { + "epoch": 0.01834890711527392, + "grad_norm": 0.20575934648513794, + "learning_rate": 1.6638e-05, + "loss": 0.0536, + "step": 8320 + }, + { + "epoch": 0.018370961090172087, + "grad_norm": 0.16224327683448792, + "learning_rate": 1.6658000000000002e-05, + "loss": 0.0504, + "step": 8330 + }, + { + "epoch": 0.018393015065070253, + "grad_norm": 0.208085298538208, + "learning_rate": 1.6678e-05, + "loss": 0.0507, + "step": 8340 + }, + { + "epoch": 0.01841506903996842, + "grad_norm": 0.16996918618679047, + "learning_rate": 1.6698e-05, + "loss": 0.0512, + "step": 8350 + }, + { + "epoch": 0.018437123014866583, + "grad_norm": 0.17696531116962433, + "learning_rate": 1.6718e-05, + "loss": 0.0479, + "step": 8360 + }, + { + "epoch": 0.01845917698976475, + "grad_norm": 0.20204488933086395, + "learning_rate": 1.6737999999999998e-05, + "loss": 0.0486, + "step": 8370 + }, + { + "epoch": 0.018481230964662917, + "grad_norm": 0.18792542815208435, + "learning_rate": 1.6758e-05, + "loss": 0.0522, + "step": 8380 + }, + { + "epoch": 0.018503284939561083, + "grad_norm": 0.2497435361146927, + "learning_rate": 1.6778e-05, + "loss": 0.0522, + "step": 8390 + }, + { + "epoch": 0.018525338914459247, + "grad_norm": 0.14774838089942932, + "learning_rate": 1.6797999999999997e-05, + "loss": 0.0516, + "step": 8400 + }, + { + "epoch": 0.018547392889357413, + "grad_norm": 0.1824364960193634, + "learning_rate": 1.6818e-05, + "loss": 0.0514, + "step": 8410 + }, + { + "epoch": 0.01856944686425558, + "grad_norm": 0.20960180461406708, + "learning_rate": 1.6838e-05, + "loss": 0.0495, + "step": 8420 + }, + { + "epoch": 0.018591500839153743, + "grad_norm": 0.22910189628601074, + "learning_rate": 1.6858e-05, + "loss": 0.048, + "step": 8430 + }, + { + "epoch": 0.01861355481405191, + "grad_norm": 0.1530093252658844, + "learning_rate": 1.6878e-05, + "loss": 0.05, + "step": 8440 + }, + { + "epoch": 0.018635608788950077, + "grad_norm": 0.3316331207752228, + "learning_rate": 1.6898e-05, + "loss": 0.0504, + "step": 8450 + }, + { + "epoch": 0.018657662763848244, + "grad_norm": 0.16836705803871155, + "learning_rate": 1.6918e-05, + "loss": 0.047, + "step": 8460 + }, + { + "epoch": 0.018679716738746407, + "grad_norm": 0.2637791037559509, + "learning_rate": 1.6938e-05, + "loss": 0.049, + "step": 8470 + }, + { + "epoch": 0.018701770713644573, + "grad_norm": 0.1473717987537384, + "learning_rate": 1.6958000000000003e-05, + "loss": 0.0501, + "step": 8480 + }, + { + "epoch": 0.01872382468854274, + "grad_norm": 0.19454646110534668, + "learning_rate": 1.6978e-05, + "loss": 0.0513, + "step": 8490 + }, + { + "epoch": 0.018745878663440907, + "grad_norm": 0.2653292417526245, + "learning_rate": 1.6998e-05, + "loss": 0.0525, + "step": 8500 + }, + { + "epoch": 0.01876793263833907, + "grad_norm": 0.22744891047477722, + "learning_rate": 1.7018000000000002e-05, + "loss": 0.0505, + "step": 8510 + }, + { + "epoch": 0.018789986613237237, + "grad_norm": 0.18945972621440887, + "learning_rate": 1.7038e-05, + "loss": 0.0542, + "step": 8520 + }, + { + "epoch": 0.018812040588135404, + "grad_norm": 0.1801786571741104, + "learning_rate": 1.7058e-05, + "loss": 0.0553, + "step": 8530 + }, + { + "epoch": 0.018834094563033567, + "grad_norm": 0.16825315356254578, + "learning_rate": 1.7078e-05, + "loss": 0.0491, + "step": 8540 + }, + { + "epoch": 0.018856148537931734, + "grad_norm": 0.177957683801651, + "learning_rate": 1.7097999999999998e-05, + "loss": 0.0491, + "step": 8550 + }, + { + "epoch": 0.0188782025128299, + "grad_norm": 0.17480723559856415, + "learning_rate": 1.7118e-05, + "loss": 0.051, + "step": 8560 + }, + { + "epoch": 0.018900256487728067, + "grad_norm": 0.21035151183605194, + "learning_rate": 1.7138e-05, + "loss": 0.0502, + "step": 8570 + }, + { + "epoch": 0.01892231046262623, + "grad_norm": 0.1988653540611267, + "learning_rate": 1.7158e-05, + "loss": 0.0477, + "step": 8580 + }, + { + "epoch": 0.018944364437524397, + "grad_norm": 0.21370139718055725, + "learning_rate": 1.7178e-05, + "loss": 0.0539, + "step": 8590 + }, + { + "epoch": 0.018966418412422564, + "grad_norm": 0.1911020427942276, + "learning_rate": 1.7198e-05, + "loss": 0.0485, + "step": 8600 + }, + { + "epoch": 0.01898847238732073, + "grad_norm": 0.1517949402332306, + "learning_rate": 1.7218e-05, + "loss": 0.0516, + "step": 8610 + }, + { + "epoch": 0.019010526362218894, + "grad_norm": 0.1789768785238266, + "learning_rate": 1.7238e-05, + "loss": 0.0495, + "step": 8620 + }, + { + "epoch": 0.01903258033711706, + "grad_norm": 0.20637740194797516, + "learning_rate": 1.7258000000000003e-05, + "loss": 0.054, + "step": 8630 + }, + { + "epoch": 0.019054634312015227, + "grad_norm": 0.27681583166122437, + "learning_rate": 1.7278e-05, + "loss": 0.0509, + "step": 8640 + }, + { + "epoch": 0.01907668828691339, + "grad_norm": 0.1670789271593094, + "learning_rate": 1.7298e-05, + "loss": 0.0502, + "step": 8650 + }, + { + "epoch": 0.019098742261811557, + "grad_norm": 0.2354547381401062, + "learning_rate": 1.7318000000000003e-05, + "loss": 0.0504, + "step": 8660 + }, + { + "epoch": 0.019120796236709724, + "grad_norm": 0.20992477238178253, + "learning_rate": 1.7338e-05, + "loss": 0.0522, + "step": 8670 + }, + { + "epoch": 0.01914285021160789, + "grad_norm": 0.20988547801971436, + "learning_rate": 1.7358000000000002e-05, + "loss": 0.0519, + "step": 8680 + }, + { + "epoch": 0.019164904186506054, + "grad_norm": 0.23811131715774536, + "learning_rate": 1.7378000000000002e-05, + "loss": 0.0506, + "step": 8690 + }, + { + "epoch": 0.01918695816140422, + "grad_norm": 0.22906741499900818, + "learning_rate": 1.7398e-05, + "loss": 0.0526, + "step": 8700 + }, + { + "epoch": 0.019209012136302387, + "grad_norm": 0.2873428165912628, + "learning_rate": 1.7418e-05, + "loss": 0.0497, + "step": 8710 + }, + { + "epoch": 0.019231066111200554, + "grad_norm": 0.2366154044866562, + "learning_rate": 1.7438e-05, + "loss": 0.0501, + "step": 8720 + }, + { + "epoch": 0.019253120086098717, + "grad_norm": 0.20502245426177979, + "learning_rate": 1.7457999999999998e-05, + "loss": 0.0505, + "step": 8730 + }, + { + "epoch": 0.019275174060996884, + "grad_norm": 0.17709694802761078, + "learning_rate": 1.7478e-05, + "loss": 0.05, + "step": 8740 + }, + { + "epoch": 0.01929722803589505, + "grad_norm": 0.1461147516965866, + "learning_rate": 1.7498e-05, + "loss": 0.0486, + "step": 8750 + }, + { + "epoch": 0.019319282010793214, + "grad_norm": 0.22212602198123932, + "learning_rate": 1.7518e-05, + "loss": 0.0493, + "step": 8760 + }, + { + "epoch": 0.01934133598569138, + "grad_norm": 0.18392440676689148, + "learning_rate": 1.7538e-05, + "loss": 0.0529, + "step": 8770 + }, + { + "epoch": 0.019363389960589547, + "grad_norm": 0.17573848366737366, + "learning_rate": 1.7558e-05, + "loss": 0.0481, + "step": 8780 + }, + { + "epoch": 0.019385443935487714, + "grad_norm": 0.2041192501783371, + "learning_rate": 1.7578e-05, + "loss": 0.0516, + "step": 8790 + }, + { + "epoch": 0.019407497910385877, + "grad_norm": 0.19026292860507965, + "learning_rate": 1.7598e-05, + "loss": 0.0508, + "step": 8800 + }, + { + "epoch": 0.019429551885284044, + "grad_norm": 0.22351211309432983, + "learning_rate": 1.7618000000000003e-05, + "loss": 0.0481, + "step": 8810 + }, + { + "epoch": 0.01945160586018221, + "grad_norm": 0.18880511820316315, + "learning_rate": 1.7638e-05, + "loss": 0.0522, + "step": 8820 + }, + { + "epoch": 0.019473659835080377, + "grad_norm": 0.20961204171180725, + "learning_rate": 1.7658e-05, + "loss": 0.0515, + "step": 8830 + }, + { + "epoch": 0.01949571380997854, + "grad_norm": 0.23655501008033752, + "learning_rate": 1.7678000000000003e-05, + "loss": 0.0486, + "step": 8840 + }, + { + "epoch": 0.019517767784876707, + "grad_norm": 0.1962282955646515, + "learning_rate": 1.7698e-05, + "loss": 0.0505, + "step": 8850 + }, + { + "epoch": 0.019539821759774874, + "grad_norm": 0.1869167983531952, + "learning_rate": 1.7718000000000002e-05, + "loss": 0.0485, + "step": 8860 + }, + { + "epoch": 0.019561875734673037, + "grad_norm": 0.18166905641555786, + "learning_rate": 1.7738000000000002e-05, + "loss": 0.0506, + "step": 8870 + }, + { + "epoch": 0.019583929709571204, + "grad_norm": 0.1856757551431656, + "learning_rate": 1.7758e-05, + "loss": 0.0492, + "step": 8880 + }, + { + "epoch": 0.01960598368446937, + "grad_norm": 0.18212006986141205, + "learning_rate": 1.7778e-05, + "loss": 0.0484, + "step": 8890 + }, + { + "epoch": 0.019628037659367537, + "grad_norm": 0.22456404566764832, + "learning_rate": 1.7798e-05, + "loss": 0.0495, + "step": 8900 + }, + { + "epoch": 0.0196500916342657, + "grad_norm": 0.191492959856987, + "learning_rate": 1.7818e-05, + "loss": 0.0522, + "step": 8910 + }, + { + "epoch": 0.019672145609163867, + "grad_norm": 0.16763117909431458, + "learning_rate": 1.7838e-05, + "loss": 0.0498, + "step": 8920 + }, + { + "epoch": 0.019694199584062034, + "grad_norm": 0.19515149295330048, + "learning_rate": 1.7858e-05, + "loss": 0.0508, + "step": 8930 + }, + { + "epoch": 0.0197162535589602, + "grad_norm": 0.17422647774219513, + "learning_rate": 1.7878e-05, + "loss": 0.0481, + "step": 8940 + }, + { + "epoch": 0.019738307533858364, + "grad_norm": 0.2544606626033783, + "learning_rate": 1.7898e-05, + "loss": 0.0487, + "step": 8950 + }, + { + "epoch": 0.01976036150875653, + "grad_norm": 0.1491369754076004, + "learning_rate": 1.7918e-05, + "loss": 0.0504, + "step": 8960 + }, + { + "epoch": 0.019782415483654697, + "grad_norm": 0.2196090668439865, + "learning_rate": 1.7938e-05, + "loss": 0.0498, + "step": 8970 + }, + { + "epoch": 0.019804469458552864, + "grad_norm": 0.22110803425312042, + "learning_rate": 1.7958e-05, + "loss": 0.0497, + "step": 8980 + }, + { + "epoch": 0.019826523433451027, + "grad_norm": 0.1765090525150299, + "learning_rate": 1.7978e-05, + "loss": 0.0495, + "step": 8990 + }, + { + "epoch": 0.019848577408349194, + "grad_norm": 0.18661172688007355, + "learning_rate": 1.7998e-05, + "loss": 0.0501, + "step": 9000 + }, + { + "epoch": 0.01987063138324736, + "grad_norm": 0.2336031198501587, + "learning_rate": 1.8018000000000003e-05, + "loss": 0.0519, + "step": 9010 + }, + { + "epoch": 0.019892685358145524, + "grad_norm": 0.26082780957221985, + "learning_rate": 1.8038e-05, + "loss": 0.0527, + "step": 9020 + }, + { + "epoch": 0.01991473933304369, + "grad_norm": 0.23908807337284088, + "learning_rate": 1.8058e-05, + "loss": 0.0509, + "step": 9030 + }, + { + "epoch": 0.019936793307941857, + "grad_norm": 0.2248666137456894, + "learning_rate": 1.8078000000000002e-05, + "loss": 0.0507, + "step": 9040 + }, + { + "epoch": 0.019958847282840024, + "grad_norm": 0.20799098908901215, + "learning_rate": 1.8098e-05, + "loss": 0.0476, + "step": 9050 + }, + { + "epoch": 0.019980901257738187, + "grad_norm": 0.2259189635515213, + "learning_rate": 1.8118000000000002e-05, + "loss": 0.0495, + "step": 9060 + }, + { + "epoch": 0.020002955232636354, + "grad_norm": 0.23631151020526886, + "learning_rate": 1.8138e-05, + "loss": 0.0508, + "step": 9070 + }, + { + "epoch": 0.02002500920753452, + "grad_norm": 0.23596030473709106, + "learning_rate": 1.8157999999999998e-05, + "loss": 0.0495, + "step": 9080 + }, + { + "epoch": 0.020047063182432687, + "grad_norm": 0.19828318059444427, + "learning_rate": 1.8178e-05, + "loss": 0.0529, + "step": 9090 + }, + { + "epoch": 0.02006911715733085, + "grad_norm": 0.2315138280391693, + "learning_rate": 1.8198e-05, + "loss": 0.05, + "step": 9100 + }, + { + "epoch": 0.020091171132229017, + "grad_norm": 0.18951283395290375, + "learning_rate": 1.8218e-05, + "loss": 0.0496, + "step": 9110 + }, + { + "epoch": 0.020113225107127184, + "grad_norm": 0.18038369715213776, + "learning_rate": 1.8238e-05, + "loss": 0.0481, + "step": 9120 + }, + { + "epoch": 0.020135279082025347, + "grad_norm": 0.19254563748836517, + "learning_rate": 1.8258e-05, + "loss": 0.0486, + "step": 9130 + }, + { + "epoch": 0.020157333056923514, + "grad_norm": 0.22452792525291443, + "learning_rate": 1.8278e-05, + "loss": 0.0521, + "step": 9140 + }, + { + "epoch": 0.02017938703182168, + "grad_norm": 0.21819712221622467, + "learning_rate": 1.8298e-05, + "loss": 0.0503, + "step": 9150 + }, + { + "epoch": 0.020201441006719847, + "grad_norm": 0.19390645623207092, + "learning_rate": 1.8318e-05, + "loss": 0.049, + "step": 9160 + }, + { + "epoch": 0.02022349498161801, + "grad_norm": 0.27922093868255615, + "learning_rate": 1.8338e-05, + "loss": 0.0517, + "step": 9170 + }, + { + "epoch": 0.020245548956516177, + "grad_norm": 0.19590623676776886, + "learning_rate": 1.8358e-05, + "loss": 0.0496, + "step": 9180 + }, + { + "epoch": 0.020267602931414344, + "grad_norm": 0.21026885509490967, + "learning_rate": 1.8378000000000003e-05, + "loss": 0.0523, + "step": 9190 + }, + { + "epoch": 0.02028965690631251, + "grad_norm": 0.25095078349113464, + "learning_rate": 1.8398e-05, + "loss": 0.0517, + "step": 9200 + }, + { + "epoch": 0.020311710881210674, + "grad_norm": 0.20122851431369781, + "learning_rate": 1.8418e-05, + "loss": 0.0511, + "step": 9210 + }, + { + "epoch": 0.02033376485610884, + "grad_norm": 0.16463373601436615, + "learning_rate": 1.8438000000000002e-05, + "loss": 0.0497, + "step": 9220 + }, + { + "epoch": 0.020355818831007007, + "grad_norm": 0.2619516849517822, + "learning_rate": 1.8458e-05, + "loss": 0.0521, + "step": 9230 + }, + { + "epoch": 0.02037787280590517, + "grad_norm": 0.17346227169036865, + "learning_rate": 1.8478e-05, + "loss": 0.0489, + "step": 9240 + }, + { + "epoch": 0.020399926780803337, + "grad_norm": 0.20858269929885864, + "learning_rate": 1.8498e-05, + "loss": 0.0495, + "step": 9250 + }, + { + "epoch": 0.020421980755701504, + "grad_norm": 0.1451946645975113, + "learning_rate": 1.8517999999999998e-05, + "loss": 0.0497, + "step": 9260 + }, + { + "epoch": 0.02044403473059967, + "grad_norm": 0.1900469958782196, + "learning_rate": 1.8538e-05, + "loss": 0.0495, + "step": 9270 + }, + { + "epoch": 0.020466088705497834, + "grad_norm": 0.1955447494983673, + "learning_rate": 1.8558e-05, + "loss": 0.05, + "step": 9280 + }, + { + "epoch": 0.020488142680396, + "grad_norm": 0.23196479678153992, + "learning_rate": 1.8578e-05, + "loss": 0.0492, + "step": 9290 + }, + { + "epoch": 0.020510196655294168, + "grad_norm": 0.23215754330158234, + "learning_rate": 1.8598e-05, + "loss": 0.0508, + "step": 9300 + }, + { + "epoch": 0.020532250630192334, + "grad_norm": 0.20955269038677216, + "learning_rate": 1.8618e-05, + "loss": 0.05, + "step": 9310 + }, + { + "epoch": 0.020554304605090497, + "grad_norm": 0.19217514991760254, + "learning_rate": 1.8638e-05, + "loss": 0.0487, + "step": 9320 + }, + { + "epoch": 0.020576358579988664, + "grad_norm": 0.1922231912612915, + "learning_rate": 1.8658e-05, + "loss": 0.0472, + "step": 9330 + }, + { + "epoch": 0.02059841255488683, + "grad_norm": 0.21684613823890686, + "learning_rate": 1.8678000000000003e-05, + "loss": 0.0481, + "step": 9340 + }, + { + "epoch": 0.020620466529784994, + "grad_norm": 0.22603511810302734, + "learning_rate": 1.8698e-05, + "loss": 0.0506, + "step": 9350 + }, + { + "epoch": 0.02064252050468316, + "grad_norm": 0.22875544428825378, + "learning_rate": 1.8718e-05, + "loss": 0.0502, + "step": 9360 + }, + { + "epoch": 0.020664574479581328, + "grad_norm": 0.258407860994339, + "learning_rate": 1.8738000000000003e-05, + "loss": 0.0493, + "step": 9370 + }, + { + "epoch": 0.020686628454479494, + "grad_norm": 0.19087934494018555, + "learning_rate": 1.8758e-05, + "loss": 0.0506, + "step": 9380 + }, + { + "epoch": 0.020708682429377658, + "grad_norm": 0.22071613371372223, + "learning_rate": 1.8778000000000002e-05, + "loss": 0.0498, + "step": 9390 + }, + { + "epoch": 0.020730736404275824, + "grad_norm": 0.17279088497161865, + "learning_rate": 1.8798000000000002e-05, + "loss": 0.05, + "step": 9400 + }, + { + "epoch": 0.02075279037917399, + "grad_norm": 0.248427152633667, + "learning_rate": 1.8818e-05, + "loss": 0.05, + "step": 9410 + }, + { + "epoch": 0.020774844354072158, + "grad_norm": 0.19504941999912262, + "learning_rate": 1.8838e-05, + "loss": 0.0514, + "step": 9420 + }, + { + "epoch": 0.02079689832897032, + "grad_norm": 0.19061261415481567, + "learning_rate": 1.8858e-05, + "loss": 0.0506, + "step": 9430 + }, + { + "epoch": 0.020818952303868488, + "grad_norm": 0.17964744567871094, + "learning_rate": 1.8878e-05, + "loss": 0.0475, + "step": 9440 + }, + { + "epoch": 0.020841006278766654, + "grad_norm": 0.23665229976177216, + "learning_rate": 1.8898e-05, + "loss": 0.0467, + "step": 9450 + }, + { + "epoch": 0.020863060253664818, + "grad_norm": 0.1656801849603653, + "learning_rate": 1.8918e-05, + "loss": 0.0479, + "step": 9460 + }, + { + "epoch": 0.020885114228562984, + "grad_norm": 0.18836580216884613, + "learning_rate": 1.8938e-05, + "loss": 0.0486, + "step": 9470 + }, + { + "epoch": 0.02090716820346115, + "grad_norm": 0.22727029025554657, + "learning_rate": 1.8958e-05, + "loss": 0.0518, + "step": 9480 + }, + { + "epoch": 0.020929222178359318, + "grad_norm": 0.24575424194335938, + "learning_rate": 1.8978000000000004e-05, + "loss": 0.0508, + "step": 9490 + }, + { + "epoch": 0.02095127615325748, + "grad_norm": 0.16673029959201813, + "learning_rate": 1.8998e-05, + "loss": 0.0501, + "step": 9500 + }, + { + "epoch": 0.020973330128155648, + "grad_norm": 0.17572931945323944, + "learning_rate": 1.9018e-05, + "loss": 0.0472, + "step": 9510 + }, + { + "epoch": 0.020995384103053814, + "grad_norm": 0.23918063938617706, + "learning_rate": 1.9038000000000003e-05, + "loss": 0.0511, + "step": 9520 + }, + { + "epoch": 0.02101743807795198, + "grad_norm": 0.1687391996383667, + "learning_rate": 1.9058e-05, + "loss": 0.0517, + "step": 9530 + }, + { + "epoch": 0.021039492052850144, + "grad_norm": 0.21532194316387177, + "learning_rate": 1.9078000000000003e-05, + "loss": 0.0516, + "step": 9540 + }, + { + "epoch": 0.02106154602774831, + "grad_norm": 0.2210373878479004, + "learning_rate": 1.9098000000000002e-05, + "loss": 0.0503, + "step": 9550 + }, + { + "epoch": 0.021083600002646478, + "grad_norm": 0.17883194983005524, + "learning_rate": 1.9118e-05, + "loss": 0.053, + "step": 9560 + }, + { + "epoch": 0.021105653977544644, + "grad_norm": 0.1916123330593109, + "learning_rate": 1.9138000000000002e-05, + "loss": 0.0524, + "step": 9570 + }, + { + "epoch": 0.021127707952442808, + "grad_norm": 0.18982501327991486, + "learning_rate": 1.9158e-05, + "loss": 0.0526, + "step": 9580 + }, + { + "epoch": 0.021149761927340974, + "grad_norm": 0.22015298902988434, + "learning_rate": 1.9178e-05, + "loss": 0.0504, + "step": 9590 + }, + { + "epoch": 0.02117181590223914, + "grad_norm": 0.18745458126068115, + "learning_rate": 1.9198e-05, + "loss": 0.0501, + "step": 9600 + }, + { + "epoch": 0.021193869877137304, + "grad_norm": 0.2445875108242035, + "learning_rate": 1.9217999999999998e-05, + "loss": 0.0494, + "step": 9610 + }, + { + "epoch": 0.02121592385203547, + "grad_norm": 0.1785116046667099, + "learning_rate": 1.9238e-05, + "loss": 0.0494, + "step": 9620 + }, + { + "epoch": 0.021237977826933638, + "grad_norm": 0.21113619208335876, + "learning_rate": 1.9258e-05, + "loss": 0.0522, + "step": 9630 + }, + { + "epoch": 0.021260031801831804, + "grad_norm": 0.13445542752742767, + "learning_rate": 1.9277999999999997e-05, + "loss": 0.049, + "step": 9640 + }, + { + "epoch": 0.021282085776729968, + "grad_norm": 0.1814514398574829, + "learning_rate": 1.9298e-05, + "loss": 0.0482, + "step": 9650 + }, + { + "epoch": 0.021304139751628134, + "grad_norm": 0.23049083352088928, + "learning_rate": 1.9318e-05, + "loss": 0.05, + "step": 9660 + }, + { + "epoch": 0.0213261937265263, + "grad_norm": 0.16451312601566315, + "learning_rate": 1.9338e-05, + "loss": 0.0483, + "step": 9670 + }, + { + "epoch": 0.021348247701424468, + "grad_norm": 0.14778511226177216, + "learning_rate": 1.9358e-05, + "loss": 0.0505, + "step": 9680 + }, + { + "epoch": 0.02137030167632263, + "grad_norm": 0.20989590883255005, + "learning_rate": 1.9378e-05, + "loss": 0.0508, + "step": 9690 + }, + { + "epoch": 0.021392355651220798, + "grad_norm": 0.23264288902282715, + "learning_rate": 1.9398e-05, + "loss": 0.0508, + "step": 9700 + }, + { + "epoch": 0.021414409626118965, + "grad_norm": 0.1757354438304901, + "learning_rate": 1.9418e-05, + "loss": 0.0487, + "step": 9710 + }, + { + "epoch": 0.021436463601017128, + "grad_norm": 0.17921409010887146, + "learning_rate": 1.9438000000000003e-05, + "loss": 0.0526, + "step": 9720 + }, + { + "epoch": 0.021458517575915294, + "grad_norm": 0.19662487506866455, + "learning_rate": 1.9458e-05, + "loss": 0.0472, + "step": 9730 + }, + { + "epoch": 0.02148057155081346, + "grad_norm": 0.1552070677280426, + "learning_rate": 1.9478e-05, + "loss": 0.0487, + "step": 9740 + }, + { + "epoch": 0.021502625525711628, + "grad_norm": 0.17669935524463654, + "learning_rate": 1.9498000000000002e-05, + "loss": 0.0474, + "step": 9750 + }, + { + "epoch": 0.02152467950060979, + "grad_norm": 0.1983584463596344, + "learning_rate": 1.9518e-05, + "loss": 0.0521, + "step": 9760 + }, + { + "epoch": 0.021546733475507958, + "grad_norm": 0.15450799465179443, + "learning_rate": 1.9538e-05, + "loss": 0.0504, + "step": 9770 + }, + { + "epoch": 0.021568787450406125, + "grad_norm": 0.15555554628372192, + "learning_rate": 1.9558e-05, + "loss": 0.0512, + "step": 9780 + }, + { + "epoch": 0.02159084142530429, + "grad_norm": 0.18397216498851776, + "learning_rate": 1.9577999999999998e-05, + "loss": 0.0492, + "step": 9790 + }, + { + "epoch": 0.021612895400202455, + "grad_norm": 0.18548254668712616, + "learning_rate": 1.9598e-05, + "loss": 0.0481, + "step": 9800 + }, + { + "epoch": 0.02163494937510062, + "grad_norm": 0.18992213904857635, + "learning_rate": 1.9618e-05, + "loss": 0.0481, + "step": 9810 + }, + { + "epoch": 0.021657003349998788, + "grad_norm": 0.21335342526435852, + "learning_rate": 1.9638e-05, + "loss": 0.0486, + "step": 9820 + }, + { + "epoch": 0.02167905732489695, + "grad_norm": 0.25130826234817505, + "learning_rate": 1.9658e-05, + "loss": 0.0498, + "step": 9830 + }, + { + "epoch": 0.021701111299795118, + "grad_norm": 0.2218753844499588, + "learning_rate": 1.9678e-05, + "loss": 0.0514, + "step": 9840 + }, + { + "epoch": 0.021723165274693285, + "grad_norm": 0.25070348381996155, + "learning_rate": 1.9698e-05, + "loss": 0.0472, + "step": 9850 + }, + { + "epoch": 0.02174521924959145, + "grad_norm": 0.19227108359336853, + "learning_rate": 1.9718e-05, + "loss": 0.0484, + "step": 9860 + }, + { + "epoch": 0.021767273224489615, + "grad_norm": 0.1966792792081833, + "learning_rate": 1.9738000000000003e-05, + "loss": 0.0492, + "step": 9870 + }, + { + "epoch": 0.02178932719938778, + "grad_norm": 0.17220856249332428, + "learning_rate": 1.9758e-05, + "loss": 0.0513, + "step": 9880 + }, + { + "epoch": 0.021811381174285948, + "grad_norm": 0.16641879081726074, + "learning_rate": 1.9778e-05, + "loss": 0.05, + "step": 9890 + }, + { + "epoch": 0.021833435149184115, + "grad_norm": 0.2707882821559906, + "learning_rate": 1.9798000000000003e-05, + "loss": 0.0493, + "step": 9900 + }, + { + "epoch": 0.021855489124082278, + "grad_norm": 0.21127432584762573, + "learning_rate": 1.9818e-05, + "loss": 0.0493, + "step": 9910 + }, + { + "epoch": 0.021877543098980445, + "grad_norm": 0.18143242597579956, + "learning_rate": 1.9838000000000002e-05, + "loss": 0.0503, + "step": 9920 + }, + { + "epoch": 0.02189959707387861, + "grad_norm": 0.21321292221546173, + "learning_rate": 1.9858000000000002e-05, + "loss": 0.0505, + "step": 9930 + }, + { + "epoch": 0.021921651048776775, + "grad_norm": 0.20511817932128906, + "learning_rate": 1.9878e-05, + "loss": 0.0481, + "step": 9940 + }, + { + "epoch": 0.02194370502367494, + "grad_norm": 0.16584230959415436, + "learning_rate": 1.9898e-05, + "loss": 0.0477, + "step": 9950 + }, + { + "epoch": 0.021965758998573108, + "grad_norm": 0.2295343577861786, + "learning_rate": 1.9918e-05, + "loss": 0.0508, + "step": 9960 + }, + { + "epoch": 0.021987812973471275, + "grad_norm": 0.17134727537631989, + "learning_rate": 1.9938e-05, + "loss": 0.0496, + "step": 9970 + }, + { + "epoch": 0.022009866948369438, + "grad_norm": 0.20069468021392822, + "learning_rate": 1.9958e-05, + "loss": 0.048, + "step": 9980 + }, + { + "epoch": 0.022031920923267605, + "grad_norm": 0.1808074563741684, + "learning_rate": 1.9978e-05, + "loss": 0.0494, + "step": 9990 + }, + { + "epoch": 0.02205397489816577, + "grad_norm": 0.18685005605220795, + "learning_rate": 1.9998e-05, + "loss": 0.0481, + "step": 10000 + }, + { + "epoch": 0.022076028873063938, + "grad_norm": 0.1772066354751587, + "learning_rate": 2.0018e-05, + "loss": 0.049, + "step": 10010 + }, + { + "epoch": 0.0220980828479621, + "grad_norm": 0.1527770310640335, + "learning_rate": 2.0038e-05, + "loss": 0.0487, + "step": 10020 + }, + { + "epoch": 0.022120136822860268, + "grad_norm": 0.20679737627506256, + "learning_rate": 2.0058e-05, + "loss": 0.0456, + "step": 10030 + }, + { + "epoch": 0.022142190797758435, + "grad_norm": 0.1857873946428299, + "learning_rate": 2.0078e-05, + "loss": 0.0511, + "step": 10040 + }, + { + "epoch": 0.022164244772656598, + "grad_norm": 0.19803349673748016, + "learning_rate": 2.0098000000000003e-05, + "loss": 0.0468, + "step": 10050 + }, + { + "epoch": 0.022186298747554765, + "grad_norm": 0.18127350509166718, + "learning_rate": 2.0118e-05, + "loss": 0.0497, + "step": 10060 + }, + { + "epoch": 0.02220835272245293, + "grad_norm": 0.17000463604927063, + "learning_rate": 2.0138e-05, + "loss": 0.0476, + "step": 10070 + }, + { + "epoch": 0.022230406697351098, + "grad_norm": 0.17621953785419464, + "learning_rate": 2.0158000000000002e-05, + "loss": 0.0499, + "step": 10080 + }, + { + "epoch": 0.02225246067224926, + "grad_norm": 0.1674092710018158, + "learning_rate": 2.0178e-05, + "loss": 0.0492, + "step": 10090 + }, + { + "epoch": 0.022274514647147428, + "grad_norm": 0.17152445018291473, + "learning_rate": 2.0198000000000002e-05, + "loss": 0.0494, + "step": 10100 + }, + { + "epoch": 0.022296568622045595, + "grad_norm": 0.2390759289264679, + "learning_rate": 2.0218000000000002e-05, + "loss": 0.0512, + "step": 10110 + }, + { + "epoch": 0.02231862259694376, + "grad_norm": 0.18954242765903473, + "learning_rate": 2.0238e-05, + "loss": 0.0461, + "step": 10120 + }, + { + "epoch": 0.022340676571841925, + "grad_norm": 0.18735921382904053, + "learning_rate": 2.0258e-05, + "loss": 0.0483, + "step": 10130 + }, + { + "epoch": 0.02236273054674009, + "grad_norm": 0.14901567995548248, + "learning_rate": 2.0278e-05, + "loss": 0.047, + "step": 10140 + }, + { + "epoch": 0.022384784521638258, + "grad_norm": 0.15322142839431763, + "learning_rate": 2.0298e-05, + "loss": 0.0512, + "step": 10150 + }, + { + "epoch": 0.022406838496536425, + "grad_norm": 0.18785470724105835, + "learning_rate": 2.0318e-05, + "loss": 0.0513, + "step": 10160 + }, + { + "epoch": 0.022428892471434588, + "grad_norm": 0.22472339868545532, + "learning_rate": 2.0338e-05, + "loss": 0.0473, + "step": 10170 + }, + { + "epoch": 0.022450946446332755, + "grad_norm": 0.15203773975372314, + "learning_rate": 2.0358e-05, + "loss": 0.0494, + "step": 10180 + }, + { + "epoch": 0.02247300042123092, + "grad_norm": 0.150726780295372, + "learning_rate": 2.0378e-05, + "loss": 0.05, + "step": 10190 + }, + { + "epoch": 0.022495054396129085, + "grad_norm": 0.23769798874855042, + "learning_rate": 2.0398e-05, + "loss": 0.0485, + "step": 10200 + }, + { + "epoch": 0.02251710837102725, + "grad_norm": 0.20335830748081207, + "learning_rate": 2.0418e-05, + "loss": 0.0475, + "step": 10210 + }, + { + "epoch": 0.022539162345925418, + "grad_norm": 0.2180626094341278, + "learning_rate": 2.0438e-05, + "loss": 0.0482, + "step": 10220 + }, + { + "epoch": 0.022561216320823585, + "grad_norm": 0.1984626054763794, + "learning_rate": 2.0458e-05, + "loss": 0.0488, + "step": 10230 + }, + { + "epoch": 0.022583270295721748, + "grad_norm": 0.21870310604572296, + "learning_rate": 2.0478e-05, + "loss": 0.0534, + "step": 10240 + }, + { + "epoch": 0.022605324270619915, + "grad_norm": 0.22140388190746307, + "learning_rate": 2.0498000000000003e-05, + "loss": 0.0466, + "step": 10250 + }, + { + "epoch": 0.02262737824551808, + "grad_norm": 0.2380853146314621, + "learning_rate": 2.0518e-05, + "loss": 0.046, + "step": 10260 + }, + { + "epoch": 0.02264943222041625, + "grad_norm": 0.20633681118488312, + "learning_rate": 2.0538e-05, + "loss": 0.0493, + "step": 10270 + }, + { + "epoch": 0.02267148619531441, + "grad_norm": 0.18667104840278625, + "learning_rate": 2.0558000000000002e-05, + "loss": 0.0466, + "step": 10280 + }, + { + "epoch": 0.02269354017021258, + "grad_norm": 0.2053709626197815, + "learning_rate": 2.0578e-05, + "loss": 0.0495, + "step": 10290 + }, + { + "epoch": 0.022715594145110745, + "grad_norm": 0.16069483757019043, + "learning_rate": 2.0598e-05, + "loss": 0.0503, + "step": 10300 + }, + { + "epoch": 0.022737648120008908, + "grad_norm": 0.18811656534671783, + "learning_rate": 2.0618e-05, + "loss": 0.0454, + "step": 10310 + }, + { + "epoch": 0.022759702094907075, + "grad_norm": 0.23188036680221558, + "learning_rate": 2.0637999999999998e-05, + "loss": 0.0502, + "step": 10320 + }, + { + "epoch": 0.02278175606980524, + "grad_norm": 0.19029781222343445, + "learning_rate": 2.0658e-05, + "loss": 0.0485, + "step": 10330 + }, + { + "epoch": 0.02280381004470341, + "grad_norm": 0.1909770667552948, + "learning_rate": 2.0678e-05, + "loss": 0.0501, + "step": 10340 + }, + { + "epoch": 0.02282586401960157, + "grad_norm": 0.22514839470386505, + "learning_rate": 2.0698e-05, + "loss": 0.0494, + "step": 10350 + }, + { + "epoch": 0.02284791799449974, + "grad_norm": 0.18072979152202606, + "learning_rate": 2.0718e-05, + "loss": 0.0473, + "step": 10360 + }, + { + "epoch": 0.022869971969397905, + "grad_norm": 0.163665309548378, + "learning_rate": 2.0738e-05, + "loss": 0.0475, + "step": 10370 + }, + { + "epoch": 0.022892025944296072, + "grad_norm": 0.2425445318222046, + "learning_rate": 2.0758e-05, + "loss": 0.0462, + "step": 10380 + }, + { + "epoch": 0.022914079919194235, + "grad_norm": 0.17270097136497498, + "learning_rate": 2.0778e-05, + "loss": 0.0508, + "step": 10390 + }, + { + "epoch": 0.0229361338940924, + "grad_norm": 0.19995832443237305, + "learning_rate": 2.0798000000000003e-05, + "loss": 0.0468, + "step": 10400 + }, + { + "epoch": 0.02295818786899057, + "grad_norm": 0.15067650377750397, + "learning_rate": 2.0818e-05, + "loss": 0.0465, + "step": 10410 + }, + { + "epoch": 0.02298024184388873, + "grad_norm": 0.14614807069301605, + "learning_rate": 2.0838e-05, + "loss": 0.0472, + "step": 10420 + }, + { + "epoch": 0.0230022958187869, + "grad_norm": 0.1670185923576355, + "learning_rate": 2.0858000000000003e-05, + "loss": 0.0503, + "step": 10430 + }, + { + "epoch": 0.023024349793685065, + "grad_norm": 0.1944218873977661, + "learning_rate": 2.0878e-05, + "loss": 0.048, + "step": 10440 + }, + { + "epoch": 0.023046403768583232, + "grad_norm": 0.16585247218608856, + "learning_rate": 2.0898e-05, + "loss": 0.0489, + "step": 10450 + }, + { + "epoch": 0.023068457743481395, + "grad_norm": 0.2261451929807663, + "learning_rate": 2.0918000000000002e-05, + "loss": 0.0504, + "step": 10460 + }, + { + "epoch": 0.023090511718379562, + "grad_norm": 0.17988885939121246, + "learning_rate": 2.0938e-05, + "loss": 0.0489, + "step": 10470 + }, + { + "epoch": 0.02311256569327773, + "grad_norm": 0.15646110475063324, + "learning_rate": 2.0958e-05, + "loss": 0.048, + "step": 10480 + }, + { + "epoch": 0.023134619668175895, + "grad_norm": 0.19575640559196472, + "learning_rate": 2.0978e-05, + "loss": 0.0512, + "step": 10490 + }, + { + "epoch": 0.02315667364307406, + "grad_norm": 0.1687140166759491, + "learning_rate": 2.0997999999999998e-05, + "loss": 0.0471, + "step": 10500 + }, + { + "epoch": 0.023178727617972225, + "grad_norm": 0.2728213965892792, + "learning_rate": 2.1018e-05, + "loss": 0.0491, + "step": 10510 + }, + { + "epoch": 0.023200781592870392, + "grad_norm": 0.17385078966617584, + "learning_rate": 2.1038e-05, + "loss": 0.0481, + "step": 10520 + }, + { + "epoch": 0.023222835567768555, + "grad_norm": 0.1771552711725235, + "learning_rate": 2.1058e-05, + "loss": 0.0485, + "step": 10530 + }, + { + "epoch": 0.023244889542666722, + "grad_norm": 0.15053334832191467, + "learning_rate": 2.1078e-05, + "loss": 0.0471, + "step": 10540 + }, + { + "epoch": 0.02326694351756489, + "grad_norm": 0.2436646819114685, + "learning_rate": 2.1098e-05, + "loss": 0.0483, + "step": 10550 + }, + { + "epoch": 0.023288997492463055, + "grad_norm": 0.17017178237438202, + "learning_rate": 2.1118e-05, + "loss": 0.0471, + "step": 10560 + }, + { + "epoch": 0.02331105146736122, + "grad_norm": 0.17163541913032532, + "learning_rate": 2.1138e-05, + "loss": 0.0489, + "step": 10570 + }, + { + "epoch": 0.023333105442259385, + "grad_norm": 0.14704903960227966, + "learning_rate": 2.1158000000000003e-05, + "loss": 0.0463, + "step": 10580 + }, + { + "epoch": 0.023355159417157552, + "grad_norm": 0.1805671602487564, + "learning_rate": 2.1178e-05, + "loss": 0.0476, + "step": 10590 + }, + { + "epoch": 0.02337721339205572, + "grad_norm": 0.19752465188503265, + "learning_rate": 2.1198e-05, + "loss": 0.0489, + "step": 10600 + }, + { + "epoch": 0.023399267366953882, + "grad_norm": 0.18833889067173004, + "learning_rate": 2.1218000000000003e-05, + "loss": 0.0462, + "step": 10610 + }, + { + "epoch": 0.02342132134185205, + "grad_norm": 0.19475503265857697, + "learning_rate": 2.1238e-05, + "loss": 0.0482, + "step": 10620 + }, + { + "epoch": 0.023443375316750215, + "grad_norm": 0.1598215252161026, + "learning_rate": 2.1258000000000002e-05, + "loss": 0.0476, + "step": 10630 + }, + { + "epoch": 0.02346542929164838, + "grad_norm": 0.1758633852005005, + "learning_rate": 2.1278000000000002e-05, + "loss": 0.0481, + "step": 10640 + }, + { + "epoch": 0.023487483266546545, + "grad_norm": 0.15949837863445282, + "learning_rate": 2.1298e-05, + "loss": 0.0484, + "step": 10650 + }, + { + "epoch": 0.023509537241444712, + "grad_norm": 0.15200668573379517, + "learning_rate": 2.1318e-05, + "loss": 0.0481, + "step": 10660 + }, + { + "epoch": 0.02353159121634288, + "grad_norm": 0.16905927658081055, + "learning_rate": 2.1338e-05, + "loss": 0.0493, + "step": 10670 + }, + { + "epoch": 0.023553645191241042, + "grad_norm": 0.16548028588294983, + "learning_rate": 2.1358e-05, + "loss": 0.0485, + "step": 10680 + }, + { + "epoch": 0.02357569916613921, + "grad_norm": 0.1654634326696396, + "learning_rate": 2.1378e-05, + "loss": 0.0476, + "step": 10690 + }, + { + "epoch": 0.023597753141037375, + "grad_norm": 0.18695658445358276, + "learning_rate": 2.1398e-05, + "loss": 0.0473, + "step": 10700 + }, + { + "epoch": 0.023619807115935542, + "grad_norm": 0.17557375133037567, + "learning_rate": 2.1418e-05, + "loss": 0.0498, + "step": 10710 + }, + { + "epoch": 0.023641861090833705, + "grad_norm": 0.16490408778190613, + "learning_rate": 2.1438e-05, + "loss": 0.0462, + "step": 10720 + }, + { + "epoch": 0.023663915065731872, + "grad_norm": 0.16451086103916168, + "learning_rate": 2.1458000000000004e-05, + "loss": 0.0482, + "step": 10730 + }, + { + "epoch": 0.02368596904063004, + "grad_norm": 0.18840213119983673, + "learning_rate": 2.1478e-05, + "loss": 0.0482, + "step": 10740 + }, + { + "epoch": 0.023708023015528205, + "grad_norm": 0.1361098289489746, + "learning_rate": 2.1498e-05, + "loss": 0.051, + "step": 10750 + }, + { + "epoch": 0.02373007699042637, + "grad_norm": 0.18924064934253693, + "learning_rate": 2.1518000000000003e-05, + "loss": 0.0476, + "step": 10760 + }, + { + "epoch": 0.023752130965324535, + "grad_norm": 0.16066554188728333, + "learning_rate": 2.1538e-05, + "loss": 0.0464, + "step": 10770 + }, + { + "epoch": 0.023774184940222702, + "grad_norm": 0.20514018833637238, + "learning_rate": 2.1558000000000003e-05, + "loss": 0.0494, + "step": 10780 + }, + { + "epoch": 0.023796238915120865, + "grad_norm": 0.1884949505329132, + "learning_rate": 2.1578000000000002e-05, + "loss": 0.0505, + "step": 10790 + }, + { + "epoch": 0.023818292890019032, + "grad_norm": 0.14585211873054504, + "learning_rate": 2.1598e-05, + "loss": 0.0445, + "step": 10800 + }, + { + "epoch": 0.0238403468649172, + "grad_norm": 0.18810704350471497, + "learning_rate": 2.1618000000000002e-05, + "loss": 0.047, + "step": 10810 + }, + { + "epoch": 0.023862400839815365, + "grad_norm": 0.15430139005184174, + "learning_rate": 2.1638e-05, + "loss": 0.046, + "step": 10820 + }, + { + "epoch": 0.02388445481471353, + "grad_norm": 0.15502601861953735, + "learning_rate": 2.1658e-05, + "loss": 0.0464, + "step": 10830 + }, + { + "epoch": 0.023906508789611695, + "grad_norm": 0.25740236043930054, + "learning_rate": 2.1678e-05, + "loss": 0.0479, + "step": 10840 + }, + { + "epoch": 0.023928562764509862, + "grad_norm": 0.17758746445178986, + "learning_rate": 2.1697999999999998e-05, + "loss": 0.0482, + "step": 10850 + }, + { + "epoch": 0.02395061673940803, + "grad_norm": 0.23290343582630157, + "learning_rate": 2.1718e-05, + "loss": 0.0485, + "step": 10860 + }, + { + "epoch": 0.023972670714306192, + "grad_norm": 0.16877150535583496, + "learning_rate": 2.1738e-05, + "loss": 0.0477, + "step": 10870 + }, + { + "epoch": 0.02399472468920436, + "grad_norm": 0.17846450209617615, + "learning_rate": 2.1757999999999997e-05, + "loss": 0.0481, + "step": 10880 + }, + { + "epoch": 0.024016778664102525, + "grad_norm": 0.14905153214931488, + "learning_rate": 2.1778e-05, + "loss": 0.0489, + "step": 10890 + }, + { + "epoch": 0.02403883263900069, + "grad_norm": 0.20902332663536072, + "learning_rate": 2.1798e-05, + "loss": 0.0471, + "step": 10900 + }, + { + "epoch": 0.024060886613898855, + "grad_norm": 0.18222330510616302, + "learning_rate": 2.1818e-05, + "loss": 0.0457, + "step": 10910 + }, + { + "epoch": 0.024082940588797022, + "grad_norm": 0.19547228515148163, + "learning_rate": 2.1838e-05, + "loss": 0.047, + "step": 10920 + }, + { + "epoch": 0.02410499456369519, + "grad_norm": 0.17157557606697083, + "learning_rate": 2.1858e-05, + "loss": 0.0489, + "step": 10930 + }, + { + "epoch": 0.024127048538593352, + "grad_norm": 0.18091177940368652, + "learning_rate": 2.1878e-05, + "loss": 0.0479, + "step": 10940 + }, + { + "epoch": 0.02414910251349152, + "grad_norm": 0.27385467290878296, + "learning_rate": 2.1898e-05, + "loss": 0.0494, + "step": 10950 + }, + { + "epoch": 0.024171156488389686, + "grad_norm": 0.21602196991443634, + "learning_rate": 2.1918000000000003e-05, + "loss": 0.046, + "step": 10960 + }, + { + "epoch": 0.024193210463287852, + "grad_norm": 0.16964338719844818, + "learning_rate": 2.1938e-05, + "loss": 0.051, + "step": 10970 + }, + { + "epoch": 0.024215264438186015, + "grad_norm": 0.19888249039649963, + "learning_rate": 2.1958e-05, + "loss": 0.0477, + "step": 10980 + }, + { + "epoch": 0.024237318413084182, + "grad_norm": 0.2390305995941162, + "learning_rate": 2.1978000000000002e-05, + "loss": 0.0507, + "step": 10990 + }, + { + "epoch": 0.02425937238798235, + "grad_norm": 0.15828152000904083, + "learning_rate": 2.1998e-05, + "loss": 0.0481, + "step": 11000 + }, + { + "epoch": 0.024281426362880512, + "grad_norm": 0.17327257990837097, + "learning_rate": 2.2018e-05, + "loss": 0.0482, + "step": 11010 + }, + { + "epoch": 0.02430348033777868, + "grad_norm": 0.18878963589668274, + "learning_rate": 2.2038e-05, + "loss": 0.047, + "step": 11020 + }, + { + "epoch": 0.024325534312676846, + "grad_norm": 0.17152118682861328, + "learning_rate": 2.2057999999999998e-05, + "loss": 0.0468, + "step": 11030 + }, + { + "epoch": 0.024347588287575012, + "grad_norm": 0.18071164190769196, + "learning_rate": 2.2078e-05, + "loss": 0.0471, + "step": 11040 + }, + { + "epoch": 0.024369642262473176, + "grad_norm": 0.22051694989204407, + "learning_rate": 2.2098e-05, + "loss": 0.0466, + "step": 11050 + }, + { + "epoch": 0.024391696237371342, + "grad_norm": 0.17515048384666443, + "learning_rate": 2.2118e-05, + "loss": 0.0484, + "step": 11060 + }, + { + "epoch": 0.02441375021226951, + "grad_norm": 0.19428913295269012, + "learning_rate": 2.2138e-05, + "loss": 0.0495, + "step": 11070 + }, + { + "epoch": 0.024435804187167676, + "grad_norm": 0.1667558252811432, + "learning_rate": 2.2158e-05, + "loss": 0.0495, + "step": 11080 + }, + { + "epoch": 0.02445785816206584, + "grad_norm": 0.16836239397525787, + "learning_rate": 2.2178e-05, + "loss": 0.0513, + "step": 11090 + }, + { + "epoch": 0.024479912136964006, + "grad_norm": 0.1914311796426773, + "learning_rate": 2.2198e-05, + "loss": 0.0467, + "step": 11100 + }, + { + "epoch": 0.024501966111862172, + "grad_norm": 0.18111860752105713, + "learning_rate": 2.2218000000000003e-05, + "loss": 0.0485, + "step": 11110 + }, + { + "epoch": 0.024524020086760336, + "grad_norm": 0.1606244295835495, + "learning_rate": 2.2238e-05, + "loss": 0.0464, + "step": 11120 + }, + { + "epoch": 0.024546074061658502, + "grad_norm": 0.17347489297389984, + "learning_rate": 2.2258e-05, + "loss": 0.0486, + "step": 11130 + }, + { + "epoch": 0.02456812803655667, + "grad_norm": 0.16496120393276215, + "learning_rate": 2.2278000000000003e-05, + "loss": 0.0495, + "step": 11140 + }, + { + "epoch": 0.024590182011454836, + "grad_norm": 0.19144372642040253, + "learning_rate": 2.2298e-05, + "loss": 0.0495, + "step": 11150 + }, + { + "epoch": 0.024612235986353, + "grad_norm": 0.16001437604427338, + "learning_rate": 2.2318000000000002e-05, + "loss": 0.0485, + "step": 11160 + }, + { + "epoch": 0.024634289961251166, + "grad_norm": 0.18742331862449646, + "learning_rate": 2.2338000000000002e-05, + "loss": 0.0498, + "step": 11170 + }, + { + "epoch": 0.024656343936149332, + "grad_norm": 0.19301818311214447, + "learning_rate": 2.2358e-05, + "loss": 0.0479, + "step": 11180 + }, + { + "epoch": 0.0246783979110475, + "grad_norm": 0.17428570985794067, + "learning_rate": 2.2378e-05, + "loss": 0.0466, + "step": 11190 + }, + { + "epoch": 0.024700451885945662, + "grad_norm": 0.20296214520931244, + "learning_rate": 2.2398e-05, + "loss": 0.0489, + "step": 11200 + }, + { + "epoch": 0.02472250586084383, + "grad_norm": 0.16715574264526367, + "learning_rate": 2.2418e-05, + "loss": 0.0513, + "step": 11210 + }, + { + "epoch": 0.024744559835741996, + "grad_norm": 0.15701182186603546, + "learning_rate": 2.2438e-05, + "loss": 0.0463, + "step": 11220 + }, + { + "epoch": 0.024766613810640162, + "grad_norm": 0.17995859682559967, + "learning_rate": 2.2458e-05, + "loss": 0.0494, + "step": 11230 + }, + { + "epoch": 0.024788667785538326, + "grad_norm": 0.17961110174655914, + "learning_rate": 2.2478e-05, + "loss": 0.0507, + "step": 11240 + }, + { + "epoch": 0.024810721760436492, + "grad_norm": 0.21997955441474915, + "learning_rate": 2.2498e-05, + "loss": 0.0467, + "step": 11250 + }, + { + "epoch": 0.02483277573533466, + "grad_norm": 0.19542783498764038, + "learning_rate": 2.2518e-05, + "loss": 0.0497, + "step": 11260 + }, + { + "epoch": 0.024854829710232822, + "grad_norm": 0.2070036679506302, + "learning_rate": 2.2538e-05, + "loss": 0.0481, + "step": 11270 + }, + { + "epoch": 0.02487688368513099, + "grad_norm": 0.21796326339244843, + "learning_rate": 2.2558e-05, + "loss": 0.0479, + "step": 11280 + }, + { + "epoch": 0.024898937660029156, + "grad_norm": 0.18486756086349487, + "learning_rate": 2.2578000000000003e-05, + "loss": 0.0469, + "step": 11290 + }, + { + "epoch": 0.024920991634927323, + "grad_norm": 0.1682298481464386, + "learning_rate": 2.2598e-05, + "loss": 0.0462, + "step": 11300 + }, + { + "epoch": 0.024943045609825486, + "grad_norm": 0.19520972669124603, + "learning_rate": 2.2618e-05, + "loss": 0.049, + "step": 11310 + }, + { + "epoch": 0.024965099584723652, + "grad_norm": 0.1276034414768219, + "learning_rate": 2.2638000000000002e-05, + "loss": 0.0462, + "step": 11320 + }, + { + "epoch": 0.02498715355962182, + "grad_norm": 0.1589232087135315, + "learning_rate": 2.2658e-05, + "loss": 0.0485, + "step": 11330 + }, + { + "epoch": 0.025009207534519986, + "grad_norm": 0.19714654982089996, + "learning_rate": 2.2678000000000002e-05, + "loss": 0.0476, + "step": 11340 + }, + { + "epoch": 0.02503126150941815, + "grad_norm": 0.2243887484073639, + "learning_rate": 2.2698000000000002e-05, + "loss": 0.0479, + "step": 11350 + }, + { + "epoch": 0.025053315484316316, + "grad_norm": 0.14408992230892181, + "learning_rate": 2.2718e-05, + "loss": 0.0494, + "step": 11360 + }, + { + "epoch": 0.025075369459214483, + "grad_norm": 0.16619734466075897, + "learning_rate": 2.2738e-05, + "loss": 0.0499, + "step": 11370 + }, + { + "epoch": 0.025097423434112646, + "grad_norm": 0.1806904673576355, + "learning_rate": 2.2758e-05, + "loss": 0.0473, + "step": 11380 + }, + { + "epoch": 0.025119477409010812, + "grad_norm": 0.18133051693439484, + "learning_rate": 2.2778e-05, + "loss": 0.0503, + "step": 11390 + }, + { + "epoch": 0.02514153138390898, + "grad_norm": 0.15475305914878845, + "learning_rate": 2.2798e-05, + "loss": 0.0496, + "step": 11400 + }, + { + "epoch": 0.025163585358807146, + "grad_norm": 0.20978540182113647, + "learning_rate": 2.2818e-05, + "loss": 0.0465, + "step": 11410 + }, + { + "epoch": 0.02518563933370531, + "grad_norm": 0.18403570353984833, + "learning_rate": 2.2838e-05, + "loss": 0.0488, + "step": 11420 + }, + { + "epoch": 0.025207693308603476, + "grad_norm": 0.22812379896640778, + "learning_rate": 2.2858e-05, + "loss": 0.0488, + "step": 11430 + }, + { + "epoch": 0.025229747283501643, + "grad_norm": 0.16179832816123962, + "learning_rate": 2.2878e-05, + "loss": 0.0501, + "step": 11440 + }, + { + "epoch": 0.02525180125839981, + "grad_norm": 0.11788656562566757, + "learning_rate": 2.2898e-05, + "loss": 0.0466, + "step": 11450 + }, + { + "epoch": 0.025273855233297973, + "grad_norm": 0.15431715548038483, + "learning_rate": 2.2918e-05, + "loss": 0.0498, + "step": 11460 + }, + { + "epoch": 0.02529590920819614, + "grad_norm": 0.20999209582805634, + "learning_rate": 2.2938e-05, + "loss": 0.05, + "step": 11470 + }, + { + "epoch": 0.025317963183094306, + "grad_norm": 0.18578512966632843, + "learning_rate": 2.2958e-05, + "loss": 0.0464, + "step": 11480 + }, + { + "epoch": 0.02534001715799247, + "grad_norm": 0.21402034163475037, + "learning_rate": 2.2978000000000003e-05, + "loss": 0.0474, + "step": 11490 + }, + { + "epoch": 0.025362071132890636, + "grad_norm": 0.1923270970582962, + "learning_rate": 2.2998e-05, + "loss": 0.0479, + "step": 11500 + }, + { + "epoch": 0.025384125107788803, + "grad_norm": 0.15953326225280762, + "learning_rate": 2.3018e-05, + "loss": 0.0471, + "step": 11510 + }, + { + "epoch": 0.02540617908268697, + "grad_norm": 0.20601199567317963, + "learning_rate": 2.3038000000000002e-05, + "loss": 0.0477, + "step": 11520 + }, + { + "epoch": 0.025428233057585133, + "grad_norm": 0.19278888404369354, + "learning_rate": 2.3058e-05, + "loss": 0.0467, + "step": 11530 + }, + { + "epoch": 0.0254502870324833, + "grad_norm": 0.1802232712507248, + "learning_rate": 2.3078e-05, + "loss": 0.0482, + "step": 11540 + }, + { + "epoch": 0.025472341007381466, + "grad_norm": 0.2403274029493332, + "learning_rate": 2.3098e-05, + "loss": 0.0486, + "step": 11550 + }, + { + "epoch": 0.025494394982279633, + "grad_norm": 0.17138680815696716, + "learning_rate": 2.3117999999999998e-05, + "loss": 0.0465, + "step": 11560 + }, + { + "epoch": 0.025516448957177796, + "grad_norm": 0.17042399942874908, + "learning_rate": 2.3138e-05, + "loss": 0.0475, + "step": 11570 + }, + { + "epoch": 0.025538502932075963, + "grad_norm": 0.22894760966300964, + "learning_rate": 2.3158e-05, + "loss": 0.0506, + "step": 11580 + }, + { + "epoch": 0.02556055690697413, + "grad_norm": 0.24378104507923126, + "learning_rate": 2.3178e-05, + "loss": 0.0482, + "step": 11590 + }, + { + "epoch": 0.025582610881872293, + "grad_norm": 0.16220594942569733, + "learning_rate": 2.3198e-05, + "loss": 0.0474, + "step": 11600 + }, + { + "epoch": 0.02560466485677046, + "grad_norm": 0.20351342856884003, + "learning_rate": 2.3218e-05, + "loss": 0.0452, + "step": 11610 + }, + { + "epoch": 0.025626718831668626, + "grad_norm": 0.18298695981502533, + "learning_rate": 2.3238e-05, + "loss": 0.0468, + "step": 11620 + }, + { + "epoch": 0.025648772806566793, + "grad_norm": 0.22364875674247742, + "learning_rate": 2.3258e-05, + "loss": 0.0493, + "step": 11630 + }, + { + "epoch": 0.025670826781464956, + "grad_norm": 0.18178071081638336, + "learning_rate": 2.3278000000000003e-05, + "loss": 0.0471, + "step": 11640 + }, + { + "epoch": 0.025692880756363123, + "grad_norm": 0.19209255278110504, + "learning_rate": 2.3298e-05, + "loss": 0.0469, + "step": 11650 + }, + { + "epoch": 0.02571493473126129, + "grad_norm": 0.20178760588169098, + "learning_rate": 2.3318e-05, + "loss": 0.0468, + "step": 11660 + }, + { + "epoch": 0.025736988706159456, + "grad_norm": 0.14861422777175903, + "learning_rate": 2.3338000000000003e-05, + "loss": 0.0484, + "step": 11670 + }, + { + "epoch": 0.02575904268105762, + "grad_norm": 0.20645657181739807, + "learning_rate": 2.3358e-05, + "loss": 0.0461, + "step": 11680 + }, + { + "epoch": 0.025781096655955786, + "grad_norm": 0.18411985039710999, + "learning_rate": 2.3378000000000002e-05, + "loss": 0.0481, + "step": 11690 + }, + { + "epoch": 0.025803150630853953, + "grad_norm": 0.14574643969535828, + "learning_rate": 2.3398000000000002e-05, + "loss": 0.0456, + "step": 11700 + }, + { + "epoch": 0.025825204605752116, + "grad_norm": 0.18001149594783783, + "learning_rate": 2.3418e-05, + "loss": 0.047, + "step": 11710 + }, + { + "epoch": 0.025847258580650283, + "grad_norm": 0.19095087051391602, + "learning_rate": 2.3438e-05, + "loss": 0.0474, + "step": 11720 + }, + { + "epoch": 0.02586931255554845, + "grad_norm": 0.16112889349460602, + "learning_rate": 2.3458e-05, + "loss": 0.0441, + "step": 11730 + }, + { + "epoch": 0.025891366530446616, + "grad_norm": 0.20157410204410553, + "learning_rate": 2.3477999999999998e-05, + "loss": 0.0482, + "step": 11740 + }, + { + "epoch": 0.02591342050534478, + "grad_norm": 0.17402979731559753, + "learning_rate": 2.3498e-05, + "loss": 0.0475, + "step": 11750 + }, + { + "epoch": 0.025935474480242946, + "grad_norm": 0.13885000348091125, + "learning_rate": 2.3518e-05, + "loss": 0.0499, + "step": 11760 + }, + { + "epoch": 0.025957528455141113, + "grad_norm": 0.16214601695537567, + "learning_rate": 2.3538e-05, + "loss": 0.0478, + "step": 11770 + }, + { + "epoch": 0.02597958243003928, + "grad_norm": 0.17525658011436462, + "learning_rate": 2.3558e-05, + "loss": 0.0486, + "step": 11780 + }, + { + "epoch": 0.026001636404937443, + "grad_norm": 0.18905441462993622, + "learning_rate": 2.3578e-05, + "loss": 0.0472, + "step": 11790 + }, + { + "epoch": 0.02602369037983561, + "grad_norm": 0.15374496579170227, + "learning_rate": 2.3598e-05, + "loss": 0.0466, + "step": 11800 + }, + { + "epoch": 0.026045744354733776, + "grad_norm": 0.23952621221542358, + "learning_rate": 2.3618e-05, + "loss": 0.0473, + "step": 11810 + }, + { + "epoch": 0.026067798329631943, + "grad_norm": 0.15349437296390533, + "learning_rate": 2.3638000000000003e-05, + "loss": 0.0472, + "step": 11820 + }, + { + "epoch": 0.026089852304530106, + "grad_norm": 0.19122211635112762, + "learning_rate": 2.3658e-05, + "loss": 0.051, + "step": 11830 + }, + { + "epoch": 0.026111906279428273, + "grad_norm": 0.17018869519233704, + "learning_rate": 2.3678e-05, + "loss": 0.0471, + "step": 11840 + }, + { + "epoch": 0.02613396025432644, + "grad_norm": 0.1723955124616623, + "learning_rate": 2.3698000000000002e-05, + "loss": 0.0447, + "step": 11850 + }, + { + "epoch": 0.026156014229224603, + "grad_norm": 0.2125282734632492, + "learning_rate": 2.3718e-05, + "loss": 0.047, + "step": 11860 + }, + { + "epoch": 0.02617806820412277, + "grad_norm": 0.12752532958984375, + "learning_rate": 2.3738000000000002e-05, + "loss": 0.0476, + "step": 11870 + }, + { + "epoch": 0.026200122179020936, + "grad_norm": 0.15404167771339417, + "learning_rate": 2.3758000000000002e-05, + "loss": 0.0472, + "step": 11880 + }, + { + "epoch": 0.026222176153919103, + "grad_norm": 0.1817760467529297, + "learning_rate": 2.3778e-05, + "loss": 0.0471, + "step": 11890 + }, + { + "epoch": 0.026244230128817266, + "grad_norm": 0.18650531768798828, + "learning_rate": 2.3798e-05, + "loss": 0.0484, + "step": 11900 + }, + { + "epoch": 0.026266284103715433, + "grad_norm": 0.19768834114074707, + "learning_rate": 2.3818e-05, + "loss": 0.0494, + "step": 11910 + }, + { + "epoch": 0.0262883380786136, + "grad_norm": 0.22243338823318481, + "learning_rate": 2.3838e-05, + "loss": 0.0482, + "step": 11920 + }, + { + "epoch": 0.026310392053511766, + "grad_norm": 0.2107769101858139, + "learning_rate": 2.3858e-05, + "loss": 0.0473, + "step": 11930 + }, + { + "epoch": 0.02633244602840993, + "grad_norm": 0.18164761364459991, + "learning_rate": 2.3878e-05, + "loss": 0.0471, + "step": 11940 + }, + { + "epoch": 0.026354500003308096, + "grad_norm": 0.18395747244358063, + "learning_rate": 2.3898e-05, + "loss": 0.051, + "step": 11950 + }, + { + "epoch": 0.026376553978206263, + "grad_norm": 0.18146191537380219, + "learning_rate": 2.3918e-05, + "loss": 0.0474, + "step": 11960 + }, + { + "epoch": 0.026398607953104426, + "grad_norm": 0.21583440899848938, + "learning_rate": 2.3938000000000004e-05, + "loss": 0.0454, + "step": 11970 + }, + { + "epoch": 0.026420661928002593, + "grad_norm": 0.19847795367240906, + "learning_rate": 2.3958e-05, + "loss": 0.0496, + "step": 11980 + }, + { + "epoch": 0.02644271590290076, + "grad_norm": 0.20956003665924072, + "learning_rate": 2.3978e-05, + "loss": 0.0483, + "step": 11990 + }, + { + "epoch": 0.026464769877798926, + "grad_norm": 0.1837162971496582, + "learning_rate": 2.3998000000000003e-05, + "loss": 0.048, + "step": 12000 + }, + { + "epoch": 0.02648682385269709, + "grad_norm": 0.2473433017730713, + "learning_rate": 2.4018e-05, + "loss": 0.0489, + "step": 12010 + }, + { + "epoch": 0.026508877827595256, + "grad_norm": 0.17767247557640076, + "learning_rate": 2.4038000000000003e-05, + "loss": 0.044, + "step": 12020 + }, + { + "epoch": 0.026530931802493423, + "grad_norm": 0.20458216965198517, + "learning_rate": 2.4058000000000002e-05, + "loss": 0.0459, + "step": 12030 + }, + { + "epoch": 0.02655298577739159, + "grad_norm": 0.14820057153701782, + "learning_rate": 2.4078e-05, + "loss": 0.0467, + "step": 12040 + }, + { + "epoch": 0.026575039752289753, + "grad_norm": 0.1739119589328766, + "learning_rate": 2.4098000000000002e-05, + "loss": 0.0477, + "step": 12050 + }, + { + "epoch": 0.02659709372718792, + "grad_norm": 0.1872083991765976, + "learning_rate": 2.4118000000000002e-05, + "loss": 0.0461, + "step": 12060 + }, + { + "epoch": 0.026619147702086086, + "grad_norm": 0.1331469565629959, + "learning_rate": 2.4138e-05, + "loss": 0.049, + "step": 12070 + }, + { + "epoch": 0.02664120167698425, + "grad_norm": 0.20496827363967896, + "learning_rate": 2.4158e-05, + "loss": 0.048, + "step": 12080 + }, + { + "epoch": 0.026663255651882416, + "grad_norm": 0.18722055852413177, + "learning_rate": 2.4177999999999998e-05, + "loss": 0.0496, + "step": 12090 + }, + { + "epoch": 0.026685309626780583, + "grad_norm": 0.19980549812316895, + "learning_rate": 2.4198e-05, + "loss": 0.0475, + "step": 12100 + }, + { + "epoch": 0.02670736360167875, + "grad_norm": 0.1486787050962448, + "learning_rate": 2.4218e-05, + "loss": 0.0452, + "step": 12110 + }, + { + "epoch": 0.026729417576576913, + "grad_norm": 0.17807623744010925, + "learning_rate": 2.4238e-05, + "loss": 0.0467, + "step": 12120 + }, + { + "epoch": 0.02675147155147508, + "grad_norm": 0.19242319464683533, + "learning_rate": 2.4258e-05, + "loss": 0.0455, + "step": 12130 + }, + { + "epoch": 0.026773525526373246, + "grad_norm": 0.2036859095096588, + "learning_rate": 2.4278e-05, + "loss": 0.0454, + "step": 12140 + }, + { + "epoch": 0.026795579501271413, + "grad_norm": 0.17255178093910217, + "learning_rate": 2.4298e-05, + "loss": 0.0478, + "step": 12150 + }, + { + "epoch": 0.026817633476169576, + "grad_norm": 0.16618643701076508, + "learning_rate": 2.4318e-05, + "loss": 0.05, + "step": 12160 + }, + { + "epoch": 0.026839687451067743, + "grad_norm": 0.17081230878829956, + "learning_rate": 2.4338e-05, + "loss": 0.0472, + "step": 12170 + }, + { + "epoch": 0.02686174142596591, + "grad_norm": 0.19131812453269958, + "learning_rate": 2.4358e-05, + "loss": 0.0463, + "step": 12180 + }, + { + "epoch": 0.026883795400864073, + "grad_norm": 0.20334111154079437, + "learning_rate": 2.4378e-05, + "loss": 0.0473, + "step": 12190 + }, + { + "epoch": 0.02690584937576224, + "grad_norm": 0.1858380287885666, + "learning_rate": 2.4398000000000003e-05, + "loss": 0.0499, + "step": 12200 + }, + { + "epoch": 0.026927903350660407, + "grad_norm": 0.18815471231937408, + "learning_rate": 2.4418e-05, + "loss": 0.0486, + "step": 12210 + }, + { + "epoch": 0.026949957325558573, + "grad_norm": 0.1764150708913803, + "learning_rate": 2.4438e-05, + "loss": 0.0497, + "step": 12220 + }, + { + "epoch": 0.026972011300456736, + "grad_norm": 0.17557373642921448, + "learning_rate": 2.4458000000000002e-05, + "loss": 0.0468, + "step": 12230 + }, + { + "epoch": 0.026994065275354903, + "grad_norm": 0.1843852549791336, + "learning_rate": 2.4478e-05, + "loss": 0.0483, + "step": 12240 + }, + { + "epoch": 0.02701611925025307, + "grad_norm": 0.18512511253356934, + "learning_rate": 2.4498e-05, + "loss": 0.0465, + "step": 12250 + }, + { + "epoch": 0.027038173225151237, + "grad_norm": 0.19700002670288086, + "learning_rate": 2.4518e-05, + "loss": 0.049, + "step": 12260 + }, + { + "epoch": 0.0270602272000494, + "grad_norm": 0.22664037346839905, + "learning_rate": 2.4537999999999998e-05, + "loss": 0.045, + "step": 12270 + }, + { + "epoch": 0.027082281174947567, + "grad_norm": 0.20212730765342712, + "learning_rate": 2.4558e-05, + "loss": 0.0484, + "step": 12280 + }, + { + "epoch": 0.027104335149845733, + "grad_norm": 0.1963186413049698, + "learning_rate": 2.4578e-05, + "loss": 0.0499, + "step": 12290 + }, + { + "epoch": 0.027126389124743897, + "grad_norm": 0.19785037636756897, + "learning_rate": 2.4598e-05, + "loss": 0.049, + "step": 12300 + }, + { + "epoch": 0.027148443099642063, + "grad_norm": 0.14279329776763916, + "learning_rate": 2.4618e-05, + "loss": 0.0481, + "step": 12310 + }, + { + "epoch": 0.02717049707454023, + "grad_norm": 0.15178143978118896, + "learning_rate": 2.4638e-05, + "loss": 0.0463, + "step": 12320 + }, + { + "epoch": 0.027192551049438397, + "grad_norm": 0.15772607922554016, + "learning_rate": 2.4658e-05, + "loss": 0.0468, + "step": 12330 + }, + { + "epoch": 0.02721460502433656, + "grad_norm": 0.1621614694595337, + "learning_rate": 2.4678e-05, + "loss": 0.0462, + "step": 12340 + }, + { + "epoch": 0.027236658999234727, + "grad_norm": 0.2164505571126938, + "learning_rate": 2.4698000000000003e-05, + "loss": 0.049, + "step": 12350 + }, + { + "epoch": 0.027258712974132893, + "grad_norm": 0.1571478694677353, + "learning_rate": 2.4718e-05, + "loss": 0.0481, + "step": 12360 + }, + { + "epoch": 0.02728076694903106, + "grad_norm": 0.15718615055084229, + "learning_rate": 2.4738e-05, + "loss": 0.0471, + "step": 12370 + }, + { + "epoch": 0.027302820923929223, + "grad_norm": 0.23459330201148987, + "learning_rate": 2.4758000000000002e-05, + "loss": 0.0454, + "step": 12380 + }, + { + "epoch": 0.02732487489882739, + "grad_norm": 0.17911528050899506, + "learning_rate": 2.4778e-05, + "loss": 0.0484, + "step": 12390 + }, + { + "epoch": 0.027346928873725557, + "grad_norm": 0.1898423135280609, + "learning_rate": 2.4798000000000002e-05, + "loss": 0.0499, + "step": 12400 + }, + { + "epoch": 0.027368982848623723, + "grad_norm": 0.16852670907974243, + "learning_rate": 2.4818000000000002e-05, + "loss": 0.0493, + "step": 12410 + }, + { + "epoch": 0.027391036823521887, + "grad_norm": 0.1785620152950287, + "learning_rate": 2.4838e-05, + "loss": 0.0474, + "step": 12420 + }, + { + "epoch": 0.027413090798420053, + "grad_norm": 0.12684215605258942, + "learning_rate": 2.4858e-05, + "loss": 0.0465, + "step": 12430 + }, + { + "epoch": 0.02743514477331822, + "grad_norm": 0.1842130869626999, + "learning_rate": 2.4878e-05, + "loss": 0.0454, + "step": 12440 + }, + { + "epoch": 0.027457198748216383, + "grad_norm": 0.16878929734230042, + "learning_rate": 2.4898e-05, + "loss": 0.0475, + "step": 12450 + }, + { + "epoch": 0.02747925272311455, + "grad_norm": 0.13551223278045654, + "learning_rate": 2.4918e-05, + "loss": 0.0497, + "step": 12460 + }, + { + "epoch": 0.027501306698012717, + "grad_norm": 0.1683470755815506, + "learning_rate": 2.4938e-05, + "loss": 0.0492, + "step": 12470 + }, + { + "epoch": 0.027523360672910883, + "grad_norm": 0.1873762160539627, + "learning_rate": 2.4958e-05, + "loss": 0.0473, + "step": 12480 + }, + { + "epoch": 0.027545414647809047, + "grad_norm": 0.16687490046024323, + "learning_rate": 2.4978e-05, + "loss": 0.046, + "step": 12490 + }, + { + "epoch": 0.027567468622707213, + "grad_norm": 0.14075641334056854, + "learning_rate": 2.4998000000000004e-05, + "loss": 0.0498, + "step": 12500 + }, + { + "epoch": 0.02758952259760538, + "grad_norm": 0.16070406138896942, + "learning_rate": 2.5018e-05, + "loss": 0.0454, + "step": 12510 + }, + { + "epoch": 0.027611576572503547, + "grad_norm": 0.18618714809417725, + "learning_rate": 2.5038e-05, + "loss": 0.0466, + "step": 12520 + }, + { + "epoch": 0.02763363054740171, + "grad_norm": 0.14739008247852325, + "learning_rate": 2.5058000000000003e-05, + "loss": 0.0475, + "step": 12530 + }, + { + "epoch": 0.027655684522299877, + "grad_norm": 0.25327304005622864, + "learning_rate": 2.5078e-05, + "loss": 0.047, + "step": 12540 + }, + { + "epoch": 0.027677738497198043, + "grad_norm": 0.15328551828861237, + "learning_rate": 2.5098000000000003e-05, + "loss": 0.0482, + "step": 12550 + }, + { + "epoch": 0.027699792472096207, + "grad_norm": 0.24242539703845978, + "learning_rate": 2.5118000000000002e-05, + "loss": 0.0495, + "step": 12560 + }, + { + "epoch": 0.027721846446994373, + "grad_norm": 0.20177684724330902, + "learning_rate": 2.5138e-05, + "loss": 0.0496, + "step": 12570 + }, + { + "epoch": 0.02774390042189254, + "grad_norm": 0.15746141970157623, + "learning_rate": 2.5158000000000002e-05, + "loss": 0.0478, + "step": 12580 + }, + { + "epoch": 0.027765954396790707, + "grad_norm": 0.20668038725852966, + "learning_rate": 2.5178000000000002e-05, + "loss": 0.046, + "step": 12590 + }, + { + "epoch": 0.02778800837168887, + "grad_norm": 0.23516622185707092, + "learning_rate": 2.5197999999999998e-05, + "loss": 0.0466, + "step": 12600 + }, + { + "epoch": 0.027810062346587037, + "grad_norm": 0.16114771366119385, + "learning_rate": 2.5218e-05, + "loss": 0.0479, + "step": 12610 + }, + { + "epoch": 0.027832116321485204, + "grad_norm": 0.15172336995601654, + "learning_rate": 2.5238e-05, + "loss": 0.047, + "step": 12620 + }, + { + "epoch": 0.02785417029638337, + "grad_norm": 0.22268320620059967, + "learning_rate": 2.5258e-05, + "loss": 0.0479, + "step": 12630 + }, + { + "epoch": 0.027876224271281533, + "grad_norm": 0.13002991676330566, + "learning_rate": 2.5278e-05, + "loss": 0.0486, + "step": 12640 + }, + { + "epoch": 0.0278982782461797, + "grad_norm": 0.20059365034103394, + "learning_rate": 2.5298e-05, + "loss": 0.0442, + "step": 12650 + }, + { + "epoch": 0.027920332221077867, + "grad_norm": 0.1846139132976532, + "learning_rate": 2.5318e-05, + "loss": 0.0469, + "step": 12660 + }, + { + "epoch": 0.02794238619597603, + "grad_norm": 0.12366990745067596, + "learning_rate": 2.5338e-05, + "loss": 0.047, + "step": 12670 + }, + { + "epoch": 0.027964440170874197, + "grad_norm": 0.194268137216568, + "learning_rate": 2.5358000000000004e-05, + "loss": 0.0465, + "step": 12680 + }, + { + "epoch": 0.027986494145772364, + "grad_norm": 0.12836451828479767, + "learning_rate": 2.5378e-05, + "loss": 0.0501, + "step": 12690 + }, + { + "epoch": 0.02800854812067053, + "grad_norm": 0.17423059046268463, + "learning_rate": 2.5398e-05, + "loss": 0.0452, + "step": 12700 + }, + { + "epoch": 0.028030602095568694, + "grad_norm": 0.15684765577316284, + "learning_rate": 2.5418e-05, + "loss": 0.0493, + "step": 12710 + }, + { + "epoch": 0.02805265607046686, + "grad_norm": 0.18591587245464325, + "learning_rate": 2.5438e-05, + "loss": 0.0465, + "step": 12720 + }, + { + "epoch": 0.028074710045365027, + "grad_norm": 0.15811437368392944, + "learning_rate": 2.5458000000000003e-05, + "loss": 0.0452, + "step": 12730 + }, + { + "epoch": 0.028096764020263194, + "grad_norm": 0.16863958537578583, + "learning_rate": 2.5478e-05, + "loss": 0.0493, + "step": 12740 + }, + { + "epoch": 0.028118817995161357, + "grad_norm": 0.19352257251739502, + "learning_rate": 2.5498e-05, + "loss": 0.0478, + "step": 12750 + }, + { + "epoch": 0.028140871970059524, + "grad_norm": 0.14174148440361023, + "learning_rate": 2.5518000000000002e-05, + "loss": 0.0463, + "step": 12760 + }, + { + "epoch": 0.02816292594495769, + "grad_norm": 0.22428496181964874, + "learning_rate": 2.5538e-05, + "loss": 0.0482, + "step": 12770 + }, + { + "epoch": 0.028184979919855854, + "grad_norm": 0.18139560520648956, + "learning_rate": 2.5558e-05, + "loss": 0.048, + "step": 12780 + }, + { + "epoch": 0.02820703389475402, + "grad_norm": 0.14287497103214264, + "learning_rate": 2.5578e-05, + "loss": 0.0473, + "step": 12790 + }, + { + "epoch": 0.028229087869652187, + "grad_norm": 0.15960693359375, + "learning_rate": 2.5597999999999998e-05, + "loss": 0.0495, + "step": 12800 + }, + { + "epoch": 0.028251141844550354, + "grad_norm": 0.16851718723773956, + "learning_rate": 2.5618e-05, + "loss": 0.0459, + "step": 12810 + }, + { + "epoch": 0.028273195819448517, + "grad_norm": 0.10872913897037506, + "learning_rate": 2.5638e-05, + "loss": 0.048, + "step": 12820 + }, + { + "epoch": 0.028295249794346684, + "grad_norm": 0.15800996124744415, + "learning_rate": 2.5658e-05, + "loss": 0.0463, + "step": 12830 + }, + { + "epoch": 0.02831730376924485, + "grad_norm": 0.20372240245342255, + "learning_rate": 2.5678e-05, + "loss": 0.0467, + "step": 12840 + }, + { + "epoch": 0.028339357744143017, + "grad_norm": 0.16997572779655457, + "learning_rate": 2.5698e-05, + "loss": 0.0479, + "step": 12850 + }, + { + "epoch": 0.02836141171904118, + "grad_norm": 0.15036799013614655, + "learning_rate": 2.5718e-05, + "loss": 0.0464, + "step": 12860 + }, + { + "epoch": 0.028383465693939347, + "grad_norm": 0.17047515511512756, + "learning_rate": 2.5738e-05, + "loss": 0.0455, + "step": 12870 + }, + { + "epoch": 0.028405519668837514, + "grad_norm": 0.169579416513443, + "learning_rate": 2.5758000000000003e-05, + "loss": 0.0449, + "step": 12880 + }, + { + "epoch": 0.028427573643735677, + "grad_norm": 0.12181471288204193, + "learning_rate": 2.5778e-05, + "loss": 0.0482, + "step": 12890 + }, + { + "epoch": 0.028449627618633844, + "grad_norm": 0.18738201260566711, + "learning_rate": 2.5798e-05, + "loss": 0.0474, + "step": 12900 + }, + { + "epoch": 0.02847168159353201, + "grad_norm": 0.1519399732351303, + "learning_rate": 2.5818000000000003e-05, + "loss": 0.0438, + "step": 12910 + }, + { + "epoch": 0.028493735568430177, + "grad_norm": 0.14400826394557953, + "learning_rate": 2.5838e-05, + "loss": 0.0456, + "step": 12920 + }, + { + "epoch": 0.02851578954332834, + "grad_norm": 0.20642298460006714, + "learning_rate": 2.5858000000000002e-05, + "loss": 0.0461, + "step": 12930 + }, + { + "epoch": 0.028537843518226507, + "grad_norm": 0.16632826626300812, + "learning_rate": 2.5878000000000002e-05, + "loss": 0.0457, + "step": 12940 + }, + { + "epoch": 0.028559897493124674, + "grad_norm": 0.2142360657453537, + "learning_rate": 2.5898e-05, + "loss": 0.0472, + "step": 12950 + }, + { + "epoch": 0.02858195146802284, + "grad_norm": 0.17036837339401245, + "learning_rate": 2.5918e-05, + "loss": 0.0448, + "step": 12960 + }, + { + "epoch": 0.028604005442921004, + "grad_norm": 0.15447752177715302, + "learning_rate": 2.5938e-05, + "loss": 0.0472, + "step": 12970 + }, + { + "epoch": 0.02862605941781917, + "grad_norm": 0.15439659357070923, + "learning_rate": 2.5958e-05, + "loss": 0.0456, + "step": 12980 + }, + { + "epoch": 0.028648113392717337, + "grad_norm": 0.1630573570728302, + "learning_rate": 2.5978e-05, + "loss": 0.0491, + "step": 12990 + }, + { + "epoch": 0.028670167367615504, + "grad_norm": 0.17772436141967773, + "learning_rate": 2.5998e-05, + "loss": 0.0478, + "step": 13000 + }, + { + "epoch": 0.028692221342513667, + "grad_norm": 0.17211408913135529, + "learning_rate": 2.6018e-05, + "loss": 0.0449, + "step": 13010 + }, + { + "epoch": 0.028714275317411834, + "grad_norm": 0.20315749943256378, + "learning_rate": 2.6038e-05, + "loss": 0.0485, + "step": 13020 + }, + { + "epoch": 0.02873632929231, + "grad_norm": 0.18563857674598694, + "learning_rate": 2.6058e-05, + "loss": 0.0468, + "step": 13030 + }, + { + "epoch": 0.028758383267208164, + "grad_norm": 0.1626054048538208, + "learning_rate": 2.6078e-05, + "loss": 0.0456, + "step": 13040 + }, + { + "epoch": 0.02878043724210633, + "grad_norm": 0.18349170684814453, + "learning_rate": 2.6098e-05, + "loss": 0.0431, + "step": 13050 + }, + { + "epoch": 0.028802491217004497, + "grad_norm": 0.18530650436878204, + "learning_rate": 2.6118000000000003e-05, + "loss": 0.0485, + "step": 13060 + }, + { + "epoch": 0.028824545191902664, + "grad_norm": 0.14321914315223694, + "learning_rate": 2.6138e-05, + "loss": 0.045, + "step": 13070 + }, + { + "epoch": 0.028846599166800827, + "grad_norm": 0.15629467368125916, + "learning_rate": 2.6158e-05, + "loss": 0.0467, + "step": 13080 + }, + { + "epoch": 0.028868653141698994, + "grad_norm": 0.200401172041893, + "learning_rate": 2.6178000000000002e-05, + "loss": 0.0485, + "step": 13090 + }, + { + "epoch": 0.02889070711659716, + "grad_norm": 0.16690517961978912, + "learning_rate": 2.6198e-05, + "loss": 0.0472, + "step": 13100 + }, + { + "epoch": 0.028912761091495327, + "grad_norm": 0.20065024495124817, + "learning_rate": 2.6218000000000002e-05, + "loss": 0.0481, + "step": 13110 + }, + { + "epoch": 0.02893481506639349, + "grad_norm": 0.13851718604564667, + "learning_rate": 2.6238000000000002e-05, + "loss": 0.0476, + "step": 13120 + }, + { + "epoch": 0.028956869041291657, + "grad_norm": 0.1910465508699417, + "learning_rate": 2.6257999999999998e-05, + "loss": 0.0493, + "step": 13130 + }, + { + "epoch": 0.028978923016189824, + "grad_norm": 0.2520337700843811, + "learning_rate": 2.6278e-05, + "loss": 0.0504, + "step": 13140 + }, + { + "epoch": 0.029000976991087987, + "grad_norm": 0.19681622087955475, + "learning_rate": 2.6298e-05, + "loss": 0.0456, + "step": 13150 + }, + { + "epoch": 0.029023030965986154, + "grad_norm": 0.1660032868385315, + "learning_rate": 2.6318e-05, + "loss": 0.0479, + "step": 13160 + }, + { + "epoch": 0.02904508494088432, + "grad_norm": 0.15975527465343475, + "learning_rate": 2.6338e-05, + "loss": 0.0501, + "step": 13170 + }, + { + "epoch": 0.029067138915782487, + "grad_norm": 0.18404024839401245, + "learning_rate": 2.6358e-05, + "loss": 0.0439, + "step": 13180 + }, + { + "epoch": 0.02908919289068065, + "grad_norm": 0.2019631713628769, + "learning_rate": 2.6378e-05, + "loss": 0.0466, + "step": 13190 + }, + { + "epoch": 0.029111246865578817, + "grad_norm": 0.15521033108234406, + "learning_rate": 2.6398e-05, + "loss": 0.0469, + "step": 13200 + }, + { + "epoch": 0.029133300840476984, + "grad_norm": 0.22537927329540253, + "learning_rate": 2.6418000000000004e-05, + "loss": 0.0437, + "step": 13210 + }, + { + "epoch": 0.02915535481537515, + "grad_norm": 0.19045570492744446, + "learning_rate": 2.6438e-05, + "loss": 0.046, + "step": 13220 + }, + { + "epoch": 0.029177408790273314, + "grad_norm": 0.1534426063299179, + "learning_rate": 2.6458e-05, + "loss": 0.0457, + "step": 13230 + }, + { + "epoch": 0.02919946276517148, + "grad_norm": 0.14271676540374756, + "learning_rate": 2.6478000000000003e-05, + "loss": 0.0481, + "step": 13240 + }, + { + "epoch": 0.029221516740069647, + "grad_norm": 0.16379901766777039, + "learning_rate": 2.6498e-05, + "loss": 0.048, + "step": 13250 + }, + { + "epoch": 0.02924357071496781, + "grad_norm": 0.15865832567214966, + "learning_rate": 2.6518000000000003e-05, + "loss": 0.0461, + "step": 13260 + }, + { + "epoch": 0.029265624689865977, + "grad_norm": 0.19453302025794983, + "learning_rate": 2.6538000000000002e-05, + "loss": 0.047, + "step": 13270 + }, + { + "epoch": 0.029287678664764144, + "grad_norm": 0.1891113668680191, + "learning_rate": 2.6558e-05, + "loss": 0.0479, + "step": 13280 + }, + { + "epoch": 0.02930973263966231, + "grad_norm": 0.15556982159614563, + "learning_rate": 2.6578000000000002e-05, + "loss": 0.0488, + "step": 13290 + }, + { + "epoch": 0.029331786614560474, + "grad_norm": 0.20502987504005432, + "learning_rate": 2.6598000000000002e-05, + "loss": 0.0469, + "step": 13300 + }, + { + "epoch": 0.02935384058945864, + "grad_norm": 0.17239804565906525, + "learning_rate": 2.6618e-05, + "loss": 0.0458, + "step": 13310 + }, + { + "epoch": 0.029375894564356807, + "grad_norm": 0.15920491516590118, + "learning_rate": 2.6638e-05, + "loss": 0.0487, + "step": 13320 + }, + { + "epoch": 0.029397948539254974, + "grad_norm": 0.15464897453784943, + "learning_rate": 2.6657999999999998e-05, + "loss": 0.0459, + "step": 13330 + }, + { + "epoch": 0.029420002514153137, + "grad_norm": 0.17027169466018677, + "learning_rate": 2.6678e-05, + "loss": 0.0494, + "step": 13340 + }, + { + "epoch": 0.029442056489051304, + "grad_norm": 0.15137545764446259, + "learning_rate": 2.6698e-05, + "loss": 0.0475, + "step": 13350 + }, + { + "epoch": 0.02946411046394947, + "grad_norm": 0.14185847342014313, + "learning_rate": 2.6718e-05, + "loss": 0.0476, + "step": 13360 + }, + { + "epoch": 0.029486164438847634, + "grad_norm": 0.1434077024459839, + "learning_rate": 2.6738e-05, + "loss": 0.0485, + "step": 13370 + }, + { + "epoch": 0.0295082184137458, + "grad_norm": 0.21498259902000427, + "learning_rate": 2.6758e-05, + "loss": 0.0488, + "step": 13380 + }, + { + "epoch": 0.029530272388643967, + "grad_norm": 0.180231973528862, + "learning_rate": 2.6778e-05, + "loss": 0.0472, + "step": 13390 + }, + { + "epoch": 0.029552326363542134, + "grad_norm": 0.17247654497623444, + "learning_rate": 2.6798e-05, + "loss": 0.0508, + "step": 13400 + }, + { + "epoch": 0.029574380338440297, + "grad_norm": 0.12864989042282104, + "learning_rate": 2.6818e-05, + "loss": 0.0457, + "step": 13410 + }, + { + "epoch": 0.029596434313338464, + "grad_norm": 0.15071746706962585, + "learning_rate": 2.6838e-05, + "loss": 0.0458, + "step": 13420 + }, + { + "epoch": 0.02961848828823663, + "grad_norm": 0.15236200392246246, + "learning_rate": 2.6858e-05, + "loss": 0.0487, + "step": 13430 + }, + { + "epoch": 0.029640542263134798, + "grad_norm": 0.14639902114868164, + "learning_rate": 2.6878000000000003e-05, + "loss": 0.047, + "step": 13440 + }, + { + "epoch": 0.02966259623803296, + "grad_norm": 0.21576742827892303, + "learning_rate": 2.6898e-05, + "loss": 0.045, + "step": 13450 + }, + { + "epoch": 0.029684650212931128, + "grad_norm": 0.16702035069465637, + "learning_rate": 2.6918e-05, + "loss": 0.0483, + "step": 13460 + }, + { + "epoch": 0.029706704187829294, + "grad_norm": 0.13116976618766785, + "learning_rate": 2.6938000000000002e-05, + "loss": 0.0492, + "step": 13470 + }, + { + "epoch": 0.029728758162727457, + "grad_norm": 0.16232626140117645, + "learning_rate": 2.6958e-05, + "loss": 0.0459, + "step": 13480 + }, + { + "epoch": 0.029750812137625624, + "grad_norm": 0.17902302742004395, + "learning_rate": 2.6978e-05, + "loss": 0.0459, + "step": 13490 + }, + { + "epoch": 0.02977286611252379, + "grad_norm": 0.13475121557712555, + "learning_rate": 2.6998e-05, + "loss": 0.0441, + "step": 13500 + }, + { + "epoch": 0.029794920087421958, + "grad_norm": 0.21501848101615906, + "learning_rate": 2.7017999999999998e-05, + "loss": 0.0455, + "step": 13510 + }, + { + "epoch": 0.02981697406232012, + "grad_norm": 0.1480705291032791, + "learning_rate": 2.7038e-05, + "loss": 0.0472, + "step": 13520 + }, + { + "epoch": 0.029839028037218288, + "grad_norm": 0.142201766371727, + "learning_rate": 2.7058e-05, + "loss": 0.048, + "step": 13530 + }, + { + "epoch": 0.029861082012116454, + "grad_norm": 0.1497841775417328, + "learning_rate": 2.7078e-05, + "loss": 0.0492, + "step": 13540 + }, + { + "epoch": 0.02988313598701462, + "grad_norm": 0.14443975687026978, + "learning_rate": 2.7098e-05, + "loss": 0.0455, + "step": 13550 + }, + { + "epoch": 0.029905189961912784, + "grad_norm": 0.14485104382038116, + "learning_rate": 2.7118e-05, + "loss": 0.0467, + "step": 13560 + }, + { + "epoch": 0.02992724393681095, + "grad_norm": 0.1865907460451126, + "learning_rate": 2.7138e-05, + "loss": 0.0468, + "step": 13570 + }, + { + "epoch": 0.029949297911709118, + "grad_norm": 0.19188910722732544, + "learning_rate": 2.7158e-05, + "loss": 0.0446, + "step": 13580 + }, + { + "epoch": 0.029971351886607284, + "grad_norm": 0.1548086702823639, + "learning_rate": 2.7178000000000003e-05, + "loss": 0.046, + "step": 13590 + }, + { + "epoch": 0.029993405861505448, + "grad_norm": 0.14782312512397766, + "learning_rate": 2.7198e-05, + "loss": 0.0466, + "step": 13600 + }, + { + "epoch": 0.030015459836403614, + "grad_norm": 0.18447528779506683, + "learning_rate": 2.7218e-05, + "loss": 0.0449, + "step": 13610 + }, + { + "epoch": 0.03003751381130178, + "grad_norm": 0.1385408341884613, + "learning_rate": 2.7238000000000002e-05, + "loss": 0.0461, + "step": 13620 + }, + { + "epoch": 0.030059567786199944, + "grad_norm": 0.20159070193767548, + "learning_rate": 2.7258e-05, + "loss": 0.047, + "step": 13630 + }, + { + "epoch": 0.03008162176109811, + "grad_norm": 0.14971517026424408, + "learning_rate": 2.7278000000000002e-05, + "loss": 0.0456, + "step": 13640 + }, + { + "epoch": 0.030103675735996278, + "grad_norm": 0.20525360107421875, + "learning_rate": 2.7298000000000002e-05, + "loss": 0.0456, + "step": 13650 + }, + { + "epoch": 0.030125729710894444, + "grad_norm": 0.22105693817138672, + "learning_rate": 2.7318e-05, + "loss": 0.0463, + "step": 13660 + }, + { + "epoch": 0.030147783685792608, + "grad_norm": 0.12940587103366852, + "learning_rate": 2.7338e-05, + "loss": 0.0458, + "step": 13670 + }, + { + "epoch": 0.030169837660690774, + "grad_norm": 0.19698452949523926, + "learning_rate": 2.7358e-05, + "loss": 0.045, + "step": 13680 + }, + { + "epoch": 0.03019189163558894, + "grad_norm": 0.16359055042266846, + "learning_rate": 2.7378e-05, + "loss": 0.0455, + "step": 13690 + }, + { + "epoch": 0.030213945610487108, + "grad_norm": 0.15211759507656097, + "learning_rate": 2.7398e-05, + "loss": 0.0458, + "step": 13700 + }, + { + "epoch": 0.03023599958538527, + "grad_norm": 0.1702694445848465, + "learning_rate": 2.7418e-05, + "loss": 0.0459, + "step": 13710 + }, + { + "epoch": 0.030258053560283438, + "grad_norm": 0.15375538170337677, + "learning_rate": 2.7438e-05, + "loss": 0.0487, + "step": 13720 + }, + { + "epoch": 0.030280107535181604, + "grad_norm": 0.24555186927318573, + "learning_rate": 2.7458e-05, + "loss": 0.0477, + "step": 13730 + }, + { + "epoch": 0.030302161510079768, + "grad_norm": 0.23133176565170288, + "learning_rate": 2.7478000000000004e-05, + "loss": 0.0471, + "step": 13740 + }, + { + "epoch": 0.030324215484977934, + "grad_norm": 0.17136001586914062, + "learning_rate": 2.7498e-05, + "loss": 0.0473, + "step": 13750 + }, + { + "epoch": 0.0303462694598761, + "grad_norm": 0.18315254151821136, + "learning_rate": 2.7518e-05, + "loss": 0.0502, + "step": 13760 + }, + { + "epoch": 0.030368323434774268, + "grad_norm": 0.19147442281246185, + "learning_rate": 2.7538000000000003e-05, + "loss": 0.0463, + "step": 13770 + }, + { + "epoch": 0.03039037740967243, + "grad_norm": 0.17874620854854584, + "learning_rate": 2.7558e-05, + "loss": 0.0475, + "step": 13780 + }, + { + "epoch": 0.030412431384570598, + "grad_norm": 0.18643783032894135, + "learning_rate": 2.7578000000000003e-05, + "loss": 0.0469, + "step": 13790 + }, + { + "epoch": 0.030434485359468764, + "grad_norm": 0.170736163854599, + "learning_rate": 2.7598000000000002e-05, + "loss": 0.0483, + "step": 13800 + }, + { + "epoch": 0.03045653933436693, + "grad_norm": 0.17115595936775208, + "learning_rate": 2.7618e-05, + "loss": 0.0476, + "step": 13810 + }, + { + "epoch": 0.030478593309265094, + "grad_norm": 0.201929971575737, + "learning_rate": 2.7638000000000002e-05, + "loss": 0.0509, + "step": 13820 + }, + { + "epoch": 0.03050064728416326, + "grad_norm": 0.1426149308681488, + "learning_rate": 2.7658000000000002e-05, + "loss": 0.0459, + "step": 13830 + }, + { + "epoch": 0.030522701259061428, + "grad_norm": 0.1687866747379303, + "learning_rate": 2.7678e-05, + "loss": 0.0449, + "step": 13840 + }, + { + "epoch": 0.03054475523395959, + "grad_norm": 0.15439383685588837, + "learning_rate": 2.7698e-05, + "loss": 0.0472, + "step": 13850 + }, + { + "epoch": 0.030566809208857758, + "grad_norm": 0.18992900848388672, + "learning_rate": 2.7718e-05, + "loss": 0.0485, + "step": 13860 + }, + { + "epoch": 0.030588863183755925, + "grad_norm": 0.20212410390377045, + "learning_rate": 2.7738e-05, + "loss": 0.0488, + "step": 13870 + }, + { + "epoch": 0.03061091715865409, + "grad_norm": 0.1494836062192917, + "learning_rate": 2.7758e-05, + "loss": 0.0463, + "step": 13880 + }, + { + "epoch": 0.030632971133552254, + "grad_norm": 0.19130119681358337, + "learning_rate": 2.7778e-05, + "loss": 0.048, + "step": 13890 + }, + { + "epoch": 0.03065502510845042, + "grad_norm": 0.16722241044044495, + "learning_rate": 2.7798e-05, + "loss": 0.0459, + "step": 13900 + }, + { + "epoch": 0.030677079083348588, + "grad_norm": 0.1748635172843933, + "learning_rate": 2.7818e-05, + "loss": 0.0472, + "step": 13910 + }, + { + "epoch": 0.030699133058246755, + "grad_norm": 0.13833864033222198, + "learning_rate": 2.7838000000000004e-05, + "loss": 0.0464, + "step": 13920 + }, + { + "epoch": 0.030721187033144918, + "grad_norm": 0.17684286832809448, + "learning_rate": 2.7858e-05, + "loss": 0.0455, + "step": 13930 + }, + { + "epoch": 0.030743241008043085, + "grad_norm": 0.21147967875003815, + "learning_rate": 2.7878e-05, + "loss": 0.0466, + "step": 13940 + }, + { + "epoch": 0.03076529498294125, + "grad_norm": 0.16949915885925293, + "learning_rate": 2.7898e-05, + "loss": 0.0478, + "step": 13950 + }, + { + "epoch": 0.030787348957839415, + "grad_norm": 0.1611425131559372, + "learning_rate": 2.7918e-05, + "loss": 0.0503, + "step": 13960 + }, + { + "epoch": 0.03080940293273758, + "grad_norm": 0.14303962886333466, + "learning_rate": 2.7938000000000003e-05, + "loss": 0.0471, + "step": 13970 + }, + { + "epoch": 0.030831456907635748, + "grad_norm": 0.18714861571788788, + "learning_rate": 2.7958e-05, + "loss": 0.0467, + "step": 13980 + }, + { + "epoch": 0.030853510882533915, + "grad_norm": 0.23042020201683044, + "learning_rate": 2.7978e-05, + "loss": 0.0449, + "step": 13990 + }, + { + "epoch": 0.030875564857432078, + "grad_norm": 0.1547527313232422, + "learning_rate": 2.7998000000000002e-05, + "loss": 0.0486, + "step": 14000 + }, + { + "epoch": 0.030897618832330245, + "grad_norm": 0.15742161870002747, + "learning_rate": 2.8018e-05, + "loss": 0.0471, + "step": 14010 + }, + { + "epoch": 0.03091967280722841, + "grad_norm": 0.17717087268829346, + "learning_rate": 2.8038e-05, + "loss": 0.0471, + "step": 14020 + }, + { + "epoch": 0.030941726782126578, + "grad_norm": 0.16724812984466553, + "learning_rate": 2.8058e-05, + "loss": 0.0481, + "step": 14030 + }, + { + "epoch": 0.03096378075702474, + "grad_norm": 0.12492561340332031, + "learning_rate": 2.8077999999999998e-05, + "loss": 0.0442, + "step": 14040 + }, + { + "epoch": 0.030985834731922908, + "grad_norm": 0.1656019389629364, + "learning_rate": 2.8098e-05, + "loss": 0.0468, + "step": 14050 + }, + { + "epoch": 0.031007888706821075, + "grad_norm": 0.1847297102212906, + "learning_rate": 2.8118e-05, + "loss": 0.0493, + "step": 14060 + }, + { + "epoch": 0.031029942681719238, + "grad_norm": 0.1562245488166809, + "learning_rate": 2.8138e-05, + "loss": 0.0474, + "step": 14070 + }, + { + "epoch": 0.031051996656617405, + "grad_norm": 0.1872711181640625, + "learning_rate": 2.8158e-05, + "loss": 0.0464, + "step": 14080 + }, + { + "epoch": 0.03107405063151557, + "grad_norm": 0.2119067758321762, + "learning_rate": 2.8178e-05, + "loss": 0.049, + "step": 14090 + }, + { + "epoch": 0.031096104606413738, + "grad_norm": 0.19016295671463013, + "learning_rate": 2.8198e-05, + "loss": 0.0446, + "step": 14100 + }, + { + "epoch": 0.0311181585813119, + "grad_norm": 0.14369706809520721, + "learning_rate": 2.8218e-05, + "loss": 0.0488, + "step": 14110 + }, + { + "epoch": 0.031140212556210068, + "grad_norm": 0.16937874257564545, + "learning_rate": 2.8238000000000003e-05, + "loss": 0.0446, + "step": 14120 + }, + { + "epoch": 0.031162266531108235, + "grad_norm": 0.19514939188957214, + "learning_rate": 2.8258e-05, + "loss": 0.0474, + "step": 14130 + }, + { + "epoch": 0.0311843205060064, + "grad_norm": 0.18840616941452026, + "learning_rate": 2.8278e-05, + "loss": 0.047, + "step": 14140 + }, + { + "epoch": 0.031206374480904565, + "grad_norm": 0.17779630422592163, + "learning_rate": 2.8298000000000002e-05, + "loss": 0.0444, + "step": 14150 + }, + { + "epoch": 0.03122842845580273, + "grad_norm": 0.13793502748012543, + "learning_rate": 2.8318e-05, + "loss": 0.046, + "step": 14160 + }, + { + "epoch": 0.031250482430700895, + "grad_norm": 0.16791298985481262, + "learning_rate": 2.8338000000000002e-05, + "loss": 0.0468, + "step": 14170 + }, + { + "epoch": 0.031272536405599065, + "grad_norm": 0.18301376700401306, + "learning_rate": 2.8358000000000002e-05, + "loss": 0.0455, + "step": 14180 + }, + { + "epoch": 0.03129459038049723, + "grad_norm": 0.16503553092479706, + "learning_rate": 2.8378e-05, + "loss": 0.0457, + "step": 14190 + }, + { + "epoch": 0.03131664435539539, + "grad_norm": 0.12639553844928741, + "learning_rate": 2.8398e-05, + "loss": 0.0448, + "step": 14200 + }, + { + "epoch": 0.03133869833029356, + "grad_norm": 0.16540560126304626, + "learning_rate": 2.8418e-05, + "loss": 0.0441, + "step": 14210 + }, + { + "epoch": 0.031360752305191725, + "grad_norm": 0.15314680337905884, + "learning_rate": 2.8438e-05, + "loss": 0.0443, + "step": 14220 + }, + { + "epoch": 0.031382806280089895, + "grad_norm": 0.15915250778198242, + "learning_rate": 2.8458e-05, + "loss": 0.0455, + "step": 14230 + }, + { + "epoch": 0.03140486025498806, + "grad_norm": 0.1479964703321457, + "learning_rate": 2.8478e-05, + "loss": 0.0449, + "step": 14240 + }, + { + "epoch": 0.03142691422988622, + "grad_norm": 0.16534070670604706, + "learning_rate": 2.8498e-05, + "loss": 0.0474, + "step": 14250 + }, + { + "epoch": 0.03144896820478439, + "grad_norm": 0.20025178790092468, + "learning_rate": 2.8518e-05, + "loss": 0.0455, + "step": 14260 + }, + { + "epoch": 0.031471022179682555, + "grad_norm": 0.1956511288881302, + "learning_rate": 2.8538e-05, + "loss": 0.0443, + "step": 14270 + }, + { + "epoch": 0.03149307615458072, + "grad_norm": 0.17185235023498535, + "learning_rate": 2.8558e-05, + "loss": 0.0473, + "step": 14280 + }, + { + "epoch": 0.03151513012947889, + "grad_norm": 0.19284214079380035, + "learning_rate": 2.8578e-05, + "loss": 0.0488, + "step": 14290 + }, + { + "epoch": 0.03153718410437705, + "grad_norm": 0.11601866036653519, + "learning_rate": 2.8598000000000003e-05, + "loss": 0.0447, + "step": 14300 + }, + { + "epoch": 0.031559238079275215, + "grad_norm": 0.1352609246969223, + "learning_rate": 2.8618e-05, + "loss": 0.045, + "step": 14310 + }, + { + "epoch": 0.031581292054173385, + "grad_norm": 0.16082115471363068, + "learning_rate": 2.8638e-05, + "loss": 0.0473, + "step": 14320 + }, + { + "epoch": 0.03160334602907155, + "grad_norm": 0.15041808784008026, + "learning_rate": 2.8658000000000002e-05, + "loss": 0.0422, + "step": 14330 + }, + { + "epoch": 0.03162540000396972, + "grad_norm": 0.13859325647354126, + "learning_rate": 2.8678e-05, + "loss": 0.0443, + "step": 14340 + }, + { + "epoch": 0.03164745397886788, + "grad_norm": 0.20022419095039368, + "learning_rate": 2.8698000000000002e-05, + "loss": 0.0493, + "step": 14350 + }, + { + "epoch": 0.031669507953766045, + "grad_norm": 0.16862517595291138, + "learning_rate": 2.8718000000000002e-05, + "loss": 0.0453, + "step": 14360 + }, + { + "epoch": 0.031691561928664215, + "grad_norm": 0.17139111459255219, + "learning_rate": 2.8737999999999998e-05, + "loss": 0.0442, + "step": 14370 + }, + { + "epoch": 0.03171361590356238, + "grad_norm": 0.13291610777378082, + "learning_rate": 2.8758e-05, + "loss": 0.0484, + "step": 14380 + }, + { + "epoch": 0.03173566987846054, + "grad_norm": 0.1339268684387207, + "learning_rate": 2.8778e-05, + "loss": 0.0454, + "step": 14390 + }, + { + "epoch": 0.03175772385335871, + "grad_norm": 0.19643299281597137, + "learning_rate": 2.8798e-05, + "loss": 0.0457, + "step": 14400 + }, + { + "epoch": 0.031779777828256875, + "grad_norm": 0.20999270677566528, + "learning_rate": 2.8818e-05, + "loss": 0.0493, + "step": 14410 + }, + { + "epoch": 0.031801831803155045, + "grad_norm": 0.19520923495292664, + "learning_rate": 2.8838e-05, + "loss": 0.047, + "step": 14420 + }, + { + "epoch": 0.03182388577805321, + "grad_norm": 0.10664965957403183, + "learning_rate": 2.8858e-05, + "loss": 0.0448, + "step": 14430 + }, + { + "epoch": 0.03184593975295137, + "grad_norm": 0.18497170507907867, + "learning_rate": 2.8878e-05, + "loss": 0.048, + "step": 14440 + }, + { + "epoch": 0.03186799372784954, + "grad_norm": 0.18721210956573486, + "learning_rate": 2.8898000000000004e-05, + "loss": 0.0437, + "step": 14450 + }, + { + "epoch": 0.031890047702747705, + "grad_norm": 0.13650934398174286, + "learning_rate": 2.8918e-05, + "loss": 0.0459, + "step": 14460 + }, + { + "epoch": 0.03191210167764587, + "grad_norm": 0.14168265461921692, + "learning_rate": 2.8938e-05, + "loss": 0.0459, + "step": 14470 + }, + { + "epoch": 0.03193415565254404, + "grad_norm": 0.17137876152992249, + "learning_rate": 2.8958000000000003e-05, + "loss": 0.05, + "step": 14480 + }, + { + "epoch": 0.0319562096274422, + "grad_norm": 0.14616720378398895, + "learning_rate": 2.8978e-05, + "loss": 0.0457, + "step": 14490 + }, + { + "epoch": 0.031978263602340365, + "grad_norm": 0.15721125900745392, + "learning_rate": 2.8998000000000003e-05, + "loss": 0.0461, + "step": 14500 + }, + { + "epoch": 0.032000317577238535, + "grad_norm": 0.1745813637971878, + "learning_rate": 2.9018000000000002e-05, + "loss": 0.0464, + "step": 14510 + }, + { + "epoch": 0.0320223715521367, + "grad_norm": 0.16590188443660736, + "learning_rate": 2.9038e-05, + "loss": 0.0456, + "step": 14520 + }, + { + "epoch": 0.03204442552703487, + "grad_norm": 0.1968233436346054, + "learning_rate": 2.9058000000000002e-05, + "loss": 0.048, + "step": 14530 + }, + { + "epoch": 0.03206647950193303, + "grad_norm": 0.138576477766037, + "learning_rate": 2.9078000000000002e-05, + "loss": 0.0458, + "step": 14540 + }, + { + "epoch": 0.032088533476831195, + "grad_norm": 0.14617830514907837, + "learning_rate": 2.9098e-05, + "loss": 0.0448, + "step": 14550 + }, + { + "epoch": 0.032110587451729365, + "grad_norm": 0.189032644033432, + "learning_rate": 2.9118e-05, + "loss": 0.0459, + "step": 14560 + }, + { + "epoch": 0.03213264142662753, + "grad_norm": 0.23984314501285553, + "learning_rate": 2.9137999999999998e-05, + "loss": 0.0463, + "step": 14570 + }, + { + "epoch": 0.03215469540152569, + "grad_norm": 0.15773019194602966, + "learning_rate": 2.9158e-05, + "loss": 0.0472, + "step": 14580 + }, + { + "epoch": 0.03217674937642386, + "grad_norm": 0.14879532158374786, + "learning_rate": 2.9178e-05, + "loss": 0.0462, + "step": 14590 + }, + { + "epoch": 0.032198803351322025, + "grad_norm": 0.15320642292499542, + "learning_rate": 2.9198e-05, + "loss": 0.046, + "step": 14600 + }, + { + "epoch": 0.03222085732622019, + "grad_norm": 0.15112537145614624, + "learning_rate": 2.9218e-05, + "loss": 0.0464, + "step": 14610 + }, + { + "epoch": 0.03224291130111836, + "grad_norm": 0.14901308715343475, + "learning_rate": 2.9238e-05, + "loss": 0.0483, + "step": 14620 + }, + { + "epoch": 0.03226496527601652, + "grad_norm": 0.15828612446784973, + "learning_rate": 2.9258e-05, + "loss": 0.045, + "step": 14630 + }, + { + "epoch": 0.03228701925091469, + "grad_norm": 0.13923898339271545, + "learning_rate": 2.9278e-05, + "loss": 0.0473, + "step": 14640 + }, + { + "epoch": 0.032309073225812855, + "grad_norm": 0.15514932572841644, + "learning_rate": 2.9298000000000003e-05, + "loss": 0.0458, + "step": 14650 + }, + { + "epoch": 0.03233112720071102, + "grad_norm": 0.15768370032310486, + "learning_rate": 2.9318e-05, + "loss": 0.0488, + "step": 14660 + }, + { + "epoch": 0.03235318117560919, + "grad_norm": 0.18186424672603607, + "learning_rate": 2.9338e-05, + "loss": 0.0468, + "step": 14670 + }, + { + "epoch": 0.03237523515050735, + "grad_norm": 0.13170965015888214, + "learning_rate": 2.9358000000000003e-05, + "loss": 0.0481, + "step": 14680 + }, + { + "epoch": 0.032397289125405515, + "grad_norm": 0.14656053483486176, + "learning_rate": 2.9378e-05, + "loss": 0.044, + "step": 14690 + }, + { + "epoch": 0.032419343100303685, + "grad_norm": 0.14418958127498627, + "learning_rate": 2.9398000000000002e-05, + "loss": 0.046, + "step": 14700 + }, + { + "epoch": 0.03244139707520185, + "grad_norm": 0.14936614036560059, + "learning_rate": 2.9418000000000002e-05, + "loss": 0.0475, + "step": 14710 + }, + { + "epoch": 0.03246345105010001, + "grad_norm": 0.16814082860946655, + "learning_rate": 2.9438e-05, + "loss": 0.0484, + "step": 14720 + }, + { + "epoch": 0.03248550502499818, + "grad_norm": 0.18004222214221954, + "learning_rate": 2.9458e-05, + "loss": 0.0471, + "step": 14730 + }, + { + "epoch": 0.032507558999896345, + "grad_norm": 0.19229580461978912, + "learning_rate": 2.9478e-05, + "loss": 0.0457, + "step": 14740 + }, + { + "epoch": 0.032529612974794515, + "grad_norm": 0.13171425461769104, + "learning_rate": 2.9497999999999998e-05, + "loss": 0.0458, + "step": 14750 + }, + { + "epoch": 0.03255166694969268, + "grad_norm": 0.19796909391880035, + "learning_rate": 2.9518e-05, + "loss": 0.0491, + "step": 14760 + }, + { + "epoch": 0.03257372092459084, + "grad_norm": 0.18899787962436676, + "learning_rate": 2.9538e-05, + "loss": 0.0449, + "step": 14770 + }, + { + "epoch": 0.03259577489948901, + "grad_norm": 0.156744122505188, + "learning_rate": 2.9558e-05, + "loss": 0.0456, + "step": 14780 + }, + { + "epoch": 0.032617828874387175, + "grad_norm": 0.14983601868152618, + "learning_rate": 2.9578e-05, + "loss": 0.0467, + "step": 14790 + }, + { + "epoch": 0.03263988284928534, + "grad_norm": 0.18528655171394348, + "learning_rate": 2.9598e-05, + "loss": 0.0462, + "step": 14800 + }, + { + "epoch": 0.03266193682418351, + "grad_norm": 0.14782869815826416, + "learning_rate": 2.9618e-05, + "loss": 0.0466, + "step": 14810 + }, + { + "epoch": 0.03268399079908167, + "grad_norm": 0.22728100419044495, + "learning_rate": 2.9638e-05, + "loss": 0.0481, + "step": 14820 + }, + { + "epoch": 0.032706044773979835, + "grad_norm": 0.19546189904212952, + "learning_rate": 2.9658000000000003e-05, + "loss": 0.0463, + "step": 14830 + }, + { + "epoch": 0.032728098748878005, + "grad_norm": 0.1453908085823059, + "learning_rate": 2.9678e-05, + "loss": 0.0457, + "step": 14840 + }, + { + "epoch": 0.03275015272377617, + "grad_norm": 0.1387641280889511, + "learning_rate": 2.9698e-05, + "loss": 0.0441, + "step": 14850 + }, + { + "epoch": 0.03277220669867434, + "grad_norm": 0.13814206421375275, + "learning_rate": 2.9718000000000002e-05, + "loss": 0.0455, + "step": 14860 + }, + { + "epoch": 0.0327942606735725, + "grad_norm": 0.13723410665988922, + "learning_rate": 2.9738e-05, + "loss": 0.0445, + "step": 14870 + }, + { + "epoch": 0.032816314648470665, + "grad_norm": 0.14278635382652283, + "learning_rate": 2.9758000000000002e-05, + "loss": 0.0462, + "step": 14880 + }, + { + "epoch": 0.032838368623368835, + "grad_norm": 0.17922042310237885, + "learning_rate": 2.9778000000000002e-05, + "loss": 0.0479, + "step": 14890 + }, + { + "epoch": 0.032860422598267, + "grad_norm": 0.17110882699489594, + "learning_rate": 2.9797999999999998e-05, + "loss": 0.0468, + "step": 14900 + }, + { + "epoch": 0.03288247657316516, + "grad_norm": 0.15550518035888672, + "learning_rate": 2.9818e-05, + "loss": 0.0453, + "step": 14910 + }, + { + "epoch": 0.03290453054806333, + "grad_norm": 0.17610272765159607, + "learning_rate": 2.9838e-05, + "loss": 0.0451, + "step": 14920 + }, + { + "epoch": 0.032926584522961495, + "grad_norm": 0.1489662081003189, + "learning_rate": 2.9858e-05, + "loss": 0.0468, + "step": 14930 + }, + { + "epoch": 0.03294863849785966, + "grad_norm": 0.1523844301700592, + "learning_rate": 2.9878e-05, + "loss": 0.0447, + "step": 14940 + }, + { + "epoch": 0.03297069247275783, + "grad_norm": 0.1355103999376297, + "learning_rate": 2.9898e-05, + "loss": 0.0443, + "step": 14950 + }, + { + "epoch": 0.03299274644765599, + "grad_norm": 0.211969792842865, + "learning_rate": 2.9918e-05, + "loss": 0.0457, + "step": 14960 + }, + { + "epoch": 0.03301480042255416, + "grad_norm": 0.16637063026428223, + "learning_rate": 2.9938e-05, + "loss": 0.0484, + "step": 14970 + }, + { + "epoch": 0.033036854397452325, + "grad_norm": 0.15146584808826447, + "learning_rate": 2.9958000000000004e-05, + "loss": 0.0481, + "step": 14980 + }, + { + "epoch": 0.03305890837235049, + "grad_norm": 0.15329474210739136, + "learning_rate": 2.9978e-05, + "loss": 0.0459, + "step": 14990 + }, + { + "epoch": 0.03308096234724866, + "grad_norm": 0.15692239999771118, + "learning_rate": 2.9998e-05, + "loss": 0.0442, + "step": 15000 + }, + { + "epoch": 0.03310301632214682, + "grad_norm": 0.13887931406497955, + "learning_rate": 2.9999999926183014e-05, + "loss": 0.046, + "step": 15010 + }, + { + "epoch": 0.033125070297044985, + "grad_norm": 0.1864013522863388, + "learning_rate": 2.999999967101319e-05, + "loss": 0.0463, + "step": 15020 + }, + { + "epoch": 0.033147124271943156, + "grad_norm": 0.10857298970222473, + "learning_rate": 2.9999999233579203e-05, + "loss": 0.0446, + "step": 15030 + }, + { + "epoch": 0.03316917824684132, + "grad_norm": 0.23157966136932373, + "learning_rate": 2.999999861388107e-05, + "loss": 0.0474, + "step": 15040 + }, + { + "epoch": 0.03319123222173948, + "grad_norm": 0.16245384514331818, + "learning_rate": 2.9999997811918786e-05, + "loss": 0.0495, + "step": 15050 + }, + { + "epoch": 0.03321328619663765, + "grad_norm": 0.12926553189754486, + "learning_rate": 2.9999996827692363e-05, + "loss": 0.0454, + "step": 15060 + }, + { + "epoch": 0.033235340171535815, + "grad_norm": 0.15237529575824738, + "learning_rate": 2.999999566120182e-05, + "loss": 0.0487, + "step": 15070 + }, + { + "epoch": 0.033257394146433986, + "grad_norm": 0.17366312444210052, + "learning_rate": 2.999999431244717e-05, + "loss": 0.0448, + "step": 15080 + }, + { + "epoch": 0.03327944812133215, + "grad_norm": 0.15386749804019928, + "learning_rate": 2.999999278142842e-05, + "loss": 0.049, + "step": 15090 + }, + { + "epoch": 0.03330150209623031, + "grad_norm": 0.14204446971416473, + "learning_rate": 2.9999991068145606e-05, + "loss": 0.0452, + "step": 15100 + }, + { + "epoch": 0.03332355607112848, + "grad_norm": 0.17382098734378815, + "learning_rate": 2.999998917259873e-05, + "loss": 0.0446, + "step": 15110 + }, + { + "epoch": 0.033345610046026646, + "grad_norm": 0.16209334135055542, + "learning_rate": 2.9999987094787824e-05, + "loss": 0.0462, + "step": 15120 + }, + { + "epoch": 0.03336766402092481, + "grad_norm": 0.1455903947353363, + "learning_rate": 2.9999984834712914e-05, + "loss": 0.0468, + "step": 15130 + }, + { + "epoch": 0.03338971799582298, + "grad_norm": 0.17260028421878815, + "learning_rate": 2.9999982392374025e-05, + "loss": 0.0468, + "step": 15140 + }, + { + "epoch": 0.03341177197072114, + "grad_norm": 0.14122024178504944, + "learning_rate": 2.999997976777119e-05, + "loss": 0.0471, + "step": 15150 + }, + { + "epoch": 0.033433825945619305, + "grad_norm": 0.14442487061023712, + "learning_rate": 2.9999976960904437e-05, + "loss": 0.0441, + "step": 15160 + }, + { + "epoch": 0.033455879920517476, + "grad_norm": 0.21378545463085175, + "learning_rate": 2.99999739717738e-05, + "loss": 0.0471, + "step": 15170 + }, + { + "epoch": 0.03347793389541564, + "grad_norm": 0.16575412452220917, + "learning_rate": 2.9999970800379316e-05, + "loss": 0.0465, + "step": 15180 + }, + { + "epoch": 0.03349998787031381, + "grad_norm": 0.15736399590969086, + "learning_rate": 2.999996744672103e-05, + "loss": 0.0451, + "step": 15190 + }, + { + "epoch": 0.03352204184521197, + "grad_norm": 0.15028883516788483, + "learning_rate": 2.999996391079897e-05, + "loss": 0.0476, + "step": 15200 + }, + { + "epoch": 0.033544095820110136, + "grad_norm": 0.1738930493593216, + "learning_rate": 2.999996019261319e-05, + "loss": 0.0503, + "step": 15210 + }, + { + "epoch": 0.033566149795008306, + "grad_norm": 0.15197120606899261, + "learning_rate": 2.999995629216373e-05, + "loss": 0.0439, + "step": 15220 + }, + { + "epoch": 0.03358820376990647, + "grad_norm": 0.16266201436519623, + "learning_rate": 2.9999952209450644e-05, + "loss": 0.0464, + "step": 15230 + }, + { + "epoch": 0.03361025774480463, + "grad_norm": 0.11937083303928375, + "learning_rate": 2.999994794447397e-05, + "loss": 0.0443, + "step": 15240 + }, + { + "epoch": 0.0336323117197028, + "grad_norm": 0.15876245498657227, + "learning_rate": 2.999994349723377e-05, + "loss": 0.0442, + "step": 15250 + }, + { + "epoch": 0.033654365694600966, + "grad_norm": 0.13293586671352386, + "learning_rate": 2.999993886773009e-05, + "loss": 0.0456, + "step": 15260 + }, + { + "epoch": 0.03367641966949913, + "grad_norm": 0.13078777492046356, + "learning_rate": 2.9999934055962996e-05, + "loss": 0.0468, + "step": 15270 + }, + { + "epoch": 0.0336984736443973, + "grad_norm": 0.1664464771747589, + "learning_rate": 2.9999929061932535e-05, + "loss": 0.0434, + "step": 15280 + }, + { + "epoch": 0.03372052761929546, + "grad_norm": 0.2157633751630783, + "learning_rate": 2.999992388563878e-05, + "loss": 0.0445, + "step": 15290 + }, + { + "epoch": 0.03374258159419363, + "grad_norm": 0.20275279879570007, + "learning_rate": 2.9999918527081782e-05, + "loss": 0.0463, + "step": 15300 + }, + { + "epoch": 0.033764635569091796, + "grad_norm": 0.148171529173851, + "learning_rate": 2.9999912986261613e-05, + "loss": 0.0474, + "step": 15310 + }, + { + "epoch": 0.03378668954398996, + "grad_norm": 0.12368163466453552, + "learning_rate": 2.999990726317834e-05, + "loss": 0.0433, + "step": 15320 + }, + { + "epoch": 0.03380874351888813, + "grad_norm": 0.1698303073644638, + "learning_rate": 2.9999901357832033e-05, + "loss": 0.0445, + "step": 15330 + }, + { + "epoch": 0.03383079749378629, + "grad_norm": 0.1843125969171524, + "learning_rate": 2.9999895270222762e-05, + "loss": 0.0452, + "step": 15340 + }, + { + "epoch": 0.033852851468684456, + "grad_norm": 0.15280534327030182, + "learning_rate": 2.9999889000350594e-05, + "loss": 0.0464, + "step": 15350 + }, + { + "epoch": 0.033874905443582626, + "grad_norm": 0.15575696527957916, + "learning_rate": 2.9999882548215622e-05, + "loss": 0.0463, + "step": 15360 + }, + { + "epoch": 0.03389695941848079, + "grad_norm": 0.13653725385665894, + "learning_rate": 2.9999875913817913e-05, + "loss": 0.0474, + "step": 15370 + }, + { + "epoch": 0.03391901339337895, + "grad_norm": 0.16028818488121033, + "learning_rate": 2.9999869097157544e-05, + "loss": 0.0463, + "step": 15380 + }, + { + "epoch": 0.03394106736827712, + "grad_norm": 0.18218259513378143, + "learning_rate": 2.999986209823461e-05, + "loss": 0.0445, + "step": 15390 + }, + { + "epoch": 0.033963121343175286, + "grad_norm": 0.1623292714357376, + "learning_rate": 2.9999854917049183e-05, + "loss": 0.0453, + "step": 15400 + }, + { + "epoch": 0.033985175318073456, + "grad_norm": 0.13847419619560242, + "learning_rate": 2.9999847553601363e-05, + "loss": 0.0438, + "step": 15410 + }, + { + "epoch": 0.03400722929297162, + "grad_norm": 0.1381678581237793, + "learning_rate": 2.9999840007891226e-05, + "loss": 0.0441, + "step": 15420 + }, + { + "epoch": 0.03402928326786978, + "grad_norm": 0.10970080643892288, + "learning_rate": 2.9999832279918878e-05, + "loss": 0.0463, + "step": 15430 + }, + { + "epoch": 0.03405133724276795, + "grad_norm": 0.14500144124031067, + "learning_rate": 2.9999824369684395e-05, + "loss": 0.0451, + "step": 15440 + }, + { + "epoch": 0.034073391217666116, + "grad_norm": 0.18109162151813507, + "learning_rate": 2.9999816277187897e-05, + "loss": 0.0487, + "step": 15450 + }, + { + "epoch": 0.03409544519256428, + "grad_norm": 0.1303819715976715, + "learning_rate": 2.999980800242946e-05, + "loss": 0.0426, + "step": 15460 + }, + { + "epoch": 0.03411749916746245, + "grad_norm": 0.12526026368141174, + "learning_rate": 2.9999799545409195e-05, + "loss": 0.0467, + "step": 15470 + }, + { + "epoch": 0.03413955314236061, + "grad_norm": 0.12968847155570984, + "learning_rate": 2.9999790906127203e-05, + "loss": 0.047, + "step": 15480 + }, + { + "epoch": 0.034161607117258776, + "grad_norm": 0.14515769481658936, + "learning_rate": 2.9999782084583597e-05, + "loss": 0.0464, + "step": 15490 + }, + { + "epoch": 0.034183661092156946, + "grad_norm": 0.12359938770532608, + "learning_rate": 2.999977308077847e-05, + "loss": 0.0437, + "step": 15500 + }, + { + "epoch": 0.03420571506705511, + "grad_norm": 0.12878717482089996, + "learning_rate": 2.9999763894711933e-05, + "loss": 0.0472, + "step": 15510 + }, + { + "epoch": 0.03422776904195328, + "grad_norm": 0.16466914117336273, + "learning_rate": 2.999975452638411e-05, + "loss": 0.0463, + "step": 15520 + }, + { + "epoch": 0.03424982301685144, + "grad_norm": 0.1387498676776886, + "learning_rate": 2.999974497579511e-05, + "loss": 0.0489, + "step": 15530 + }, + { + "epoch": 0.034271876991749606, + "grad_norm": 0.196039617061615, + "learning_rate": 2.9999735242945037e-05, + "loss": 0.0449, + "step": 15540 + }, + { + "epoch": 0.034293930966647776, + "grad_norm": 0.14948910474777222, + "learning_rate": 2.999972532783402e-05, + "loss": 0.0457, + "step": 15550 + }, + { + "epoch": 0.03431598494154594, + "grad_norm": 0.17398568987846375, + "learning_rate": 2.9999715230462182e-05, + "loss": 0.0474, + "step": 15560 + }, + { + "epoch": 0.0343380389164441, + "grad_norm": 0.12802262604236603, + "learning_rate": 2.9999704950829646e-05, + "loss": 0.0472, + "step": 15570 + }, + { + "epoch": 0.03436009289134227, + "grad_norm": 0.13282813131809235, + "learning_rate": 2.9999694488936525e-05, + "loss": 0.0438, + "step": 15580 + }, + { + "epoch": 0.034382146866240436, + "grad_norm": 0.19716425240039825, + "learning_rate": 2.999968384478296e-05, + "loss": 0.0452, + "step": 15590 + }, + { + "epoch": 0.034404200841138606, + "grad_norm": 0.18410833179950714, + "learning_rate": 2.9999673018369068e-05, + "loss": 0.0456, + "step": 15600 + }, + { + "epoch": 0.03442625481603677, + "grad_norm": 0.18138228356838226, + "learning_rate": 2.9999662009694993e-05, + "loss": 0.0464, + "step": 15610 + }, + { + "epoch": 0.03444830879093493, + "grad_norm": 0.18573223054409027, + "learning_rate": 2.999965081876086e-05, + "loss": 0.0427, + "step": 15620 + }, + { + "epoch": 0.0344703627658331, + "grad_norm": 0.1661141812801361, + "learning_rate": 2.9999639445566808e-05, + "loss": 0.044, + "step": 15630 + }, + { + "epoch": 0.034492416740731266, + "grad_norm": 0.13738515973091125, + "learning_rate": 2.9999627890112972e-05, + "loss": 0.0451, + "step": 15640 + }, + { + "epoch": 0.03451447071562943, + "grad_norm": 0.1697273552417755, + "learning_rate": 2.99996161523995e-05, + "loss": 0.0463, + "step": 15650 + }, + { + "epoch": 0.0345365246905276, + "grad_norm": 0.14832450449466705, + "learning_rate": 2.9999604232426526e-05, + "loss": 0.0486, + "step": 15660 + }, + { + "epoch": 0.03455857866542576, + "grad_norm": 0.15451723337173462, + "learning_rate": 2.9999592130194197e-05, + "loss": 0.0481, + "step": 15670 + }, + { + "epoch": 0.034580632640323926, + "grad_norm": 0.16160430014133453, + "learning_rate": 2.9999579845702668e-05, + "loss": 0.0435, + "step": 15680 + }, + { + "epoch": 0.034602686615222096, + "grad_norm": 0.22431713342666626, + "learning_rate": 2.9999567378952074e-05, + "loss": 0.0464, + "step": 15690 + }, + { + "epoch": 0.03462474059012026, + "grad_norm": 0.2042677402496338, + "learning_rate": 2.9999554729942583e-05, + "loss": 0.0461, + "step": 15700 + }, + { + "epoch": 0.03464679456501843, + "grad_norm": 0.1463322788476944, + "learning_rate": 2.9999541898674335e-05, + "loss": 0.0452, + "step": 15710 + }, + { + "epoch": 0.03466884853991659, + "grad_norm": 0.1465679407119751, + "learning_rate": 2.999952888514749e-05, + "loss": 0.0453, + "step": 15720 + }, + { + "epoch": 0.034690902514814756, + "grad_norm": 0.17681355774402618, + "learning_rate": 2.999951568936221e-05, + "loss": 0.0458, + "step": 15730 + }, + { + "epoch": 0.034712956489712926, + "grad_norm": 0.1781127154827118, + "learning_rate": 2.999950231131865e-05, + "loss": 0.047, + "step": 15740 + }, + { + "epoch": 0.03473501046461109, + "grad_norm": 0.14723040163516998, + "learning_rate": 2.999948875101698e-05, + "loss": 0.0454, + "step": 15750 + }, + { + "epoch": 0.03475706443950925, + "grad_norm": 0.17343075573444366, + "learning_rate": 2.9999475008457354e-05, + "loss": 0.047, + "step": 15760 + }, + { + "epoch": 0.03477911841440742, + "grad_norm": 0.16784629225730896, + "learning_rate": 2.9999461083639946e-05, + "loss": 0.0451, + "step": 15770 + }, + { + "epoch": 0.034801172389305586, + "grad_norm": 0.20920410752296448, + "learning_rate": 2.9999446976564924e-05, + "loss": 0.0464, + "step": 15780 + }, + { + "epoch": 0.03482322636420375, + "grad_norm": 0.1892329901456833, + "learning_rate": 2.9999432687232464e-05, + "loss": 0.0447, + "step": 15790 + }, + { + "epoch": 0.03484528033910192, + "grad_norm": 0.12959614396095276, + "learning_rate": 2.9999418215642732e-05, + "loss": 0.0443, + "step": 15800 + }, + { + "epoch": 0.03486733431400008, + "grad_norm": 0.16924886405467987, + "learning_rate": 2.99994035617959e-05, + "loss": 0.043, + "step": 15810 + }, + { + "epoch": 0.03488938828889825, + "grad_norm": 0.1865646094083786, + "learning_rate": 2.9999388725692164e-05, + "loss": 0.0436, + "step": 15820 + }, + { + "epoch": 0.034911442263796416, + "grad_norm": 0.14287611842155457, + "learning_rate": 2.9999373707331687e-05, + "loss": 0.0446, + "step": 15830 + }, + { + "epoch": 0.03493349623869458, + "grad_norm": 0.14014513790607452, + "learning_rate": 2.999935850671466e-05, + "loss": 0.0456, + "step": 15840 + }, + { + "epoch": 0.03495555021359275, + "grad_norm": 0.14021261036396027, + "learning_rate": 2.9999343123841265e-05, + "loss": 0.0442, + "step": 15850 + }, + { + "epoch": 0.03497760418849091, + "grad_norm": 0.16304688155651093, + "learning_rate": 2.999932755871169e-05, + "loss": 0.0449, + "step": 15860 + }, + { + "epoch": 0.034999658163389076, + "grad_norm": 0.1607753485441208, + "learning_rate": 2.999931181132612e-05, + "loss": 0.048, + "step": 15870 + }, + { + "epoch": 0.035021712138287246, + "grad_norm": 0.21680933237075806, + "learning_rate": 2.9999295881684756e-05, + "loss": 0.0487, + "step": 15880 + }, + { + "epoch": 0.03504376611318541, + "grad_norm": 0.17246560752391815, + "learning_rate": 2.9999279769787776e-05, + "loss": 0.0484, + "step": 15890 + }, + { + "epoch": 0.03506582008808357, + "grad_norm": 0.15946480631828308, + "learning_rate": 2.999926347563539e-05, + "loss": 0.0467, + "step": 15900 + }, + { + "epoch": 0.03508787406298174, + "grad_norm": 0.20313313603401184, + "learning_rate": 2.999924699922779e-05, + "loss": 0.046, + "step": 15910 + }, + { + "epoch": 0.035109928037879906, + "grad_norm": 0.17009039223194122, + "learning_rate": 2.9999230340565183e-05, + "loss": 0.0482, + "step": 15920 + }, + { + "epoch": 0.035131982012778076, + "grad_norm": 0.1558743119239807, + "learning_rate": 2.9999213499647762e-05, + "loss": 0.0466, + "step": 15930 + }, + { + "epoch": 0.03515403598767624, + "grad_norm": 0.17878901958465576, + "learning_rate": 2.9999196476475736e-05, + "loss": 0.0456, + "step": 15940 + }, + { + "epoch": 0.0351760899625744, + "grad_norm": 0.16840359568595886, + "learning_rate": 2.9999179271049308e-05, + "loss": 0.0452, + "step": 15950 + }, + { + "epoch": 0.03519814393747257, + "grad_norm": 0.18718630075454712, + "learning_rate": 2.9999161883368695e-05, + "loss": 0.0444, + "step": 15960 + }, + { + "epoch": 0.035220197912370736, + "grad_norm": 0.13419488072395325, + "learning_rate": 2.99991443134341e-05, + "loss": 0.0463, + "step": 15970 + }, + { + "epoch": 0.0352422518872689, + "grad_norm": 0.12599915266036987, + "learning_rate": 2.9999126561245742e-05, + "loss": 0.0476, + "step": 15980 + }, + { + "epoch": 0.03526430586216707, + "grad_norm": 0.1641278862953186, + "learning_rate": 2.9999108626803833e-05, + "loss": 0.0454, + "step": 15990 + }, + { + "epoch": 0.03528635983706523, + "grad_norm": 0.11996762454509735, + "learning_rate": 2.9999090510108596e-05, + "loss": 0.0434, + "step": 16000 + }, + { + "epoch": 0.035308413811963396, + "grad_norm": 0.12569352984428406, + "learning_rate": 2.9999072211160244e-05, + "loss": 0.046, + "step": 16010 + }, + { + "epoch": 0.035330467786861566, + "grad_norm": 0.18336810171604156, + "learning_rate": 2.9999053729959e-05, + "loss": 0.0449, + "step": 16020 + }, + { + "epoch": 0.03535252176175973, + "grad_norm": 0.1706746518611908, + "learning_rate": 2.99990350665051e-05, + "loss": 0.0473, + "step": 16030 + }, + { + "epoch": 0.0353745757366579, + "grad_norm": 0.19142910838127136, + "learning_rate": 2.9999016220798757e-05, + "loss": 0.0446, + "step": 16040 + }, + { + "epoch": 0.03539662971155606, + "grad_norm": 0.1327306628227234, + "learning_rate": 2.999899719284021e-05, + "loss": 0.0466, + "step": 16050 + }, + { + "epoch": 0.035418683686454226, + "grad_norm": 0.15019507706165314, + "learning_rate": 2.999897798262968e-05, + "loss": 0.0444, + "step": 16060 + }, + { + "epoch": 0.035440737661352396, + "grad_norm": 0.1271306723356247, + "learning_rate": 2.999895859016741e-05, + "loss": 0.0465, + "step": 16070 + }, + { + "epoch": 0.03546279163625056, + "grad_norm": 0.13765139877796173, + "learning_rate": 2.9998939015453633e-05, + "loss": 0.0441, + "step": 16080 + }, + { + "epoch": 0.03548484561114872, + "grad_norm": 0.14900951087474823, + "learning_rate": 2.9998919258488584e-05, + "loss": 0.0428, + "step": 16090 + }, + { + "epoch": 0.03550689958604689, + "grad_norm": 0.1522381603717804, + "learning_rate": 2.9998899319272507e-05, + "loss": 0.0441, + "step": 16100 + }, + { + "epoch": 0.035528953560945056, + "grad_norm": 0.14380134642124176, + "learning_rate": 2.9998879197805637e-05, + "loss": 0.0459, + "step": 16110 + }, + { + "epoch": 0.03555100753584322, + "grad_norm": 0.14031602442264557, + "learning_rate": 2.9998858894088226e-05, + "loss": 0.0451, + "step": 16120 + }, + { + "epoch": 0.03557306151074139, + "grad_norm": 0.19203117489814758, + "learning_rate": 2.9998838408120517e-05, + "loss": 0.0461, + "step": 16130 + }, + { + "epoch": 0.03559511548563955, + "grad_norm": 0.15345744788646698, + "learning_rate": 2.999881773990276e-05, + "loss": 0.044, + "step": 16140 + }, + { + "epoch": 0.03561716946053772, + "grad_norm": 0.16763542592525482, + "learning_rate": 2.9998796889435207e-05, + "loss": 0.0459, + "step": 16150 + }, + { + "epoch": 0.035639223435435886, + "grad_norm": 0.16214267909526825, + "learning_rate": 2.999877585671811e-05, + "loss": 0.0469, + "step": 16160 + }, + { + "epoch": 0.03566127741033405, + "grad_norm": 0.21455839276313782, + "learning_rate": 2.999875464175173e-05, + "loss": 0.0471, + "step": 16170 + }, + { + "epoch": 0.03568333138523222, + "grad_norm": 0.15191969275474548, + "learning_rate": 2.9998733244536316e-05, + "loss": 0.0442, + "step": 16180 + }, + { + "epoch": 0.03570538536013038, + "grad_norm": 0.16129592061042786, + "learning_rate": 2.999871166507213e-05, + "loss": 0.0427, + "step": 16190 + }, + { + "epoch": 0.035727439335028546, + "grad_norm": 0.19265569746494293, + "learning_rate": 2.9998689903359437e-05, + "loss": 0.0453, + "step": 16200 + }, + { + "epoch": 0.035749493309926716, + "grad_norm": 0.15155267715454102, + "learning_rate": 2.99986679593985e-05, + "loss": 0.047, + "step": 16210 + }, + { + "epoch": 0.03577154728482488, + "grad_norm": 0.1705935001373291, + "learning_rate": 2.9998645833189587e-05, + "loss": 0.0452, + "step": 16220 + }, + { + "epoch": 0.03579360125972304, + "grad_norm": 0.17889384925365448, + "learning_rate": 2.999862352473297e-05, + "loss": 0.0439, + "step": 16230 + }, + { + "epoch": 0.03581565523462121, + "grad_norm": 0.15854282677173615, + "learning_rate": 2.9998601034028916e-05, + "loss": 0.0464, + "step": 16240 + }, + { + "epoch": 0.035837709209519376, + "grad_norm": 0.18022547662258148, + "learning_rate": 2.999857836107769e-05, + "loss": 0.047, + "step": 16250 + }, + { + "epoch": 0.03585976318441755, + "grad_norm": 0.1588965505361557, + "learning_rate": 2.9998555505879584e-05, + "loss": 0.045, + "step": 16260 + }, + { + "epoch": 0.03588181715931571, + "grad_norm": 0.16062650084495544, + "learning_rate": 2.9998532468434866e-05, + "loss": 0.0457, + "step": 16270 + }, + { + "epoch": 0.03590387113421387, + "grad_norm": 0.1591169536113739, + "learning_rate": 2.9998509248743814e-05, + "loss": 0.0445, + "step": 16280 + }, + { + "epoch": 0.03592592510911204, + "grad_norm": 0.12987175583839417, + "learning_rate": 2.9998485846806718e-05, + "loss": 0.0471, + "step": 16290 + }, + { + "epoch": 0.035947979084010206, + "grad_norm": 0.13039925694465637, + "learning_rate": 2.9998462262623853e-05, + "loss": 0.0433, + "step": 16300 + }, + { + "epoch": 0.03597003305890837, + "grad_norm": 0.13507625460624695, + "learning_rate": 2.9998438496195514e-05, + "loss": 0.0434, + "step": 16310 + }, + { + "epoch": 0.03599208703380654, + "grad_norm": 0.18318918347358704, + "learning_rate": 2.9998414547521988e-05, + "loss": 0.0444, + "step": 16320 + }, + { + "epoch": 0.0360141410087047, + "grad_norm": 0.11629536747932434, + "learning_rate": 2.999839041660356e-05, + "loss": 0.0454, + "step": 16330 + }, + { + "epoch": 0.036036194983602866, + "grad_norm": 0.18742625415325165, + "learning_rate": 2.9998366103440532e-05, + "loss": 0.0471, + "step": 16340 + }, + { + "epoch": 0.03605824895850104, + "grad_norm": 0.1466389149427414, + "learning_rate": 2.9998341608033195e-05, + "loss": 0.0452, + "step": 16350 + }, + { + "epoch": 0.0360803029333992, + "grad_norm": 0.14826372265815735, + "learning_rate": 2.9998316930381844e-05, + "loss": 0.0443, + "step": 16360 + }, + { + "epoch": 0.03610235690829737, + "grad_norm": 0.13518868386745453, + "learning_rate": 2.9998292070486784e-05, + "loss": 0.0458, + "step": 16370 + }, + { + "epoch": 0.03612441088319553, + "grad_norm": 0.15793339908123016, + "learning_rate": 2.999826702834831e-05, + "loss": 0.0446, + "step": 16380 + }, + { + "epoch": 0.036146464858093696, + "grad_norm": 0.16417062282562256, + "learning_rate": 2.9998241803966735e-05, + "loss": 0.0464, + "step": 16390 + }, + { + "epoch": 0.03616851883299187, + "grad_norm": 0.15750570595264435, + "learning_rate": 2.999821639734236e-05, + "loss": 0.0437, + "step": 16400 + }, + { + "epoch": 0.03619057280789003, + "grad_norm": 0.17567047476768494, + "learning_rate": 2.9998190808475495e-05, + "loss": 0.0444, + "step": 16410 + }, + { + "epoch": 0.03621262678278819, + "grad_norm": 0.15609505772590637, + "learning_rate": 2.9998165037366453e-05, + "loss": 0.0428, + "step": 16420 + }, + { + "epoch": 0.03623468075768636, + "grad_norm": 0.20168183743953705, + "learning_rate": 2.999813908401554e-05, + "loss": 0.0463, + "step": 16430 + }, + { + "epoch": 0.03625673473258453, + "grad_norm": 0.23743057250976562, + "learning_rate": 2.9998112948423082e-05, + "loss": 0.0462, + "step": 16440 + }, + { + "epoch": 0.03627878870748269, + "grad_norm": 0.1525774747133255, + "learning_rate": 2.999808663058939e-05, + "loss": 0.0444, + "step": 16450 + }, + { + "epoch": 0.03630084268238086, + "grad_norm": 0.15683147311210632, + "learning_rate": 2.999806013051478e-05, + "loss": 0.0452, + "step": 16460 + }, + { + "epoch": 0.03632289665727902, + "grad_norm": 0.17385423183441162, + "learning_rate": 2.9998033448199586e-05, + "loss": 0.0449, + "step": 16470 + }, + { + "epoch": 0.03634495063217719, + "grad_norm": 0.13668817281723022, + "learning_rate": 2.9998006583644125e-05, + "loss": 0.0437, + "step": 16480 + }, + { + "epoch": 0.03636700460707536, + "grad_norm": 0.15778274834156036, + "learning_rate": 2.9997979536848722e-05, + "loss": 0.0441, + "step": 16490 + }, + { + "epoch": 0.03638905858197352, + "grad_norm": 0.1915811449289322, + "learning_rate": 2.99979523078137e-05, + "loss": 0.0432, + "step": 16500 + }, + { + "epoch": 0.03641111255687169, + "grad_norm": 0.12772348523139954, + "learning_rate": 2.9997924896539403e-05, + "loss": 0.0453, + "step": 16510 + }, + { + "epoch": 0.03643316653176985, + "grad_norm": 0.1776987761259079, + "learning_rate": 2.9997897303026164e-05, + "loss": 0.0442, + "step": 16520 + }, + { + "epoch": 0.03645522050666802, + "grad_norm": 0.15850423276424408, + "learning_rate": 2.9997869527274304e-05, + "loss": 0.0449, + "step": 16530 + }, + { + "epoch": 0.03647727448156619, + "grad_norm": 0.14122767746448517, + "learning_rate": 2.999784156928417e-05, + "loss": 0.0434, + "step": 16540 + }, + { + "epoch": 0.03649932845646435, + "grad_norm": 0.17538826167583466, + "learning_rate": 2.9997813429056105e-05, + "loss": 0.0449, + "step": 16550 + }, + { + "epoch": 0.03652138243136251, + "grad_norm": 0.15619058907032013, + "learning_rate": 2.999778510659044e-05, + "loss": 0.0426, + "step": 16560 + }, + { + "epoch": 0.03654343640626068, + "grad_norm": 0.12638789415359497, + "learning_rate": 2.999775660188753e-05, + "loss": 0.0457, + "step": 16570 + }, + { + "epoch": 0.03656549038115885, + "grad_norm": 0.15829405188560486, + "learning_rate": 2.999772791494772e-05, + "loss": 0.0434, + "step": 16580 + }, + { + "epoch": 0.03658754435605702, + "grad_norm": 0.15936708450317383, + "learning_rate": 2.999769904577135e-05, + "loss": 0.0436, + "step": 16590 + }, + { + "epoch": 0.03660959833095518, + "grad_norm": 0.13326296210289001, + "learning_rate": 2.9997669994358778e-05, + "loss": 0.0449, + "step": 16600 + }, + { + "epoch": 0.03663165230585334, + "grad_norm": 0.1434566229581833, + "learning_rate": 2.9997640760710354e-05, + "loss": 0.046, + "step": 16610 + }, + { + "epoch": 0.036653706280751513, + "grad_norm": 0.13554736971855164, + "learning_rate": 2.9997611344826438e-05, + "loss": 0.0456, + "step": 16620 + }, + { + "epoch": 0.03667576025564968, + "grad_norm": 0.1383960098028183, + "learning_rate": 2.999758174670738e-05, + "loss": 0.0455, + "step": 16630 + }, + { + "epoch": 0.03669781423054784, + "grad_norm": 0.13953112065792084, + "learning_rate": 2.999755196635355e-05, + "loss": 0.0454, + "step": 16640 + }, + { + "epoch": 0.03671986820544601, + "grad_norm": 0.15564095973968506, + "learning_rate": 2.9997522003765297e-05, + "loss": 0.0456, + "step": 16650 + }, + { + "epoch": 0.03674192218034417, + "grad_norm": 0.22008678317070007, + "learning_rate": 2.9997491858942997e-05, + "loss": 0.0457, + "step": 16660 + }, + { + "epoch": 0.03676397615524234, + "grad_norm": 0.11372203379869461, + "learning_rate": 2.9997461531887006e-05, + "loss": 0.0435, + "step": 16670 + }, + { + "epoch": 0.03678603013014051, + "grad_norm": 0.1285763382911682, + "learning_rate": 2.99974310225977e-05, + "loss": 0.0456, + "step": 16680 + }, + { + "epoch": 0.03680808410503867, + "grad_norm": 0.14139454066753387, + "learning_rate": 2.9997400331075447e-05, + "loss": 0.0421, + "step": 16690 + }, + { + "epoch": 0.03683013807993684, + "grad_norm": 0.15731723606586456, + "learning_rate": 2.999736945732062e-05, + "loss": 0.0423, + "step": 16700 + }, + { + "epoch": 0.036852192054835003, + "grad_norm": 0.14442016184329987, + "learning_rate": 2.9997338401333594e-05, + "loss": 0.045, + "step": 16710 + }, + { + "epoch": 0.03687424602973317, + "grad_norm": 0.12351729720830917, + "learning_rate": 2.9997307163114748e-05, + "loss": 0.0451, + "step": 16720 + }, + { + "epoch": 0.03689630000463134, + "grad_norm": 0.124122753739357, + "learning_rate": 2.999727574266446e-05, + "loss": 0.0457, + "step": 16730 + }, + { + "epoch": 0.0369183539795295, + "grad_norm": 0.14406822621822357, + "learning_rate": 2.999724413998311e-05, + "loss": 0.0472, + "step": 16740 + }, + { + "epoch": 0.03694040795442766, + "grad_norm": 0.15739116072654724, + "learning_rate": 2.9997212355071086e-05, + "loss": 0.0455, + "step": 16750 + }, + { + "epoch": 0.036962461929325834, + "grad_norm": 0.15658704936504364, + "learning_rate": 2.9997180387928767e-05, + "loss": 0.047, + "step": 16760 + }, + { + "epoch": 0.036984515904224, + "grad_norm": 0.13239139318466187, + "learning_rate": 2.9997148238556556e-05, + "loss": 0.0461, + "step": 16770 + }, + { + "epoch": 0.03700656987912217, + "grad_norm": 0.14501775801181793, + "learning_rate": 2.9997115906954828e-05, + "loss": 0.044, + "step": 16780 + }, + { + "epoch": 0.03702862385402033, + "grad_norm": 0.14607492089271545, + "learning_rate": 2.999708339312398e-05, + "loss": 0.0454, + "step": 16790 + }, + { + "epoch": 0.037050677828918493, + "grad_norm": 0.13692402839660645, + "learning_rate": 2.9997050697064413e-05, + "loss": 0.0435, + "step": 16800 + }, + { + "epoch": 0.037072731803816664, + "grad_norm": 0.1539285033941269, + "learning_rate": 2.9997017818776522e-05, + "loss": 0.0456, + "step": 16810 + }, + { + "epoch": 0.03709478577871483, + "grad_norm": 0.13916811347007751, + "learning_rate": 2.9996984758260703e-05, + "loss": 0.0469, + "step": 16820 + }, + { + "epoch": 0.03711683975361299, + "grad_norm": 0.13688598573207855, + "learning_rate": 2.999695151551736e-05, + "loss": 0.0453, + "step": 16830 + }, + { + "epoch": 0.03713889372851116, + "grad_norm": 0.17836499214172363, + "learning_rate": 2.9996918090546894e-05, + "loss": 0.0425, + "step": 16840 + }, + { + "epoch": 0.037160947703409324, + "grad_norm": 0.19256529211997986, + "learning_rate": 2.9996884483349716e-05, + "loss": 0.0461, + "step": 16850 + }, + { + "epoch": 0.03718300167830749, + "grad_norm": 0.17246316373348236, + "learning_rate": 2.999685069392623e-05, + "loss": 0.0441, + "step": 16860 + }, + { + "epoch": 0.03720505565320566, + "grad_norm": 0.15534093976020813, + "learning_rate": 2.999681672227685e-05, + "loss": 0.0447, + "step": 16870 + }, + { + "epoch": 0.03722710962810382, + "grad_norm": 0.15006183087825775, + "learning_rate": 2.999678256840199e-05, + "loss": 0.0449, + "step": 16880 + }, + { + "epoch": 0.03724916360300199, + "grad_norm": 0.1648184061050415, + "learning_rate": 2.9996748232302057e-05, + "loss": 0.0441, + "step": 16890 + }, + { + "epoch": 0.037271217577900154, + "grad_norm": 0.1795004904270172, + "learning_rate": 2.999671371397748e-05, + "loss": 0.0446, + "step": 16900 + }, + { + "epoch": 0.03729327155279832, + "grad_norm": 0.11655654013156891, + "learning_rate": 2.999667901342867e-05, + "loss": 0.0441, + "step": 16910 + }, + { + "epoch": 0.03731532552769649, + "grad_norm": 0.1752614676952362, + "learning_rate": 2.999664413065605e-05, + "loss": 0.0447, + "step": 16920 + }, + { + "epoch": 0.03733737950259465, + "grad_norm": 0.13533827662467957, + "learning_rate": 2.9996609065660044e-05, + "loss": 0.0435, + "step": 16930 + }, + { + "epoch": 0.037359433477492814, + "grad_norm": 0.16120217740535736, + "learning_rate": 2.9996573818441082e-05, + "loss": 0.0457, + "step": 16940 + }, + { + "epoch": 0.037381487452390984, + "grad_norm": 0.1793200969696045, + "learning_rate": 2.9996538388999583e-05, + "loss": 0.0471, + "step": 16950 + }, + { + "epoch": 0.03740354142728915, + "grad_norm": 0.13402876257896423, + "learning_rate": 2.999650277733599e-05, + "loss": 0.043, + "step": 16960 + }, + { + "epoch": 0.03742559540218731, + "grad_norm": 0.1503273844718933, + "learning_rate": 2.9996466983450724e-05, + "loss": 0.0445, + "step": 16970 + }, + { + "epoch": 0.03744764937708548, + "grad_norm": 0.1315179169178009, + "learning_rate": 2.999643100734423e-05, + "loss": 0.0451, + "step": 16980 + }, + { + "epoch": 0.037469703351983644, + "grad_norm": 0.14840875566005707, + "learning_rate": 2.9996394849016935e-05, + "loss": 0.0452, + "step": 16990 + }, + { + "epoch": 0.037491757326881814, + "grad_norm": 0.1715621054172516, + "learning_rate": 2.9996358508469285e-05, + "loss": 0.0454, + "step": 17000 + }, + { + "epoch": 0.03751381130177998, + "grad_norm": 0.14651672542095184, + "learning_rate": 2.999632198570172e-05, + "loss": 0.0445, + "step": 17010 + }, + { + "epoch": 0.03753586527667814, + "grad_norm": 0.16682806611061096, + "learning_rate": 2.999628528071469e-05, + "loss": 0.0467, + "step": 17020 + }, + { + "epoch": 0.03755791925157631, + "grad_norm": 0.14330869913101196, + "learning_rate": 2.999624839350863e-05, + "loss": 0.0467, + "step": 17030 + }, + { + "epoch": 0.037579973226474474, + "grad_norm": 0.11560671776533127, + "learning_rate": 2.9996211324083994e-05, + "loss": 0.0455, + "step": 17040 + }, + { + "epoch": 0.03760202720137264, + "grad_norm": 0.14582975208759308, + "learning_rate": 2.999617407244123e-05, + "loss": 0.0449, + "step": 17050 + }, + { + "epoch": 0.03762408117627081, + "grad_norm": 0.19716574251651764, + "learning_rate": 2.9996136638580796e-05, + "loss": 0.0455, + "step": 17060 + }, + { + "epoch": 0.03764613515116897, + "grad_norm": 0.13577675819396973, + "learning_rate": 2.9996099022503137e-05, + "loss": 0.0461, + "step": 17070 + }, + { + "epoch": 0.037668189126067134, + "grad_norm": 0.12997108697891235, + "learning_rate": 2.9996061224208726e-05, + "loss": 0.0444, + "step": 17080 + }, + { + "epoch": 0.037690243100965304, + "grad_norm": 0.14072662591934204, + "learning_rate": 2.9996023243698e-05, + "loss": 0.0434, + "step": 17090 + }, + { + "epoch": 0.03771229707586347, + "grad_norm": 0.14203424751758575, + "learning_rate": 2.999598508097144e-05, + "loss": 0.0445, + "step": 17100 + }, + { + "epoch": 0.03773435105076164, + "grad_norm": 0.11189981549978256, + "learning_rate": 2.9995946736029504e-05, + "loss": 0.0433, + "step": 17110 + }, + { + "epoch": 0.0377564050256598, + "grad_norm": 0.14970284700393677, + "learning_rate": 2.9995908208872657e-05, + "loss": 0.0442, + "step": 17120 + }, + { + "epoch": 0.037778459000557964, + "grad_norm": 0.18838968873023987, + "learning_rate": 2.999586949950136e-05, + "loss": 0.0433, + "step": 17130 + }, + { + "epoch": 0.037800512975456134, + "grad_norm": 0.13015882670879364, + "learning_rate": 2.9995830607916096e-05, + "loss": 0.0441, + "step": 17140 + }, + { + "epoch": 0.0378225669503543, + "grad_norm": 0.14355885982513428, + "learning_rate": 2.999579153411733e-05, + "loss": 0.0444, + "step": 17150 + }, + { + "epoch": 0.03784462092525246, + "grad_norm": 0.16889743506908417, + "learning_rate": 2.9995752278105542e-05, + "loss": 0.0449, + "step": 17160 + }, + { + "epoch": 0.03786667490015063, + "grad_norm": 0.17769432067871094, + "learning_rate": 2.9995712839881202e-05, + "loss": 0.0464, + "step": 17170 + }, + { + "epoch": 0.037888728875048794, + "grad_norm": 0.12694409489631653, + "learning_rate": 2.9995673219444794e-05, + "loss": 0.0453, + "step": 17180 + }, + { + "epoch": 0.03791078284994696, + "grad_norm": 0.13167746365070343, + "learning_rate": 2.9995633416796793e-05, + "loss": 0.0434, + "step": 17190 + }, + { + "epoch": 0.03793283682484513, + "grad_norm": 0.2431311458349228, + "learning_rate": 2.9995593431937694e-05, + "loss": 0.0441, + "step": 17200 + }, + { + "epoch": 0.03795489079974329, + "grad_norm": 0.16347244381904602, + "learning_rate": 2.9995553264867974e-05, + "loss": 0.0444, + "step": 17210 + }, + { + "epoch": 0.03797694477464146, + "grad_norm": 0.17749664187431335, + "learning_rate": 2.9995512915588125e-05, + "loss": 0.044, + "step": 17220 + }, + { + "epoch": 0.037998998749539624, + "grad_norm": 0.13029932975769043, + "learning_rate": 2.9995472384098634e-05, + "loss": 0.0438, + "step": 17230 + }, + { + "epoch": 0.03802105272443779, + "grad_norm": 0.11888851225376129, + "learning_rate": 2.99954316704e-05, + "loss": 0.0451, + "step": 17240 + }, + { + "epoch": 0.03804310669933596, + "grad_norm": 0.13815763592720032, + "learning_rate": 2.9995390774492707e-05, + "loss": 0.0452, + "step": 17250 + }, + { + "epoch": 0.03806516067423412, + "grad_norm": 0.14667131006717682, + "learning_rate": 2.9995349696377263e-05, + "loss": 0.0435, + "step": 17260 + }, + { + "epoch": 0.038087214649132284, + "grad_norm": 0.1913997232913971, + "learning_rate": 2.999530843605416e-05, + "loss": 0.0472, + "step": 17270 + }, + { + "epoch": 0.038109268624030454, + "grad_norm": 0.19202370941638947, + "learning_rate": 2.99952669935239e-05, + "loss": 0.0472, + "step": 17280 + }, + { + "epoch": 0.03813132259892862, + "grad_norm": 0.18227945268154144, + "learning_rate": 2.999522536878699e-05, + "loss": 0.043, + "step": 17290 + }, + { + "epoch": 0.03815337657382678, + "grad_norm": 0.13738375902175903, + "learning_rate": 2.9995183561843934e-05, + "loss": 0.0457, + "step": 17300 + }, + { + "epoch": 0.03817543054872495, + "grad_norm": 0.13874191045761108, + "learning_rate": 2.999514157269524e-05, + "loss": 0.0442, + "step": 17310 + }, + { + "epoch": 0.038197484523623114, + "grad_norm": 0.15438248217105865, + "learning_rate": 2.999509940134142e-05, + "loss": 0.0443, + "step": 17320 + }, + { + "epoch": 0.038219538498521284, + "grad_norm": 0.13309475779533386, + "learning_rate": 2.9995057047782978e-05, + "loss": 0.0432, + "step": 17330 + }, + { + "epoch": 0.03824159247341945, + "grad_norm": 0.12283775955438614, + "learning_rate": 2.999501451202044e-05, + "loss": 0.0463, + "step": 17340 + }, + { + "epoch": 0.03826364644831761, + "grad_norm": 0.15343381464481354, + "learning_rate": 2.9994971794054324e-05, + "loss": 0.045, + "step": 17350 + }, + { + "epoch": 0.03828570042321578, + "grad_norm": 0.141991525888443, + "learning_rate": 2.999492889388513e-05, + "loss": 0.0431, + "step": 17360 + }, + { + "epoch": 0.038307754398113944, + "grad_norm": 0.1427912563085556, + "learning_rate": 2.9994885811513402e-05, + "loss": 0.0468, + "step": 17370 + }, + { + "epoch": 0.03832980837301211, + "grad_norm": 0.1266181915998459, + "learning_rate": 2.999484254693965e-05, + "loss": 0.0451, + "step": 17380 + }, + { + "epoch": 0.03835186234791028, + "grad_norm": 0.16878646612167358, + "learning_rate": 2.9994799100164407e-05, + "loss": 0.045, + "step": 17390 + }, + { + "epoch": 0.03837391632280844, + "grad_norm": 0.17879042029380798, + "learning_rate": 2.9994755471188196e-05, + "loss": 0.0443, + "step": 17400 + }, + { + "epoch": 0.038395970297706604, + "grad_norm": 0.12150581926107407, + "learning_rate": 2.9994711660011545e-05, + "loss": 0.0471, + "step": 17410 + }, + { + "epoch": 0.038418024272604774, + "grad_norm": 0.1590297371149063, + "learning_rate": 2.9994667666634993e-05, + "loss": 0.0447, + "step": 17420 + }, + { + "epoch": 0.03844007824750294, + "grad_norm": 0.12306741625070572, + "learning_rate": 2.999462349105907e-05, + "loss": 0.0444, + "step": 17430 + }, + { + "epoch": 0.03846213222240111, + "grad_norm": 0.12822410464286804, + "learning_rate": 2.9994579133284315e-05, + "loss": 0.0445, + "step": 17440 + }, + { + "epoch": 0.03848418619729927, + "grad_norm": 0.13222207129001617, + "learning_rate": 2.9994534593311267e-05, + "loss": 0.0437, + "step": 17450 + }, + { + "epoch": 0.038506240172197434, + "grad_norm": 0.18047775328159332, + "learning_rate": 2.9994489871140466e-05, + "loss": 0.0438, + "step": 17460 + }, + { + "epoch": 0.038528294147095604, + "grad_norm": 0.13266661763191223, + "learning_rate": 2.9994444966772458e-05, + "loss": 0.0457, + "step": 17470 + }, + { + "epoch": 0.03855034812199377, + "grad_norm": 0.16737423837184906, + "learning_rate": 2.9994399880207785e-05, + "loss": 0.0437, + "step": 17480 + }, + { + "epoch": 0.03857240209689193, + "grad_norm": 0.17700886726379395, + "learning_rate": 2.9994354611446996e-05, + "loss": 0.045, + "step": 17490 + }, + { + "epoch": 0.0385944560717901, + "grad_norm": 0.14156101644039154, + "learning_rate": 2.9994309160490642e-05, + "loss": 0.047, + "step": 17500 + }, + { + "epoch": 0.038616510046688264, + "grad_norm": 0.16606906056404114, + "learning_rate": 2.9994263527339274e-05, + "loss": 0.0441, + "step": 17510 + }, + { + "epoch": 0.03863856402158643, + "grad_norm": 0.11966013163328171, + "learning_rate": 2.9994217711993447e-05, + "loss": 0.0429, + "step": 17520 + }, + { + "epoch": 0.0386606179964846, + "grad_norm": 0.1553391069173813, + "learning_rate": 2.9994171714453716e-05, + "loss": 0.0442, + "step": 17530 + }, + { + "epoch": 0.03868267197138276, + "grad_norm": 0.16773711144924164, + "learning_rate": 2.9994125534720647e-05, + "loss": 0.0441, + "step": 17540 + }, + { + "epoch": 0.03870472594628093, + "grad_norm": 0.1601998656988144, + "learning_rate": 2.9994079172794796e-05, + "loss": 0.0437, + "step": 17550 + }, + { + "epoch": 0.038726779921179094, + "grad_norm": 0.12203298509120941, + "learning_rate": 2.9994032628676722e-05, + "loss": 0.0447, + "step": 17560 + }, + { + "epoch": 0.03874883389607726, + "grad_norm": 0.1549472212791443, + "learning_rate": 2.9993985902366998e-05, + "loss": 0.0441, + "step": 17570 + }, + { + "epoch": 0.03877088787097543, + "grad_norm": 0.12436886131763458, + "learning_rate": 2.9993938993866185e-05, + "loss": 0.0439, + "step": 17580 + }, + { + "epoch": 0.03879294184587359, + "grad_norm": 0.12133403867483139, + "learning_rate": 2.9993891903174856e-05, + "loss": 0.0421, + "step": 17590 + }, + { + "epoch": 0.038814995820771754, + "grad_norm": 0.11527027189731598, + "learning_rate": 2.999384463029359e-05, + "loss": 0.0421, + "step": 17600 + }, + { + "epoch": 0.038837049795669924, + "grad_norm": 0.1391354352235794, + "learning_rate": 2.999379717522295e-05, + "loss": 0.0454, + "step": 17610 + }, + { + "epoch": 0.03885910377056809, + "grad_norm": 0.1703055500984192, + "learning_rate": 2.999374953796352e-05, + "loss": 0.0465, + "step": 17620 + }, + { + "epoch": 0.03888115774546625, + "grad_norm": 0.1302184909582138, + "learning_rate": 2.9993701718515877e-05, + "loss": 0.043, + "step": 17630 + }, + { + "epoch": 0.03890321172036442, + "grad_norm": 0.16566447913646698, + "learning_rate": 2.99936537168806e-05, + "loss": 0.0471, + "step": 17640 + }, + { + "epoch": 0.038925265695262584, + "grad_norm": 0.1491473764181137, + "learning_rate": 2.999360553305828e-05, + "loss": 0.0461, + "step": 17650 + }, + { + "epoch": 0.038947319670160754, + "grad_norm": 0.12381955981254578, + "learning_rate": 2.9993557167049486e-05, + "loss": 0.0436, + "step": 17660 + }, + { + "epoch": 0.03896937364505892, + "grad_norm": 0.12897422909736633, + "learning_rate": 2.9993508618854823e-05, + "loss": 0.0447, + "step": 17670 + }, + { + "epoch": 0.03899142761995708, + "grad_norm": 0.14306406676769257, + "learning_rate": 2.999345988847487e-05, + "loss": 0.0467, + "step": 17680 + }, + { + "epoch": 0.03901348159485525, + "grad_norm": 0.1559290736913681, + "learning_rate": 2.9993410975910227e-05, + "loss": 0.0448, + "step": 17690 + }, + { + "epoch": 0.039035535569753414, + "grad_norm": 0.1189289540052414, + "learning_rate": 2.9993361881161487e-05, + "loss": 0.0439, + "step": 17700 + }, + { + "epoch": 0.03905758954465158, + "grad_norm": 0.12291364371776581, + "learning_rate": 2.9993312604229238e-05, + "loss": 0.0462, + "step": 17710 + }, + { + "epoch": 0.03907964351954975, + "grad_norm": 0.16162703931331635, + "learning_rate": 2.9993263145114086e-05, + "loss": 0.0446, + "step": 17720 + }, + { + "epoch": 0.03910169749444791, + "grad_norm": 0.12684771418571472, + "learning_rate": 2.9993213503816635e-05, + "loss": 0.0433, + "step": 17730 + }, + { + "epoch": 0.039123751469346074, + "grad_norm": 0.16293397545814514, + "learning_rate": 2.9993163680337478e-05, + "loss": 0.0443, + "step": 17740 + }, + { + "epoch": 0.039145805444244244, + "grad_norm": 0.1600823998451233, + "learning_rate": 2.999311367467723e-05, + "loss": 0.0434, + "step": 17750 + }, + { + "epoch": 0.03916785941914241, + "grad_norm": 0.14080633223056793, + "learning_rate": 2.9993063486836497e-05, + "loss": 0.0438, + "step": 17760 + }, + { + "epoch": 0.03918991339404058, + "grad_norm": 0.1848950982093811, + "learning_rate": 2.999301311681588e-05, + "loss": 0.045, + "step": 17770 + }, + { + "epoch": 0.03921196736893874, + "grad_norm": 0.18590280413627625, + "learning_rate": 2.9992962564616003e-05, + "loss": 0.045, + "step": 17780 + }, + { + "epoch": 0.039234021343836904, + "grad_norm": 0.11885136365890503, + "learning_rate": 2.999291183023748e-05, + "loss": 0.0467, + "step": 17790 + }, + { + "epoch": 0.039256075318735074, + "grad_norm": 0.1620529443025589, + "learning_rate": 2.9992860913680914e-05, + "loss": 0.0464, + "step": 17800 + }, + { + "epoch": 0.03927812929363324, + "grad_norm": 0.1205454021692276, + "learning_rate": 2.9992809814946936e-05, + "loss": 0.0456, + "step": 17810 + }, + { + "epoch": 0.0393001832685314, + "grad_norm": 0.16362564265727997, + "learning_rate": 2.9992758534036157e-05, + "loss": 0.0439, + "step": 17820 + }, + { + "epoch": 0.03932223724342957, + "grad_norm": 0.16726753115653992, + "learning_rate": 2.9992707070949212e-05, + "loss": 0.0434, + "step": 17830 + }, + { + "epoch": 0.039344291218327734, + "grad_norm": 0.15604940056800842, + "learning_rate": 2.9992655425686723e-05, + "loss": 0.0417, + "step": 17840 + }, + { + "epoch": 0.039366345193225905, + "grad_norm": 0.15055251121520996, + "learning_rate": 2.999260359824931e-05, + "loss": 0.0461, + "step": 17850 + }, + { + "epoch": 0.03938839916812407, + "grad_norm": 0.13415409624576569, + "learning_rate": 2.9992551588637612e-05, + "loss": 0.0426, + "step": 17860 + }, + { + "epoch": 0.03941045314302223, + "grad_norm": 0.1261167973279953, + "learning_rate": 2.9992499396852254e-05, + "loss": 0.0447, + "step": 17870 + }, + { + "epoch": 0.0394325071179204, + "grad_norm": 0.12726472318172455, + "learning_rate": 2.9992447022893877e-05, + "loss": 0.0434, + "step": 17880 + }, + { + "epoch": 0.039454561092818564, + "grad_norm": 0.1190476194024086, + "learning_rate": 2.9992394466763107e-05, + "loss": 0.0429, + "step": 17890 + }, + { + "epoch": 0.03947661506771673, + "grad_norm": 0.15304580330848694, + "learning_rate": 2.9992341728460596e-05, + "loss": 0.0439, + "step": 17900 + }, + { + "epoch": 0.0394986690426149, + "grad_norm": 0.13381344079971313, + "learning_rate": 2.9992288807986973e-05, + "loss": 0.0454, + "step": 17910 + }, + { + "epoch": 0.03952072301751306, + "grad_norm": 0.11642707884311676, + "learning_rate": 2.9992235705342888e-05, + "loss": 0.0416, + "step": 17920 + }, + { + "epoch": 0.039542776992411224, + "grad_norm": 0.15968577563762665, + "learning_rate": 2.9992182420528983e-05, + "loss": 0.0448, + "step": 17930 + }, + { + "epoch": 0.039564830967309395, + "grad_norm": 0.14048413932323456, + "learning_rate": 2.999212895354591e-05, + "loss": 0.0441, + "step": 17940 + }, + { + "epoch": 0.03958688494220756, + "grad_norm": 0.1387387365102768, + "learning_rate": 2.999207530439431e-05, + "loss": 0.0436, + "step": 17950 + }, + { + "epoch": 0.03960893891710573, + "grad_norm": 0.13805602490901947, + "learning_rate": 2.9992021473074844e-05, + "loss": 0.044, + "step": 17960 + }, + { + "epoch": 0.03963099289200389, + "grad_norm": 0.15499714016914368, + "learning_rate": 2.999196745958816e-05, + "loss": 0.0436, + "step": 17970 + }, + { + "epoch": 0.039653046866902054, + "grad_norm": 0.15932464599609375, + "learning_rate": 2.999191326393492e-05, + "loss": 0.0441, + "step": 17980 + }, + { + "epoch": 0.039675100841800225, + "grad_norm": 0.16604937613010406, + "learning_rate": 2.9991858886115778e-05, + "loss": 0.0455, + "step": 17990 + }, + { + "epoch": 0.03969715481669839, + "grad_norm": 0.16352349519729614, + "learning_rate": 2.9991804326131392e-05, + "loss": 0.0456, + "step": 18000 + }, + { + "epoch": 0.03971920879159655, + "grad_norm": 0.17992252111434937, + "learning_rate": 2.9991749583982434e-05, + "loss": 0.0456, + "step": 18010 + }, + { + "epoch": 0.03974126276649472, + "grad_norm": 0.14291416108608246, + "learning_rate": 2.9991694659669557e-05, + "loss": 0.0431, + "step": 18020 + }, + { + "epoch": 0.039763316741392885, + "grad_norm": 0.1520717442035675, + "learning_rate": 2.9991639553193443e-05, + "loss": 0.0424, + "step": 18030 + }, + { + "epoch": 0.03978537071629105, + "grad_norm": 0.1287989318370819, + "learning_rate": 2.999158426455475e-05, + "loss": 0.0462, + "step": 18040 + }, + { + "epoch": 0.03980742469118922, + "grad_norm": 0.12750302255153656, + "learning_rate": 2.9991528793754154e-05, + "loss": 0.0428, + "step": 18050 + }, + { + "epoch": 0.03982947866608738, + "grad_norm": 0.14513130486011505, + "learning_rate": 2.999147314079233e-05, + "loss": 0.0454, + "step": 18060 + }, + { + "epoch": 0.03985153264098555, + "grad_norm": 0.12974560260772705, + "learning_rate": 2.9991417305669954e-05, + "loss": 0.0418, + "step": 18070 + }, + { + "epoch": 0.039873586615883715, + "grad_norm": 0.11617918312549591, + "learning_rate": 2.99913612883877e-05, + "loss": 0.043, + "step": 18080 + }, + { + "epoch": 0.03989564059078188, + "grad_norm": 0.16006949543952942, + "learning_rate": 2.9991305088946255e-05, + "loss": 0.044, + "step": 18090 + }, + { + "epoch": 0.03991769456568005, + "grad_norm": 0.12068171054124832, + "learning_rate": 2.99912487073463e-05, + "loss": 0.0423, + "step": 18100 + }, + { + "epoch": 0.03993974854057821, + "grad_norm": 0.16942191123962402, + "learning_rate": 2.9991192143588513e-05, + "loss": 0.0463, + "step": 18110 + }, + { + "epoch": 0.039961802515476375, + "grad_norm": 0.14090265333652496, + "learning_rate": 2.9991135397673594e-05, + "loss": 0.0461, + "step": 18120 + }, + { + "epoch": 0.039983856490374545, + "grad_norm": 0.12045764923095703, + "learning_rate": 2.9991078469602223e-05, + "loss": 0.0429, + "step": 18130 + }, + { + "epoch": 0.04000591046527271, + "grad_norm": 0.12622511386871338, + "learning_rate": 2.9991021359375096e-05, + "loss": 0.0444, + "step": 18140 + }, + { + "epoch": 0.04002796444017087, + "grad_norm": 0.15737897157669067, + "learning_rate": 2.9990964066992906e-05, + "loss": 0.0444, + "step": 18150 + }, + { + "epoch": 0.04005001841506904, + "grad_norm": 0.12559294700622559, + "learning_rate": 2.999090659245635e-05, + "loss": 0.0435, + "step": 18160 + }, + { + "epoch": 0.040072072389967205, + "grad_norm": 0.1852211207151413, + "learning_rate": 2.999084893576612e-05, + "loss": 0.0438, + "step": 18170 + }, + { + "epoch": 0.040094126364865375, + "grad_norm": 0.1423761397600174, + "learning_rate": 2.9990791096922924e-05, + "loss": 0.0418, + "step": 18180 + }, + { + "epoch": 0.04011618033976354, + "grad_norm": 0.11744251102209091, + "learning_rate": 2.9990733075927462e-05, + "loss": 0.0449, + "step": 18190 + }, + { + "epoch": 0.0401382343146617, + "grad_norm": 0.1504257470369339, + "learning_rate": 2.999067487278044e-05, + "loss": 0.0465, + "step": 18200 + }, + { + "epoch": 0.04016028828955987, + "grad_norm": 0.1363408863544464, + "learning_rate": 2.999061648748257e-05, + "loss": 0.0443, + "step": 18210 + }, + { + "epoch": 0.040182342264458035, + "grad_norm": 0.147630512714386, + "learning_rate": 2.999055792003455e-05, + "loss": 0.0452, + "step": 18220 + }, + { + "epoch": 0.0402043962393562, + "grad_norm": 0.14608390629291534, + "learning_rate": 2.99904991704371e-05, + "loss": 0.0451, + "step": 18230 + }, + { + "epoch": 0.04022645021425437, + "grad_norm": 0.1571732759475708, + "learning_rate": 2.999044023869093e-05, + "loss": 0.0411, + "step": 18240 + }, + { + "epoch": 0.04024850418915253, + "grad_norm": 0.1395224779844284, + "learning_rate": 2.9990381124796757e-05, + "loss": 0.0457, + "step": 18250 + }, + { + "epoch": 0.040270558164050695, + "grad_norm": 0.1738761067390442, + "learning_rate": 2.9990321828755305e-05, + "loss": 0.0428, + "step": 18260 + }, + { + "epoch": 0.040292612138948865, + "grad_norm": 0.17801891267299652, + "learning_rate": 2.9990262350567285e-05, + "loss": 0.0446, + "step": 18270 + }, + { + "epoch": 0.04031466611384703, + "grad_norm": 0.12102679908275604, + "learning_rate": 2.9990202690233425e-05, + "loss": 0.046, + "step": 18280 + }, + { + "epoch": 0.0403367200887452, + "grad_norm": 0.14792431890964508, + "learning_rate": 2.9990142847754454e-05, + "loss": 0.0419, + "step": 18290 + }, + { + "epoch": 0.04035877406364336, + "grad_norm": 0.13881243765354156, + "learning_rate": 2.9990082823131087e-05, + "loss": 0.0444, + "step": 18300 + }, + { + "epoch": 0.040380828038541525, + "grad_norm": 0.16386081278324127, + "learning_rate": 2.9990022616364062e-05, + "loss": 0.0451, + "step": 18310 + }, + { + "epoch": 0.040402882013439695, + "grad_norm": 0.1277780532836914, + "learning_rate": 2.998996222745412e-05, + "loss": 0.0455, + "step": 18320 + }, + { + "epoch": 0.04042493598833786, + "grad_norm": 0.1404598504304886, + "learning_rate": 2.998990165640197e-05, + "loss": 0.0452, + "step": 18330 + }, + { + "epoch": 0.04044698996323602, + "grad_norm": 0.12719765305519104, + "learning_rate": 2.9989840903208367e-05, + "loss": 0.0426, + "step": 18340 + }, + { + "epoch": 0.04046904393813419, + "grad_norm": 0.14630275964736938, + "learning_rate": 2.9989779967874047e-05, + "loss": 0.0442, + "step": 18350 + }, + { + "epoch": 0.040491097913032355, + "grad_norm": 0.11759409308433533, + "learning_rate": 2.9989718850399745e-05, + "loss": 0.0431, + "step": 18360 + }, + { + "epoch": 0.04051315188793052, + "grad_norm": 0.15649837255477905, + "learning_rate": 2.9989657550786207e-05, + "loss": 0.0458, + "step": 18370 + }, + { + "epoch": 0.04053520586282869, + "grad_norm": 0.14868782460689545, + "learning_rate": 2.9989596069034175e-05, + "loss": 0.0422, + "step": 18380 + }, + { + "epoch": 0.04055725983772685, + "grad_norm": 0.20002233982086182, + "learning_rate": 2.99895344051444e-05, + "loss": 0.0447, + "step": 18390 + }, + { + "epoch": 0.04057931381262502, + "grad_norm": 0.12322783470153809, + "learning_rate": 2.998947255911763e-05, + "loss": 0.0418, + "step": 18400 + }, + { + "epoch": 0.040601367787523185, + "grad_norm": 0.13516053557395935, + "learning_rate": 2.998941053095462e-05, + "loss": 0.0462, + "step": 18410 + }, + { + "epoch": 0.04062342176242135, + "grad_norm": 0.1559779942035675, + "learning_rate": 2.9989348320656115e-05, + "loss": 0.0449, + "step": 18420 + }, + { + "epoch": 0.04064547573731952, + "grad_norm": 0.18609538674354553, + "learning_rate": 2.9989285928222875e-05, + "loss": 0.0434, + "step": 18430 + }, + { + "epoch": 0.04066752971221768, + "grad_norm": 0.15587666630744934, + "learning_rate": 2.998922335365566e-05, + "loss": 0.0428, + "step": 18440 + }, + { + "epoch": 0.040689583687115845, + "grad_norm": 0.09558484703302383, + "learning_rate": 2.998916059695523e-05, + "loss": 0.043, + "step": 18450 + }, + { + "epoch": 0.040711637662014015, + "grad_norm": 0.16992299258708954, + "learning_rate": 2.9989097658122346e-05, + "loss": 0.0438, + "step": 18460 + }, + { + "epoch": 0.04073369163691218, + "grad_norm": 0.16411836445331573, + "learning_rate": 2.9989034537157768e-05, + "loss": 0.0457, + "step": 18470 + }, + { + "epoch": 0.04075574561181034, + "grad_norm": 0.1645163744688034, + "learning_rate": 2.9988971234062276e-05, + "loss": 0.0448, + "step": 18480 + }, + { + "epoch": 0.04077779958670851, + "grad_norm": 0.18205547332763672, + "learning_rate": 2.9988907748836627e-05, + "loss": 0.0423, + "step": 18490 + }, + { + "epoch": 0.040799853561606675, + "grad_norm": 0.18471196293830872, + "learning_rate": 2.99888440814816e-05, + "loss": 0.0436, + "step": 18500 + }, + { + "epoch": 0.040821907536504845, + "grad_norm": 0.11469350755214691, + "learning_rate": 2.9988780231997966e-05, + "loss": 0.0418, + "step": 18510 + }, + { + "epoch": 0.04084396151140301, + "grad_norm": 0.12717978656291962, + "learning_rate": 2.9988716200386498e-05, + "loss": 0.0447, + "step": 18520 + }, + { + "epoch": 0.04086601548630117, + "grad_norm": 0.14069652557373047, + "learning_rate": 2.9988651986647977e-05, + "loss": 0.0432, + "step": 18530 + }, + { + "epoch": 0.04088806946119934, + "grad_norm": 0.09781837463378906, + "learning_rate": 2.9988587590783184e-05, + "loss": 0.0424, + "step": 18540 + }, + { + "epoch": 0.040910123436097505, + "grad_norm": 0.13555948436260223, + "learning_rate": 2.9988523012792897e-05, + "loss": 0.0434, + "step": 18550 + }, + { + "epoch": 0.04093217741099567, + "grad_norm": 0.14132043719291687, + "learning_rate": 2.9988458252677905e-05, + "loss": 0.0444, + "step": 18560 + }, + { + "epoch": 0.04095423138589384, + "grad_norm": 0.19137978553771973, + "learning_rate": 2.9988393310438998e-05, + "loss": 0.042, + "step": 18570 + }, + { + "epoch": 0.040976285360792, + "grad_norm": 0.18839135766029358, + "learning_rate": 2.9988328186076956e-05, + "loss": 0.0434, + "step": 18580 + }, + { + "epoch": 0.040998339335690165, + "grad_norm": 0.15363135933876038, + "learning_rate": 2.998826287959258e-05, + "loss": 0.0446, + "step": 18590 + }, + { + "epoch": 0.041020393310588335, + "grad_norm": 0.17956086993217468, + "learning_rate": 2.9988197390986655e-05, + "loss": 0.0444, + "step": 18600 + }, + { + "epoch": 0.0410424472854865, + "grad_norm": 0.158932626247406, + "learning_rate": 2.9988131720259982e-05, + "loss": 0.0434, + "step": 18610 + }, + { + "epoch": 0.04106450126038467, + "grad_norm": 0.16856230795383453, + "learning_rate": 2.9988065867413355e-05, + "loss": 0.045, + "step": 18620 + }, + { + "epoch": 0.04108655523528283, + "grad_norm": 0.14811895787715912, + "learning_rate": 2.9987999832447577e-05, + "loss": 0.0454, + "step": 18630 + }, + { + "epoch": 0.041108609210180995, + "grad_norm": 0.13001573085784912, + "learning_rate": 2.9987933615363454e-05, + "loss": 0.0446, + "step": 18640 + }, + { + "epoch": 0.041130663185079165, + "grad_norm": 0.16481848061084747, + "learning_rate": 2.9987867216161783e-05, + "loss": 0.0445, + "step": 18650 + }, + { + "epoch": 0.04115271715997733, + "grad_norm": 0.18321062624454498, + "learning_rate": 2.9987800634843375e-05, + "loss": 0.0436, + "step": 18660 + }, + { + "epoch": 0.04117477113487549, + "grad_norm": 0.14475645124912262, + "learning_rate": 2.998773387140904e-05, + "loss": 0.0455, + "step": 18670 + }, + { + "epoch": 0.04119682510977366, + "grad_norm": 0.1655343472957611, + "learning_rate": 2.9987666925859586e-05, + "loss": 0.0438, + "step": 18680 + }, + { + "epoch": 0.041218879084671825, + "grad_norm": 0.17365579307079315, + "learning_rate": 2.998759979819583e-05, + "loss": 0.0425, + "step": 18690 + }, + { + "epoch": 0.04124093305956999, + "grad_norm": 0.12879890203475952, + "learning_rate": 2.9987532488418586e-05, + "loss": 0.0422, + "step": 18700 + }, + { + "epoch": 0.04126298703446816, + "grad_norm": 0.2000253051519394, + "learning_rate": 2.998746499652867e-05, + "loss": 0.0448, + "step": 18710 + }, + { + "epoch": 0.04128504100936632, + "grad_norm": 0.13663922250270844, + "learning_rate": 2.9987397322526902e-05, + "loss": 0.0437, + "step": 18720 + }, + { + "epoch": 0.04130709498426449, + "grad_norm": 0.1578352153301239, + "learning_rate": 2.9987329466414108e-05, + "loss": 0.045, + "step": 18730 + }, + { + "epoch": 0.041329148959162655, + "grad_norm": 0.148896262049675, + "learning_rate": 2.998726142819111e-05, + "loss": 0.0413, + "step": 18740 + }, + { + "epoch": 0.04135120293406082, + "grad_norm": 0.1604834944009781, + "learning_rate": 2.998719320785874e-05, + "loss": 0.0446, + "step": 18750 + }, + { + "epoch": 0.04137325690895899, + "grad_norm": 0.16164597868919373, + "learning_rate": 2.9987124805417817e-05, + "loss": 0.0461, + "step": 18760 + }, + { + "epoch": 0.04139531088385715, + "grad_norm": 0.1550665944814682, + "learning_rate": 2.9987056220869177e-05, + "loss": 0.0426, + "step": 18770 + }, + { + "epoch": 0.041417364858755315, + "grad_norm": 0.15223196148872375, + "learning_rate": 2.9986987454213656e-05, + "loss": 0.0452, + "step": 18780 + }, + { + "epoch": 0.041439418833653485, + "grad_norm": 0.14017735421657562, + "learning_rate": 2.9986918505452087e-05, + "loss": 0.044, + "step": 18790 + }, + { + "epoch": 0.04146147280855165, + "grad_norm": 0.11510568857192993, + "learning_rate": 2.9986849374585308e-05, + "loss": 0.0464, + "step": 18800 + }, + { + "epoch": 0.04148352678344981, + "grad_norm": 0.17361609637737274, + "learning_rate": 2.998678006161416e-05, + "loss": 0.0449, + "step": 18810 + }, + { + "epoch": 0.04150558075834798, + "grad_norm": 0.12921549379825592, + "learning_rate": 2.9986710566539484e-05, + "loss": 0.0419, + "step": 18820 + }, + { + "epoch": 0.041527634733246145, + "grad_norm": 0.13947393000125885, + "learning_rate": 2.9986640889362122e-05, + "loss": 0.0437, + "step": 18830 + }, + { + "epoch": 0.041549688708144315, + "grad_norm": 0.1556612104177475, + "learning_rate": 2.9986571030082927e-05, + "loss": 0.0417, + "step": 18840 + }, + { + "epoch": 0.04157174268304248, + "grad_norm": 0.12043892592191696, + "learning_rate": 2.9986500988702747e-05, + "loss": 0.0436, + "step": 18850 + }, + { + "epoch": 0.04159379665794064, + "grad_norm": 0.14535388350486755, + "learning_rate": 2.9986430765222423e-05, + "loss": 0.0441, + "step": 18860 + }, + { + "epoch": 0.04161585063283881, + "grad_norm": 0.16298623383045197, + "learning_rate": 2.9986360359642822e-05, + "loss": 0.0453, + "step": 18870 + }, + { + "epoch": 0.041637904607736975, + "grad_norm": 0.14481554925441742, + "learning_rate": 2.998628977196479e-05, + "loss": 0.0436, + "step": 18880 + }, + { + "epoch": 0.04165995858263514, + "grad_norm": 0.15747632086277008, + "learning_rate": 2.9986219002189187e-05, + "loss": 0.0439, + "step": 18890 + }, + { + "epoch": 0.04168201255753331, + "grad_norm": 0.13717198371887207, + "learning_rate": 2.9986148050316876e-05, + "loss": 0.0427, + "step": 18900 + }, + { + "epoch": 0.04170406653243147, + "grad_norm": 0.11807756125926971, + "learning_rate": 2.998607691634871e-05, + "loss": 0.0442, + "step": 18910 + }, + { + "epoch": 0.041726120507329635, + "grad_norm": 0.17223821580410004, + "learning_rate": 2.998600560028557e-05, + "loss": 0.0441, + "step": 18920 + }, + { + "epoch": 0.041748174482227805, + "grad_norm": 0.1335803121328354, + "learning_rate": 2.998593410212831e-05, + "loss": 0.0443, + "step": 18930 + }, + { + "epoch": 0.04177022845712597, + "grad_norm": 0.17243985831737518, + "learning_rate": 2.99858624218778e-05, + "loss": 0.045, + "step": 18940 + }, + { + "epoch": 0.04179228243202414, + "grad_norm": 0.15684108436107635, + "learning_rate": 2.9985790559534914e-05, + "loss": 0.0442, + "step": 18950 + }, + { + "epoch": 0.0418143364069223, + "grad_norm": 0.16449791193008423, + "learning_rate": 2.9985718515100524e-05, + "loss": 0.0446, + "step": 18960 + }, + { + "epoch": 0.041836390381820465, + "grad_norm": 0.1816624104976654, + "learning_rate": 2.9985646288575505e-05, + "loss": 0.0426, + "step": 18970 + }, + { + "epoch": 0.041858444356718635, + "grad_norm": 0.15125305950641632, + "learning_rate": 2.9985573879960732e-05, + "loss": 0.042, + "step": 18980 + }, + { + "epoch": 0.0418804983316168, + "grad_norm": 0.13877061009407043, + "learning_rate": 2.998550128925709e-05, + "loss": 0.0451, + "step": 18990 + }, + { + "epoch": 0.04190255230651496, + "grad_norm": 0.17699554562568665, + "learning_rate": 2.998542851646546e-05, + "loss": 0.0432, + "step": 19000 + }, + { + "epoch": 0.04192460628141313, + "grad_norm": 0.13231970369815826, + "learning_rate": 2.9985355561586723e-05, + "loss": 0.0426, + "step": 19010 + }, + { + "epoch": 0.041946660256311295, + "grad_norm": 0.17503751814365387, + "learning_rate": 2.9985282424621763e-05, + "loss": 0.0423, + "step": 19020 + }, + { + "epoch": 0.041968714231209465, + "grad_norm": 0.1431511640548706, + "learning_rate": 2.998520910557148e-05, + "loss": 0.0427, + "step": 19030 + }, + { + "epoch": 0.04199076820610763, + "grad_norm": 0.13485033810138702, + "learning_rate": 2.9985135604436756e-05, + "loss": 0.0446, + "step": 19040 + }, + { + "epoch": 0.04201282218100579, + "grad_norm": 0.15286582708358765, + "learning_rate": 2.9985061921218487e-05, + "loss": 0.0408, + "step": 19050 + }, + { + "epoch": 0.04203487615590396, + "grad_norm": 0.15771183371543884, + "learning_rate": 2.9984988055917567e-05, + "loss": 0.042, + "step": 19060 + }, + { + "epoch": 0.042056930130802125, + "grad_norm": 0.17356999218463898, + "learning_rate": 2.9984914008534894e-05, + "loss": 0.0418, + "step": 19070 + }, + { + "epoch": 0.04207898410570029, + "grad_norm": 0.16708654165267944, + "learning_rate": 2.9984839779071365e-05, + "loss": 0.0452, + "step": 19080 + }, + { + "epoch": 0.04210103808059846, + "grad_norm": 0.14194022119045258, + "learning_rate": 2.9984765367527883e-05, + "loss": 0.0447, + "step": 19090 + }, + { + "epoch": 0.04212309205549662, + "grad_norm": 0.15442846715450287, + "learning_rate": 2.9984690773905358e-05, + "loss": 0.0458, + "step": 19100 + }, + { + "epoch": 0.042145146030394785, + "grad_norm": 0.1893140822649002, + "learning_rate": 2.9984615998204695e-05, + "loss": 0.0449, + "step": 19110 + }, + { + "epoch": 0.042167200005292955, + "grad_norm": 0.12554919719696045, + "learning_rate": 2.9984541040426795e-05, + "loss": 0.0431, + "step": 19120 + }, + { + "epoch": 0.04218925398019112, + "grad_norm": 0.12231724709272385, + "learning_rate": 2.9984465900572577e-05, + "loss": 0.0441, + "step": 19130 + }, + { + "epoch": 0.04221130795508929, + "grad_norm": 0.17733630537986755, + "learning_rate": 2.9984390578642948e-05, + "loss": 0.0449, + "step": 19140 + }, + { + "epoch": 0.04223336192998745, + "grad_norm": 0.16800613701343536, + "learning_rate": 2.9984315074638827e-05, + "loss": 0.0439, + "step": 19150 + }, + { + "epoch": 0.042255415904885615, + "grad_norm": 0.16977491974830627, + "learning_rate": 2.998423938856113e-05, + "loss": 0.0434, + "step": 19160 + }, + { + "epoch": 0.042277469879783786, + "grad_norm": 0.1619211733341217, + "learning_rate": 2.9984163520410774e-05, + "loss": 0.0437, + "step": 19170 + }, + { + "epoch": 0.04229952385468195, + "grad_norm": 0.1427748203277588, + "learning_rate": 2.9984087470188688e-05, + "loss": 0.0434, + "step": 19180 + }, + { + "epoch": 0.04232157782958011, + "grad_norm": 0.1481148898601532, + "learning_rate": 2.998401123789579e-05, + "loss": 0.0413, + "step": 19190 + }, + { + "epoch": 0.04234363180447828, + "grad_norm": 0.1286570429801941, + "learning_rate": 2.998393482353301e-05, + "loss": 0.0469, + "step": 19200 + }, + { + "epoch": 0.042365685779376445, + "grad_norm": 0.17417976260185242, + "learning_rate": 2.9983858227101274e-05, + "loss": 0.0443, + "step": 19210 + }, + { + "epoch": 0.04238773975427461, + "grad_norm": 0.17744658887386322, + "learning_rate": 2.998378144860151e-05, + "loss": 0.0431, + "step": 19220 + }, + { + "epoch": 0.04240979372917278, + "grad_norm": 0.1714107245206833, + "learning_rate": 2.9983704488034658e-05, + "loss": 0.0452, + "step": 19230 + }, + { + "epoch": 0.04243184770407094, + "grad_norm": 0.13039565086364746, + "learning_rate": 2.998362734540165e-05, + "loss": 0.0431, + "step": 19240 + }, + { + "epoch": 0.04245390167896911, + "grad_norm": 0.13011685013771057, + "learning_rate": 2.9983550020703422e-05, + "loss": 0.0422, + "step": 19250 + }, + { + "epoch": 0.042475955653867276, + "grad_norm": 0.19763515889644623, + "learning_rate": 2.9983472513940915e-05, + "loss": 0.0441, + "step": 19260 + }, + { + "epoch": 0.04249800962876544, + "grad_norm": 0.13163815438747406, + "learning_rate": 2.9983394825115067e-05, + "loss": 0.045, + "step": 19270 + }, + { + "epoch": 0.04252006360366361, + "grad_norm": 0.14451389014720917, + "learning_rate": 2.998331695422683e-05, + "loss": 0.0434, + "step": 19280 + }, + { + "epoch": 0.04254211757856177, + "grad_norm": 0.18729060888290405, + "learning_rate": 2.998323890127714e-05, + "loss": 0.0435, + "step": 19290 + }, + { + "epoch": 0.042564171553459935, + "grad_norm": 0.15636026859283447, + "learning_rate": 2.9983160666266953e-05, + "loss": 0.0458, + "step": 19300 + }, + { + "epoch": 0.042586225528358106, + "grad_norm": 0.13767939805984497, + "learning_rate": 2.998308224919722e-05, + "loss": 0.0441, + "step": 19310 + }, + { + "epoch": 0.04260827950325627, + "grad_norm": 0.1394219994544983, + "learning_rate": 2.998300365006889e-05, + "loss": 0.0412, + "step": 19320 + }, + { + "epoch": 0.04263033347815443, + "grad_norm": 0.1158580482006073, + "learning_rate": 2.9982924868882917e-05, + "loss": 0.0421, + "step": 19330 + }, + { + "epoch": 0.0426523874530526, + "grad_norm": 0.12299320101737976, + "learning_rate": 2.9982845905640265e-05, + "loss": 0.0448, + "step": 19340 + }, + { + "epoch": 0.042674441427950766, + "grad_norm": 0.15439295768737793, + "learning_rate": 2.9982766760341884e-05, + "loss": 0.0429, + "step": 19350 + }, + { + "epoch": 0.042696495402848936, + "grad_norm": 0.14623256027698517, + "learning_rate": 2.9982687432988743e-05, + "loss": 0.0403, + "step": 19360 + }, + { + "epoch": 0.0427185493777471, + "grad_norm": 0.12869924306869507, + "learning_rate": 2.9982607923581804e-05, + "loss": 0.0421, + "step": 19370 + }, + { + "epoch": 0.04274060335264526, + "grad_norm": 0.13645216822624207, + "learning_rate": 2.9982528232122033e-05, + "loss": 0.0434, + "step": 19380 + }, + { + "epoch": 0.04276265732754343, + "grad_norm": 0.14318528771400452, + "learning_rate": 2.99824483586104e-05, + "loss": 0.0434, + "step": 19390 + }, + { + "epoch": 0.042784711302441596, + "grad_norm": 0.12748458981513977, + "learning_rate": 2.998236830304787e-05, + "loss": 0.0413, + "step": 19400 + }, + { + "epoch": 0.04280676527733976, + "grad_norm": 0.13826815783977509, + "learning_rate": 2.9982288065435423e-05, + "loss": 0.0416, + "step": 19410 + }, + { + "epoch": 0.04282881925223793, + "grad_norm": 0.13154248893260956, + "learning_rate": 2.9982207645774024e-05, + "loss": 0.0438, + "step": 19420 + }, + { + "epoch": 0.04285087322713609, + "grad_norm": 0.16108721494674683, + "learning_rate": 2.9982127044064666e-05, + "loss": 0.0432, + "step": 19430 + }, + { + "epoch": 0.042872927202034256, + "grad_norm": 0.1316419541835785, + "learning_rate": 2.9982046260308313e-05, + "loss": 0.0452, + "step": 19440 + }, + { + "epoch": 0.042894981176932426, + "grad_norm": 0.13004270195960999, + "learning_rate": 2.9981965294505952e-05, + "loss": 0.0439, + "step": 19450 + }, + { + "epoch": 0.04291703515183059, + "grad_norm": 0.15175391733646393, + "learning_rate": 2.998188414665857e-05, + "loss": 0.0421, + "step": 19460 + }, + { + "epoch": 0.04293908912672876, + "grad_norm": 0.15216891467571259, + "learning_rate": 2.998180281676715e-05, + "loss": 0.043, + "step": 19470 + }, + { + "epoch": 0.04296114310162692, + "grad_norm": 0.21029189229011536, + "learning_rate": 2.998172130483268e-05, + "loss": 0.0423, + "step": 19480 + }, + { + "epoch": 0.042983197076525086, + "grad_norm": 0.1789425015449524, + "learning_rate": 2.9981639610856148e-05, + "loss": 0.0404, + "step": 19490 + }, + { + "epoch": 0.043005251051423256, + "grad_norm": 0.1793346107006073, + "learning_rate": 2.998155773483855e-05, + "loss": 0.0449, + "step": 19500 + }, + { + "epoch": 0.04302730502632142, + "grad_norm": 0.14469006657600403, + "learning_rate": 2.9981475676780886e-05, + "loss": 0.0441, + "step": 19510 + }, + { + "epoch": 0.04304935900121958, + "grad_norm": 0.14730432629585266, + "learning_rate": 2.9981393436684144e-05, + "loss": 0.0432, + "step": 19520 + }, + { + "epoch": 0.04307141297611775, + "grad_norm": 0.17436708509922028, + "learning_rate": 2.9981311014549326e-05, + "loss": 0.0426, + "step": 19530 + }, + { + "epoch": 0.043093466951015916, + "grad_norm": 0.17409755289554596, + "learning_rate": 2.9981228410377438e-05, + "loss": 0.0437, + "step": 19540 + }, + { + "epoch": 0.04311552092591408, + "grad_norm": 0.11343152076005936, + "learning_rate": 2.9981145624169478e-05, + "loss": 0.0413, + "step": 19550 + }, + { + "epoch": 0.04313757490081225, + "grad_norm": 0.16219741106033325, + "learning_rate": 2.9981062655926457e-05, + "loss": 0.0434, + "step": 19560 + }, + { + "epoch": 0.04315962887571041, + "grad_norm": 0.11884623020887375, + "learning_rate": 2.9980979505649375e-05, + "loss": 0.042, + "step": 19570 + }, + { + "epoch": 0.04318168285060858, + "grad_norm": 0.11829676479101181, + "learning_rate": 2.9980896173339254e-05, + "loss": 0.0427, + "step": 19580 + }, + { + "epoch": 0.043203736825506746, + "grad_norm": 0.13537636399269104, + "learning_rate": 2.9980812658997093e-05, + "loss": 0.0428, + "step": 19590 + }, + { + "epoch": 0.04322579080040491, + "grad_norm": 0.13103951513767242, + "learning_rate": 2.998072896262392e-05, + "loss": 0.0404, + "step": 19600 + }, + { + "epoch": 0.04324784477530308, + "grad_norm": 0.11609891057014465, + "learning_rate": 2.9980645084220742e-05, + "loss": 0.0464, + "step": 19610 + }, + { + "epoch": 0.04326989875020124, + "grad_norm": 0.11381754279136658, + "learning_rate": 2.998056102378858e-05, + "loss": 0.0445, + "step": 19620 + }, + { + "epoch": 0.043291952725099406, + "grad_norm": 0.1635146588087082, + "learning_rate": 2.998047678132847e-05, + "loss": 0.0432, + "step": 19630 + }, + { + "epoch": 0.043314006699997576, + "grad_norm": 0.12164878100156784, + "learning_rate": 2.998039235684141e-05, + "loss": 0.0432, + "step": 19640 + }, + { + "epoch": 0.04333606067489574, + "grad_norm": 0.13882160186767578, + "learning_rate": 2.9980307750328443e-05, + "loss": 0.0408, + "step": 19650 + }, + { + "epoch": 0.0433581146497939, + "grad_norm": 0.15201398730278015, + "learning_rate": 2.9980222961790592e-05, + "loss": 0.0463, + "step": 19660 + }, + { + "epoch": 0.04338016862469207, + "grad_norm": 0.13896174728870392, + "learning_rate": 2.9980137991228894e-05, + "loss": 0.0441, + "step": 19670 + }, + { + "epoch": 0.043402222599590236, + "grad_norm": 0.13332046568393707, + "learning_rate": 2.998005283864437e-05, + "loss": 0.047, + "step": 19680 + }, + { + "epoch": 0.043424276574488406, + "grad_norm": 0.14694546163082123, + "learning_rate": 2.9979967504038062e-05, + "loss": 0.0453, + "step": 19690 + }, + { + "epoch": 0.04344633054938657, + "grad_norm": 0.132303848862648, + "learning_rate": 2.9979881987411005e-05, + "loss": 0.0429, + "step": 19700 + }, + { + "epoch": 0.04346838452428473, + "grad_norm": 0.14235623180866241, + "learning_rate": 2.9979796288764238e-05, + "loss": 0.0445, + "step": 19710 + }, + { + "epoch": 0.0434904384991829, + "grad_norm": 0.19236382842063904, + "learning_rate": 2.9979710408098805e-05, + "loss": 0.0442, + "step": 19720 + }, + { + "epoch": 0.043512492474081066, + "grad_norm": 0.15556108951568604, + "learning_rate": 2.9979624345415742e-05, + "loss": 0.0446, + "step": 19730 + }, + { + "epoch": 0.04353454644897923, + "grad_norm": 0.14704492688179016, + "learning_rate": 2.9979538100716107e-05, + "loss": 0.044, + "step": 19740 + }, + { + "epoch": 0.0435566004238774, + "grad_norm": 0.13818378746509552, + "learning_rate": 2.9979451674000937e-05, + "loss": 0.0429, + "step": 19750 + }, + { + "epoch": 0.04357865439877556, + "grad_norm": 0.13610661029815674, + "learning_rate": 2.997936506527129e-05, + "loss": 0.0432, + "step": 19760 + }, + { + "epoch": 0.043600708373673726, + "grad_norm": 0.15360033512115479, + "learning_rate": 2.997927827452821e-05, + "loss": 0.0424, + "step": 19770 + }, + { + "epoch": 0.043622762348571896, + "grad_norm": 0.17868396639823914, + "learning_rate": 2.9979191301772758e-05, + "loss": 0.0442, + "step": 19780 + }, + { + "epoch": 0.04364481632347006, + "grad_norm": 0.13049829006195068, + "learning_rate": 2.9979104147005993e-05, + "loss": 0.0451, + "step": 19790 + }, + { + "epoch": 0.04366687029836823, + "grad_norm": 0.1408914029598236, + "learning_rate": 2.9979016810228965e-05, + "loss": 0.0433, + "step": 19800 + }, + { + "epoch": 0.04368892427326639, + "grad_norm": 0.1267915666103363, + "learning_rate": 2.997892929144274e-05, + "loss": 0.0456, + "step": 19810 + }, + { + "epoch": 0.043710978248164556, + "grad_norm": 0.13770537078380585, + "learning_rate": 2.9978841590648387e-05, + "loss": 0.0436, + "step": 19820 + }, + { + "epoch": 0.043733032223062726, + "grad_norm": 0.13656949996948242, + "learning_rate": 2.9978753707846962e-05, + "loss": 0.0448, + "step": 19830 + }, + { + "epoch": 0.04375508619796089, + "grad_norm": 0.11661051958799362, + "learning_rate": 2.997866564303954e-05, + "loss": 0.046, + "step": 19840 + }, + { + "epoch": 0.04377714017285905, + "grad_norm": 0.18717007339000702, + "learning_rate": 2.997857739622718e-05, + "loss": 0.0456, + "step": 19850 + }, + { + "epoch": 0.04379919414775722, + "grad_norm": 0.17736312747001648, + "learning_rate": 2.997848896741097e-05, + "loss": 0.0441, + "step": 19860 + }, + { + "epoch": 0.043821248122655386, + "grad_norm": 0.1793155074119568, + "learning_rate": 2.9978400356591974e-05, + "loss": 0.0442, + "step": 19870 + }, + { + "epoch": 0.04384330209755355, + "grad_norm": 0.14842727780342102, + "learning_rate": 2.9978311563771276e-05, + "loss": 0.0417, + "step": 19880 + }, + { + "epoch": 0.04386535607245172, + "grad_norm": 0.1279231309890747, + "learning_rate": 2.9978222588949947e-05, + "loss": 0.0421, + "step": 19890 + }, + { + "epoch": 0.04388741004734988, + "grad_norm": 0.14728137850761414, + "learning_rate": 2.997813343212907e-05, + "loss": 0.0441, + "step": 19900 + }, + { + "epoch": 0.04390946402224805, + "grad_norm": 0.18991468846797943, + "learning_rate": 2.9978044093309733e-05, + "loss": 0.046, + "step": 19910 + }, + { + "epoch": 0.043931517997146216, + "grad_norm": 0.20107515156269073, + "learning_rate": 2.997795457249302e-05, + "loss": 0.0431, + "step": 19920 + }, + { + "epoch": 0.04395357197204438, + "grad_norm": 0.14799199998378754, + "learning_rate": 2.9977864869680018e-05, + "loss": 0.041, + "step": 19930 + }, + { + "epoch": 0.04397562594694255, + "grad_norm": 0.19245512783527374, + "learning_rate": 2.9977774984871813e-05, + "loss": 0.0447, + "step": 19940 + }, + { + "epoch": 0.04399767992184071, + "grad_norm": 0.1696229875087738, + "learning_rate": 2.9977684918069503e-05, + "loss": 0.0431, + "step": 19950 + }, + { + "epoch": 0.044019733896738876, + "grad_norm": 0.12282171845436096, + "learning_rate": 2.9977594669274175e-05, + "loss": 0.0417, + "step": 19960 + }, + { + "epoch": 0.044041787871637046, + "grad_norm": 0.12652531266212463, + "learning_rate": 2.9977504238486934e-05, + "loss": 0.0436, + "step": 19970 + }, + { + "epoch": 0.04406384184653521, + "grad_norm": 0.16366034746170044, + "learning_rate": 2.9977413625708877e-05, + "loss": 0.0437, + "step": 19980 + }, + { + "epoch": 0.04408589582143337, + "grad_norm": 0.1519293636083603, + "learning_rate": 2.9977322830941102e-05, + "loss": 0.0442, + "step": 19990 + }, + { + "epoch": 0.04410794979633154, + "grad_norm": 0.1600172221660614, + "learning_rate": 2.9977231854184714e-05, + "loss": 0.0464, + "step": 20000 + }, + { + "epoch": 0.044130003771229706, + "grad_norm": 0.14366956055164337, + "learning_rate": 2.9977140695440817e-05, + "loss": 0.0435, + "step": 20010 + }, + { + "epoch": 0.044152057746127876, + "grad_norm": 0.18282394111156464, + "learning_rate": 2.9977049354710522e-05, + "loss": 0.0443, + "step": 20020 + }, + { + "epoch": 0.04417411172102604, + "grad_norm": 0.17940562963485718, + "learning_rate": 2.9976957831994935e-05, + "loss": 0.044, + "step": 20030 + }, + { + "epoch": 0.0441961656959242, + "grad_norm": 0.1271345317363739, + "learning_rate": 2.997686612729517e-05, + "loss": 0.0446, + "step": 20040 + }, + { + "epoch": 0.04421821967082237, + "grad_norm": 0.18522068858146667, + "learning_rate": 2.997677424061234e-05, + "loss": 0.0434, + "step": 20050 + }, + { + "epoch": 0.044240273645720536, + "grad_norm": 0.15287689864635468, + "learning_rate": 2.9976682171947568e-05, + "loss": 0.0419, + "step": 20060 + }, + { + "epoch": 0.0442623276206187, + "grad_norm": 0.14157342910766602, + "learning_rate": 2.997658992130196e-05, + "loss": 0.0436, + "step": 20070 + }, + { + "epoch": 0.04428438159551687, + "grad_norm": 0.12138845026493073, + "learning_rate": 2.9976497488676643e-05, + "loss": 0.0433, + "step": 20080 + }, + { + "epoch": 0.04430643557041503, + "grad_norm": 0.12475436180830002, + "learning_rate": 2.997640487407275e-05, + "loss": 0.0414, + "step": 20090 + }, + { + "epoch": 0.044328489545313196, + "grad_norm": 0.14733603596687317, + "learning_rate": 2.997631207749139e-05, + "loss": 0.0436, + "step": 20100 + }, + { + "epoch": 0.044350543520211366, + "grad_norm": 0.10603823512792587, + "learning_rate": 2.9976219098933707e-05, + "loss": 0.0421, + "step": 20110 + }, + { + "epoch": 0.04437259749510953, + "grad_norm": 0.14472636580467224, + "learning_rate": 2.997612593840081e-05, + "loss": 0.044, + "step": 20120 + }, + { + "epoch": 0.0443946514700077, + "grad_norm": 0.15501326322555542, + "learning_rate": 2.9976032595893855e-05, + "loss": 0.0436, + "step": 20130 + }, + { + "epoch": 0.04441670544490586, + "grad_norm": 0.11792830377817154, + "learning_rate": 2.9975939071413956e-05, + "loss": 0.0426, + "step": 20140 + }, + { + "epoch": 0.044438759419804026, + "grad_norm": 0.17960397899150848, + "learning_rate": 2.9975845364962263e-05, + "loss": 0.0464, + "step": 20150 + }, + { + "epoch": 0.044460813394702196, + "grad_norm": 0.11049877852201462, + "learning_rate": 2.9975751476539904e-05, + "loss": 0.0427, + "step": 20160 + }, + { + "epoch": 0.04448286736960036, + "grad_norm": 0.1420356035232544, + "learning_rate": 2.997565740614803e-05, + "loss": 0.0429, + "step": 20170 + }, + { + "epoch": 0.04450492134449852, + "grad_norm": 0.16277483105659485, + "learning_rate": 2.9975563153787776e-05, + "loss": 0.0448, + "step": 20180 + }, + { + "epoch": 0.04452697531939669, + "grad_norm": 0.14834684133529663, + "learning_rate": 2.997546871946029e-05, + "loss": 0.0447, + "step": 20190 + }, + { + "epoch": 0.044549029294294856, + "grad_norm": 0.16876570880413055, + "learning_rate": 2.9975374103166722e-05, + "loss": 0.0421, + "step": 20200 + }, + { + "epoch": 0.044571083269193026, + "grad_norm": 0.1381208598613739, + "learning_rate": 2.997527930490822e-05, + "loss": 0.0417, + "step": 20210 + }, + { + "epoch": 0.04459313724409119, + "grad_norm": 0.11942976713180542, + "learning_rate": 2.997518432468593e-05, + "loss": 0.0436, + "step": 20220 + }, + { + "epoch": 0.04461519121898935, + "grad_norm": 0.1262224167585373, + "learning_rate": 2.9975089162501018e-05, + "loss": 0.0439, + "step": 20230 + }, + { + "epoch": 0.04463724519388752, + "grad_norm": 0.15905871987342834, + "learning_rate": 2.9974993818354635e-05, + "loss": 0.0439, + "step": 20240 + }, + { + "epoch": 0.044659299168785686, + "grad_norm": 0.1460305154323578, + "learning_rate": 2.9974898292247933e-05, + "loss": 0.0409, + "step": 20250 + }, + { + "epoch": 0.04468135314368385, + "grad_norm": 0.12591460347175598, + "learning_rate": 2.9974802584182078e-05, + "loss": 0.043, + "step": 20260 + }, + { + "epoch": 0.04470340711858202, + "grad_norm": 0.1232740581035614, + "learning_rate": 2.9974706694158234e-05, + "loss": 0.0428, + "step": 20270 + }, + { + "epoch": 0.04472546109348018, + "grad_norm": 0.16056889295578003, + "learning_rate": 2.997461062217757e-05, + "loss": 0.0431, + "step": 20280 + }, + { + "epoch": 0.044747515068378346, + "grad_norm": 0.17870131134986877, + "learning_rate": 2.9974514368241242e-05, + "loss": 0.0425, + "step": 20290 + }, + { + "epoch": 0.044769569043276516, + "grad_norm": 0.12946054339408875, + "learning_rate": 2.997441793235043e-05, + "loss": 0.0417, + "step": 20300 + }, + { + "epoch": 0.04479162301817468, + "grad_norm": 0.1098407730460167, + "learning_rate": 2.99743213145063e-05, + "loss": 0.0433, + "step": 20310 + }, + { + "epoch": 0.04481367699307285, + "grad_norm": 0.13645118474960327, + "learning_rate": 2.997422451471003e-05, + "loss": 0.0428, + "step": 20320 + }, + { + "epoch": 0.04483573096797101, + "grad_norm": 0.143929123878479, + "learning_rate": 2.9974127532962795e-05, + "loss": 0.0418, + "step": 20330 + }, + { + "epoch": 0.044857784942869176, + "grad_norm": 0.18579508364200592, + "learning_rate": 2.997403036926577e-05, + "loss": 0.0409, + "step": 20340 + }, + { + "epoch": 0.044879838917767347, + "grad_norm": 0.15862756967544556, + "learning_rate": 2.997393302362014e-05, + "loss": 0.0433, + "step": 20350 + }, + { + "epoch": 0.04490189289266551, + "grad_norm": 0.12923507392406464, + "learning_rate": 2.9973835496027085e-05, + "loss": 0.0443, + "step": 20360 + }, + { + "epoch": 0.04492394686756367, + "grad_norm": 0.22871994972229004, + "learning_rate": 2.9973737786487786e-05, + "loss": 0.0444, + "step": 20370 + }, + { + "epoch": 0.04494600084246184, + "grad_norm": 0.16231651604175568, + "learning_rate": 2.9973639895003442e-05, + "loss": 0.0441, + "step": 20380 + }, + { + "epoch": 0.044968054817360006, + "grad_norm": 0.14822860062122345, + "learning_rate": 2.9973541821575236e-05, + "loss": 0.0443, + "step": 20390 + }, + { + "epoch": 0.04499010879225817, + "grad_norm": 0.13495469093322754, + "learning_rate": 2.997344356620436e-05, + "loss": 0.0422, + "step": 20400 + }, + { + "epoch": 0.04501216276715634, + "grad_norm": 0.15102426707744598, + "learning_rate": 2.9973345128892004e-05, + "loss": 0.0443, + "step": 20410 + }, + { + "epoch": 0.0450342167420545, + "grad_norm": 0.12696033716201782, + "learning_rate": 2.9973246509639364e-05, + "loss": 0.0436, + "step": 20420 + }, + { + "epoch": 0.04505627071695267, + "grad_norm": 0.08966541290283203, + "learning_rate": 2.997314770844765e-05, + "loss": 0.0404, + "step": 20430 + }, + { + "epoch": 0.045078324691850837, + "grad_norm": 0.15302281081676483, + "learning_rate": 2.997304872531805e-05, + "loss": 0.0421, + "step": 20440 + }, + { + "epoch": 0.045100378666749, + "grad_norm": 0.13351021707057953, + "learning_rate": 2.9972949560251772e-05, + "loss": 0.0428, + "step": 20450 + }, + { + "epoch": 0.04512243264164717, + "grad_norm": 0.14557643234729767, + "learning_rate": 2.997285021325002e-05, + "loss": 0.042, + "step": 20460 + }, + { + "epoch": 0.04514448661654533, + "grad_norm": 0.15550057590007782, + "learning_rate": 2.9972750684314e-05, + "loss": 0.0436, + "step": 20470 + }, + { + "epoch": 0.045166540591443496, + "grad_norm": 0.17808754742145538, + "learning_rate": 2.9972650973444922e-05, + "loss": 0.0455, + "step": 20480 + }, + { + "epoch": 0.04518859456634167, + "grad_norm": 0.1526651829481125, + "learning_rate": 2.9972551080644002e-05, + "loss": 0.0439, + "step": 20490 + }, + { + "epoch": 0.04521064854123983, + "grad_norm": 0.1242154985666275, + "learning_rate": 2.9972451005912447e-05, + "loss": 0.0432, + "step": 20500 + }, + { + "epoch": 0.04523270251613799, + "grad_norm": 0.12677104771137238, + "learning_rate": 2.9972350749251475e-05, + "loss": 0.0422, + "step": 20510 + }, + { + "epoch": 0.04525475649103616, + "grad_norm": 0.1521439552307129, + "learning_rate": 2.9972250310662307e-05, + "loss": 0.0411, + "step": 20520 + }, + { + "epoch": 0.045276810465934327, + "grad_norm": 0.13022375106811523, + "learning_rate": 2.9972149690146163e-05, + "loss": 0.0425, + "step": 20530 + }, + { + "epoch": 0.0452988644408325, + "grad_norm": 0.16075612604618073, + "learning_rate": 2.9972048887704262e-05, + "loss": 0.0433, + "step": 20540 + }, + { + "epoch": 0.04532091841573066, + "grad_norm": 0.1436825543642044, + "learning_rate": 2.997194790333783e-05, + "loss": 0.042, + "step": 20550 + }, + { + "epoch": 0.04534297239062882, + "grad_norm": 0.14494070410728455, + "learning_rate": 2.9971846737048097e-05, + "loss": 0.044, + "step": 20560 + }, + { + "epoch": 0.04536502636552699, + "grad_norm": 0.17264434695243835, + "learning_rate": 2.997174538883629e-05, + "loss": 0.0403, + "step": 20570 + }, + { + "epoch": 0.04538708034042516, + "grad_norm": 0.11638203263282776, + "learning_rate": 2.9971643858703642e-05, + "loss": 0.0406, + "step": 20580 + }, + { + "epoch": 0.04540913431532332, + "grad_norm": 0.15860828757286072, + "learning_rate": 2.9971542146651382e-05, + "loss": 0.0436, + "step": 20590 + }, + { + "epoch": 0.04543118829022149, + "grad_norm": 0.12115710228681564, + "learning_rate": 2.9971440252680755e-05, + "loss": 0.0433, + "step": 20600 + }, + { + "epoch": 0.04545324226511965, + "grad_norm": 0.14604617655277252, + "learning_rate": 2.997133817679299e-05, + "loss": 0.0422, + "step": 20610 + }, + { + "epoch": 0.045475296240017817, + "grad_norm": 0.17774544656276703, + "learning_rate": 2.997123591898933e-05, + "loss": 0.0411, + "step": 20620 + }, + { + "epoch": 0.04549735021491599, + "grad_norm": 0.14298853278160095, + "learning_rate": 2.997113347927102e-05, + "loss": 0.0444, + "step": 20630 + }, + { + "epoch": 0.04551940418981415, + "grad_norm": 0.11875593662261963, + "learning_rate": 2.99710308576393e-05, + "loss": 0.0435, + "step": 20640 + }, + { + "epoch": 0.04554145816471232, + "grad_norm": 0.1251024305820465, + "learning_rate": 2.9970928054095425e-05, + "loss": 0.0457, + "step": 20650 + }, + { + "epoch": 0.04556351213961048, + "grad_norm": 0.14302514493465424, + "learning_rate": 2.997082506864064e-05, + "loss": 0.043, + "step": 20660 + }, + { + "epoch": 0.04558556611450865, + "grad_norm": 0.12006643414497375, + "learning_rate": 2.997072190127619e-05, + "loss": 0.0441, + "step": 20670 + }, + { + "epoch": 0.04560762008940682, + "grad_norm": 0.14690133929252625, + "learning_rate": 2.997061855200334e-05, + "loss": 0.0432, + "step": 20680 + }, + { + "epoch": 0.04562967406430498, + "grad_norm": 0.13978731632232666, + "learning_rate": 2.9970515020823334e-05, + "loss": 0.0447, + "step": 20690 + }, + { + "epoch": 0.04565172803920314, + "grad_norm": 0.10030906647443771, + "learning_rate": 2.997041130773744e-05, + "loss": 0.045, + "step": 20700 + }, + { + "epoch": 0.04567378201410131, + "grad_norm": 0.1712356060743332, + "learning_rate": 2.9970307412746913e-05, + "loss": 0.0429, + "step": 20710 + }, + { + "epoch": 0.04569583598899948, + "grad_norm": 0.16126559674739838, + "learning_rate": 2.9970203335853017e-05, + "loss": 0.0437, + "step": 20720 + }, + { + "epoch": 0.04571788996389764, + "grad_norm": 0.13907277584075928, + "learning_rate": 2.9970099077057017e-05, + "loss": 0.042, + "step": 20730 + }, + { + "epoch": 0.04573994393879581, + "grad_norm": 0.1465783268213272, + "learning_rate": 2.996999463636018e-05, + "loss": 0.0459, + "step": 20740 + }, + { + "epoch": 0.04576199791369397, + "grad_norm": 0.13508039712905884, + "learning_rate": 2.996989001376377e-05, + "loss": 0.0405, + "step": 20750 + }, + { + "epoch": 0.045784051888592144, + "grad_norm": 0.11511833965778351, + "learning_rate": 2.9969785209269062e-05, + "loss": 0.0415, + "step": 20760 + }, + { + "epoch": 0.04580610586349031, + "grad_norm": 0.1257779598236084, + "learning_rate": 2.9969680222877337e-05, + "loss": 0.0427, + "step": 20770 + }, + { + "epoch": 0.04582815983838847, + "grad_norm": 0.11834251135587692, + "learning_rate": 2.9969575054589857e-05, + "loss": 0.0436, + "step": 20780 + }, + { + "epoch": 0.04585021381328664, + "grad_norm": 0.12085633724927902, + "learning_rate": 2.9969469704407907e-05, + "loss": 0.0416, + "step": 20790 + }, + { + "epoch": 0.0458722677881848, + "grad_norm": 0.1257585883140564, + "learning_rate": 2.9969364172332766e-05, + "loss": 0.0424, + "step": 20800 + }, + { + "epoch": 0.04589432176308297, + "grad_norm": 0.11298589408397675, + "learning_rate": 2.996925845836572e-05, + "loss": 0.043, + "step": 20810 + }, + { + "epoch": 0.04591637573798114, + "grad_norm": 0.1981487274169922, + "learning_rate": 2.996915256250805e-05, + "loss": 0.0408, + "step": 20820 + }, + { + "epoch": 0.0459384297128793, + "grad_norm": 0.12500938773155212, + "learning_rate": 2.9969046484761045e-05, + "loss": 0.0415, + "step": 20830 + }, + { + "epoch": 0.04596048368777746, + "grad_norm": 0.17059968411922455, + "learning_rate": 2.996894022512599e-05, + "loss": 0.0444, + "step": 20840 + }, + { + "epoch": 0.045982537662675634, + "grad_norm": 0.12951520085334778, + "learning_rate": 2.9968833783604176e-05, + "loss": 0.0446, + "step": 20850 + }, + { + "epoch": 0.0460045916375738, + "grad_norm": 0.13549785315990448, + "learning_rate": 2.99687271601969e-05, + "loss": 0.0423, + "step": 20860 + }, + { + "epoch": 0.04602664561247197, + "grad_norm": 0.12184930592775345, + "learning_rate": 2.9968620354905457e-05, + "loss": 0.0436, + "step": 20870 + }, + { + "epoch": 0.04604869958737013, + "grad_norm": 0.16633081436157227, + "learning_rate": 2.9968513367731143e-05, + "loss": 0.0435, + "step": 20880 + }, + { + "epoch": 0.04607075356226829, + "grad_norm": 0.14614787697792053, + "learning_rate": 2.996840619867526e-05, + "loss": 0.043, + "step": 20890 + }, + { + "epoch": 0.046092807537166464, + "grad_norm": 0.12369143962860107, + "learning_rate": 2.996829884773911e-05, + "loss": 0.0437, + "step": 20900 + }, + { + "epoch": 0.04611486151206463, + "grad_norm": 0.17227476835250854, + "learning_rate": 2.9968191314923994e-05, + "loss": 0.0412, + "step": 20910 + }, + { + "epoch": 0.04613691548696279, + "grad_norm": 0.1443036049604416, + "learning_rate": 2.996808360023122e-05, + "loss": 0.0439, + "step": 20920 + }, + { + "epoch": 0.04615896946186096, + "grad_norm": 0.12360411137342453, + "learning_rate": 2.9967975703662104e-05, + "loss": 0.043, + "step": 20930 + }, + { + "epoch": 0.046181023436759124, + "grad_norm": 0.12656155228614807, + "learning_rate": 2.9967867625217947e-05, + "loss": 0.0423, + "step": 20940 + }, + { + "epoch": 0.04620307741165729, + "grad_norm": 0.14145821332931519, + "learning_rate": 2.9967759364900068e-05, + "loss": 0.0439, + "step": 20950 + }, + { + "epoch": 0.04622513138655546, + "grad_norm": 0.1398652344942093, + "learning_rate": 2.9967650922709782e-05, + "loss": 0.0411, + "step": 20960 + }, + { + "epoch": 0.04624718536145362, + "grad_norm": 0.12732119858264923, + "learning_rate": 2.99675422986484e-05, + "loss": 0.0447, + "step": 20970 + }, + { + "epoch": 0.04626923933635179, + "grad_norm": 0.1451430767774582, + "learning_rate": 2.9967433492717253e-05, + "loss": 0.0436, + "step": 20980 + }, + { + "epoch": 0.046291293311249954, + "grad_norm": 0.15832935273647308, + "learning_rate": 2.9967324504917654e-05, + "loss": 0.0444, + "step": 20990 + }, + { + "epoch": 0.04631334728614812, + "grad_norm": 0.12228241562843323, + "learning_rate": 2.9967215335250932e-05, + "loss": 0.0415, + "step": 21000 + }, + { + "epoch": 0.04633540126104629, + "grad_norm": 0.137429341673851, + "learning_rate": 2.9967105983718414e-05, + "loss": 0.0435, + "step": 21010 + }, + { + "epoch": 0.04635745523594445, + "grad_norm": 0.19436702132225037, + "learning_rate": 2.9966996450321423e-05, + "loss": 0.0438, + "step": 21020 + }, + { + "epoch": 0.046379509210842614, + "grad_norm": 0.11534681171178818, + "learning_rate": 2.9966886735061295e-05, + "loss": 0.0429, + "step": 21030 + }, + { + "epoch": 0.046401563185740784, + "grad_norm": 0.14042814075946808, + "learning_rate": 2.9966776837939368e-05, + "loss": 0.0468, + "step": 21040 + }, + { + "epoch": 0.04642361716063895, + "grad_norm": 0.15301576256752014, + "learning_rate": 2.9966666758956963e-05, + "loss": 0.0424, + "step": 21050 + }, + { + "epoch": 0.04644567113553711, + "grad_norm": 0.1491350382566452, + "learning_rate": 2.996655649811543e-05, + "loss": 0.0436, + "step": 21060 + }, + { + "epoch": 0.04646772511043528, + "grad_norm": 0.19200018048286438, + "learning_rate": 2.9966446055416105e-05, + "loss": 0.0408, + "step": 21070 + }, + { + "epoch": 0.046489779085333444, + "grad_norm": 0.187462717294693, + "learning_rate": 2.9966335430860328e-05, + "loss": 0.0426, + "step": 21080 + }, + { + "epoch": 0.046511833060231614, + "grad_norm": 0.15237389504909515, + "learning_rate": 2.9966224624449447e-05, + "loss": 0.0442, + "step": 21090 + }, + { + "epoch": 0.04653388703512978, + "grad_norm": 0.18835294246673584, + "learning_rate": 2.9966113636184807e-05, + "loss": 0.042, + "step": 21100 + }, + { + "epoch": 0.04655594101002794, + "grad_norm": 0.17557862401008606, + "learning_rate": 2.9966002466067756e-05, + "loss": 0.0426, + "step": 21110 + }, + { + "epoch": 0.04657799498492611, + "grad_norm": 0.14137472212314606, + "learning_rate": 2.996589111409964e-05, + "loss": 0.0416, + "step": 21120 + }, + { + "epoch": 0.046600048959824274, + "grad_norm": 0.14732234179973602, + "learning_rate": 2.996577958028182e-05, + "loss": 0.0419, + "step": 21130 + }, + { + "epoch": 0.04662210293472244, + "grad_norm": 0.15051515400409698, + "learning_rate": 2.996566786461565e-05, + "loss": 0.0455, + "step": 21140 + }, + { + "epoch": 0.04664415690962061, + "grad_norm": 0.19998498260974884, + "learning_rate": 2.9965555967102483e-05, + "loss": 0.042, + "step": 21150 + }, + { + "epoch": 0.04666621088451877, + "grad_norm": 0.14046545326709747, + "learning_rate": 2.996544388774368e-05, + "loss": 0.0429, + "step": 21160 + }, + { + "epoch": 0.046688264859416934, + "grad_norm": 0.14038948714733124, + "learning_rate": 2.9965331626540605e-05, + "loss": 0.0441, + "step": 21170 + }, + { + "epoch": 0.046710318834315104, + "grad_norm": 0.13328216969966888, + "learning_rate": 2.9965219183494625e-05, + "loss": 0.0419, + "step": 21180 + }, + { + "epoch": 0.04673237280921327, + "grad_norm": 0.1497262865304947, + "learning_rate": 2.99651065586071e-05, + "loss": 0.0457, + "step": 21190 + }, + { + "epoch": 0.04675442678411144, + "grad_norm": 0.11040347814559937, + "learning_rate": 2.99649937518794e-05, + "loss": 0.0441, + "step": 21200 + }, + { + "epoch": 0.0467764807590096, + "grad_norm": 0.15628871321678162, + "learning_rate": 2.99648807633129e-05, + "loss": 0.044, + "step": 21210 + }, + { + "epoch": 0.046798534733907764, + "grad_norm": 0.1482667475938797, + "learning_rate": 2.9964767592908966e-05, + "loss": 0.0454, + "step": 21220 + }, + { + "epoch": 0.046820588708805934, + "grad_norm": 0.13580459356307983, + "learning_rate": 2.9964654240668974e-05, + "loss": 0.0401, + "step": 21230 + }, + { + "epoch": 0.0468426426837041, + "grad_norm": 0.151071235537529, + "learning_rate": 2.996454070659431e-05, + "loss": 0.0423, + "step": 21240 + }, + { + "epoch": 0.04686469665860226, + "grad_norm": 0.1413988471031189, + "learning_rate": 2.9964426990686342e-05, + "loss": 0.0434, + "step": 21250 + }, + { + "epoch": 0.04688675063350043, + "grad_norm": 0.13886862993240356, + "learning_rate": 2.996431309294646e-05, + "loss": 0.0412, + "step": 21260 + }, + { + "epoch": 0.046908804608398594, + "grad_norm": 0.17949049174785614, + "learning_rate": 2.9964199013376047e-05, + "loss": 0.044, + "step": 21270 + }, + { + "epoch": 0.04693085858329676, + "grad_norm": 0.14112485945224762, + "learning_rate": 2.9964084751976484e-05, + "loss": 0.0443, + "step": 21280 + }, + { + "epoch": 0.04695291255819493, + "grad_norm": 0.14255787432193756, + "learning_rate": 2.9963970308749165e-05, + "loss": 0.0426, + "step": 21290 + }, + { + "epoch": 0.04697496653309309, + "grad_norm": 0.13443201780319214, + "learning_rate": 2.9963855683695477e-05, + "loss": 0.0436, + "step": 21300 + }, + { + "epoch": 0.04699702050799126, + "grad_norm": 0.16046026349067688, + "learning_rate": 2.9963740876816816e-05, + "loss": 0.0423, + "step": 21310 + }, + { + "epoch": 0.047019074482889424, + "grad_norm": 0.13246962428092957, + "learning_rate": 2.9963625888114574e-05, + "loss": 0.043, + "step": 21320 + }, + { + "epoch": 0.04704112845778759, + "grad_norm": 0.16072885692119598, + "learning_rate": 2.996351071759015e-05, + "loss": 0.0407, + "step": 21330 + }, + { + "epoch": 0.04706318243268576, + "grad_norm": 0.10935262590646744, + "learning_rate": 2.9963395365244945e-05, + "loss": 0.0427, + "step": 21340 + }, + { + "epoch": 0.04708523640758392, + "grad_norm": 0.13252322375774384, + "learning_rate": 2.9963279831080356e-05, + "loss": 0.0448, + "step": 21350 + }, + { + "epoch": 0.047107290382482084, + "grad_norm": 0.13276393711566925, + "learning_rate": 2.996316411509779e-05, + "loss": 0.0429, + "step": 21360 + }, + { + "epoch": 0.047129344357380254, + "grad_norm": 0.1456340104341507, + "learning_rate": 2.996304821729865e-05, + "loss": 0.0415, + "step": 21370 + }, + { + "epoch": 0.04715139833227842, + "grad_norm": 0.16287919878959656, + "learning_rate": 2.996293213768435e-05, + "loss": 0.0407, + "step": 21380 + }, + { + "epoch": 0.04717345230717659, + "grad_norm": 0.13740728795528412, + "learning_rate": 2.9962815876256297e-05, + "loss": 0.0417, + "step": 21390 + }, + { + "epoch": 0.04719550628207475, + "grad_norm": 0.13025020062923431, + "learning_rate": 2.99626994330159e-05, + "loss": 0.0412, + "step": 21400 + }, + { + "epoch": 0.047217560256972914, + "grad_norm": 0.17397242784500122, + "learning_rate": 2.9962582807964583e-05, + "loss": 0.0406, + "step": 21410 + }, + { + "epoch": 0.047239614231871084, + "grad_norm": 0.14669479429721832, + "learning_rate": 2.9962466001103755e-05, + "loss": 0.0424, + "step": 21420 + }, + { + "epoch": 0.04726166820676925, + "grad_norm": 0.1622041016817093, + "learning_rate": 2.9962349012434837e-05, + "loss": 0.0425, + "step": 21430 + }, + { + "epoch": 0.04728372218166741, + "grad_norm": 0.12409317493438721, + "learning_rate": 2.9962231841959254e-05, + "loss": 0.044, + "step": 21440 + }, + { + "epoch": 0.04730577615656558, + "grad_norm": 0.1454387605190277, + "learning_rate": 2.9962114489678428e-05, + "loss": 0.0438, + "step": 21450 + }, + { + "epoch": 0.047327830131463744, + "grad_norm": 0.14390145242214203, + "learning_rate": 2.9961996955593783e-05, + "loss": 0.0426, + "step": 21460 + }, + { + "epoch": 0.04734988410636191, + "grad_norm": 0.15632064640522003, + "learning_rate": 2.996187923970675e-05, + "loss": 0.0402, + "step": 21470 + }, + { + "epoch": 0.04737193808126008, + "grad_norm": 0.13554294407367706, + "learning_rate": 2.9961761342018753e-05, + "loss": 0.041, + "step": 21480 + }, + { + "epoch": 0.04739399205615824, + "grad_norm": 0.1485297828912735, + "learning_rate": 2.9961643262531237e-05, + "loss": 0.0431, + "step": 21490 + }, + { + "epoch": 0.04741604603105641, + "grad_norm": 0.11790084838867188, + "learning_rate": 2.9961525001245622e-05, + "loss": 0.0409, + "step": 21500 + }, + { + "epoch": 0.047438100005954574, + "grad_norm": 0.117509625852108, + "learning_rate": 2.996140655816336e-05, + "loss": 0.0433, + "step": 21510 + }, + { + "epoch": 0.04746015398085274, + "grad_norm": 0.174348846077919, + "learning_rate": 2.9961287933285877e-05, + "loss": 0.0447, + "step": 21520 + }, + { + "epoch": 0.04748220795575091, + "grad_norm": 0.1848611682653427, + "learning_rate": 2.996116912661462e-05, + "loss": 0.042, + "step": 21530 + }, + { + "epoch": 0.04750426193064907, + "grad_norm": 0.16451938450336456, + "learning_rate": 2.9961050138151033e-05, + "loss": 0.043, + "step": 21540 + }, + { + "epoch": 0.047526315905547234, + "grad_norm": 0.13390159606933594, + "learning_rate": 2.996093096789656e-05, + "loss": 0.0448, + "step": 21550 + }, + { + "epoch": 0.047548369880445404, + "grad_norm": 0.17484168708324432, + "learning_rate": 2.996081161585265e-05, + "loss": 0.0416, + "step": 21560 + }, + { + "epoch": 0.04757042385534357, + "grad_norm": 0.12214018404483795, + "learning_rate": 2.996069208202076e-05, + "loss": 0.0424, + "step": 21570 + }, + { + "epoch": 0.04759247783024173, + "grad_norm": 0.1496545970439911, + "learning_rate": 2.996057236640233e-05, + "loss": 0.0439, + "step": 21580 + }, + { + "epoch": 0.0476145318051399, + "grad_norm": 0.17114603519439697, + "learning_rate": 2.996045246899882e-05, + "loss": 0.045, + "step": 21590 + }, + { + "epoch": 0.047636585780038064, + "grad_norm": 0.17237791419029236, + "learning_rate": 2.996033238981169e-05, + "loss": 0.043, + "step": 21600 + }, + { + "epoch": 0.047658639754936234, + "grad_norm": 0.13604344427585602, + "learning_rate": 2.9960212128842396e-05, + "loss": 0.0449, + "step": 21610 + }, + { + "epoch": 0.0476806937298344, + "grad_norm": 0.1759425550699234, + "learning_rate": 2.99600916860924e-05, + "loss": 0.0445, + "step": 21620 + }, + { + "epoch": 0.04770274770473256, + "grad_norm": 0.16672439873218536, + "learning_rate": 2.9959971061563166e-05, + "loss": 0.0421, + "step": 21630 + }, + { + "epoch": 0.04772480167963073, + "grad_norm": 0.15957282483577728, + "learning_rate": 2.9959850255256158e-05, + "loss": 0.0432, + "step": 21640 + }, + { + "epoch": 0.047746855654528894, + "grad_norm": 0.1440388560295105, + "learning_rate": 2.9959729267172843e-05, + "loss": 0.0413, + "step": 21650 + }, + { + "epoch": 0.04776890962942706, + "grad_norm": 0.15907977521419525, + "learning_rate": 2.9959608097314696e-05, + "loss": 0.0423, + "step": 21660 + }, + { + "epoch": 0.04779096360432523, + "grad_norm": 0.12124737352132797, + "learning_rate": 2.9959486745683183e-05, + "loss": 0.0428, + "step": 21670 + }, + { + "epoch": 0.04781301757922339, + "grad_norm": 0.13486742973327637, + "learning_rate": 2.9959365212279786e-05, + "loss": 0.0418, + "step": 21680 + }, + { + "epoch": 0.047835071554121554, + "grad_norm": 0.17528873682022095, + "learning_rate": 2.9959243497105978e-05, + "loss": 0.0424, + "step": 21690 + }, + { + "epoch": 0.047857125529019724, + "grad_norm": 0.16112983226776123, + "learning_rate": 2.9959121600163236e-05, + "loss": 0.0418, + "step": 21700 + }, + { + "epoch": 0.04787917950391789, + "grad_norm": 0.14254023134708405, + "learning_rate": 2.995899952145304e-05, + "loss": 0.0406, + "step": 21710 + }, + { + "epoch": 0.04790123347881606, + "grad_norm": 0.11074694246053696, + "learning_rate": 2.995887726097688e-05, + "loss": 0.0422, + "step": 21720 + }, + { + "epoch": 0.04792328745371422, + "grad_norm": 0.11590298265218735, + "learning_rate": 2.9958754818736236e-05, + "loss": 0.0411, + "step": 21730 + }, + { + "epoch": 0.047945341428612384, + "grad_norm": 0.1546044796705246, + "learning_rate": 2.99586321947326e-05, + "loss": 0.0436, + "step": 21740 + }, + { + "epoch": 0.047967395403510554, + "grad_norm": 0.1523285061120987, + "learning_rate": 2.995850938896746e-05, + "loss": 0.0404, + "step": 21750 + }, + { + "epoch": 0.04798944937840872, + "grad_norm": 0.13441191613674164, + "learning_rate": 2.9958386401442303e-05, + "loss": 0.0433, + "step": 21760 + }, + { + "epoch": 0.04801150335330688, + "grad_norm": 0.13555744290351868, + "learning_rate": 2.995826323215863e-05, + "loss": 0.0439, + "step": 21770 + }, + { + "epoch": 0.04803355732820505, + "grad_norm": 0.1467466652393341, + "learning_rate": 2.9958139881117933e-05, + "loss": 0.0437, + "step": 21780 + }, + { + "epoch": 0.048055611303103214, + "grad_norm": 0.14258699119091034, + "learning_rate": 2.995801634832172e-05, + "loss": 0.0409, + "step": 21790 + }, + { + "epoch": 0.04807766527800138, + "grad_norm": 0.1472284495830536, + "learning_rate": 2.9957892633771486e-05, + "loss": 0.0435, + "step": 21800 + }, + { + "epoch": 0.04809971925289955, + "grad_norm": 0.15894843637943268, + "learning_rate": 2.9957768737468726e-05, + "loss": 0.0457, + "step": 21810 + }, + { + "epoch": 0.04812177322779771, + "grad_norm": 0.16334398090839386, + "learning_rate": 2.995764465941496e-05, + "loss": 0.0427, + "step": 21820 + }, + { + "epoch": 0.04814382720269588, + "grad_norm": 0.1228649690747261, + "learning_rate": 2.995752039961169e-05, + "loss": 0.0428, + "step": 21830 + }, + { + "epoch": 0.048165881177594044, + "grad_norm": 0.1501438170671463, + "learning_rate": 2.995739595806042e-05, + "loss": 0.0451, + "step": 21840 + }, + { + "epoch": 0.04818793515249221, + "grad_norm": 0.14116428792476654, + "learning_rate": 2.9957271334762667e-05, + "loss": 0.0418, + "step": 21850 + }, + { + "epoch": 0.04820998912739038, + "grad_norm": 0.1606983095407486, + "learning_rate": 2.9957146529719945e-05, + "loss": 0.0434, + "step": 21860 + }, + { + "epoch": 0.04823204310228854, + "grad_norm": 0.17196157574653625, + "learning_rate": 2.9957021542933776e-05, + "loss": 0.0429, + "step": 21870 + }, + { + "epoch": 0.048254097077186704, + "grad_norm": 0.1592680662870407, + "learning_rate": 2.995689637440567e-05, + "loss": 0.0423, + "step": 21880 + }, + { + "epoch": 0.048276151052084874, + "grad_norm": 0.1084945797920227, + "learning_rate": 2.995677102413715e-05, + "loss": 0.0415, + "step": 21890 + }, + { + "epoch": 0.04829820502698304, + "grad_norm": 0.14037565886974335, + "learning_rate": 2.9956645492129744e-05, + "loss": 0.0425, + "step": 21900 + }, + { + "epoch": 0.0483202590018812, + "grad_norm": 0.10451938956975937, + "learning_rate": 2.995651977838497e-05, + "loss": 0.044, + "step": 21910 + }, + { + "epoch": 0.04834231297677937, + "grad_norm": 0.12845030426979065, + "learning_rate": 2.9956393882904367e-05, + "loss": 0.0422, + "step": 21920 + }, + { + "epoch": 0.048364366951677534, + "grad_norm": 0.17688781023025513, + "learning_rate": 2.995626780568945e-05, + "loss": 0.0436, + "step": 21930 + }, + { + "epoch": 0.048386420926575704, + "grad_norm": 0.14218977093696594, + "learning_rate": 2.995614154674176e-05, + "loss": 0.0428, + "step": 21940 + }, + { + "epoch": 0.04840847490147387, + "grad_norm": 0.14164653420448303, + "learning_rate": 2.9956015106062826e-05, + "loss": 0.0418, + "step": 21950 + }, + { + "epoch": 0.04843052887637203, + "grad_norm": 0.17650504410266876, + "learning_rate": 2.9955888483654193e-05, + "loss": 0.0432, + "step": 21960 + }, + { + "epoch": 0.0484525828512702, + "grad_norm": 0.12861120700836182, + "learning_rate": 2.995576167951739e-05, + "loss": 0.0429, + "step": 21970 + }, + { + "epoch": 0.048474636826168364, + "grad_norm": 0.13220003247261047, + "learning_rate": 2.9955634693653967e-05, + "loss": 0.0418, + "step": 21980 + }, + { + "epoch": 0.04849669080106653, + "grad_norm": 0.1597372442483902, + "learning_rate": 2.9955507526065455e-05, + "loss": 0.0433, + "step": 21990 + }, + { + "epoch": 0.0485187447759647, + "grad_norm": 0.1397099792957306, + "learning_rate": 2.995538017675341e-05, + "loss": 0.0438, + "step": 22000 + }, + { + "epoch": 0.04854079875086286, + "grad_norm": 0.13083161413669586, + "learning_rate": 2.995525264571938e-05, + "loss": 0.0393, + "step": 22010 + }, + { + "epoch": 0.048562852725761024, + "grad_norm": 0.11210645735263824, + "learning_rate": 2.9955124932964907e-05, + "loss": 0.0414, + "step": 22020 + }, + { + "epoch": 0.048584906700659194, + "grad_norm": 0.10957285016775131, + "learning_rate": 2.995499703849154e-05, + "loss": 0.0441, + "step": 22030 + }, + { + "epoch": 0.04860696067555736, + "grad_norm": 0.13177838921546936, + "learning_rate": 2.995486896230085e-05, + "loss": 0.0439, + "step": 22040 + }, + { + "epoch": 0.04862901465045553, + "grad_norm": 0.1240258663892746, + "learning_rate": 2.9954740704394373e-05, + "loss": 0.0417, + "step": 22050 + }, + { + "epoch": 0.04865106862535369, + "grad_norm": 0.12008364498615265, + "learning_rate": 2.995461226477368e-05, + "loss": 0.0446, + "step": 22060 + }, + { + "epoch": 0.048673122600251854, + "grad_norm": 0.12909749150276184, + "learning_rate": 2.9954483643440328e-05, + "loss": 0.0421, + "step": 22070 + }, + { + "epoch": 0.048695176575150025, + "grad_norm": 0.13692782819271088, + "learning_rate": 2.995435484039588e-05, + "loss": 0.0446, + "step": 22080 + }, + { + "epoch": 0.04871723055004819, + "grad_norm": 0.1799049973487854, + "learning_rate": 2.9954225855641906e-05, + "loss": 0.0414, + "step": 22090 + }, + { + "epoch": 0.04873928452494635, + "grad_norm": 0.15068428218364716, + "learning_rate": 2.9954096689179966e-05, + "loss": 0.0411, + "step": 22100 + }, + { + "epoch": 0.04876133849984452, + "grad_norm": 0.14732873439788818, + "learning_rate": 2.995396734101163e-05, + "loss": 0.0438, + "step": 22110 + }, + { + "epoch": 0.048783392474742684, + "grad_norm": 0.174880251288414, + "learning_rate": 2.9953837811138475e-05, + "loss": 0.0421, + "step": 22120 + }, + { + "epoch": 0.04880544644964085, + "grad_norm": 0.1580851674079895, + "learning_rate": 2.995370809956207e-05, + "loss": 0.0416, + "step": 22130 + }, + { + "epoch": 0.04882750042453902, + "grad_norm": 0.12817804515361786, + "learning_rate": 2.9953578206283992e-05, + "loss": 0.0428, + "step": 22140 + }, + { + "epoch": 0.04884955439943718, + "grad_norm": 0.11546886712312698, + "learning_rate": 2.9953448131305823e-05, + "loss": 0.04, + "step": 22150 + }, + { + "epoch": 0.04887160837433535, + "grad_norm": 0.14491645991802216, + "learning_rate": 2.9953317874629138e-05, + "loss": 0.0417, + "step": 22160 + }, + { + "epoch": 0.048893662349233515, + "grad_norm": 0.12666036188602448, + "learning_rate": 2.9953187436255525e-05, + "loss": 0.0413, + "step": 22170 + }, + { + "epoch": 0.04891571632413168, + "grad_norm": 0.14805074036121368, + "learning_rate": 2.9953056816186568e-05, + "loss": 0.0431, + "step": 22180 + }, + { + "epoch": 0.04893777029902985, + "grad_norm": 0.09210848808288574, + "learning_rate": 2.9952926014423847e-05, + "loss": 0.042, + "step": 22190 + }, + { + "epoch": 0.04895982427392801, + "grad_norm": 0.17460767924785614, + "learning_rate": 2.995279503096896e-05, + "loss": 0.0429, + "step": 22200 + }, + { + "epoch": 0.048981878248826174, + "grad_norm": 0.1602095067501068, + "learning_rate": 2.99526638658235e-05, + "loss": 0.0429, + "step": 22210 + }, + { + "epoch": 0.049003932223724345, + "grad_norm": 0.11819919943809509, + "learning_rate": 2.995253251898905e-05, + "loss": 0.0429, + "step": 22220 + }, + { + "epoch": 0.04902598619862251, + "grad_norm": 0.16083577275276184, + "learning_rate": 2.9952400990467214e-05, + "loss": 0.0437, + "step": 22230 + }, + { + "epoch": 0.04904804017352067, + "grad_norm": 0.18914183974266052, + "learning_rate": 2.995226928025959e-05, + "loss": 0.0435, + "step": 22240 + }, + { + "epoch": 0.04907009414841884, + "grad_norm": 0.17560313642024994, + "learning_rate": 2.9952137388367776e-05, + "loss": 0.0456, + "step": 22250 + }, + { + "epoch": 0.049092148123317005, + "grad_norm": 0.1562008559703827, + "learning_rate": 2.9952005314793375e-05, + "loss": 0.0425, + "step": 22260 + }, + { + "epoch": 0.049114202098215175, + "grad_norm": 0.144831582903862, + "learning_rate": 2.995187305953799e-05, + "loss": 0.0441, + "step": 22270 + }, + { + "epoch": 0.04913625607311334, + "grad_norm": 0.14084863662719727, + "learning_rate": 2.995174062260324e-05, + "loss": 0.044, + "step": 22280 + }, + { + "epoch": 0.0491583100480115, + "grad_norm": 0.1354473978281021, + "learning_rate": 2.9951608003990713e-05, + "loss": 0.044, + "step": 22290 + }, + { + "epoch": 0.04918036402290967, + "grad_norm": 0.12381875514984131, + "learning_rate": 2.995147520370204e-05, + "loss": 0.0417, + "step": 22300 + }, + { + "epoch": 0.049202417997807835, + "grad_norm": 0.10689845681190491, + "learning_rate": 2.9951342221738823e-05, + "loss": 0.0415, + "step": 22310 + }, + { + "epoch": 0.049224471972706, + "grad_norm": 0.13093264400959015, + "learning_rate": 2.9951209058102683e-05, + "loss": 0.0405, + "step": 22320 + }, + { + "epoch": 0.04924652594760417, + "grad_norm": 0.12151782959699631, + "learning_rate": 2.9951075712795237e-05, + "loss": 0.0425, + "step": 22330 + }, + { + "epoch": 0.04926857992250233, + "grad_norm": 0.18296140432357788, + "learning_rate": 2.9950942185818103e-05, + "loss": 0.0442, + "step": 22340 + }, + { + "epoch": 0.049290633897400495, + "grad_norm": 0.11715980619192123, + "learning_rate": 2.9950808477172907e-05, + "loss": 0.0457, + "step": 22350 + }, + { + "epoch": 0.049312687872298665, + "grad_norm": 0.16063465178012848, + "learning_rate": 2.9950674586861272e-05, + "loss": 0.0439, + "step": 22360 + }, + { + "epoch": 0.04933474184719683, + "grad_norm": 0.14025969803333282, + "learning_rate": 2.9950540514884823e-05, + "loss": 0.0414, + "step": 22370 + }, + { + "epoch": 0.049356795822095, + "grad_norm": 0.1591300219297409, + "learning_rate": 2.995040626124519e-05, + "loss": 0.0403, + "step": 22380 + }, + { + "epoch": 0.04937884979699316, + "grad_norm": 0.1282469481229782, + "learning_rate": 2.9950271825944013e-05, + "loss": 0.042, + "step": 22390 + }, + { + "epoch": 0.049400903771891325, + "grad_norm": 0.09732916206121445, + "learning_rate": 2.995013720898291e-05, + "loss": 0.0408, + "step": 22400 + }, + { + "epoch": 0.049422957746789495, + "grad_norm": 0.11959724873304367, + "learning_rate": 2.9950002410363528e-05, + "loss": 0.0421, + "step": 22410 + }, + { + "epoch": 0.04944501172168766, + "grad_norm": 0.1503666788339615, + "learning_rate": 2.9949867430087507e-05, + "loss": 0.041, + "step": 22420 + }, + { + "epoch": 0.04946706569658582, + "grad_norm": 0.18706700205802917, + "learning_rate": 2.9949732268156472e-05, + "loss": 0.0422, + "step": 22430 + }, + { + "epoch": 0.04948911967148399, + "grad_norm": 0.14063787460327148, + "learning_rate": 2.994959692457208e-05, + "loss": 0.0411, + "step": 22440 + }, + { + "epoch": 0.049511173646382155, + "grad_norm": 0.13804534077644348, + "learning_rate": 2.9949461399335972e-05, + "loss": 0.0412, + "step": 22450 + }, + { + "epoch": 0.049533227621280325, + "grad_norm": 0.11892229318618774, + "learning_rate": 2.994932569244979e-05, + "loss": 0.0429, + "step": 22460 + }, + { + "epoch": 0.04955528159617849, + "grad_norm": 0.16957467794418335, + "learning_rate": 2.9949189803915183e-05, + "loss": 0.0451, + "step": 22470 + }, + { + "epoch": 0.04957733557107665, + "grad_norm": 0.12654295563697815, + "learning_rate": 2.9949053733733812e-05, + "loss": 0.0423, + "step": 22480 + }, + { + "epoch": 0.04959938954597482, + "grad_norm": 0.17955835163593292, + "learning_rate": 2.994891748190732e-05, + "loss": 0.0411, + "step": 22490 + }, + { + "epoch": 0.049621443520872985, + "grad_norm": 0.16373340785503387, + "learning_rate": 2.9948781048437374e-05, + "loss": 0.0439, + "step": 22500 + }, + { + "epoch": 0.04964349749577115, + "grad_norm": 0.159379243850708, + "learning_rate": 2.9948644433325617e-05, + "loss": 0.0438, + "step": 22510 + }, + { + "epoch": 0.04966555147066932, + "grad_norm": 0.1350419819355011, + "learning_rate": 2.9948507636573716e-05, + "loss": 0.043, + "step": 22520 + }, + { + "epoch": 0.04968760544556748, + "grad_norm": 0.14838343858718872, + "learning_rate": 2.9948370658183334e-05, + "loss": 0.0425, + "step": 22530 + }, + { + "epoch": 0.049709659420465645, + "grad_norm": 0.1620209813117981, + "learning_rate": 2.9948233498156135e-05, + "loss": 0.0444, + "step": 22540 + }, + { + "epoch": 0.049731713395363815, + "grad_norm": 0.12243489921092987, + "learning_rate": 2.9948096156493787e-05, + "loss": 0.0419, + "step": 22550 + }, + { + "epoch": 0.04975376737026198, + "grad_norm": 0.15873511135578156, + "learning_rate": 2.994795863319796e-05, + "loss": 0.042, + "step": 22560 + }, + { + "epoch": 0.04977582134516015, + "grad_norm": 0.1673242151737213, + "learning_rate": 2.9947820928270318e-05, + "loss": 0.041, + "step": 22570 + }, + { + "epoch": 0.04979787532005831, + "grad_norm": 0.09793081879615784, + "learning_rate": 2.994768304171254e-05, + "loss": 0.0415, + "step": 22580 + }, + { + "epoch": 0.049819929294956475, + "grad_norm": 0.11757230758666992, + "learning_rate": 2.99475449735263e-05, + "loss": 0.0397, + "step": 22590 + }, + { + "epoch": 0.049841983269854645, + "grad_norm": 0.11009015887975693, + "learning_rate": 2.9947406723713275e-05, + "loss": 0.0428, + "step": 22600 + }, + { + "epoch": 0.04986403724475281, + "grad_norm": 0.14466041326522827, + "learning_rate": 2.994726829227515e-05, + "loss": 0.0424, + "step": 22610 + }, + { + "epoch": 0.04988609121965097, + "grad_norm": 0.11674920469522476, + "learning_rate": 2.99471296792136e-05, + "loss": 0.041, + "step": 22620 + }, + { + "epoch": 0.04990814519454914, + "grad_norm": 0.14951030910015106, + "learning_rate": 2.994699088453031e-05, + "loss": 0.0451, + "step": 22630 + }, + { + "epoch": 0.049930199169447305, + "grad_norm": 0.17494474351406097, + "learning_rate": 2.994685190822697e-05, + "loss": 0.0441, + "step": 22640 + }, + { + "epoch": 0.04995225314434547, + "grad_norm": 0.14391674101352692, + "learning_rate": 2.9946712750305263e-05, + "loss": 0.0444, + "step": 22650 + }, + { + "epoch": 0.04997430711924364, + "grad_norm": 0.1367325335741043, + "learning_rate": 2.9946573410766888e-05, + "loss": 0.0429, + "step": 22660 + }, + { + "epoch": 0.0499963610941418, + "grad_norm": 0.14415951073169708, + "learning_rate": 2.994643388961354e-05, + "loss": 0.0415, + "step": 22670 + }, + { + "epoch": 0.05001841506903997, + "grad_norm": 0.17022652924060822, + "learning_rate": 2.9946294186846898e-05, + "loss": 0.0408, + "step": 22680 + }, + { + "epoch": 0.050040469043938135, + "grad_norm": 0.11354662477970123, + "learning_rate": 2.9946154302468674e-05, + "loss": 0.0428, + "step": 22690 + }, + { + "epoch": 0.0500625230188363, + "grad_norm": 0.1538243293762207, + "learning_rate": 2.9946014236480566e-05, + "loss": 0.0422, + "step": 22700 + }, + { + "epoch": 0.05008457699373447, + "grad_norm": 0.14507417380809784, + "learning_rate": 2.994587398888427e-05, + "loss": 0.0426, + "step": 22710 + }, + { + "epoch": 0.05010663096863263, + "grad_norm": 0.10444077104330063, + "learning_rate": 2.99457335596815e-05, + "loss": 0.0409, + "step": 22720 + }, + { + "epoch": 0.050128684943530795, + "grad_norm": 0.11746329814195633, + "learning_rate": 2.9945592948873946e-05, + "loss": 0.0445, + "step": 22730 + }, + { + "epoch": 0.050150738918428965, + "grad_norm": 0.11998794972896576, + "learning_rate": 2.994545215646334e-05, + "loss": 0.0422, + "step": 22740 + }, + { + "epoch": 0.05017279289332713, + "grad_norm": 0.15573115646839142, + "learning_rate": 2.9945311182451367e-05, + "loss": 0.0417, + "step": 22750 + }, + { + "epoch": 0.05019484686822529, + "grad_norm": 0.13476359844207764, + "learning_rate": 2.9945170026839757e-05, + "loss": 0.0439, + "step": 22760 + }, + { + "epoch": 0.05021690084312346, + "grad_norm": 0.1260036826133728, + "learning_rate": 2.9945028689630223e-05, + "loss": 0.0423, + "step": 22770 + }, + { + "epoch": 0.050238954818021625, + "grad_norm": 0.14663146436214447, + "learning_rate": 2.9944887170824478e-05, + "loss": 0.0405, + "step": 22780 + }, + { + "epoch": 0.050261008792919795, + "grad_norm": 0.1277671903371811, + "learning_rate": 2.9944745470424245e-05, + "loss": 0.041, + "step": 22790 + }, + { + "epoch": 0.05028306276781796, + "grad_norm": 0.1565127968788147, + "learning_rate": 2.994460358843124e-05, + "loss": 0.0439, + "step": 22800 + }, + { + "epoch": 0.05030511674271612, + "grad_norm": 0.12089205533266068, + "learning_rate": 2.9944461524847192e-05, + "loss": 0.0405, + "step": 22810 + }, + { + "epoch": 0.05032717071761429, + "grad_norm": 0.16387970745563507, + "learning_rate": 2.9944319279673832e-05, + "loss": 0.0454, + "step": 22820 + }, + { + "epoch": 0.050349224692512455, + "grad_norm": 0.17081953585147858, + "learning_rate": 2.994417685291288e-05, + "loss": 0.0413, + "step": 22830 + }, + { + "epoch": 0.05037127866741062, + "grad_norm": 0.11592712253332138, + "learning_rate": 2.994403424456607e-05, + "loss": 0.043, + "step": 22840 + }, + { + "epoch": 0.05039333264230879, + "grad_norm": 0.12341602146625519, + "learning_rate": 2.994389145463513e-05, + "loss": 0.0415, + "step": 22850 + }, + { + "epoch": 0.05041538661720695, + "grad_norm": 0.11924461275339127, + "learning_rate": 2.9943748483121807e-05, + "loss": 0.0411, + "step": 22860 + }, + { + "epoch": 0.050437440592105115, + "grad_norm": 0.15106190741062164, + "learning_rate": 2.9943605330027824e-05, + "loss": 0.0411, + "step": 22870 + }, + { + "epoch": 0.050459494567003285, + "grad_norm": 0.12309566885232925, + "learning_rate": 2.9943461995354934e-05, + "loss": 0.0433, + "step": 22880 + }, + { + "epoch": 0.05048154854190145, + "grad_norm": 0.15705856680870056, + "learning_rate": 2.9943318479104865e-05, + "loss": 0.0433, + "step": 22890 + }, + { + "epoch": 0.05050360251679962, + "grad_norm": 0.13241936266422272, + "learning_rate": 2.994317478127937e-05, + "loss": 0.0413, + "step": 22900 + }, + { + "epoch": 0.05052565649169778, + "grad_norm": 0.13198506832122803, + "learning_rate": 2.9943030901880194e-05, + "loss": 0.0417, + "step": 22910 + }, + { + "epoch": 0.050547710466595945, + "grad_norm": 0.1561354249715805, + "learning_rate": 2.994288684090908e-05, + "loss": 0.0428, + "step": 22920 + }, + { + "epoch": 0.050569764441494115, + "grad_norm": 0.1337384134531021, + "learning_rate": 2.994274259836779e-05, + "loss": 0.0405, + "step": 22930 + }, + { + "epoch": 0.05059181841639228, + "grad_norm": 0.12211067229509354, + "learning_rate": 2.9942598174258064e-05, + "loss": 0.0431, + "step": 22940 + }, + { + "epoch": 0.05061387239129044, + "grad_norm": 0.14032891392707825, + "learning_rate": 2.9942453568581662e-05, + "loss": 0.0418, + "step": 22950 + }, + { + "epoch": 0.05063592636618861, + "grad_norm": 0.1150805726647377, + "learning_rate": 2.9942308781340343e-05, + "loss": 0.043, + "step": 22960 + }, + { + "epoch": 0.050657980341086775, + "grad_norm": 0.1340208202600479, + "learning_rate": 2.994216381253586e-05, + "loss": 0.0419, + "step": 22970 + }, + { + "epoch": 0.05068003431598494, + "grad_norm": 0.12458059191703796, + "learning_rate": 2.9942018662169983e-05, + "loss": 0.0442, + "step": 22980 + }, + { + "epoch": 0.05070208829088311, + "grad_norm": 0.14945991337299347, + "learning_rate": 2.9941873330244473e-05, + "loss": 0.0437, + "step": 22990 + }, + { + "epoch": 0.05072414226578127, + "grad_norm": 0.1508241891860962, + "learning_rate": 2.9941727816761093e-05, + "loss": 0.0431, + "step": 23000 + }, + { + "epoch": 0.05074619624067944, + "grad_norm": 0.1401892751455307, + "learning_rate": 2.9941582121721615e-05, + "loss": 0.0434, + "step": 23010 + }, + { + "epoch": 0.050768250215577605, + "grad_norm": 0.1890978068113327, + "learning_rate": 2.9941436245127804e-05, + "loss": 0.0427, + "step": 23020 + }, + { + "epoch": 0.05079030419047577, + "grad_norm": 0.14443816244602203, + "learning_rate": 2.9941290186981432e-05, + "loss": 0.0447, + "step": 23030 + }, + { + "epoch": 0.05081235816537394, + "grad_norm": 0.18702252209186554, + "learning_rate": 2.994114394728428e-05, + "loss": 0.0418, + "step": 23040 + }, + { + "epoch": 0.0508344121402721, + "grad_norm": 0.15623219311237335, + "learning_rate": 2.9940997526038125e-05, + "loss": 0.0419, + "step": 23050 + }, + { + "epoch": 0.050856466115170265, + "grad_norm": 0.13844870030879974, + "learning_rate": 2.994085092324474e-05, + "loss": 0.0433, + "step": 23060 + }, + { + "epoch": 0.050878520090068435, + "grad_norm": 0.12908348441123962, + "learning_rate": 2.9940704138905913e-05, + "loss": 0.0407, + "step": 23070 + }, + { + "epoch": 0.0509005740649666, + "grad_norm": 0.10402347892522812, + "learning_rate": 2.9940557173023422e-05, + "loss": 0.0422, + "step": 23080 + }, + { + "epoch": 0.05092262803986476, + "grad_norm": 0.13119827210903168, + "learning_rate": 2.9940410025599056e-05, + "loss": 0.0415, + "step": 23090 + }, + { + "epoch": 0.05094468201476293, + "grad_norm": 0.14919696748256683, + "learning_rate": 2.9940262696634605e-05, + "loss": 0.0401, + "step": 23100 + }, + { + "epoch": 0.050966735989661095, + "grad_norm": 0.1375122219324112, + "learning_rate": 2.9940115186131854e-05, + "loss": 0.0408, + "step": 23110 + }, + { + "epoch": 0.050988789964559265, + "grad_norm": 0.20320138335227966, + "learning_rate": 2.9939967494092595e-05, + "loss": 0.0436, + "step": 23120 + }, + { + "epoch": 0.05101084393945743, + "grad_norm": 0.14121226966381073, + "learning_rate": 2.993981962051863e-05, + "loss": 0.0403, + "step": 23130 + }, + { + "epoch": 0.05103289791435559, + "grad_norm": 0.15528464317321777, + "learning_rate": 2.993967156541175e-05, + "loss": 0.0419, + "step": 23140 + }, + { + "epoch": 0.05105495188925376, + "grad_norm": 0.13403652608394623, + "learning_rate": 2.993952332877375e-05, + "loss": 0.0418, + "step": 23150 + }, + { + "epoch": 0.051077005864151925, + "grad_norm": 0.12228585034608841, + "learning_rate": 2.993937491060644e-05, + "loss": 0.0421, + "step": 23160 + }, + { + "epoch": 0.05109905983905009, + "grad_norm": 0.14834384620189667, + "learning_rate": 2.993922631091162e-05, + "loss": 0.042, + "step": 23170 + }, + { + "epoch": 0.05112111381394826, + "grad_norm": 0.1276550143957138, + "learning_rate": 2.9939077529691095e-05, + "loss": 0.0405, + "step": 23180 + }, + { + "epoch": 0.05114316778884642, + "grad_norm": 0.13796289265155792, + "learning_rate": 2.9938928566946676e-05, + "loss": 0.0425, + "step": 23190 + }, + { + "epoch": 0.051165221763744585, + "grad_norm": 0.11887761950492859, + "learning_rate": 2.9938779422680165e-05, + "loss": 0.0409, + "step": 23200 + }, + { + "epoch": 0.051187275738642755, + "grad_norm": 0.14167042076587677, + "learning_rate": 2.9938630096893383e-05, + "loss": 0.0404, + "step": 23210 + }, + { + "epoch": 0.05120932971354092, + "grad_norm": 0.13020817935466766, + "learning_rate": 2.993848058958814e-05, + "loss": 0.0438, + "step": 23220 + }, + { + "epoch": 0.05123138368843909, + "grad_norm": 0.106519915163517, + "learning_rate": 2.9938330900766247e-05, + "loss": 0.0394, + "step": 23230 + }, + { + "epoch": 0.05125343766333725, + "grad_norm": 0.14789395034313202, + "learning_rate": 2.9938181030429542e-05, + "loss": 0.0453, + "step": 23240 + }, + { + "epoch": 0.051275491638235415, + "grad_norm": 0.14831489324569702, + "learning_rate": 2.9938030978579826e-05, + "loss": 0.0398, + "step": 23250 + }, + { + "epoch": 0.051297545613133586, + "grad_norm": 0.12475091964006424, + "learning_rate": 2.9937880745218932e-05, + "loss": 0.0412, + "step": 23260 + }, + { + "epoch": 0.05131959958803175, + "grad_norm": 0.16461269557476044, + "learning_rate": 2.993773033034868e-05, + "loss": 0.0437, + "step": 23270 + }, + { + "epoch": 0.05134165356292991, + "grad_norm": 0.151713028550148, + "learning_rate": 2.9937579733970903e-05, + "loss": 0.0437, + "step": 23280 + }, + { + "epoch": 0.05136370753782808, + "grad_norm": 0.12388259917497635, + "learning_rate": 2.9937428956087432e-05, + "loss": 0.0397, + "step": 23290 + }, + { + "epoch": 0.051385761512726245, + "grad_norm": 0.12035589665174484, + "learning_rate": 2.993727799670009e-05, + "loss": 0.0413, + "step": 23300 + }, + { + "epoch": 0.05140781548762441, + "grad_norm": 0.1030055433511734, + "learning_rate": 2.993712685581072e-05, + "loss": 0.043, + "step": 23310 + }, + { + "epoch": 0.05142986946252258, + "grad_norm": 0.1516592800617218, + "learning_rate": 2.9936975533421157e-05, + "loss": 0.0447, + "step": 23320 + }, + { + "epoch": 0.05145192343742074, + "grad_norm": 0.128151997923851, + "learning_rate": 2.9936824029533237e-05, + "loss": 0.041, + "step": 23330 + }, + { + "epoch": 0.05147397741231891, + "grad_norm": 0.13739103078842163, + "learning_rate": 2.9936672344148803e-05, + "loss": 0.0432, + "step": 23340 + }, + { + "epoch": 0.051496031387217076, + "grad_norm": 0.12007404863834381, + "learning_rate": 2.9936520477269698e-05, + "loss": 0.0398, + "step": 23350 + }, + { + "epoch": 0.05151808536211524, + "grad_norm": 0.17489752173423767, + "learning_rate": 2.9936368428897765e-05, + "loss": 0.0406, + "step": 23360 + }, + { + "epoch": 0.05154013933701341, + "grad_norm": 0.10179442167282104, + "learning_rate": 2.9936216199034857e-05, + "loss": 0.0412, + "step": 23370 + }, + { + "epoch": 0.05156219331191157, + "grad_norm": 0.15539510548114777, + "learning_rate": 2.9936063787682817e-05, + "loss": 0.042, + "step": 23380 + }, + { + "epoch": 0.051584247286809735, + "grad_norm": 0.16805028915405273, + "learning_rate": 2.99359111948435e-05, + "loss": 0.0425, + "step": 23390 + }, + { + "epoch": 0.051606301261707906, + "grad_norm": 0.14502501487731934, + "learning_rate": 2.9935758420518757e-05, + "loss": 0.0395, + "step": 23400 + }, + { + "epoch": 0.05162835523660607, + "grad_norm": 0.13896988332271576, + "learning_rate": 2.9935605464710453e-05, + "loss": 0.0416, + "step": 23410 + }, + { + "epoch": 0.05165040921150423, + "grad_norm": 0.15243935585021973, + "learning_rate": 2.993545232742044e-05, + "loss": 0.039, + "step": 23420 + }, + { + "epoch": 0.0516724631864024, + "grad_norm": 0.12485843896865845, + "learning_rate": 2.9935299008650576e-05, + "loss": 0.0417, + "step": 23430 + }, + { + "epoch": 0.051694517161300566, + "grad_norm": 0.17391960322856903, + "learning_rate": 2.993514550840273e-05, + "loss": 0.0416, + "step": 23440 + }, + { + "epoch": 0.051716571136198736, + "grad_norm": 0.15577512979507446, + "learning_rate": 2.993499182667877e-05, + "loss": 0.0422, + "step": 23450 + }, + { + "epoch": 0.0517386251110969, + "grad_norm": 0.10336781293153763, + "learning_rate": 2.993483796348055e-05, + "loss": 0.0422, + "step": 23460 + }, + { + "epoch": 0.05176067908599506, + "grad_norm": 0.13335151970386505, + "learning_rate": 2.9934683918809953e-05, + "loss": 0.0404, + "step": 23470 + }, + { + "epoch": 0.05178273306089323, + "grad_norm": 0.1428961604833603, + "learning_rate": 2.9934529692668845e-05, + "loss": 0.0417, + "step": 23480 + }, + { + "epoch": 0.051804787035791396, + "grad_norm": 0.1310087889432907, + "learning_rate": 2.99343752850591e-05, + "loss": 0.0423, + "step": 23490 + }, + { + "epoch": 0.05182684101068956, + "grad_norm": 0.1489526629447937, + "learning_rate": 2.99342206959826e-05, + "loss": 0.0428, + "step": 23500 + }, + { + "epoch": 0.05184889498558773, + "grad_norm": 0.11231059581041336, + "learning_rate": 2.9934065925441208e-05, + "loss": 0.0431, + "step": 23510 + }, + { + "epoch": 0.05187094896048589, + "grad_norm": 0.1488802284002304, + "learning_rate": 2.993391097343682e-05, + "loss": 0.0412, + "step": 23520 + }, + { + "epoch": 0.051893002935384056, + "grad_norm": 0.11853601038455963, + "learning_rate": 2.9933755839971313e-05, + "loss": 0.0419, + "step": 23530 + }, + { + "epoch": 0.051915056910282226, + "grad_norm": 0.12536324560642242, + "learning_rate": 2.9933600525046577e-05, + "loss": 0.0425, + "step": 23540 + }, + { + "epoch": 0.05193711088518039, + "grad_norm": 0.12783338129520416, + "learning_rate": 2.993344502866449e-05, + "loss": 0.0408, + "step": 23550 + }, + { + "epoch": 0.05195916486007856, + "grad_norm": 0.13907895982265472, + "learning_rate": 2.9933289350826945e-05, + "loss": 0.0405, + "step": 23560 + }, + { + "epoch": 0.05198121883497672, + "grad_norm": 0.1432880461215973, + "learning_rate": 2.9933133491535842e-05, + "loss": 0.0417, + "step": 23570 + }, + { + "epoch": 0.052003272809874886, + "grad_norm": 0.11625749617815018, + "learning_rate": 2.9932977450793066e-05, + "loss": 0.0413, + "step": 23580 + }, + { + "epoch": 0.052025326784773056, + "grad_norm": 0.12741006910800934, + "learning_rate": 2.993282122860051e-05, + "loss": 0.0429, + "step": 23590 + }, + { + "epoch": 0.05204738075967122, + "grad_norm": 0.1707274317741394, + "learning_rate": 2.9932664824960082e-05, + "loss": 0.0407, + "step": 23600 + }, + { + "epoch": 0.05206943473456938, + "grad_norm": 0.11112320423126221, + "learning_rate": 2.9932508239873676e-05, + "loss": 0.0443, + "step": 23610 + }, + { + "epoch": 0.05209148870946755, + "grad_norm": 0.11852963268756866, + "learning_rate": 2.9932351473343194e-05, + "loss": 0.0413, + "step": 23620 + }, + { + "epoch": 0.052113542684365716, + "grad_norm": 0.15210191905498505, + "learning_rate": 2.9932194525370547e-05, + "loss": 0.0395, + "step": 23630 + }, + { + "epoch": 0.052135596659263886, + "grad_norm": 0.16185426712036133, + "learning_rate": 2.9932037395957635e-05, + "loss": 0.0417, + "step": 23640 + }, + { + "epoch": 0.05215765063416205, + "grad_norm": 0.11459892243146896, + "learning_rate": 2.9931880085106373e-05, + "loss": 0.0414, + "step": 23650 + }, + { + "epoch": 0.05217970460906021, + "grad_norm": 0.1262151300907135, + "learning_rate": 2.9931722592818668e-05, + "loss": 0.0411, + "step": 23660 + }, + { + "epoch": 0.05220175858395838, + "grad_norm": 0.16527463495731354, + "learning_rate": 2.9931564919096438e-05, + "loss": 0.0406, + "step": 23670 + }, + { + "epoch": 0.052223812558856546, + "grad_norm": 0.13346748054027557, + "learning_rate": 2.9931407063941594e-05, + "loss": 0.0422, + "step": 23680 + }, + { + "epoch": 0.05224586653375471, + "grad_norm": 0.08395450562238693, + "learning_rate": 2.9931249027356057e-05, + "loss": 0.0422, + "step": 23690 + }, + { + "epoch": 0.05226792050865288, + "grad_norm": 0.12392585724592209, + "learning_rate": 2.9931090809341748e-05, + "loss": 0.0421, + "step": 23700 + }, + { + "epoch": 0.05228997448355104, + "grad_norm": 0.1316421627998352, + "learning_rate": 2.9930932409900584e-05, + "loss": 0.0399, + "step": 23710 + }, + { + "epoch": 0.052312028458449206, + "grad_norm": 0.11366080492734909, + "learning_rate": 2.9930773829034498e-05, + "loss": 0.0445, + "step": 23720 + }, + { + "epoch": 0.052334082433347376, + "grad_norm": 0.18145114183425903, + "learning_rate": 2.9930615066745416e-05, + "loss": 0.0403, + "step": 23730 + }, + { + "epoch": 0.05235613640824554, + "grad_norm": 0.1514090746641159, + "learning_rate": 2.993045612303526e-05, + "loss": 0.0403, + "step": 23740 + }, + { + "epoch": 0.05237819038314371, + "grad_norm": 0.13554221391677856, + "learning_rate": 2.9930296997905964e-05, + "loss": 0.0412, + "step": 23750 + }, + { + "epoch": 0.05240024435804187, + "grad_norm": 0.10258446633815765, + "learning_rate": 2.993013769135947e-05, + "loss": 0.0423, + "step": 23760 + }, + { + "epoch": 0.052422298332940036, + "grad_norm": 0.12460599094629288, + "learning_rate": 2.99299782033977e-05, + "loss": 0.0396, + "step": 23770 + }, + { + "epoch": 0.052444352307838206, + "grad_norm": 0.1379214972257614, + "learning_rate": 2.9929818534022596e-05, + "loss": 0.0424, + "step": 23780 + }, + { + "epoch": 0.05246640628273637, + "grad_norm": 0.14221374690532684, + "learning_rate": 2.992965868323611e-05, + "loss": 0.0419, + "step": 23790 + }, + { + "epoch": 0.05248846025763453, + "grad_norm": 0.15181368589401245, + "learning_rate": 2.992949865104017e-05, + "loss": 0.0438, + "step": 23800 + }, + { + "epoch": 0.0525105142325327, + "grad_norm": 0.1365519016981125, + "learning_rate": 2.9929338437436724e-05, + "loss": 0.0426, + "step": 23810 + }, + { + "epoch": 0.052532568207430866, + "grad_norm": 0.17761455476284027, + "learning_rate": 2.9929178042427722e-05, + "loss": 0.0424, + "step": 23820 + }, + { + "epoch": 0.05255462218232903, + "grad_norm": 0.12987060844898224, + "learning_rate": 2.9929017466015112e-05, + "loss": 0.04, + "step": 23830 + }, + { + "epoch": 0.0525766761572272, + "grad_norm": 0.11838418990373611, + "learning_rate": 2.9928856708200842e-05, + "loss": 0.0412, + "step": 23840 + }, + { + "epoch": 0.05259873013212536, + "grad_norm": 0.13231320679187775, + "learning_rate": 2.9928695768986874e-05, + "loss": 0.0412, + "step": 23850 + }, + { + "epoch": 0.05262078410702353, + "grad_norm": 0.10654434561729431, + "learning_rate": 2.9928534648375152e-05, + "loss": 0.0418, + "step": 23860 + }, + { + "epoch": 0.052642838081921696, + "grad_norm": 0.1416098028421402, + "learning_rate": 2.992837334636764e-05, + "loss": 0.0422, + "step": 23870 + }, + { + "epoch": 0.05266489205681986, + "grad_norm": 0.1333264857530594, + "learning_rate": 2.99282118629663e-05, + "loss": 0.0396, + "step": 23880 + }, + { + "epoch": 0.05268694603171803, + "grad_norm": 0.1376463770866394, + "learning_rate": 2.9928050198173088e-05, + "loss": 0.0432, + "step": 23890 + }, + { + "epoch": 0.05270900000661619, + "grad_norm": 0.15053653717041016, + "learning_rate": 2.992788835198997e-05, + "loss": 0.0435, + "step": 23900 + }, + { + "epoch": 0.052731053981514356, + "grad_norm": 0.1589229553937912, + "learning_rate": 2.9927726324418917e-05, + "loss": 0.0424, + "step": 23910 + }, + { + "epoch": 0.052753107956412526, + "grad_norm": 0.1518530696630478, + "learning_rate": 2.9927564115461896e-05, + "loss": 0.0436, + "step": 23920 + }, + { + "epoch": 0.05277516193131069, + "grad_norm": 0.10790357738733292, + "learning_rate": 2.9927401725120874e-05, + "loss": 0.0406, + "step": 23930 + }, + { + "epoch": 0.05279721590620885, + "grad_norm": 0.16978172957897186, + "learning_rate": 2.9927239153397826e-05, + "loss": 0.0432, + "step": 23940 + }, + { + "epoch": 0.05281926988110702, + "grad_norm": 0.17391090095043182, + "learning_rate": 2.9927076400294735e-05, + "loss": 0.0428, + "step": 23950 + }, + { + "epoch": 0.052841323856005186, + "grad_norm": 0.1144176796078682, + "learning_rate": 2.992691346581357e-05, + "loss": 0.0421, + "step": 23960 + }, + { + "epoch": 0.052863377830903356, + "grad_norm": 0.11158004403114319, + "learning_rate": 2.992675034995631e-05, + "loss": 0.0435, + "step": 23970 + }, + { + "epoch": 0.05288543180580152, + "grad_norm": 0.147115096449852, + "learning_rate": 2.992658705272494e-05, + "loss": 0.0418, + "step": 23980 + }, + { + "epoch": 0.05290748578069968, + "grad_norm": 0.13118895888328552, + "learning_rate": 2.992642357412145e-05, + "loss": 0.0418, + "step": 23990 + }, + { + "epoch": 0.05292953975559785, + "grad_norm": 0.15070614218711853, + "learning_rate": 2.9926259914147816e-05, + "loss": 0.0411, + "step": 24000 + }, + { + "epoch": 0.052951593730496016, + "grad_norm": 0.11179095506668091, + "learning_rate": 2.9926096072806033e-05, + "loss": 0.04, + "step": 24010 + }, + { + "epoch": 0.05297364770539418, + "grad_norm": 0.12296900898218155, + "learning_rate": 2.992593205009809e-05, + "loss": 0.0418, + "step": 24020 + }, + { + "epoch": 0.05299570168029235, + "grad_norm": 0.15066580474376678, + "learning_rate": 2.9925767846025978e-05, + "loss": 0.04, + "step": 24030 + }, + { + "epoch": 0.05301775565519051, + "grad_norm": 0.11157210171222687, + "learning_rate": 2.9925603460591697e-05, + "loss": 0.0386, + "step": 24040 + }, + { + "epoch": 0.053039809630088676, + "grad_norm": 0.13178569078445435, + "learning_rate": 2.992543889379724e-05, + "loss": 0.0428, + "step": 24050 + }, + { + "epoch": 0.053061863604986846, + "grad_norm": 0.13692647218704224, + "learning_rate": 2.9925274145644607e-05, + "loss": 0.0426, + "step": 24060 + }, + { + "epoch": 0.05308391757988501, + "grad_norm": 0.12480829656124115, + "learning_rate": 2.9925109216135805e-05, + "loss": 0.0418, + "step": 24070 + }, + { + "epoch": 0.05310597155478318, + "grad_norm": 0.11567129194736481, + "learning_rate": 2.9924944105272835e-05, + "loss": 0.0408, + "step": 24080 + }, + { + "epoch": 0.05312802552968134, + "grad_norm": 0.13385634124279022, + "learning_rate": 2.9924778813057702e-05, + "loss": 0.0422, + "step": 24090 + }, + { + "epoch": 0.053150079504579506, + "grad_norm": 0.14794638752937317, + "learning_rate": 2.992461333949241e-05, + "loss": 0.0419, + "step": 24100 + }, + { + "epoch": 0.053172133479477676, + "grad_norm": 0.16185362637043, + "learning_rate": 2.992444768457898e-05, + "loss": 0.0423, + "step": 24110 + }, + { + "epoch": 0.05319418745437584, + "grad_norm": 0.152851864695549, + "learning_rate": 2.992428184831942e-05, + "loss": 0.0422, + "step": 24120 + }, + { + "epoch": 0.053216241429274, + "grad_norm": 0.14738082885742188, + "learning_rate": 2.992411583071574e-05, + "loss": 0.0431, + "step": 24130 + }, + { + "epoch": 0.05323829540417217, + "grad_norm": 0.18007071316242218, + "learning_rate": 2.992394963176997e-05, + "loss": 0.0423, + "step": 24140 + }, + { + "epoch": 0.053260349379070336, + "grad_norm": 0.16056975722312927, + "learning_rate": 2.9923783251484112e-05, + "loss": 0.0418, + "step": 24150 + }, + { + "epoch": 0.0532824033539685, + "grad_norm": 0.114621102809906, + "learning_rate": 2.99236166898602e-05, + "loss": 0.0412, + "step": 24160 + }, + { + "epoch": 0.05330445732886667, + "grad_norm": 0.11338641494512558, + "learning_rate": 2.9923449946900262e-05, + "loss": 0.0403, + "step": 24170 + }, + { + "epoch": 0.05332651130376483, + "grad_norm": 0.15765075385570526, + "learning_rate": 2.992328302260631e-05, + "loss": 0.0388, + "step": 24180 + }, + { + "epoch": 0.053348565278663, + "grad_norm": 0.13307985663414001, + "learning_rate": 2.992311591698038e-05, + "loss": 0.0411, + "step": 24190 + }, + { + "epoch": 0.053370619253561166, + "grad_norm": 0.10737631469964981, + "learning_rate": 2.99229486300245e-05, + "loss": 0.042, + "step": 24200 + }, + { + "epoch": 0.05339267322845933, + "grad_norm": 0.11456667631864548, + "learning_rate": 2.992278116174071e-05, + "loss": 0.0414, + "step": 24210 + }, + { + "epoch": 0.0534147272033575, + "grad_norm": 0.17663072049617767, + "learning_rate": 2.9922613512131034e-05, + "loss": 0.0418, + "step": 24220 + }, + { + "epoch": 0.05343678117825566, + "grad_norm": 0.15866999328136444, + "learning_rate": 2.9922445681197516e-05, + "loss": 0.0421, + "step": 24230 + }, + { + "epoch": 0.053458835153153826, + "grad_norm": 0.12055014818906784, + "learning_rate": 2.9922277668942193e-05, + "loss": 0.0426, + "step": 24240 + }, + { + "epoch": 0.053480889128051996, + "grad_norm": 0.11739565432071686, + "learning_rate": 2.9922109475367115e-05, + "loss": 0.0419, + "step": 24250 + }, + { + "epoch": 0.05350294310295016, + "grad_norm": 0.17282234132289886, + "learning_rate": 2.9921941100474314e-05, + "loss": 0.0409, + "step": 24260 + }, + { + "epoch": 0.05352499707784832, + "grad_norm": 0.12809181213378906, + "learning_rate": 2.9921772544265834e-05, + "loss": 0.0421, + "step": 24270 + }, + { + "epoch": 0.05354705105274649, + "grad_norm": 0.12855587899684906, + "learning_rate": 2.9921603806743733e-05, + "loss": 0.0439, + "step": 24280 + }, + { + "epoch": 0.053569105027644656, + "grad_norm": 0.19812685251235962, + "learning_rate": 2.992143488791006e-05, + "loss": 0.041, + "step": 24290 + }, + { + "epoch": 0.053591159002542826, + "grad_norm": 0.13563448190689087, + "learning_rate": 2.9921265787766864e-05, + "loss": 0.0425, + "step": 24300 + }, + { + "epoch": 0.05361321297744099, + "grad_norm": 0.15332671999931335, + "learning_rate": 2.9921096506316197e-05, + "loss": 0.0436, + "step": 24310 + }, + { + "epoch": 0.05363526695233915, + "grad_norm": 0.15332987904548645, + "learning_rate": 2.9920927043560123e-05, + "loss": 0.0428, + "step": 24320 + }, + { + "epoch": 0.05365732092723732, + "grad_norm": 0.12960077822208405, + "learning_rate": 2.99207573995007e-05, + "loss": 0.0439, + "step": 24330 + }, + { + "epoch": 0.053679374902135486, + "grad_norm": 0.17675729095935822, + "learning_rate": 2.992058757413998e-05, + "loss": 0.0434, + "step": 24340 + }, + { + "epoch": 0.05370142887703365, + "grad_norm": 0.11769378185272217, + "learning_rate": 2.992041756748004e-05, + "loss": 0.042, + "step": 24350 + }, + { + "epoch": 0.05372348285193182, + "grad_norm": 0.13720740377902985, + "learning_rate": 2.9920247379522935e-05, + "loss": 0.0424, + "step": 24360 + }, + { + "epoch": 0.05374553682682998, + "grad_norm": 0.14776237308979034, + "learning_rate": 2.992007701027074e-05, + "loss": 0.0427, + "step": 24370 + }, + { + "epoch": 0.053767590801728146, + "grad_norm": 0.14638595283031464, + "learning_rate": 2.9919906459725524e-05, + "loss": 0.0413, + "step": 24380 + }, + { + "epoch": 0.053789644776626316, + "grad_norm": 0.181232750415802, + "learning_rate": 2.9919735727889354e-05, + "loss": 0.0422, + "step": 24390 + }, + { + "epoch": 0.05381169875152448, + "grad_norm": 0.12736734747886658, + "learning_rate": 2.9919564814764308e-05, + "loss": 0.0444, + "step": 24400 + }, + { + "epoch": 0.05383375272642265, + "grad_norm": 0.16505394876003265, + "learning_rate": 2.9919393720352463e-05, + "loss": 0.0413, + "step": 24410 + }, + { + "epoch": 0.05385580670132081, + "grad_norm": 0.12456220388412476, + "learning_rate": 2.9919222444655905e-05, + "loss": 0.0427, + "step": 24420 + }, + { + "epoch": 0.053877860676218976, + "grad_norm": 0.12485910207033157, + "learning_rate": 2.99190509876767e-05, + "loss": 0.0423, + "step": 24430 + }, + { + "epoch": 0.053899914651117146, + "grad_norm": 0.12422651052474976, + "learning_rate": 2.9918879349416946e-05, + "loss": 0.0395, + "step": 24440 + }, + { + "epoch": 0.05392196862601531, + "grad_norm": 0.12228687107563019, + "learning_rate": 2.9918707529878714e-05, + "loss": 0.0426, + "step": 24450 + }, + { + "epoch": 0.05394402260091347, + "grad_norm": 0.12211237102746964, + "learning_rate": 2.9918535529064108e-05, + "loss": 0.0425, + "step": 24460 + }, + { + "epoch": 0.05396607657581164, + "grad_norm": 0.11523541063070297, + "learning_rate": 2.9918363346975205e-05, + "loss": 0.0443, + "step": 24470 + }, + { + "epoch": 0.053988130550709806, + "grad_norm": 0.1380365639925003, + "learning_rate": 2.9918190983614107e-05, + "loss": 0.0414, + "step": 24480 + }, + { + "epoch": 0.05401018452560797, + "grad_norm": 0.13465999066829681, + "learning_rate": 2.99180184389829e-05, + "loss": 0.0412, + "step": 24490 + }, + { + "epoch": 0.05403223850050614, + "grad_norm": 0.16599848866462708, + "learning_rate": 2.991784571308368e-05, + "loss": 0.0408, + "step": 24500 + }, + { + "epoch": 0.0540542924754043, + "grad_norm": 0.15669070184230804, + "learning_rate": 2.9917672805918554e-05, + "loss": 0.0419, + "step": 24510 + }, + { + "epoch": 0.05407634645030247, + "grad_norm": 0.10704990476369858, + "learning_rate": 2.9917499717489622e-05, + "loss": 0.0446, + "step": 24520 + }, + { + "epoch": 0.054098400425200636, + "grad_norm": 0.1817997246980667, + "learning_rate": 2.991732644779898e-05, + "loss": 0.0412, + "step": 24530 + }, + { + "epoch": 0.0541204544000988, + "grad_norm": 0.1346602737903595, + "learning_rate": 2.991715299684874e-05, + "loss": 0.0402, + "step": 24540 + }, + { + "epoch": 0.05414250837499697, + "grad_norm": 0.1272028088569641, + "learning_rate": 2.9916979364641003e-05, + "loss": 0.042, + "step": 24550 + }, + { + "epoch": 0.05416456234989513, + "grad_norm": 0.12560951709747314, + "learning_rate": 2.9916805551177886e-05, + "loss": 0.0415, + "step": 24560 + }, + { + "epoch": 0.054186616324793296, + "grad_norm": 0.154292494058609, + "learning_rate": 2.99166315564615e-05, + "loss": 0.0413, + "step": 24570 + }, + { + "epoch": 0.05420867029969147, + "grad_norm": 0.18566656112670898, + "learning_rate": 2.991645738049395e-05, + "loss": 0.0427, + "step": 24580 + }, + { + "epoch": 0.05423072427458963, + "grad_norm": 0.16655240952968597, + "learning_rate": 2.9916283023277364e-05, + "loss": 0.0431, + "step": 24590 + }, + { + "epoch": 0.05425277824948779, + "grad_norm": 0.1352268010377884, + "learning_rate": 2.991610848481385e-05, + "loss": 0.0426, + "step": 24600 + }, + { + "epoch": 0.05427483222438596, + "grad_norm": 0.12885455787181854, + "learning_rate": 2.9915933765105542e-05, + "loss": 0.0417, + "step": 24610 + }, + { + "epoch": 0.054296886199284126, + "grad_norm": 0.1159503236413002, + "learning_rate": 2.9915758864154554e-05, + "loss": 0.0422, + "step": 24620 + }, + { + "epoch": 0.0543189401741823, + "grad_norm": 0.12800952792167664, + "learning_rate": 2.9915583781963014e-05, + "loss": 0.0399, + "step": 24630 + }, + { + "epoch": 0.05434099414908046, + "grad_norm": 0.13429008424282074, + "learning_rate": 2.9915408518533045e-05, + "loss": 0.0424, + "step": 24640 + }, + { + "epoch": 0.05436304812397862, + "grad_norm": 0.13649609684944153, + "learning_rate": 2.9915233073866784e-05, + "loss": 0.0423, + "step": 24650 + }, + { + "epoch": 0.05438510209887679, + "grad_norm": 0.14614984393119812, + "learning_rate": 2.9915057447966354e-05, + "loss": 0.0392, + "step": 24660 + }, + { + "epoch": 0.05440715607377496, + "grad_norm": 0.11340288072824478, + "learning_rate": 2.9914881640833895e-05, + "loss": 0.0417, + "step": 24670 + }, + { + "epoch": 0.05442921004867312, + "grad_norm": 0.08313354104757309, + "learning_rate": 2.9914705652471546e-05, + "loss": 0.0409, + "step": 24680 + }, + { + "epoch": 0.05445126402357129, + "grad_norm": 0.15209491550922394, + "learning_rate": 2.991452948288144e-05, + "loss": 0.0392, + "step": 24690 + }, + { + "epoch": 0.05447331799846945, + "grad_norm": 0.15531809628009796, + "learning_rate": 2.991435313206572e-05, + "loss": 0.0424, + "step": 24700 + }, + { + "epoch": 0.054495371973367616, + "grad_norm": 0.20275679230690002, + "learning_rate": 2.9914176600026525e-05, + "loss": 0.0429, + "step": 24710 + }, + { + "epoch": 0.05451742594826579, + "grad_norm": 0.1226024478673935, + "learning_rate": 2.9913999886766e-05, + "loss": 0.0415, + "step": 24720 + }, + { + "epoch": 0.05453947992316395, + "grad_norm": 0.1750810742378235, + "learning_rate": 2.9913822992286302e-05, + "loss": 0.0424, + "step": 24730 + }, + { + "epoch": 0.05456153389806212, + "grad_norm": 0.12680916488170624, + "learning_rate": 2.9913645916589573e-05, + "loss": 0.0425, + "step": 24740 + }, + { + "epoch": 0.05458358787296028, + "grad_norm": 0.1317356824874878, + "learning_rate": 2.9913468659677963e-05, + "loss": 0.0408, + "step": 24750 + }, + { + "epoch": 0.05460564184785845, + "grad_norm": 0.16593900322914124, + "learning_rate": 2.991329122155363e-05, + "loss": 0.0407, + "step": 24760 + }, + { + "epoch": 0.05462769582275662, + "grad_norm": 0.10689384490251541, + "learning_rate": 2.9913113602218723e-05, + "loss": 0.0413, + "step": 24770 + }, + { + "epoch": 0.05464974979765478, + "grad_norm": 0.13418486714363098, + "learning_rate": 2.9912935801675412e-05, + "loss": 0.042, + "step": 24780 + }, + { + "epoch": 0.05467180377255294, + "grad_norm": 0.16026991605758667, + "learning_rate": 2.9912757819925845e-05, + "loss": 0.0407, + "step": 24790 + }, + { + "epoch": 0.05469385774745111, + "grad_norm": 0.11327914148569107, + "learning_rate": 2.9912579656972196e-05, + "loss": 0.0413, + "step": 24800 + }, + { + "epoch": 0.05471591172234928, + "grad_norm": 0.14159950613975525, + "learning_rate": 2.991240131281662e-05, + "loss": 0.0409, + "step": 24810 + }, + { + "epoch": 0.05473796569724745, + "grad_norm": 0.1384396106004715, + "learning_rate": 2.991222278746129e-05, + "loss": 0.0412, + "step": 24820 + }, + { + "epoch": 0.05476001967214561, + "grad_norm": 0.14367267489433289, + "learning_rate": 2.9912044080908375e-05, + "loss": 0.0406, + "step": 24830 + }, + { + "epoch": 0.05478207364704377, + "grad_norm": 0.14774708449840546, + "learning_rate": 2.9911865193160048e-05, + "loss": 0.041, + "step": 24840 + }, + { + "epoch": 0.054804127621941943, + "grad_norm": 0.11753451824188232, + "learning_rate": 2.991168612421847e-05, + "loss": 0.0416, + "step": 24850 + }, + { + "epoch": 0.05482618159684011, + "grad_norm": 0.11987815052270889, + "learning_rate": 2.9911506874085834e-05, + "loss": 0.0398, + "step": 24860 + }, + { + "epoch": 0.05484823557173827, + "grad_norm": 0.15195980668067932, + "learning_rate": 2.9911327442764308e-05, + "loss": 0.0421, + "step": 24870 + }, + { + "epoch": 0.05487028954663644, + "grad_norm": 0.1487438678741455, + "learning_rate": 2.9911147830256078e-05, + "loss": 0.043, + "step": 24880 + }, + { + "epoch": 0.0548923435215346, + "grad_norm": 0.13262683153152466, + "learning_rate": 2.991096803656332e-05, + "loss": 0.0403, + "step": 24890 + }, + { + "epoch": 0.05491439749643277, + "grad_norm": 0.10267778486013412, + "learning_rate": 2.9910788061688223e-05, + "loss": 0.0424, + "step": 24900 + }, + { + "epoch": 0.05493645147133094, + "grad_norm": 0.15423765778541565, + "learning_rate": 2.9910607905632977e-05, + "loss": 0.0397, + "step": 24910 + }, + { + "epoch": 0.0549585054462291, + "grad_norm": 0.13493800163269043, + "learning_rate": 2.9910427568399763e-05, + "loss": 0.0414, + "step": 24920 + }, + { + "epoch": 0.05498055942112727, + "grad_norm": 0.12048714607954025, + "learning_rate": 2.9910247049990776e-05, + "loss": 0.0417, + "step": 24930 + }, + { + "epoch": 0.055002613396025433, + "grad_norm": 0.13394801318645477, + "learning_rate": 2.991006635040821e-05, + "loss": 0.0437, + "step": 24940 + }, + { + "epoch": 0.0550246673709236, + "grad_norm": 0.12563388049602509, + "learning_rate": 2.990988546965426e-05, + "loss": 0.041, + "step": 24950 + }, + { + "epoch": 0.05504672134582177, + "grad_norm": 0.15721862018108368, + "learning_rate": 2.9909704407731124e-05, + "loss": 0.0443, + "step": 24960 + }, + { + "epoch": 0.05506877532071993, + "grad_norm": 0.16958512365818024, + "learning_rate": 2.9909523164641008e-05, + "loss": 0.0402, + "step": 24970 + }, + { + "epoch": 0.05509082929561809, + "grad_norm": 0.11886856704950333, + "learning_rate": 2.99093417403861e-05, + "loss": 0.0402, + "step": 24980 + }, + { + "epoch": 0.055112883270516264, + "grad_norm": 0.12432379275560379, + "learning_rate": 2.990916013496862e-05, + "loss": 0.0416, + "step": 24990 + }, + { + "epoch": 0.05513493724541443, + "grad_norm": 0.1340593844652176, + "learning_rate": 2.9908978348390764e-05, + "loss": 0.0397, + "step": 25000 + }, + { + "epoch": 0.05515699122031259, + "grad_norm": 0.12875141203403473, + "learning_rate": 2.9908796380654746e-05, + "loss": 0.0414, + "step": 25010 + }, + { + "epoch": 0.05517904519521076, + "grad_norm": 0.13128383457660675, + "learning_rate": 2.9908614231762775e-05, + "loss": 0.041, + "step": 25020 + }, + { + "epoch": 0.055201099170108923, + "grad_norm": 0.12166864424943924, + "learning_rate": 2.9908431901717064e-05, + "loss": 0.0421, + "step": 25030 + }, + { + "epoch": 0.055223153145007094, + "grad_norm": 0.11975006759166718, + "learning_rate": 2.9908249390519834e-05, + "loss": 0.0432, + "step": 25040 + }, + { + "epoch": 0.05524520711990526, + "grad_norm": 0.13814975321292877, + "learning_rate": 2.9908066698173296e-05, + "loss": 0.0397, + "step": 25050 + }, + { + "epoch": 0.05526726109480342, + "grad_norm": 0.1249738410115242, + "learning_rate": 2.990788382467967e-05, + "loss": 0.0412, + "step": 25060 + }, + { + "epoch": 0.05528931506970159, + "grad_norm": 0.13462255895137787, + "learning_rate": 2.990770077004118e-05, + "loss": 0.0424, + "step": 25070 + }, + { + "epoch": 0.055311369044599754, + "grad_norm": 0.14742860198020935, + "learning_rate": 2.9907517534260052e-05, + "loss": 0.0406, + "step": 25080 + }, + { + "epoch": 0.05533342301949792, + "grad_norm": 0.17464645206928253, + "learning_rate": 2.990733411733851e-05, + "loss": 0.0429, + "step": 25090 + }, + { + "epoch": 0.05535547699439609, + "grad_norm": 0.13381949067115784, + "learning_rate": 2.9907150519278787e-05, + "loss": 0.0451, + "step": 25100 + }, + { + "epoch": 0.05537753096929425, + "grad_norm": 0.1345413774251938, + "learning_rate": 2.9906966740083105e-05, + "loss": 0.0417, + "step": 25110 + }, + { + "epoch": 0.055399584944192413, + "grad_norm": 0.15290716290473938, + "learning_rate": 2.9906782779753707e-05, + "loss": 0.0417, + "step": 25120 + }, + { + "epoch": 0.055421638919090584, + "grad_norm": 0.11692753434181213, + "learning_rate": 2.990659863829282e-05, + "loss": 0.04, + "step": 25130 + }, + { + "epoch": 0.05544369289398875, + "grad_norm": 0.10838860273361206, + "learning_rate": 2.9906414315702692e-05, + "loss": 0.0418, + "step": 25140 + }, + { + "epoch": 0.05546574686888692, + "grad_norm": 0.1553717404603958, + "learning_rate": 2.990622981198555e-05, + "loss": 0.0422, + "step": 25150 + }, + { + "epoch": 0.05548780084378508, + "grad_norm": 0.12451763451099396, + "learning_rate": 2.9906045127143644e-05, + "loss": 0.0405, + "step": 25160 + }, + { + "epoch": 0.055509854818683244, + "grad_norm": 0.1218048557639122, + "learning_rate": 2.9905860261179216e-05, + "loss": 0.0385, + "step": 25170 + }, + { + "epoch": 0.055531908793581414, + "grad_norm": 0.1338917464017868, + "learning_rate": 2.9905675214094512e-05, + "loss": 0.04, + "step": 25180 + }, + { + "epoch": 0.05555396276847958, + "grad_norm": 0.18307524919509888, + "learning_rate": 2.9905489985891778e-05, + "loss": 0.0421, + "step": 25190 + }, + { + "epoch": 0.05557601674337774, + "grad_norm": 0.12770357728004456, + "learning_rate": 2.9905304576573275e-05, + "loss": 0.0412, + "step": 25200 + }, + { + "epoch": 0.05559807071827591, + "grad_norm": 0.11931005865335464, + "learning_rate": 2.9905118986141245e-05, + "loss": 0.0421, + "step": 25210 + }, + { + "epoch": 0.055620124693174074, + "grad_norm": 0.136357843875885, + "learning_rate": 2.990493321459794e-05, + "loss": 0.041, + "step": 25220 + }, + { + "epoch": 0.05564217866807224, + "grad_norm": 0.1153496727347374, + "learning_rate": 2.9904747261945634e-05, + "loss": 0.041, + "step": 25230 + }, + { + "epoch": 0.05566423264297041, + "grad_norm": 0.13733138144016266, + "learning_rate": 2.990456112818657e-05, + "loss": 0.0433, + "step": 25240 + }, + { + "epoch": 0.05568628661786857, + "grad_norm": 0.15342067182064056, + "learning_rate": 2.9904374813323018e-05, + "loss": 0.0387, + "step": 25250 + }, + { + "epoch": 0.05570834059276674, + "grad_norm": 0.10551542043685913, + "learning_rate": 2.990418831735724e-05, + "loss": 0.0415, + "step": 25260 + }, + { + "epoch": 0.055730394567664904, + "grad_norm": 0.10636767745018005, + "learning_rate": 2.99040016402915e-05, + "loss": 0.0417, + "step": 25270 + }, + { + "epoch": 0.05575244854256307, + "grad_norm": 0.1406567096710205, + "learning_rate": 2.990381478212807e-05, + "loss": 0.0405, + "step": 25280 + }, + { + "epoch": 0.05577450251746124, + "grad_norm": 0.21232056617736816, + "learning_rate": 2.990362774286922e-05, + "loss": 0.0391, + "step": 25290 + }, + { + "epoch": 0.0557965564923594, + "grad_norm": 0.1328824758529663, + "learning_rate": 2.9903440522517222e-05, + "loss": 0.0427, + "step": 25300 + }, + { + "epoch": 0.055818610467257564, + "grad_norm": 0.1430632770061493, + "learning_rate": 2.9903253121074348e-05, + "loss": 0.0417, + "step": 25310 + }, + { + "epoch": 0.055840664442155734, + "grad_norm": 0.1174650713801384, + "learning_rate": 2.9903065538542876e-05, + "loss": 0.0412, + "step": 25320 + }, + { + "epoch": 0.0558627184170539, + "grad_norm": 0.11168243736028671, + "learning_rate": 2.9902877774925095e-05, + "loss": 0.0382, + "step": 25330 + }, + { + "epoch": 0.05588477239195206, + "grad_norm": 0.14632421731948853, + "learning_rate": 2.990268983022327e-05, + "loss": 0.0396, + "step": 25340 + }, + { + "epoch": 0.05590682636685023, + "grad_norm": 0.13734875619411469, + "learning_rate": 2.9902501704439697e-05, + "loss": 0.0399, + "step": 25350 + }, + { + "epoch": 0.055928880341748394, + "grad_norm": 0.13471569120883942, + "learning_rate": 2.9902313397576658e-05, + "loss": 0.0441, + "step": 25360 + }, + { + "epoch": 0.055950934316646564, + "grad_norm": 0.14403408765792847, + "learning_rate": 2.990212490963644e-05, + "loss": 0.0417, + "step": 25370 + }, + { + "epoch": 0.05597298829154473, + "grad_norm": 0.1501179188489914, + "learning_rate": 2.9901936240621336e-05, + "loss": 0.0426, + "step": 25380 + }, + { + "epoch": 0.05599504226644289, + "grad_norm": 0.13382482528686523, + "learning_rate": 2.9901747390533635e-05, + "loss": 0.0422, + "step": 25390 + }, + { + "epoch": 0.05601709624134106, + "grad_norm": 0.11907656490802765, + "learning_rate": 2.9901558359375632e-05, + "loss": 0.0432, + "step": 25400 + }, + { + "epoch": 0.056039150216239224, + "grad_norm": 0.11567626148462296, + "learning_rate": 2.990136914714963e-05, + "loss": 0.0391, + "step": 25410 + }, + { + "epoch": 0.05606120419113739, + "grad_norm": 0.14386819303035736, + "learning_rate": 2.990117975385792e-05, + "loss": 0.0404, + "step": 25420 + }, + { + "epoch": 0.05608325816603556, + "grad_norm": 0.14762665331363678, + "learning_rate": 2.990099017950281e-05, + "loss": 0.042, + "step": 25430 + }, + { + "epoch": 0.05610531214093372, + "grad_norm": 0.14531563222408295, + "learning_rate": 2.9900800424086595e-05, + "loss": 0.0406, + "step": 25440 + }, + { + "epoch": 0.056127366115831884, + "grad_norm": 0.11847081780433655, + "learning_rate": 2.9900610487611596e-05, + "loss": 0.0396, + "step": 25450 + }, + { + "epoch": 0.056149420090730054, + "grad_norm": 0.1163814440369606, + "learning_rate": 2.9900420370080103e-05, + "loss": 0.0415, + "step": 25460 + }, + { + "epoch": 0.05617147406562822, + "grad_norm": 0.1631232649087906, + "learning_rate": 2.9900230071494433e-05, + "loss": 0.0431, + "step": 25470 + }, + { + "epoch": 0.05619352804052639, + "grad_norm": 0.15309719741344452, + "learning_rate": 2.9900039591856904e-05, + "loss": 0.0413, + "step": 25480 + }, + { + "epoch": 0.05621558201542455, + "grad_norm": 0.13892816007137299, + "learning_rate": 2.9899848931169827e-05, + "loss": 0.0422, + "step": 25490 + }, + { + "epoch": 0.056237635990322714, + "grad_norm": 0.18218286335468292, + "learning_rate": 2.989965808943551e-05, + "loss": 0.0455, + "step": 25500 + }, + { + "epoch": 0.056259689965220884, + "grad_norm": 0.1610560119152069, + "learning_rate": 2.989946706665629e-05, + "loss": 0.0375, + "step": 25510 + }, + { + "epoch": 0.05628174394011905, + "grad_norm": 0.16535060107707977, + "learning_rate": 2.989927586283447e-05, + "loss": 0.0433, + "step": 25520 + }, + { + "epoch": 0.05630379791501721, + "grad_norm": 0.1556999534368515, + "learning_rate": 2.9899084477972384e-05, + "loss": 0.0424, + "step": 25530 + }, + { + "epoch": 0.05632585188991538, + "grad_norm": 0.09766054153442383, + "learning_rate": 2.9898892912072353e-05, + "loss": 0.0403, + "step": 25540 + }, + { + "epoch": 0.056347905864813544, + "grad_norm": 0.10352345556020737, + "learning_rate": 2.989870116513671e-05, + "loss": 0.0411, + "step": 25550 + }, + { + "epoch": 0.05636995983971171, + "grad_norm": 0.13649868965148926, + "learning_rate": 2.9898509237167778e-05, + "loss": 0.0419, + "step": 25560 + }, + { + "epoch": 0.05639201381460988, + "grad_norm": 0.11902231723070145, + "learning_rate": 2.989831712816789e-05, + "loss": 0.0406, + "step": 25570 + }, + { + "epoch": 0.05641406778950804, + "grad_norm": 0.10191845148801804, + "learning_rate": 2.989812483813939e-05, + "loss": 0.0429, + "step": 25580 + }, + { + "epoch": 0.05643612176440621, + "grad_norm": 0.11394559592008591, + "learning_rate": 2.98979323670846e-05, + "loss": 0.042, + "step": 25590 + }, + { + "epoch": 0.056458175739304374, + "grad_norm": 0.139059379696846, + "learning_rate": 2.9897739715005868e-05, + "loss": 0.0395, + "step": 25600 + }, + { + "epoch": 0.05648022971420254, + "grad_norm": 0.11982370913028717, + "learning_rate": 2.9897546881905533e-05, + "loss": 0.0417, + "step": 25610 + }, + { + "epoch": 0.05650228368910071, + "grad_norm": 0.14986155927181244, + "learning_rate": 2.989735386778594e-05, + "loss": 0.0416, + "step": 25620 + }, + { + "epoch": 0.05652433766399887, + "grad_norm": 0.13259361684322357, + "learning_rate": 2.989716067264943e-05, + "loss": 0.0411, + "step": 25630 + }, + { + "epoch": 0.056546391638897034, + "grad_norm": 0.12338006496429443, + "learning_rate": 2.989696729649835e-05, + "loss": 0.0406, + "step": 25640 + }, + { + "epoch": 0.056568445613795204, + "grad_norm": 0.14602358639240265, + "learning_rate": 2.9896773739335054e-05, + "loss": 0.0406, + "step": 25650 + }, + { + "epoch": 0.05659049958869337, + "grad_norm": 0.14163389801979065, + "learning_rate": 2.989658000116189e-05, + "loss": 0.0404, + "step": 25660 + }, + { + "epoch": 0.05661255356359153, + "grad_norm": 0.13409319519996643, + "learning_rate": 2.9896386081981218e-05, + "loss": 0.0416, + "step": 25670 + }, + { + "epoch": 0.0566346075384897, + "grad_norm": 0.11894315481185913, + "learning_rate": 2.9896191981795393e-05, + "loss": 0.0403, + "step": 25680 + }, + { + "epoch": 0.056656661513387864, + "grad_norm": 0.14531800150871277, + "learning_rate": 2.9895997700606766e-05, + "loss": 0.04, + "step": 25690 + }, + { + "epoch": 0.056678715488286034, + "grad_norm": 0.1296304166316986, + "learning_rate": 2.9895803238417705e-05, + "loss": 0.0409, + "step": 25700 + }, + { + "epoch": 0.0567007694631842, + "grad_norm": 0.13509240746498108, + "learning_rate": 2.989560859523057e-05, + "loss": 0.0416, + "step": 25710 + }, + { + "epoch": 0.05672282343808236, + "grad_norm": 0.14688830077648163, + "learning_rate": 2.9895413771047726e-05, + "loss": 0.0411, + "step": 25720 + }, + { + "epoch": 0.05674487741298053, + "grad_norm": 0.13343459367752075, + "learning_rate": 2.9895218765871545e-05, + "loss": 0.0394, + "step": 25730 + }, + { + "epoch": 0.056766931387878694, + "grad_norm": 0.11831588298082352, + "learning_rate": 2.989502357970439e-05, + "loss": 0.0417, + "step": 25740 + }, + { + "epoch": 0.05678898536277686, + "grad_norm": 0.1440436989068985, + "learning_rate": 2.989482821254863e-05, + "loss": 0.0423, + "step": 25750 + }, + { + "epoch": 0.05681103933767503, + "grad_norm": 0.14756718277931213, + "learning_rate": 2.9894632664406648e-05, + "loss": 0.0397, + "step": 25760 + }, + { + "epoch": 0.05683309331257319, + "grad_norm": 0.15170073509216309, + "learning_rate": 2.9894436935280822e-05, + "loss": 0.0414, + "step": 25770 + }, + { + "epoch": 0.056855147287471354, + "grad_norm": 0.13788962364196777, + "learning_rate": 2.989424102517352e-05, + "loss": 0.0404, + "step": 25780 + }, + { + "epoch": 0.056877201262369524, + "grad_norm": 0.1380155235528946, + "learning_rate": 2.9894044934087123e-05, + "loss": 0.0417, + "step": 25790 + }, + { + "epoch": 0.05689925523726769, + "grad_norm": 0.13157109916210175, + "learning_rate": 2.989384866202402e-05, + "loss": 0.0413, + "step": 25800 + }, + { + "epoch": 0.05692130921216586, + "grad_norm": 0.1218644455075264, + "learning_rate": 2.9893652208986596e-05, + "loss": 0.0425, + "step": 25810 + }, + { + "epoch": 0.05694336318706402, + "grad_norm": 0.11532926559448242, + "learning_rate": 2.9893455574977235e-05, + "loss": 0.0437, + "step": 25820 + }, + { + "epoch": 0.056965417161962184, + "grad_norm": 0.17828695476055145, + "learning_rate": 2.989325875999833e-05, + "loss": 0.0437, + "step": 25830 + }, + { + "epoch": 0.056987471136860354, + "grad_norm": 0.15860208868980408, + "learning_rate": 2.9893061764052264e-05, + "loss": 0.0405, + "step": 25840 + }, + { + "epoch": 0.05700952511175852, + "grad_norm": 0.12825804948806763, + "learning_rate": 2.989286458714144e-05, + "loss": 0.0409, + "step": 25850 + }, + { + "epoch": 0.05703157908665668, + "grad_norm": 0.0952877327799797, + "learning_rate": 2.989266722926825e-05, + "loss": 0.0405, + "step": 25860 + }, + { + "epoch": 0.05705363306155485, + "grad_norm": 0.14285479485988617, + "learning_rate": 2.9892469690435092e-05, + "loss": 0.0409, + "step": 25870 + }, + { + "epoch": 0.057075687036453014, + "grad_norm": 0.13320821523666382, + "learning_rate": 2.9892271970644364e-05, + "loss": 0.0409, + "step": 25880 + }, + { + "epoch": 0.057097741011351184, + "grad_norm": 0.1803089678287506, + "learning_rate": 2.9892074069898474e-05, + "loss": 0.0403, + "step": 25890 + }, + { + "epoch": 0.05711979498624935, + "grad_norm": 0.10214867442846298, + "learning_rate": 2.989187598819982e-05, + "loss": 0.041, + "step": 25900 + }, + { + "epoch": 0.05714184896114751, + "grad_norm": 0.14007967710494995, + "learning_rate": 2.9891677725550812e-05, + "loss": 0.0406, + "step": 25910 + }, + { + "epoch": 0.05716390293604568, + "grad_norm": 0.12093830108642578, + "learning_rate": 2.9891479281953866e-05, + "loss": 0.0436, + "step": 25920 + }, + { + "epoch": 0.057185956910943844, + "grad_norm": 0.1697174310684204, + "learning_rate": 2.989128065741138e-05, + "loss": 0.0411, + "step": 25930 + }, + { + "epoch": 0.05720801088584201, + "grad_norm": 0.15494808554649353, + "learning_rate": 2.9891081851925776e-05, + "loss": 0.0436, + "step": 25940 + }, + { + "epoch": 0.05723006486074018, + "grad_norm": 0.16910478472709656, + "learning_rate": 2.989088286549947e-05, + "loss": 0.0428, + "step": 25950 + }, + { + "epoch": 0.05725211883563834, + "grad_norm": 0.11996647715568542, + "learning_rate": 2.989068369813488e-05, + "loss": 0.0412, + "step": 25960 + }, + { + "epoch": 0.057274172810536504, + "grad_norm": 0.1404450684785843, + "learning_rate": 2.989048434983442e-05, + "loss": 0.0442, + "step": 25970 + }, + { + "epoch": 0.057296226785434674, + "grad_norm": 0.15079793334007263, + "learning_rate": 2.989028482060052e-05, + "loss": 0.044, + "step": 25980 + }, + { + "epoch": 0.05731828076033284, + "grad_norm": 0.1548641324043274, + "learning_rate": 2.9890085110435593e-05, + "loss": 0.0403, + "step": 25990 + }, + { + "epoch": 0.05734033473523101, + "grad_norm": 0.17291542887687683, + "learning_rate": 2.9889885219342077e-05, + "loss": 0.0424, + "step": 26000 + }, + { + "epoch": 0.05736238871012917, + "grad_norm": 0.11859870702028275, + "learning_rate": 2.98896851473224e-05, + "loss": 0.0401, + "step": 26010 + }, + { + "epoch": 0.057384442685027334, + "grad_norm": 0.12797221541404724, + "learning_rate": 2.9889484894378987e-05, + "loss": 0.0405, + "step": 26020 + }, + { + "epoch": 0.057406496659925504, + "grad_norm": 0.16732501983642578, + "learning_rate": 2.9889284460514278e-05, + "loss": 0.0422, + "step": 26030 + }, + { + "epoch": 0.05742855063482367, + "grad_norm": 0.12476964294910431, + "learning_rate": 2.9889083845730704e-05, + "loss": 0.0395, + "step": 26040 + }, + { + "epoch": 0.05745060460972183, + "grad_norm": 0.10633287578821182, + "learning_rate": 2.98888830500307e-05, + "loss": 0.0402, + "step": 26050 + }, + { + "epoch": 0.05747265858462, + "grad_norm": 0.11220492422580719, + "learning_rate": 2.9888682073416714e-05, + "loss": 0.0411, + "step": 26060 + }, + { + "epoch": 0.057494712559518164, + "grad_norm": 0.1310124397277832, + "learning_rate": 2.9888480915891182e-05, + "loss": 0.0422, + "step": 26070 + }, + { + "epoch": 0.05751676653441633, + "grad_norm": 0.12427566200494766, + "learning_rate": 2.988827957745655e-05, + "loss": 0.0404, + "step": 26080 + }, + { + "epoch": 0.0575388205093145, + "grad_norm": 0.14590409398078918, + "learning_rate": 2.9888078058115265e-05, + "loss": 0.0403, + "step": 26090 + }, + { + "epoch": 0.05756087448421266, + "grad_norm": 0.0901922881603241, + "learning_rate": 2.9887876357869774e-05, + "loss": 0.039, + "step": 26100 + }, + { + "epoch": 0.05758292845911083, + "grad_norm": 0.12351728975772858, + "learning_rate": 2.9887674476722527e-05, + "loss": 0.0412, + "step": 26110 + }, + { + "epoch": 0.057604982434008994, + "grad_norm": 0.10906308144330978, + "learning_rate": 2.988747241467598e-05, + "loss": 0.0416, + "step": 26120 + }, + { + "epoch": 0.05762703640890716, + "grad_norm": 0.16973362863063812, + "learning_rate": 2.9887270171732585e-05, + "loss": 0.0411, + "step": 26130 + }, + { + "epoch": 0.05764909038380533, + "grad_norm": 0.14753492176532745, + "learning_rate": 2.988706774789481e-05, + "loss": 0.0405, + "step": 26140 + }, + { + "epoch": 0.05767114435870349, + "grad_norm": 0.23904231190681458, + "learning_rate": 2.9886865143165093e-05, + "loss": 0.0421, + "step": 26150 + }, + { + "epoch": 0.057693198333601654, + "grad_norm": 0.14369241893291473, + "learning_rate": 2.9886662357545918e-05, + "loss": 0.0413, + "step": 26160 + }, + { + "epoch": 0.057715252308499825, + "grad_norm": 0.13422265648841858, + "learning_rate": 2.988645939103974e-05, + "loss": 0.0394, + "step": 26170 + }, + { + "epoch": 0.05773730628339799, + "grad_norm": 0.09970229864120483, + "learning_rate": 2.988625624364902e-05, + "loss": 0.0412, + "step": 26180 + }, + { + "epoch": 0.05775936025829615, + "grad_norm": 0.13660478591918945, + "learning_rate": 2.9886052915376238e-05, + "loss": 0.0417, + "step": 26190 + }, + { + "epoch": 0.05778141423319432, + "grad_norm": 0.19541829824447632, + "learning_rate": 2.9885849406223854e-05, + "loss": 0.0424, + "step": 26200 + }, + { + "epoch": 0.057803468208092484, + "grad_norm": 0.14562739431858063, + "learning_rate": 2.9885645716194347e-05, + "loss": 0.0428, + "step": 26210 + }, + { + "epoch": 0.057825522182990655, + "grad_norm": 0.1605752408504486, + "learning_rate": 2.988544184529019e-05, + "loss": 0.0429, + "step": 26220 + }, + { + "epoch": 0.05784757615788882, + "grad_norm": 0.14193572103977203, + "learning_rate": 2.988523779351386e-05, + "loss": 0.0409, + "step": 26230 + }, + { + "epoch": 0.05786963013278698, + "grad_norm": 0.11842799186706543, + "learning_rate": 2.988503356086784e-05, + "loss": 0.0397, + "step": 26240 + }, + { + "epoch": 0.05789168410768515, + "grad_norm": 0.1506536453962326, + "learning_rate": 2.9884829147354603e-05, + "loss": 0.0413, + "step": 26250 + }, + { + "epoch": 0.057913738082583315, + "grad_norm": 0.16807208955287933, + "learning_rate": 2.9884624552976638e-05, + "loss": 0.0403, + "step": 26260 + }, + { + "epoch": 0.05793579205748148, + "grad_norm": 0.1403999775648117, + "learning_rate": 2.9884419777736433e-05, + "loss": 0.0406, + "step": 26270 + }, + { + "epoch": 0.05795784603237965, + "grad_norm": 0.15767262876033783, + "learning_rate": 2.9884214821636476e-05, + "loss": 0.0391, + "step": 26280 + }, + { + "epoch": 0.05797990000727781, + "grad_norm": 0.1489197164773941, + "learning_rate": 2.9884009684679254e-05, + "loss": 0.0413, + "step": 26290 + }, + { + "epoch": 0.058001953982175974, + "grad_norm": 0.19224485754966736, + "learning_rate": 2.9883804366867267e-05, + "loss": 0.0422, + "step": 26300 + }, + { + "epoch": 0.058024007957074145, + "grad_norm": 0.14268513023853302, + "learning_rate": 2.9883598868203e-05, + "loss": 0.0412, + "step": 26310 + }, + { + "epoch": 0.05804606193197231, + "grad_norm": 0.1518898755311966, + "learning_rate": 2.9883393188688952e-05, + "loss": 0.0414, + "step": 26320 + }, + { + "epoch": 0.05806811590687048, + "grad_norm": 0.14315539598464966, + "learning_rate": 2.9883187328327626e-05, + "loss": 0.0434, + "step": 26330 + }, + { + "epoch": 0.05809016988176864, + "grad_norm": 0.10372531414031982, + "learning_rate": 2.9882981287121523e-05, + "loss": 0.046, + "step": 26340 + }, + { + "epoch": 0.058112223856666805, + "grad_norm": 0.10388166457414627, + "learning_rate": 2.9882775065073146e-05, + "loss": 0.0435, + "step": 26350 + }, + { + "epoch": 0.058134277831564975, + "grad_norm": 0.1287616789340973, + "learning_rate": 2.9882568662184998e-05, + "loss": 0.0423, + "step": 26360 + }, + { + "epoch": 0.05815633180646314, + "grad_norm": 0.12142083793878555, + "learning_rate": 2.988236207845959e-05, + "loss": 0.0415, + "step": 26370 + }, + { + "epoch": 0.0581783857813613, + "grad_norm": 0.10860642790794373, + "learning_rate": 2.9882155313899436e-05, + "loss": 0.0405, + "step": 26380 + }, + { + "epoch": 0.05820043975625947, + "grad_norm": 0.1422036588191986, + "learning_rate": 2.988194836850704e-05, + "loss": 0.0402, + "step": 26390 + }, + { + "epoch": 0.058222493731157635, + "grad_norm": 0.17354069650173187, + "learning_rate": 2.988174124228492e-05, + "loss": 0.0407, + "step": 26400 + }, + { + "epoch": 0.0582445477060558, + "grad_norm": 0.15918338298797607, + "learning_rate": 2.988153393523559e-05, + "loss": 0.0409, + "step": 26410 + }, + { + "epoch": 0.05826660168095397, + "grad_norm": 0.1390332728624344, + "learning_rate": 2.9881326447361575e-05, + "loss": 0.0409, + "step": 26420 + }, + { + "epoch": 0.05828865565585213, + "grad_norm": 0.11822380870580673, + "learning_rate": 2.9881118778665398e-05, + "loss": 0.0399, + "step": 26430 + }, + { + "epoch": 0.0583107096307503, + "grad_norm": 0.14430733025074005, + "learning_rate": 2.988091092914957e-05, + "loss": 0.0406, + "step": 26440 + }, + { + "epoch": 0.058332763605648465, + "grad_norm": 0.16098544001579285, + "learning_rate": 2.988070289881663e-05, + "loss": 0.0415, + "step": 26450 + }, + { + "epoch": 0.05835481758054663, + "grad_norm": 0.11095870286226273, + "learning_rate": 2.98804946876691e-05, + "loss": 0.04, + "step": 26460 + }, + { + "epoch": 0.0583768715554448, + "grad_norm": 0.15605595707893372, + "learning_rate": 2.9880286295709506e-05, + "loss": 0.0422, + "step": 26470 + }, + { + "epoch": 0.05839892553034296, + "grad_norm": 0.1462002545595169, + "learning_rate": 2.988007772294039e-05, + "loss": 0.0418, + "step": 26480 + }, + { + "epoch": 0.058420979505241125, + "grad_norm": 0.15688292682170868, + "learning_rate": 2.9879868969364275e-05, + "loss": 0.041, + "step": 26490 + }, + { + "epoch": 0.058443033480139295, + "grad_norm": 0.12990860641002655, + "learning_rate": 2.9879660034983706e-05, + "loss": 0.0395, + "step": 26500 + }, + { + "epoch": 0.05846508745503746, + "grad_norm": 0.14522969722747803, + "learning_rate": 2.987945091980122e-05, + "loss": 0.0431, + "step": 26510 + }, + { + "epoch": 0.05848714142993562, + "grad_norm": 0.13230520486831665, + "learning_rate": 2.9879241623819355e-05, + "loss": 0.0411, + "step": 26520 + }, + { + "epoch": 0.05850919540483379, + "grad_norm": 0.11985062807798386, + "learning_rate": 2.9879032147040655e-05, + "loss": 0.042, + "step": 26530 + }, + { + "epoch": 0.058531249379731955, + "grad_norm": 0.11342626810073853, + "learning_rate": 2.9878822489467668e-05, + "loss": 0.0408, + "step": 26540 + }, + { + "epoch": 0.058553303354630125, + "grad_norm": 0.14327961206436157, + "learning_rate": 2.9878612651102936e-05, + "loss": 0.0377, + "step": 26550 + }, + { + "epoch": 0.05857535732952829, + "grad_norm": 0.12413133680820465, + "learning_rate": 2.9878402631949016e-05, + "loss": 0.0427, + "step": 26560 + }, + { + "epoch": 0.05859741130442645, + "grad_norm": 0.11555052548646927, + "learning_rate": 2.9878192432008456e-05, + "loss": 0.0401, + "step": 26570 + }, + { + "epoch": 0.05861946527932462, + "grad_norm": 0.12320105731487274, + "learning_rate": 2.987798205128381e-05, + "loss": 0.0397, + "step": 26580 + }, + { + "epoch": 0.058641519254222785, + "grad_norm": 0.1389295756816864, + "learning_rate": 2.9877771489777636e-05, + "loss": 0.0399, + "step": 26590 + }, + { + "epoch": 0.05866357322912095, + "grad_norm": 0.17997707426548004, + "learning_rate": 2.9877560747492492e-05, + "loss": 0.0402, + "step": 26600 + }, + { + "epoch": 0.05868562720401912, + "grad_norm": 0.10528534650802612, + "learning_rate": 2.9877349824430937e-05, + "loss": 0.041, + "step": 26610 + }, + { + "epoch": 0.05870768117891728, + "grad_norm": 0.15602608025074005, + "learning_rate": 2.9877138720595532e-05, + "loss": 0.0421, + "step": 26620 + }, + { + "epoch": 0.058729735153815445, + "grad_norm": 0.13450275361537933, + "learning_rate": 2.987692743598885e-05, + "loss": 0.0409, + "step": 26630 + }, + { + "epoch": 0.058751789128713615, + "grad_norm": 0.1255316585302353, + "learning_rate": 2.987671597061345e-05, + "loss": 0.0405, + "step": 26640 + }, + { + "epoch": 0.05877384310361178, + "grad_norm": 0.13597525656223297, + "learning_rate": 2.9876504324471907e-05, + "loss": 0.0398, + "step": 26650 + }, + { + "epoch": 0.05879589707850995, + "grad_norm": 0.1578090339899063, + "learning_rate": 2.9876292497566796e-05, + "loss": 0.0427, + "step": 26660 + }, + { + "epoch": 0.05881795105340811, + "grad_norm": 0.11544512957334518, + "learning_rate": 2.987608048990068e-05, + "loss": 0.0406, + "step": 26670 + }, + { + "epoch": 0.058840005028306275, + "grad_norm": 0.15891769528388977, + "learning_rate": 2.9875868301476138e-05, + "loss": 0.0419, + "step": 26680 + }, + { + "epoch": 0.058862059003204445, + "grad_norm": 0.11127921938896179, + "learning_rate": 2.9875655932295754e-05, + "loss": 0.0392, + "step": 26690 + }, + { + "epoch": 0.05888411297810261, + "grad_norm": 0.10832587629556656, + "learning_rate": 2.9875443382362107e-05, + "loss": 0.041, + "step": 26700 + }, + { + "epoch": 0.05890616695300077, + "grad_norm": 0.1262553334236145, + "learning_rate": 2.9875230651677775e-05, + "loss": 0.0396, + "step": 26710 + }, + { + "epoch": 0.05892822092789894, + "grad_norm": 0.13042216002941132, + "learning_rate": 2.9875017740245346e-05, + "loss": 0.0398, + "step": 26720 + }, + { + "epoch": 0.058950274902797105, + "grad_norm": 0.10920734703540802, + "learning_rate": 2.987480464806741e-05, + "loss": 0.0397, + "step": 26730 + }, + { + "epoch": 0.05897232887769527, + "grad_norm": 0.10912954807281494, + "learning_rate": 2.9874591375146554e-05, + "loss": 0.0408, + "step": 26740 + }, + { + "epoch": 0.05899438285259344, + "grad_norm": 0.12474779039621353, + "learning_rate": 2.9874377921485365e-05, + "loss": 0.0393, + "step": 26750 + }, + { + "epoch": 0.0590164368274916, + "grad_norm": 0.15549425780773163, + "learning_rate": 2.9874164287086443e-05, + "loss": 0.041, + "step": 26760 + }, + { + "epoch": 0.05903849080238977, + "grad_norm": 0.1472315639257431, + "learning_rate": 2.9873950471952378e-05, + "loss": 0.0395, + "step": 26770 + }, + { + "epoch": 0.059060544777287935, + "grad_norm": 0.11477434635162354, + "learning_rate": 2.9873736476085778e-05, + "loss": 0.0403, + "step": 26780 + }, + { + "epoch": 0.0590825987521861, + "grad_norm": 0.14426305890083313, + "learning_rate": 2.9873522299489232e-05, + "loss": 0.0426, + "step": 26790 + }, + { + "epoch": 0.05910465272708427, + "grad_norm": 0.12886333465576172, + "learning_rate": 2.987330794216535e-05, + "loss": 0.0416, + "step": 26800 + }, + { + "epoch": 0.05912670670198243, + "grad_norm": 0.17773790657520294, + "learning_rate": 2.987309340411673e-05, + "loss": 0.042, + "step": 26810 + }, + { + "epoch": 0.059148760676880595, + "grad_norm": 0.14048101007938385, + "learning_rate": 2.9872878685345985e-05, + "loss": 0.0426, + "step": 26820 + }, + { + "epoch": 0.059170814651778765, + "grad_norm": 0.11601050943136215, + "learning_rate": 2.987266378585572e-05, + "loss": 0.0408, + "step": 26830 + }, + { + "epoch": 0.05919286862667693, + "grad_norm": 0.08874809741973877, + "learning_rate": 2.987244870564855e-05, + "loss": 0.0428, + "step": 26840 + }, + { + "epoch": 0.05921492260157509, + "grad_norm": 0.12928380072116852, + "learning_rate": 2.9872233444727085e-05, + "loss": 0.039, + "step": 26850 + }, + { + "epoch": 0.05923697657647326, + "grad_norm": 0.13318952918052673, + "learning_rate": 2.987201800309394e-05, + "loss": 0.0404, + "step": 26860 + }, + { + "epoch": 0.059259030551371425, + "grad_norm": 0.10955564677715302, + "learning_rate": 2.9871802380751735e-05, + "loss": 0.0417, + "step": 26870 + }, + { + "epoch": 0.059281084526269595, + "grad_norm": 0.14069132506847382, + "learning_rate": 2.9871586577703096e-05, + "loss": 0.0419, + "step": 26880 + }, + { + "epoch": 0.05930313850116776, + "grad_norm": 0.1151774451136589, + "learning_rate": 2.9871370593950634e-05, + "loss": 0.04, + "step": 26890 + }, + { + "epoch": 0.05932519247606592, + "grad_norm": 0.12355891615152359, + "learning_rate": 2.9871154429496977e-05, + "loss": 0.0407, + "step": 26900 + }, + { + "epoch": 0.05934724645096409, + "grad_norm": 0.12852632999420166, + "learning_rate": 2.987093808434476e-05, + "loss": 0.0405, + "step": 26910 + }, + { + "epoch": 0.059369300425862255, + "grad_norm": 0.09877955168485641, + "learning_rate": 2.9870721558496594e-05, + "loss": 0.0403, + "step": 26920 + }, + { + "epoch": 0.05939135440076042, + "grad_norm": 0.10744911432266235, + "learning_rate": 2.987050485195513e-05, + "loss": 0.0409, + "step": 26930 + }, + { + "epoch": 0.05941340837565859, + "grad_norm": 0.12010210007429123, + "learning_rate": 2.987028796472299e-05, + "loss": 0.0426, + "step": 26940 + }, + { + "epoch": 0.05943546235055675, + "grad_norm": 0.2059929221868515, + "learning_rate": 2.9870070896802806e-05, + "loss": 0.0419, + "step": 26950 + }, + { + "epoch": 0.059457516325454915, + "grad_norm": 0.1526949256658554, + "learning_rate": 2.9869853648197226e-05, + "loss": 0.0405, + "step": 26960 + }, + { + "epoch": 0.059479570300353085, + "grad_norm": 0.136229008436203, + "learning_rate": 2.9869636218908883e-05, + "loss": 0.0387, + "step": 26970 + }, + { + "epoch": 0.05950162427525125, + "grad_norm": 0.09610627591609955, + "learning_rate": 2.986941860894042e-05, + "loss": 0.0394, + "step": 26980 + }, + { + "epoch": 0.05952367825014942, + "grad_norm": 0.1505943089723587, + "learning_rate": 2.9869200818294487e-05, + "loss": 0.0414, + "step": 26990 + }, + { + "epoch": 0.05954573222504758, + "grad_norm": 0.14940571784973145, + "learning_rate": 2.9868982846973718e-05, + "loss": 0.0416, + "step": 27000 + }, + { + "epoch": 0.059567786199945745, + "grad_norm": 0.13416624069213867, + "learning_rate": 2.986876469498077e-05, + "loss": 0.041, + "step": 27010 + }, + { + "epoch": 0.059589840174843915, + "grad_norm": 0.10202611237764359, + "learning_rate": 2.9868546362318295e-05, + "loss": 0.0426, + "step": 27020 + }, + { + "epoch": 0.05961189414974208, + "grad_norm": 0.10458333045244217, + "learning_rate": 2.9868327848988947e-05, + "loss": 0.0423, + "step": 27030 + }, + { + "epoch": 0.05963394812464024, + "grad_norm": 0.1145956739783287, + "learning_rate": 2.986810915499537e-05, + "loss": 0.0394, + "step": 27040 + }, + { + "epoch": 0.05965600209953841, + "grad_norm": 0.1016496792435646, + "learning_rate": 2.9867890280340236e-05, + "loss": 0.041, + "step": 27050 + }, + { + "epoch": 0.059678056074436575, + "grad_norm": 0.12530624866485596, + "learning_rate": 2.9867671225026194e-05, + "loss": 0.0427, + "step": 27060 + }, + { + "epoch": 0.059700110049334745, + "grad_norm": 0.13447244465351105, + "learning_rate": 2.9867451989055908e-05, + "loss": 0.042, + "step": 27070 + }, + { + "epoch": 0.05972216402423291, + "grad_norm": 0.11104889959096909, + "learning_rate": 2.9867232572432044e-05, + "loss": 0.0385, + "step": 27080 + }, + { + "epoch": 0.05974421799913107, + "grad_norm": 0.12748046219348907, + "learning_rate": 2.9867012975157272e-05, + "loss": 0.0397, + "step": 27090 + }, + { + "epoch": 0.05976627197402924, + "grad_norm": 0.07917668670415878, + "learning_rate": 2.986679319723425e-05, + "loss": 0.0409, + "step": 27100 + }, + { + "epoch": 0.059788325948927405, + "grad_norm": 0.12099507451057434, + "learning_rate": 2.986657323866565e-05, + "loss": 0.0418, + "step": 27110 + }, + { + "epoch": 0.05981037992382557, + "grad_norm": 0.15399223566055298, + "learning_rate": 2.9866353099454157e-05, + "loss": 0.0416, + "step": 27120 + }, + { + "epoch": 0.05983243389872374, + "grad_norm": 0.1321702003479004, + "learning_rate": 2.9866132779602436e-05, + "loss": 0.0412, + "step": 27130 + }, + { + "epoch": 0.0598544878736219, + "grad_norm": 0.100630983710289, + "learning_rate": 2.9865912279113163e-05, + "loss": 0.0405, + "step": 27140 + }, + { + "epoch": 0.059876541848520065, + "grad_norm": 0.13283200562000275, + "learning_rate": 2.986569159798902e-05, + "loss": 0.0412, + "step": 27150 + }, + { + "epoch": 0.059898595823418235, + "grad_norm": 0.17611736059188843, + "learning_rate": 2.986547073623269e-05, + "loss": 0.0421, + "step": 27160 + }, + { + "epoch": 0.0599206497983164, + "grad_norm": 0.12965168058872223, + "learning_rate": 2.9865249693846856e-05, + "loss": 0.0413, + "step": 27170 + }, + { + "epoch": 0.05994270377321457, + "grad_norm": 0.14599281549453735, + "learning_rate": 2.9865028470834207e-05, + "loss": 0.0418, + "step": 27180 + }, + { + "epoch": 0.05996475774811273, + "grad_norm": 0.15825165808200836, + "learning_rate": 2.9864807067197422e-05, + "loss": 0.0406, + "step": 27190 + }, + { + "epoch": 0.059986811723010895, + "grad_norm": 0.1114754006266594, + "learning_rate": 2.9864585482939193e-05, + "loss": 0.0403, + "step": 27200 + }, + { + "epoch": 0.060008865697909065, + "grad_norm": 0.12690481543540955, + "learning_rate": 2.986436371806222e-05, + "loss": 0.0404, + "step": 27210 + }, + { + "epoch": 0.06003091967280723, + "grad_norm": 0.11872388422489166, + "learning_rate": 2.9864141772569193e-05, + "loss": 0.0411, + "step": 27220 + }, + { + "epoch": 0.06005297364770539, + "grad_norm": 0.10728232562541962, + "learning_rate": 2.9863919646462812e-05, + "loss": 0.0419, + "step": 27230 + }, + { + "epoch": 0.06007502762260356, + "grad_norm": 0.14394530653953552, + "learning_rate": 2.986369733974577e-05, + "loss": 0.0403, + "step": 27240 + }, + { + "epoch": 0.060097081597501725, + "grad_norm": 0.11729294061660767, + "learning_rate": 2.9863474852420773e-05, + "loss": 0.0401, + "step": 27250 + }, + { + "epoch": 0.06011913557239989, + "grad_norm": 0.15052500367164612, + "learning_rate": 2.9863252184490526e-05, + "loss": 0.0411, + "step": 27260 + }, + { + "epoch": 0.06014118954729806, + "grad_norm": 0.1415242999792099, + "learning_rate": 2.9863029335957727e-05, + "loss": 0.039, + "step": 27270 + }, + { + "epoch": 0.06016324352219622, + "grad_norm": 0.1449899673461914, + "learning_rate": 2.986280630682509e-05, + "loss": 0.0418, + "step": 27280 + }, + { + "epoch": 0.06018529749709439, + "grad_norm": 0.11455963551998138, + "learning_rate": 2.9862583097095324e-05, + "loss": 0.0408, + "step": 27290 + }, + { + "epoch": 0.060207351471992555, + "grad_norm": 0.138144388794899, + "learning_rate": 2.9862359706771137e-05, + "loss": 0.0396, + "step": 27300 + }, + { + "epoch": 0.06022940544689072, + "grad_norm": 0.11457172781229019, + "learning_rate": 2.9862136135855254e-05, + "loss": 0.0424, + "step": 27310 + }, + { + "epoch": 0.06025145942178889, + "grad_norm": 0.11892331391572952, + "learning_rate": 2.986191238435038e-05, + "loss": 0.042, + "step": 27320 + }, + { + "epoch": 0.06027351339668705, + "grad_norm": 0.1583673655986786, + "learning_rate": 2.9861688452259245e-05, + "loss": 0.0414, + "step": 27330 + }, + { + "epoch": 0.060295567371585215, + "grad_norm": 0.13613945245742798, + "learning_rate": 2.9861464339584555e-05, + "loss": 0.0409, + "step": 27340 + }, + { + "epoch": 0.060317621346483385, + "grad_norm": 0.12623284757137299, + "learning_rate": 2.9861240046329047e-05, + "loss": 0.0396, + "step": 27350 + }, + { + "epoch": 0.06033967532138155, + "grad_norm": 0.11654625833034515, + "learning_rate": 2.986101557249544e-05, + "loss": 0.0377, + "step": 27360 + }, + { + "epoch": 0.06036172929627971, + "grad_norm": 0.1456069052219391, + "learning_rate": 2.986079091808646e-05, + "loss": 0.0422, + "step": 27370 + }, + { + "epoch": 0.06038378327117788, + "grad_norm": 0.1196906790137291, + "learning_rate": 2.9860566083104842e-05, + "loss": 0.0411, + "step": 27380 + }, + { + "epoch": 0.060405837246076045, + "grad_norm": 0.14997151494026184, + "learning_rate": 2.9860341067553315e-05, + "loss": 0.0389, + "step": 27390 + }, + { + "epoch": 0.060427891220974216, + "grad_norm": 0.13210055232048035, + "learning_rate": 2.9860115871434615e-05, + "loss": 0.0409, + "step": 27400 + }, + { + "epoch": 0.06044994519587238, + "grad_norm": 0.12531565129756927, + "learning_rate": 2.9859890494751475e-05, + "loss": 0.0423, + "step": 27410 + }, + { + "epoch": 0.06047199917077054, + "grad_norm": 0.16788212954998016, + "learning_rate": 2.9859664937506637e-05, + "loss": 0.0408, + "step": 27420 + }, + { + "epoch": 0.06049405314566871, + "grad_norm": 0.14849580824375153, + "learning_rate": 2.9859439199702834e-05, + "loss": 0.0415, + "step": 27430 + }, + { + "epoch": 0.060516107120566875, + "grad_norm": 0.1455472856760025, + "learning_rate": 2.9859213281342823e-05, + "loss": 0.0409, + "step": 27440 + }, + { + "epoch": 0.06053816109546504, + "grad_norm": 0.1451481133699417, + "learning_rate": 2.985898718242934e-05, + "loss": 0.0413, + "step": 27450 + }, + { + "epoch": 0.06056021507036321, + "grad_norm": 0.1169634535908699, + "learning_rate": 2.985876090296513e-05, + "loss": 0.0412, + "step": 27460 + }, + { + "epoch": 0.06058226904526137, + "grad_norm": 0.121064193546772, + "learning_rate": 2.985853444295295e-05, + "loss": 0.0396, + "step": 27470 + }, + { + "epoch": 0.060604323020159535, + "grad_norm": 0.12723946571350098, + "learning_rate": 2.9858307802395545e-05, + "loss": 0.0405, + "step": 27480 + }, + { + "epoch": 0.060626376995057706, + "grad_norm": 0.11694926023483276, + "learning_rate": 2.9858080981295672e-05, + "loss": 0.0388, + "step": 27490 + }, + { + "epoch": 0.06064843096995587, + "grad_norm": 0.15681228041648865, + "learning_rate": 2.9857853979656086e-05, + "loss": 0.0393, + "step": 27500 + }, + { + "epoch": 0.06067048494485404, + "grad_norm": 0.16649499535560608, + "learning_rate": 2.985762679747955e-05, + "loss": 0.0398, + "step": 27510 + }, + { + "epoch": 0.0606925389197522, + "grad_norm": 0.1271468549966812, + "learning_rate": 2.985739943476882e-05, + "loss": 0.0413, + "step": 27520 + }, + { + "epoch": 0.060714592894650365, + "grad_norm": 0.10596849024295807, + "learning_rate": 2.9857171891526653e-05, + "loss": 0.0417, + "step": 27530 + }, + { + "epoch": 0.060736646869548536, + "grad_norm": 0.12532824277877808, + "learning_rate": 2.9856944167755823e-05, + "loss": 0.0419, + "step": 27540 + }, + { + "epoch": 0.0607587008444467, + "grad_norm": 0.19138681888580322, + "learning_rate": 2.9856716263459094e-05, + "loss": 0.0398, + "step": 27550 + }, + { + "epoch": 0.06078075481934486, + "grad_norm": 0.1161988228559494, + "learning_rate": 2.9856488178639237e-05, + "loss": 0.042, + "step": 27560 + }, + { + "epoch": 0.06080280879424303, + "grad_norm": 0.11985455453395844, + "learning_rate": 2.985625991329902e-05, + "loss": 0.038, + "step": 27570 + }, + { + "epoch": 0.060824862769141196, + "grad_norm": 0.12654651701450348, + "learning_rate": 2.985603146744122e-05, + "loss": 0.0395, + "step": 27580 + }, + { + "epoch": 0.06084691674403936, + "grad_norm": 0.15599696338176727, + "learning_rate": 2.985580284106861e-05, + "loss": 0.0408, + "step": 27590 + }, + { + "epoch": 0.06086897071893753, + "grad_norm": 0.11874181777238846, + "learning_rate": 2.985557403418397e-05, + "loss": 0.0438, + "step": 27600 + }, + { + "epoch": 0.06089102469383569, + "grad_norm": 0.11185019463300705, + "learning_rate": 2.9855345046790078e-05, + "loss": 0.0396, + "step": 27610 + }, + { + "epoch": 0.06091307866873386, + "grad_norm": 0.12230867892503738, + "learning_rate": 2.985511587888972e-05, + "loss": 0.0399, + "step": 27620 + }, + { + "epoch": 0.060935132643632026, + "grad_norm": 0.09117672592401505, + "learning_rate": 2.985488653048568e-05, + "loss": 0.0405, + "step": 27630 + }, + { + "epoch": 0.06095718661853019, + "grad_norm": 0.13617689907550812, + "learning_rate": 2.9854657001580736e-05, + "loss": 0.0408, + "step": 27640 + }, + { + "epoch": 0.06097924059342836, + "grad_norm": 0.1227908656001091, + "learning_rate": 2.985442729217769e-05, + "loss": 0.0425, + "step": 27650 + }, + { + "epoch": 0.06100129456832652, + "grad_norm": 0.13349805772304535, + "learning_rate": 2.985419740227933e-05, + "loss": 0.0414, + "step": 27660 + }, + { + "epoch": 0.061023348543224686, + "grad_norm": 0.13358959555625916, + "learning_rate": 2.985396733188844e-05, + "loss": 0.0381, + "step": 27670 + }, + { + "epoch": 0.061045402518122856, + "grad_norm": 0.11964410543441772, + "learning_rate": 2.985373708100783e-05, + "loss": 0.0422, + "step": 27680 + }, + { + "epoch": 0.06106745649302102, + "grad_norm": 0.1844208985567093, + "learning_rate": 2.9853506649640285e-05, + "loss": 0.0408, + "step": 27690 + }, + { + "epoch": 0.06108951046791918, + "grad_norm": 0.11621459573507309, + "learning_rate": 2.9853276037788612e-05, + "loss": 0.0423, + "step": 27700 + }, + { + "epoch": 0.06111156444281735, + "grad_norm": 0.15627427399158478, + "learning_rate": 2.9853045245455613e-05, + "loss": 0.041, + "step": 27710 + }, + { + "epoch": 0.061133618417715516, + "grad_norm": 0.13563159108161926, + "learning_rate": 2.9852814272644085e-05, + "loss": 0.0388, + "step": 27720 + }, + { + "epoch": 0.061155672392613686, + "grad_norm": 0.13995076715946198, + "learning_rate": 2.9852583119356845e-05, + "loss": 0.0398, + "step": 27730 + }, + { + "epoch": 0.06117772636751185, + "grad_norm": 0.12702986598014832, + "learning_rate": 2.9852351785596697e-05, + "loss": 0.0406, + "step": 27740 + }, + { + "epoch": 0.06119978034241001, + "grad_norm": 0.1327405720949173, + "learning_rate": 2.985212027136645e-05, + "loss": 0.0431, + "step": 27750 + }, + { + "epoch": 0.06122183431730818, + "grad_norm": 0.11716236919164658, + "learning_rate": 2.985188857666892e-05, + "loss": 0.0411, + "step": 27760 + }, + { + "epoch": 0.061243888292206346, + "grad_norm": 0.15392060577869415, + "learning_rate": 2.9851656701506917e-05, + "loss": 0.0411, + "step": 27770 + }, + { + "epoch": 0.06126594226710451, + "grad_norm": 0.15711696445941925, + "learning_rate": 2.985142464588327e-05, + "loss": 0.0392, + "step": 27780 + }, + { + "epoch": 0.06128799624200268, + "grad_norm": 0.12670299410820007, + "learning_rate": 2.985119240980079e-05, + "loss": 0.042, + "step": 27790 + }, + { + "epoch": 0.06131005021690084, + "grad_norm": 0.11741341650485992, + "learning_rate": 2.9850959993262293e-05, + "loss": 0.0392, + "step": 27800 + }, + { + "epoch": 0.061332104191799006, + "grad_norm": 0.12973445653915405, + "learning_rate": 2.985072739627062e-05, + "loss": 0.0411, + "step": 27810 + }, + { + "epoch": 0.061354158166697176, + "grad_norm": 0.1422763168811798, + "learning_rate": 2.985049461882858e-05, + "loss": 0.0408, + "step": 27820 + }, + { + "epoch": 0.06137621214159534, + "grad_norm": 0.13685883581638336, + "learning_rate": 2.9850261660939014e-05, + "loss": 0.0404, + "step": 27830 + }, + { + "epoch": 0.06139826611649351, + "grad_norm": 0.14105063676834106, + "learning_rate": 2.985002852260475e-05, + "loss": 0.0429, + "step": 27840 + }, + { + "epoch": 0.06142032009139167, + "grad_norm": 0.10450851172208786, + "learning_rate": 2.9849795203828612e-05, + "loss": 0.0415, + "step": 27850 + }, + { + "epoch": 0.061442374066289836, + "grad_norm": 0.10296732932329178, + "learning_rate": 2.9849561704613447e-05, + "loss": 0.0394, + "step": 27860 + }, + { + "epoch": 0.061464428041188006, + "grad_norm": 0.13777801394462585, + "learning_rate": 2.9849328024962084e-05, + "loss": 0.0427, + "step": 27870 + }, + { + "epoch": 0.06148648201608617, + "grad_norm": 0.11568380147218704, + "learning_rate": 2.9849094164877368e-05, + "loss": 0.0404, + "step": 27880 + }, + { + "epoch": 0.06150853599098433, + "grad_norm": 0.11776283383369446, + "learning_rate": 2.9848860124362135e-05, + "loss": 0.0408, + "step": 27890 + }, + { + "epoch": 0.0615305899658825, + "grad_norm": 0.15262870490550995, + "learning_rate": 2.9848625903419232e-05, + "loss": 0.0407, + "step": 27900 + }, + { + "epoch": 0.061552643940780666, + "grad_norm": 0.16332118213176727, + "learning_rate": 2.9848391502051507e-05, + "loss": 0.0399, + "step": 27910 + }, + { + "epoch": 0.06157469791567883, + "grad_norm": 0.1507340967655182, + "learning_rate": 2.9848156920261805e-05, + "loss": 0.0409, + "step": 27920 + }, + { + "epoch": 0.061596751890577, + "grad_norm": 0.10166743397712708, + "learning_rate": 2.984792215805298e-05, + "loss": 0.0386, + "step": 27930 + }, + { + "epoch": 0.06161880586547516, + "grad_norm": 0.165790393948555, + "learning_rate": 2.9847687215427878e-05, + "loss": 0.0413, + "step": 27940 + }, + { + "epoch": 0.06164085984037333, + "grad_norm": 0.13441869616508484, + "learning_rate": 2.9847452092389364e-05, + "loss": 0.0391, + "step": 27950 + }, + { + "epoch": 0.061662913815271496, + "grad_norm": 0.11099735647439957, + "learning_rate": 2.9847216788940282e-05, + "loss": 0.0415, + "step": 27960 + }, + { + "epoch": 0.06168496779016966, + "grad_norm": 0.12002493441104889, + "learning_rate": 2.9846981305083496e-05, + "loss": 0.0402, + "step": 27970 + }, + { + "epoch": 0.06170702176506783, + "grad_norm": 0.1263377070426941, + "learning_rate": 2.9846745640821878e-05, + "loss": 0.0417, + "step": 27980 + }, + { + "epoch": 0.06172907573996599, + "grad_norm": 0.14758382737636566, + "learning_rate": 2.9846509796158276e-05, + "loss": 0.0431, + "step": 27990 + }, + { + "epoch": 0.061751129714864156, + "grad_norm": 0.14385785162448883, + "learning_rate": 2.9846273771095565e-05, + "loss": 0.0413, + "step": 28000 + }, + { + "epoch": 0.061773183689762326, + "grad_norm": 0.1286284327507019, + "learning_rate": 2.984603756563661e-05, + "loss": 0.0403, + "step": 28010 + }, + { + "epoch": 0.06179523766466049, + "grad_norm": 0.12099786102771759, + "learning_rate": 2.984580117978428e-05, + "loss": 0.0422, + "step": 28020 + }, + { + "epoch": 0.06181729163955865, + "grad_norm": 0.1325257122516632, + "learning_rate": 2.984556461354145e-05, + "loss": 0.0412, + "step": 28030 + }, + { + "epoch": 0.06183934561445682, + "grad_norm": 0.10857488960027695, + "learning_rate": 2.9845327866911e-05, + "loss": 0.0405, + "step": 28040 + }, + { + "epoch": 0.061861399589354986, + "grad_norm": 0.14572861790657043, + "learning_rate": 2.984509093989579e-05, + "loss": 0.0426, + "step": 28050 + }, + { + "epoch": 0.061883453564253156, + "grad_norm": 0.14093418419361115, + "learning_rate": 2.9844853832498717e-05, + "loss": 0.0428, + "step": 28060 + }, + { + "epoch": 0.06190550753915132, + "grad_norm": 0.1312548667192459, + "learning_rate": 2.9844616544722646e-05, + "loss": 0.0411, + "step": 28070 + }, + { + "epoch": 0.06192756151404948, + "grad_norm": 0.14259293675422668, + "learning_rate": 2.9844379076570475e-05, + "loss": 0.0409, + "step": 28080 + }, + { + "epoch": 0.06194961548894765, + "grad_norm": 0.13704118132591248, + "learning_rate": 2.984414142804508e-05, + "loss": 0.0414, + "step": 28090 + }, + { + "epoch": 0.061971669463845816, + "grad_norm": 0.1205056756734848, + "learning_rate": 2.9843903599149354e-05, + "loss": 0.0403, + "step": 28100 + }, + { + "epoch": 0.06199372343874398, + "grad_norm": 0.1084013506770134, + "learning_rate": 2.984366558988618e-05, + "loss": 0.0418, + "step": 28110 + }, + { + "epoch": 0.06201577741364215, + "grad_norm": 0.12983199954032898, + "learning_rate": 2.9843427400258457e-05, + "loss": 0.04, + "step": 28120 + }, + { + "epoch": 0.06203783138854031, + "grad_norm": 0.16473355889320374, + "learning_rate": 2.9843189030269077e-05, + "loss": 0.0415, + "step": 28130 + }, + { + "epoch": 0.062059885363438476, + "grad_norm": 0.12608835101127625, + "learning_rate": 2.9842950479920935e-05, + "loss": 0.0406, + "step": 28140 + }, + { + "epoch": 0.062081939338336646, + "grad_norm": 0.1389031857252121, + "learning_rate": 2.984271174921693e-05, + "loss": 0.0401, + "step": 28150 + }, + { + "epoch": 0.06210399331323481, + "grad_norm": 0.14721952378749847, + "learning_rate": 2.9842472838159963e-05, + "loss": 0.0423, + "step": 28160 + }, + { + "epoch": 0.06212604728813298, + "grad_norm": 0.13370509445667267, + "learning_rate": 2.9842233746752937e-05, + "loss": 0.0412, + "step": 28170 + }, + { + "epoch": 0.06214810126303114, + "grad_norm": 0.1314840167760849, + "learning_rate": 2.984199447499876e-05, + "loss": 0.0421, + "step": 28180 + }, + { + "epoch": 0.062170155237929306, + "grad_norm": 0.16470953822135925, + "learning_rate": 2.984175502290034e-05, + "loss": 0.039, + "step": 28190 + }, + { + "epoch": 0.062192209212827476, + "grad_norm": 0.10551577061414719, + "learning_rate": 2.984151539046058e-05, + "loss": 0.0413, + "step": 28200 + }, + { + "epoch": 0.06221426318772564, + "grad_norm": 0.12417865544557571, + "learning_rate": 2.9841275577682392e-05, + "loss": 0.04, + "step": 28210 + }, + { + "epoch": 0.0622363171626238, + "grad_norm": 0.15489305555820465, + "learning_rate": 2.9841035584568696e-05, + "loss": 0.0416, + "step": 28220 + }, + { + "epoch": 0.06225837113752197, + "grad_norm": 0.19123101234436035, + "learning_rate": 2.9840795411122407e-05, + "loss": 0.0386, + "step": 28230 + }, + { + "epoch": 0.062280425112420136, + "grad_norm": 0.11892042309045792, + "learning_rate": 2.9840555057346442e-05, + "loss": 0.0368, + "step": 28240 + }, + { + "epoch": 0.062302479087318306, + "grad_norm": 0.1517825871706009, + "learning_rate": 2.984031452324372e-05, + "loss": 0.0395, + "step": 28250 + }, + { + "epoch": 0.06232453306221647, + "grad_norm": 0.14457541704177856, + "learning_rate": 2.9840073808817164e-05, + "loss": 0.0412, + "step": 28260 + }, + { + "epoch": 0.06234658703711463, + "grad_norm": 0.13908982276916504, + "learning_rate": 2.98398329140697e-05, + "loss": 0.0417, + "step": 28270 + }, + { + "epoch": 0.0623686410120128, + "grad_norm": 0.13023187220096588, + "learning_rate": 2.9839591839004254e-05, + "loss": 0.0396, + "step": 28280 + }, + { + "epoch": 0.062390694986910966, + "grad_norm": 0.12559610605239868, + "learning_rate": 2.9839350583623757e-05, + "loss": 0.0392, + "step": 28290 + }, + { + "epoch": 0.06241274896180913, + "grad_norm": 0.20101377367973328, + "learning_rate": 2.983910914793114e-05, + "loss": 0.0415, + "step": 28300 + }, + { + "epoch": 0.0624348029367073, + "grad_norm": 0.18965542316436768, + "learning_rate": 2.9838867531929338e-05, + "loss": 0.0406, + "step": 28310 + }, + { + "epoch": 0.06245685691160546, + "grad_norm": 0.13652074337005615, + "learning_rate": 2.983862573562128e-05, + "loss": 0.04, + "step": 28320 + }, + { + "epoch": 0.062478910886503626, + "grad_norm": 0.13631722331047058, + "learning_rate": 2.9838383759009916e-05, + "loss": 0.0412, + "step": 28330 + }, + { + "epoch": 0.06250096486140179, + "grad_norm": 0.1248856708407402, + "learning_rate": 2.983814160209818e-05, + "loss": 0.041, + "step": 28340 + }, + { + "epoch": 0.06252301883629996, + "grad_norm": 0.09849143773317337, + "learning_rate": 2.983789926488901e-05, + "loss": 0.0378, + "step": 28350 + }, + { + "epoch": 0.06254507281119813, + "grad_norm": 0.12151169031858444, + "learning_rate": 2.9837656747385352e-05, + "loss": 0.0417, + "step": 28360 + }, + { + "epoch": 0.06256712678609629, + "grad_norm": 0.13984981179237366, + "learning_rate": 2.983741404959016e-05, + "loss": 0.0394, + "step": 28370 + }, + { + "epoch": 0.06258918076099446, + "grad_norm": 0.1111181303858757, + "learning_rate": 2.983717117150638e-05, + "loss": 0.039, + "step": 28380 + }, + { + "epoch": 0.06261123473589263, + "grad_norm": 0.11796972900629044, + "learning_rate": 2.9836928113136956e-05, + "loss": 0.0415, + "step": 28390 + }, + { + "epoch": 0.06263328871079078, + "grad_norm": 0.16689155995845795, + "learning_rate": 2.9836684874484853e-05, + "loss": 0.0428, + "step": 28400 + }, + { + "epoch": 0.06265534268568895, + "grad_norm": 0.1369558870792389, + "learning_rate": 2.9836441455553018e-05, + "loss": 0.04, + "step": 28410 + }, + { + "epoch": 0.06267739666058712, + "grad_norm": 0.14887802302837372, + "learning_rate": 2.983619785634441e-05, + "loss": 0.0411, + "step": 28420 + }, + { + "epoch": 0.0626994506354853, + "grad_norm": 0.13945947587490082, + "learning_rate": 2.9835954076861993e-05, + "loss": 0.0407, + "step": 28430 + }, + { + "epoch": 0.06272150461038345, + "grad_norm": 0.13228070735931396, + "learning_rate": 2.9835710117108726e-05, + "loss": 0.0391, + "step": 28440 + }, + { + "epoch": 0.06274355858528162, + "grad_norm": 0.12293678522109985, + "learning_rate": 2.9835465977087572e-05, + "loss": 0.039, + "step": 28450 + }, + { + "epoch": 0.06276561256017979, + "grad_norm": 0.12638597190380096, + "learning_rate": 2.98352216568015e-05, + "loss": 0.0408, + "step": 28460 + }, + { + "epoch": 0.06278766653507795, + "grad_norm": 0.15698882937431335, + "learning_rate": 2.9834977156253478e-05, + "loss": 0.0389, + "step": 28470 + }, + { + "epoch": 0.06280972050997612, + "grad_norm": 0.12362299114465714, + "learning_rate": 2.9834732475446476e-05, + "loss": 0.0421, + "step": 28480 + }, + { + "epoch": 0.06283177448487429, + "grad_norm": 0.09829814732074738, + "learning_rate": 2.983448761438347e-05, + "loss": 0.0409, + "step": 28490 + }, + { + "epoch": 0.06285382845977244, + "grad_norm": 0.15199100971221924, + "learning_rate": 2.9834242573067433e-05, + "loss": 0.0404, + "step": 28500 + }, + { + "epoch": 0.06287588243467061, + "grad_norm": 0.13295698165893555, + "learning_rate": 2.9833997351501344e-05, + "loss": 0.0402, + "step": 28510 + }, + { + "epoch": 0.06289793640956878, + "grad_norm": 0.13573603332042694, + "learning_rate": 2.9833751949688182e-05, + "loss": 0.0422, + "step": 28520 + }, + { + "epoch": 0.06291999038446694, + "grad_norm": 0.1126512810587883, + "learning_rate": 2.9833506367630925e-05, + "loss": 0.0388, + "step": 28530 + }, + { + "epoch": 0.06294204435936511, + "grad_norm": 0.13524660468101501, + "learning_rate": 2.9833260605332558e-05, + "loss": 0.0396, + "step": 28540 + }, + { + "epoch": 0.06296409833426328, + "grad_norm": 0.1300046145915985, + "learning_rate": 2.9833014662796077e-05, + "loss": 0.0434, + "step": 28550 + }, + { + "epoch": 0.06298615230916144, + "grad_norm": 0.1582038253545761, + "learning_rate": 2.983276854002446e-05, + "loss": 0.0398, + "step": 28560 + }, + { + "epoch": 0.0630082062840596, + "grad_norm": 0.1335708349943161, + "learning_rate": 2.98325222370207e-05, + "loss": 0.0401, + "step": 28570 + }, + { + "epoch": 0.06303026025895778, + "grad_norm": 0.12730418145656586, + "learning_rate": 2.9832275753787793e-05, + "loss": 0.0415, + "step": 28580 + }, + { + "epoch": 0.06305231423385593, + "grad_norm": 0.11680078506469727, + "learning_rate": 2.9832029090328733e-05, + "loss": 0.0405, + "step": 28590 + }, + { + "epoch": 0.0630743682087541, + "grad_norm": 0.11939533799886703, + "learning_rate": 2.983178224664651e-05, + "loss": 0.0398, + "step": 28600 + }, + { + "epoch": 0.06309642218365227, + "grad_norm": 0.15486682951450348, + "learning_rate": 2.9831535222744132e-05, + "loss": 0.0406, + "step": 28610 + }, + { + "epoch": 0.06311847615855043, + "grad_norm": 0.13319309055805206, + "learning_rate": 2.9831288018624595e-05, + "loss": 0.0406, + "step": 28620 + }, + { + "epoch": 0.0631405301334486, + "grad_norm": 0.12807314097881317, + "learning_rate": 2.983104063429091e-05, + "loss": 0.0416, + "step": 28630 + }, + { + "epoch": 0.06316258410834677, + "grad_norm": 0.11048393696546555, + "learning_rate": 2.9830793069746082e-05, + "loss": 0.0398, + "step": 28640 + }, + { + "epoch": 0.06318463808324494, + "grad_norm": 0.12030120193958282, + "learning_rate": 2.983054532499311e-05, + "loss": 0.0365, + "step": 28650 + }, + { + "epoch": 0.0632066920581431, + "grad_norm": 0.11986866593360901, + "learning_rate": 2.983029740003501e-05, + "loss": 0.0408, + "step": 28660 + }, + { + "epoch": 0.06322874603304127, + "grad_norm": 0.14062248170375824, + "learning_rate": 2.9830049294874796e-05, + "loss": 0.0411, + "step": 28670 + }, + { + "epoch": 0.06325080000793944, + "grad_norm": 0.1541295349597931, + "learning_rate": 2.9829801009515476e-05, + "loss": 0.0427, + "step": 28680 + }, + { + "epoch": 0.06327285398283759, + "grad_norm": 0.14001597464084625, + "learning_rate": 2.9829552543960078e-05, + "loss": 0.043, + "step": 28690 + }, + { + "epoch": 0.06329490795773576, + "grad_norm": 0.15598875284194946, + "learning_rate": 2.982930389821161e-05, + "loss": 0.041, + "step": 28700 + }, + { + "epoch": 0.06331696193263393, + "grad_norm": 0.15309137105941772, + "learning_rate": 2.9829055072273106e-05, + "loss": 0.0412, + "step": 28710 + }, + { + "epoch": 0.06333901590753209, + "grad_norm": 0.148104727268219, + "learning_rate": 2.9828806066147576e-05, + "loss": 0.0422, + "step": 28720 + }, + { + "epoch": 0.06336106988243026, + "grad_norm": 0.14189821481704712, + "learning_rate": 2.9828556879838053e-05, + "loss": 0.0424, + "step": 28730 + }, + { + "epoch": 0.06338312385732843, + "grad_norm": 0.0927300676703453, + "learning_rate": 2.9828307513347566e-05, + "loss": 0.0407, + "step": 28740 + }, + { + "epoch": 0.06340517783222659, + "grad_norm": 0.10725734382867813, + "learning_rate": 2.9828057966679135e-05, + "loss": 0.037, + "step": 28750 + }, + { + "epoch": 0.06342723180712476, + "grad_norm": 0.14240576326847076, + "learning_rate": 2.9827808239835808e-05, + "loss": 0.0398, + "step": 28760 + }, + { + "epoch": 0.06344928578202293, + "grad_norm": 0.15038944780826569, + "learning_rate": 2.982755833282061e-05, + "loss": 0.0415, + "step": 28770 + }, + { + "epoch": 0.06347133975692108, + "grad_norm": 0.11572056263685226, + "learning_rate": 2.9827308245636577e-05, + "loss": 0.039, + "step": 28780 + }, + { + "epoch": 0.06349339373181925, + "grad_norm": 0.19777025282382965, + "learning_rate": 2.9827057978286744e-05, + "loss": 0.0395, + "step": 28790 + }, + { + "epoch": 0.06351544770671742, + "grad_norm": 0.1492968052625656, + "learning_rate": 2.9826807530774165e-05, + "loss": 0.0398, + "step": 28800 + }, + { + "epoch": 0.06353750168161558, + "grad_norm": 0.11698348075151443, + "learning_rate": 2.9826556903101873e-05, + "loss": 0.0395, + "step": 28810 + }, + { + "epoch": 0.06355955565651375, + "grad_norm": 0.12689203023910522, + "learning_rate": 2.982630609527292e-05, + "loss": 0.0414, + "step": 28820 + }, + { + "epoch": 0.06358160963141192, + "grad_norm": 0.14039523899555206, + "learning_rate": 2.9826055107290346e-05, + "loss": 0.0403, + "step": 28830 + }, + { + "epoch": 0.06360366360631009, + "grad_norm": 0.15255874395370483, + "learning_rate": 2.9825803939157202e-05, + "loss": 0.0412, + "step": 28840 + }, + { + "epoch": 0.06362571758120825, + "grad_norm": 0.13519003987312317, + "learning_rate": 2.9825552590876543e-05, + "loss": 0.0396, + "step": 28850 + }, + { + "epoch": 0.06364777155610642, + "grad_norm": 0.17637300491333008, + "learning_rate": 2.9825301062451424e-05, + "loss": 0.0381, + "step": 28860 + }, + { + "epoch": 0.06366982553100459, + "grad_norm": 0.12743064761161804, + "learning_rate": 2.98250493538849e-05, + "loss": 0.0405, + "step": 28870 + }, + { + "epoch": 0.06369187950590274, + "grad_norm": 0.10864395648241043, + "learning_rate": 2.982479746518003e-05, + "loss": 0.0401, + "step": 28880 + }, + { + "epoch": 0.06371393348080091, + "grad_norm": 0.12656210362911224, + "learning_rate": 2.9824545396339874e-05, + "loss": 0.0404, + "step": 28890 + }, + { + "epoch": 0.06373598745569908, + "grad_norm": 0.12474436312913895, + "learning_rate": 2.9824293147367493e-05, + "loss": 0.0419, + "step": 28900 + }, + { + "epoch": 0.06375804143059724, + "grad_norm": 0.14293929934501648, + "learning_rate": 2.9824040718265952e-05, + "loss": 0.0411, + "step": 28910 + }, + { + "epoch": 0.06378009540549541, + "grad_norm": 0.11703123152256012, + "learning_rate": 2.9823788109038322e-05, + "loss": 0.0411, + "step": 28920 + }, + { + "epoch": 0.06380214938039358, + "grad_norm": 0.10437506437301636, + "learning_rate": 2.9823535319687672e-05, + "loss": 0.0392, + "step": 28930 + }, + { + "epoch": 0.06382420335529174, + "grad_norm": 0.12639015913009644, + "learning_rate": 2.9823282350217075e-05, + "loss": 0.0385, + "step": 28940 + }, + { + "epoch": 0.0638462573301899, + "grad_norm": 0.1191832572221756, + "learning_rate": 2.9823029200629594e-05, + "loss": 0.0423, + "step": 28950 + }, + { + "epoch": 0.06386831130508808, + "grad_norm": 0.10383743792772293, + "learning_rate": 2.982277587092832e-05, + "loss": 0.0397, + "step": 28960 + }, + { + "epoch": 0.06389036527998623, + "grad_norm": 0.11195558309555054, + "learning_rate": 2.982252236111632e-05, + "loss": 0.0396, + "step": 28970 + }, + { + "epoch": 0.0639124192548844, + "grad_norm": 0.1048847883939743, + "learning_rate": 2.9822268671196676e-05, + "loss": 0.0388, + "step": 28980 + }, + { + "epoch": 0.06393447322978257, + "grad_norm": 0.11704430729150772, + "learning_rate": 2.982201480117248e-05, + "loss": 0.0397, + "step": 28990 + }, + { + "epoch": 0.06395652720468073, + "grad_norm": 0.1320601850748062, + "learning_rate": 2.9821760751046805e-05, + "loss": 0.0398, + "step": 29000 + }, + { + "epoch": 0.0639785811795789, + "grad_norm": 0.1253252774477005, + "learning_rate": 2.9821506520822747e-05, + "loss": 0.0391, + "step": 29010 + }, + { + "epoch": 0.06400063515447707, + "grad_norm": 0.10984361916780472, + "learning_rate": 2.9821252110503387e-05, + "loss": 0.0396, + "step": 29020 + }, + { + "epoch": 0.06402268912937523, + "grad_norm": 0.1304280310869217, + "learning_rate": 2.9820997520091823e-05, + "loss": 0.0407, + "step": 29030 + }, + { + "epoch": 0.0640447431042734, + "grad_norm": 0.14009453356266022, + "learning_rate": 2.9820742749591143e-05, + "loss": 0.042, + "step": 29040 + }, + { + "epoch": 0.06406679707917157, + "grad_norm": 0.13550172746181488, + "learning_rate": 2.9820487799004446e-05, + "loss": 0.0407, + "step": 29050 + }, + { + "epoch": 0.06408885105406974, + "grad_norm": 0.0953826978802681, + "learning_rate": 2.9820232668334834e-05, + "loss": 0.0389, + "step": 29060 + }, + { + "epoch": 0.0641109050289679, + "grad_norm": 0.12425992637872696, + "learning_rate": 2.9819977357585397e-05, + "loss": 0.0402, + "step": 29070 + }, + { + "epoch": 0.06413295900386606, + "grad_norm": 0.1612384021282196, + "learning_rate": 2.9819721866759243e-05, + "loss": 0.0409, + "step": 29080 + }, + { + "epoch": 0.06415501297876423, + "grad_norm": 0.14097708463668823, + "learning_rate": 2.9819466195859478e-05, + "loss": 0.0395, + "step": 29090 + }, + { + "epoch": 0.06417706695366239, + "grad_norm": 0.20708678662776947, + "learning_rate": 2.981921034488921e-05, + "loss": 0.0418, + "step": 29100 + }, + { + "epoch": 0.06419912092856056, + "grad_norm": 0.11592835187911987, + "learning_rate": 2.9818954313851536e-05, + "loss": 0.041, + "step": 29110 + }, + { + "epoch": 0.06422117490345873, + "grad_norm": 0.12266883254051208, + "learning_rate": 2.9818698102749583e-05, + "loss": 0.0403, + "step": 29120 + }, + { + "epoch": 0.06424322887835689, + "grad_norm": 0.12726183235645294, + "learning_rate": 2.9818441711586454e-05, + "loss": 0.041, + "step": 29130 + }, + { + "epoch": 0.06426528285325506, + "grad_norm": 0.1358252912759781, + "learning_rate": 2.9818185140365266e-05, + "loss": 0.0383, + "step": 29140 + }, + { + "epoch": 0.06428733682815323, + "grad_norm": 0.13237200677394867, + "learning_rate": 2.981792838908914e-05, + "loss": 0.0396, + "step": 29150 + }, + { + "epoch": 0.06430939080305138, + "grad_norm": 0.16595998406410217, + "learning_rate": 2.9817671457761194e-05, + "loss": 0.0409, + "step": 29160 + }, + { + "epoch": 0.06433144477794955, + "grad_norm": 0.1611325591802597, + "learning_rate": 2.981741434638455e-05, + "loss": 0.0406, + "step": 29170 + }, + { + "epoch": 0.06435349875284772, + "grad_norm": 0.13774175941944122, + "learning_rate": 2.981715705496233e-05, + "loss": 0.0415, + "step": 29180 + }, + { + "epoch": 0.06437555272774588, + "grad_norm": 0.14828743040561676, + "learning_rate": 2.981689958349766e-05, + "loss": 0.0404, + "step": 29190 + }, + { + "epoch": 0.06439760670264405, + "grad_norm": 0.10282278060913086, + "learning_rate": 2.9816641931993676e-05, + "loss": 0.038, + "step": 29200 + }, + { + "epoch": 0.06441966067754222, + "grad_norm": 0.1082158163189888, + "learning_rate": 2.98163841004535e-05, + "loss": 0.04, + "step": 29210 + }, + { + "epoch": 0.06444171465244038, + "grad_norm": 0.12066298723220825, + "learning_rate": 2.981612608888027e-05, + "loss": 0.04, + "step": 29220 + }, + { + "epoch": 0.06446376862733855, + "grad_norm": 0.12560507655143738, + "learning_rate": 2.981586789727712e-05, + "loss": 0.0415, + "step": 29230 + }, + { + "epoch": 0.06448582260223672, + "grad_norm": 0.12430407851934433, + "learning_rate": 2.9815609525647184e-05, + "loss": 0.0403, + "step": 29240 + }, + { + "epoch": 0.06450787657713487, + "grad_norm": 0.14373734593391418, + "learning_rate": 2.9815350973993603e-05, + "loss": 0.0401, + "step": 29250 + }, + { + "epoch": 0.06452993055203304, + "grad_norm": 0.12916037440299988, + "learning_rate": 2.9815092242319524e-05, + "loss": 0.0403, + "step": 29260 + }, + { + "epoch": 0.06455198452693121, + "grad_norm": 0.1374761462211609, + "learning_rate": 2.9814833330628084e-05, + "loss": 0.0409, + "step": 29270 + }, + { + "epoch": 0.06457403850182938, + "grad_norm": 0.1350041776895523, + "learning_rate": 2.9814574238922427e-05, + "loss": 0.0387, + "step": 29280 + }, + { + "epoch": 0.06459609247672754, + "grad_norm": 0.09562457352876663, + "learning_rate": 2.981431496720571e-05, + "loss": 0.041, + "step": 29290 + }, + { + "epoch": 0.06461814645162571, + "grad_norm": 0.12032316625118256, + "learning_rate": 2.981405551548108e-05, + "loss": 0.0395, + "step": 29300 + }, + { + "epoch": 0.06464020042652388, + "grad_norm": 0.13082563877105713, + "learning_rate": 2.9813795883751692e-05, + "loss": 0.0404, + "step": 29310 + }, + { + "epoch": 0.06466225440142204, + "grad_norm": 0.1376572996377945, + "learning_rate": 2.981353607202069e-05, + "loss": 0.0387, + "step": 29320 + }, + { + "epoch": 0.06468430837632021, + "grad_norm": 0.12573091685771942, + "learning_rate": 2.981327608029124e-05, + "loss": 0.0426, + "step": 29330 + }, + { + "epoch": 0.06470636235121838, + "grad_norm": 0.10843193531036377, + "learning_rate": 2.9813015908566504e-05, + "loss": 0.0395, + "step": 29340 + }, + { + "epoch": 0.06472841632611653, + "grad_norm": 0.13021788001060486, + "learning_rate": 2.9812755556849638e-05, + "loss": 0.0389, + "step": 29350 + }, + { + "epoch": 0.0647504703010147, + "grad_norm": 0.11427266895771027, + "learning_rate": 2.9812495025143803e-05, + "loss": 0.0404, + "step": 29360 + }, + { + "epoch": 0.06477252427591287, + "grad_norm": 0.11986482888460159, + "learning_rate": 2.9812234313452167e-05, + "loss": 0.0408, + "step": 29370 + }, + { + "epoch": 0.06479457825081103, + "grad_norm": 0.12409500777721405, + "learning_rate": 2.9811973421777906e-05, + "loss": 0.0407, + "step": 29380 + }, + { + "epoch": 0.0648166322257092, + "grad_norm": 0.15253669023513794, + "learning_rate": 2.9811712350124176e-05, + "loss": 0.0414, + "step": 29390 + }, + { + "epoch": 0.06483868620060737, + "grad_norm": 0.1097455620765686, + "learning_rate": 2.981145109849416e-05, + "loss": 0.0397, + "step": 29400 + }, + { + "epoch": 0.06486074017550553, + "grad_norm": 0.10907910019159317, + "learning_rate": 2.981118966689103e-05, + "loss": 0.0425, + "step": 29410 + }, + { + "epoch": 0.0648827941504037, + "grad_norm": 0.11674346029758453, + "learning_rate": 2.981092805531796e-05, + "loss": 0.0393, + "step": 29420 + }, + { + "epoch": 0.06490484812530187, + "grad_norm": 0.11539749056100845, + "learning_rate": 2.981066626377813e-05, + "loss": 0.0408, + "step": 29430 + }, + { + "epoch": 0.06492690210020002, + "grad_norm": 0.15977300703525543, + "learning_rate": 2.981040429227472e-05, + "loss": 0.0379, + "step": 29440 + }, + { + "epoch": 0.0649489560750982, + "grad_norm": 0.18630193173885345, + "learning_rate": 2.9810142140810916e-05, + "loss": 0.0383, + "step": 29450 + }, + { + "epoch": 0.06497101004999636, + "grad_norm": 0.16694355010986328, + "learning_rate": 2.9809879809389905e-05, + "loss": 0.0411, + "step": 29460 + }, + { + "epoch": 0.06499306402489452, + "grad_norm": 0.1303153932094574, + "learning_rate": 2.9809617298014868e-05, + "loss": 0.0423, + "step": 29470 + }, + { + "epoch": 0.06501511799979269, + "grad_norm": 0.11647637188434601, + "learning_rate": 2.9809354606688996e-05, + "loss": 0.0389, + "step": 29480 + }, + { + "epoch": 0.06503717197469086, + "grad_norm": 0.11119136959314346, + "learning_rate": 2.9809091735415487e-05, + "loss": 0.0386, + "step": 29490 + }, + { + "epoch": 0.06505922594958903, + "grad_norm": 0.12538425624370575, + "learning_rate": 2.980882868419753e-05, + "loss": 0.0402, + "step": 29500 + }, + { + "epoch": 0.06508127992448719, + "grad_norm": 0.15386462211608887, + "learning_rate": 2.9808565453038322e-05, + "loss": 0.0404, + "step": 29510 + }, + { + "epoch": 0.06510333389938536, + "grad_norm": 0.12189751118421555, + "learning_rate": 2.980830204194106e-05, + "loss": 0.0402, + "step": 29520 + }, + { + "epoch": 0.06512538787428353, + "grad_norm": 0.10675938427448273, + "learning_rate": 2.980803845090895e-05, + "loss": 0.0405, + "step": 29530 + }, + { + "epoch": 0.06514744184918168, + "grad_norm": 0.12235459685325623, + "learning_rate": 2.9807774679945194e-05, + "loss": 0.0414, + "step": 29540 + }, + { + "epoch": 0.06516949582407985, + "grad_norm": 0.10876954346895218, + "learning_rate": 2.980751072905299e-05, + "loss": 0.0382, + "step": 29550 + }, + { + "epoch": 0.06519154979897802, + "grad_norm": 0.1347784548997879, + "learning_rate": 2.9807246598235553e-05, + "loss": 0.0386, + "step": 29560 + }, + { + "epoch": 0.06521360377387618, + "grad_norm": 0.1598389595746994, + "learning_rate": 2.980698228749609e-05, + "loss": 0.0404, + "step": 29570 + }, + { + "epoch": 0.06523565774877435, + "grad_norm": 0.14734049141407013, + "learning_rate": 2.980671779683781e-05, + "loss": 0.0396, + "step": 29580 + }, + { + "epoch": 0.06525771172367252, + "grad_norm": 0.12914089858531952, + "learning_rate": 2.9806453126263927e-05, + "loss": 0.0412, + "step": 29590 + }, + { + "epoch": 0.06527976569857068, + "grad_norm": 0.1372224986553192, + "learning_rate": 2.9806188275777664e-05, + "loss": 0.0437, + "step": 29600 + }, + { + "epoch": 0.06530181967346885, + "grad_norm": 0.13284055888652802, + "learning_rate": 2.980592324538223e-05, + "loss": 0.0414, + "step": 29610 + }, + { + "epoch": 0.06532387364836702, + "grad_norm": 0.1486995965242386, + "learning_rate": 2.9805658035080852e-05, + "loss": 0.0396, + "step": 29620 + }, + { + "epoch": 0.06534592762326517, + "grad_norm": 0.12372855097055435, + "learning_rate": 2.9805392644876746e-05, + "loss": 0.0387, + "step": 29630 + }, + { + "epoch": 0.06536798159816334, + "grad_norm": 0.1485145092010498, + "learning_rate": 2.9805127074773145e-05, + "loss": 0.039, + "step": 29640 + }, + { + "epoch": 0.06539003557306151, + "grad_norm": 0.147007018327713, + "learning_rate": 2.980486132477327e-05, + "loss": 0.0394, + "step": 29650 + }, + { + "epoch": 0.06541208954795967, + "grad_norm": 0.12929485738277435, + "learning_rate": 2.9804595394880355e-05, + "loss": 0.0408, + "step": 29660 + }, + { + "epoch": 0.06543414352285784, + "grad_norm": 0.11426272988319397, + "learning_rate": 2.9804329285097624e-05, + "loss": 0.04, + "step": 29670 + }, + { + "epoch": 0.06545619749775601, + "grad_norm": 0.17959894239902496, + "learning_rate": 2.9804062995428316e-05, + "loss": 0.0404, + "step": 29680 + }, + { + "epoch": 0.06547825147265417, + "grad_norm": 0.12334001809358597, + "learning_rate": 2.9803796525875665e-05, + "loss": 0.0394, + "step": 29690 + }, + { + "epoch": 0.06550030544755234, + "grad_norm": 0.18012398481369019, + "learning_rate": 2.980352987644291e-05, + "loss": 0.0411, + "step": 29700 + }, + { + "epoch": 0.06552235942245051, + "grad_norm": 0.1538141965866089, + "learning_rate": 2.980326304713329e-05, + "loss": 0.0406, + "step": 29710 + }, + { + "epoch": 0.06554441339734868, + "grad_norm": 0.09087937325239182, + "learning_rate": 2.980299603795005e-05, + "loss": 0.0387, + "step": 29720 + }, + { + "epoch": 0.06556646737224683, + "grad_norm": 0.1366283893585205, + "learning_rate": 2.9802728848896425e-05, + "loss": 0.0395, + "step": 29730 + }, + { + "epoch": 0.065588521347145, + "grad_norm": 0.13679134845733643, + "learning_rate": 2.980246147997567e-05, + "loss": 0.0408, + "step": 29740 + }, + { + "epoch": 0.06561057532204317, + "grad_norm": 0.1313103437423706, + "learning_rate": 2.9802193931191036e-05, + "loss": 0.0413, + "step": 29750 + }, + { + "epoch": 0.06563262929694133, + "grad_norm": 0.1117313951253891, + "learning_rate": 2.980192620254577e-05, + "loss": 0.042, + "step": 29760 + }, + { + "epoch": 0.0656546832718395, + "grad_norm": 0.17818160355091095, + "learning_rate": 2.980165829404312e-05, + "loss": 0.0409, + "step": 29770 + }, + { + "epoch": 0.06567673724673767, + "grad_norm": 0.11743265390396118, + "learning_rate": 2.980139020568635e-05, + "loss": 0.0414, + "step": 29780 + }, + { + "epoch": 0.06569879122163583, + "grad_norm": 0.16897018253803253, + "learning_rate": 2.9801121937478718e-05, + "loss": 0.0416, + "step": 29790 + }, + { + "epoch": 0.065720845196534, + "grad_norm": 0.12565957009792328, + "learning_rate": 2.9800853489423473e-05, + "loss": 0.0419, + "step": 29800 + }, + { + "epoch": 0.06574289917143217, + "grad_norm": 0.1392817199230194, + "learning_rate": 2.980058486152389e-05, + "loss": 0.0421, + "step": 29810 + }, + { + "epoch": 0.06576495314633032, + "grad_norm": 0.17833423614501953, + "learning_rate": 2.980031605378322e-05, + "loss": 0.0419, + "step": 29820 + }, + { + "epoch": 0.0657870071212285, + "grad_norm": 0.11140471696853638, + "learning_rate": 2.9800047066204744e-05, + "loss": 0.0404, + "step": 29830 + }, + { + "epoch": 0.06580906109612666, + "grad_norm": 0.104794442653656, + "learning_rate": 2.9799777898791718e-05, + "loss": 0.0399, + "step": 29840 + }, + { + "epoch": 0.06583111507102482, + "grad_norm": 0.12216175347566605, + "learning_rate": 2.9799508551547418e-05, + "loss": 0.0392, + "step": 29850 + }, + { + "epoch": 0.06585316904592299, + "grad_norm": 0.1598903238773346, + "learning_rate": 2.9799239024475116e-05, + "loss": 0.0389, + "step": 29860 + }, + { + "epoch": 0.06587522302082116, + "grad_norm": 0.17677423357963562, + "learning_rate": 2.9798969317578086e-05, + "loss": 0.038, + "step": 29870 + }, + { + "epoch": 0.06589727699571932, + "grad_norm": 0.1716369241476059, + "learning_rate": 2.9798699430859606e-05, + "loss": 0.039, + "step": 29880 + }, + { + "epoch": 0.06591933097061749, + "grad_norm": 0.14377371966838837, + "learning_rate": 2.979842936432296e-05, + "loss": 0.041, + "step": 29890 + }, + { + "epoch": 0.06594138494551566, + "grad_norm": 0.1500401347875595, + "learning_rate": 2.979815911797142e-05, + "loss": 0.0417, + "step": 29900 + }, + { + "epoch": 0.06596343892041381, + "grad_norm": 0.1452188789844513, + "learning_rate": 2.979788869180828e-05, + "loss": 0.0412, + "step": 29910 + }, + { + "epoch": 0.06598549289531198, + "grad_norm": 0.15907490253448486, + "learning_rate": 2.9797618085836815e-05, + "loss": 0.0422, + "step": 29920 + }, + { + "epoch": 0.06600754687021015, + "grad_norm": 0.11991933733224869, + "learning_rate": 2.979734730006032e-05, + "loss": 0.0384, + "step": 29930 + }, + { + "epoch": 0.06602960084510832, + "grad_norm": 0.1391531229019165, + "learning_rate": 2.979707633448209e-05, + "loss": 0.0413, + "step": 29940 + }, + { + "epoch": 0.06605165482000648, + "grad_norm": 0.11917388439178467, + "learning_rate": 2.9796805189105407e-05, + "loss": 0.0387, + "step": 29950 + }, + { + "epoch": 0.06607370879490465, + "grad_norm": 0.13686223328113556, + "learning_rate": 2.979653386393357e-05, + "loss": 0.0408, + "step": 29960 + }, + { + "epoch": 0.06609576276980282, + "grad_norm": 0.10034787654876709, + "learning_rate": 2.979626235896988e-05, + "loss": 0.039, + "step": 29970 + }, + { + "epoch": 0.06611781674470098, + "grad_norm": 0.11349918693304062, + "learning_rate": 2.979599067421763e-05, + "loss": 0.0394, + "step": 29980 + }, + { + "epoch": 0.06613987071959915, + "grad_norm": 0.14458352327346802, + "learning_rate": 2.979571880968012e-05, + "loss": 0.0417, + "step": 29990 + }, + { + "epoch": 0.06616192469449732, + "grad_norm": 0.15609341859817505, + "learning_rate": 2.9795446765360664e-05, + "loss": 0.0418, + "step": 30000 + }, + { + "epoch": 0.06618397866939547, + "grad_norm": 0.17652079463005066, + "learning_rate": 2.9795174541262555e-05, + "loss": 0.0405, + "step": 30010 + }, + { + "epoch": 0.06620603264429364, + "grad_norm": 0.11293519288301468, + "learning_rate": 2.9794902137389107e-05, + "loss": 0.0393, + "step": 30020 + }, + { + "epoch": 0.06622808661919181, + "grad_norm": 0.13044051826000214, + "learning_rate": 2.9794629553743632e-05, + "loss": 0.0388, + "step": 30030 + }, + { + "epoch": 0.06625014059408997, + "grad_norm": 0.128260999917984, + "learning_rate": 2.9794356790329438e-05, + "loss": 0.0414, + "step": 30040 + }, + { + "epoch": 0.06627219456898814, + "grad_norm": 0.11506866663694382, + "learning_rate": 2.9794083847149845e-05, + "loss": 0.0394, + "step": 30050 + }, + { + "epoch": 0.06629424854388631, + "grad_norm": 0.15550026297569275, + "learning_rate": 2.9793810724208155e-05, + "loss": 0.0412, + "step": 30060 + }, + { + "epoch": 0.06631630251878447, + "grad_norm": 0.12046444416046143, + "learning_rate": 2.9793537421507708e-05, + "loss": 0.0394, + "step": 30070 + }, + { + "epoch": 0.06633835649368264, + "grad_norm": 0.11725793778896332, + "learning_rate": 2.9793263939051812e-05, + "loss": 0.0405, + "step": 30080 + }, + { + "epoch": 0.06636041046858081, + "grad_norm": 0.12327754497528076, + "learning_rate": 2.9792990276843788e-05, + "loss": 0.0398, + "step": 30090 + }, + { + "epoch": 0.06638246444347896, + "grad_norm": 0.1318185031414032, + "learning_rate": 2.9792716434886967e-05, + "loss": 0.038, + "step": 30100 + }, + { + "epoch": 0.06640451841837713, + "grad_norm": 0.12889058887958527, + "learning_rate": 2.9792442413184672e-05, + "loss": 0.0414, + "step": 30110 + }, + { + "epoch": 0.0664265723932753, + "grad_norm": 0.13015608489513397, + "learning_rate": 2.979216821174024e-05, + "loss": 0.0396, + "step": 30120 + }, + { + "epoch": 0.06644862636817347, + "grad_norm": 0.12018008530139923, + "learning_rate": 2.9791893830556994e-05, + "loss": 0.0392, + "step": 30130 + }, + { + "epoch": 0.06647068034307163, + "grad_norm": 0.11844159662723541, + "learning_rate": 2.979161926963827e-05, + "loss": 0.0409, + "step": 30140 + }, + { + "epoch": 0.0664927343179698, + "grad_norm": 0.12970486283302307, + "learning_rate": 2.9791344528987418e-05, + "loss": 0.0375, + "step": 30150 + }, + { + "epoch": 0.06651478829286797, + "grad_norm": 0.11248492449522018, + "learning_rate": 2.9791069608607754e-05, + "loss": 0.039, + "step": 30160 + }, + { + "epoch": 0.06653684226776613, + "grad_norm": 0.10681004077196121, + "learning_rate": 2.9790794508502633e-05, + "loss": 0.0397, + "step": 30170 + }, + { + "epoch": 0.0665588962426643, + "grad_norm": 0.12600012123584747, + "learning_rate": 2.9790519228675393e-05, + "loss": 0.0408, + "step": 30180 + }, + { + "epoch": 0.06658095021756247, + "grad_norm": 0.13735541701316833, + "learning_rate": 2.9790243769129378e-05, + "loss": 0.0425, + "step": 30190 + }, + { + "epoch": 0.06660300419246062, + "grad_norm": 0.12109561264514923, + "learning_rate": 2.978996812986794e-05, + "loss": 0.0404, + "step": 30200 + }, + { + "epoch": 0.0666250581673588, + "grad_norm": 0.0923973023891449, + "learning_rate": 2.9789692310894423e-05, + "loss": 0.0384, + "step": 30210 + }, + { + "epoch": 0.06664711214225696, + "grad_norm": 0.11136017739772797, + "learning_rate": 2.978941631221218e-05, + "loss": 0.0417, + "step": 30220 + }, + { + "epoch": 0.06666916611715512, + "grad_norm": 0.10469993203878403, + "learning_rate": 2.9789140133824567e-05, + "loss": 0.0393, + "step": 30230 + }, + { + "epoch": 0.06669122009205329, + "grad_norm": 0.11621010303497314, + "learning_rate": 2.9788863775734936e-05, + "loss": 0.0403, + "step": 30240 + }, + { + "epoch": 0.06671327406695146, + "grad_norm": 0.18802563846111298, + "learning_rate": 2.978858723794665e-05, + "loss": 0.0364, + "step": 30250 + }, + { + "epoch": 0.06673532804184962, + "grad_norm": 0.14950257539749146, + "learning_rate": 2.9788310520463065e-05, + "loss": 0.0418, + "step": 30260 + }, + { + "epoch": 0.06675738201674779, + "grad_norm": 0.11713253706693649, + "learning_rate": 2.9788033623287545e-05, + "loss": 0.0389, + "step": 30270 + }, + { + "epoch": 0.06677943599164596, + "grad_norm": 0.13906161487102509, + "learning_rate": 2.9787756546423454e-05, + "loss": 0.0374, + "step": 30280 + }, + { + "epoch": 0.06680148996654411, + "grad_norm": 0.10800255089998245, + "learning_rate": 2.978747928987416e-05, + "loss": 0.0408, + "step": 30290 + }, + { + "epoch": 0.06682354394144228, + "grad_norm": 0.13103044033050537, + "learning_rate": 2.978720185364303e-05, + "loss": 0.04, + "step": 30300 + }, + { + "epoch": 0.06684559791634045, + "grad_norm": 0.10968884080648422, + "learning_rate": 2.9786924237733437e-05, + "loss": 0.04, + "step": 30310 + }, + { + "epoch": 0.06686765189123861, + "grad_norm": 0.14909179508686066, + "learning_rate": 2.9786646442148752e-05, + "loss": 0.04, + "step": 30320 + }, + { + "epoch": 0.06688970586613678, + "grad_norm": 0.1593582183122635, + "learning_rate": 2.9786368466892352e-05, + "loss": 0.0389, + "step": 30330 + }, + { + "epoch": 0.06691175984103495, + "grad_norm": 0.12190907448530197, + "learning_rate": 2.9786090311967617e-05, + "loss": 0.0402, + "step": 30340 + }, + { + "epoch": 0.06693381381593312, + "grad_norm": 0.09649386256933212, + "learning_rate": 2.978581197737792e-05, + "loss": 0.0404, + "step": 30350 + }, + { + "epoch": 0.06695586779083128, + "grad_norm": 0.10187391191720963, + "learning_rate": 2.978553346312665e-05, + "loss": 0.0429, + "step": 30360 + }, + { + "epoch": 0.06697792176572945, + "grad_norm": 0.13368907570838928, + "learning_rate": 2.9785254769217188e-05, + "loss": 0.0409, + "step": 30370 + }, + { + "epoch": 0.06699997574062762, + "grad_norm": 0.1541265994310379, + "learning_rate": 2.9784975895652918e-05, + "loss": 0.0406, + "step": 30380 + }, + { + "epoch": 0.06702202971552577, + "grad_norm": 0.12427712231874466, + "learning_rate": 2.9784696842437237e-05, + "loss": 0.0411, + "step": 30390 + }, + { + "epoch": 0.06704408369042394, + "grad_norm": 0.09333135932683945, + "learning_rate": 2.9784417609573527e-05, + "loss": 0.0418, + "step": 30400 + }, + { + "epoch": 0.06706613766532211, + "grad_norm": 0.17011511325836182, + "learning_rate": 2.9784138197065185e-05, + "loss": 0.0413, + "step": 30410 + }, + { + "epoch": 0.06708819164022027, + "grad_norm": 0.11820335686206818, + "learning_rate": 2.9783858604915606e-05, + "loss": 0.0422, + "step": 30420 + }, + { + "epoch": 0.06711024561511844, + "grad_norm": 0.1267610788345337, + "learning_rate": 2.9783578833128187e-05, + "loss": 0.0403, + "step": 30430 + }, + { + "epoch": 0.06713229959001661, + "grad_norm": 0.1355985701084137, + "learning_rate": 2.9783298881706327e-05, + "loss": 0.04, + "step": 30440 + }, + { + "epoch": 0.06715435356491477, + "grad_norm": 0.1416977047920227, + "learning_rate": 2.9783018750653427e-05, + "loss": 0.0424, + "step": 30450 + }, + { + "epoch": 0.06717640753981294, + "grad_norm": 0.15150977671146393, + "learning_rate": 2.978273843997289e-05, + "loss": 0.0392, + "step": 30460 + }, + { + "epoch": 0.06719846151471111, + "grad_norm": 0.11643201112747192, + "learning_rate": 2.9782457949668123e-05, + "loss": 0.0394, + "step": 30470 + }, + { + "epoch": 0.06722051548960926, + "grad_norm": 0.10673320293426514, + "learning_rate": 2.978217727974254e-05, + "loss": 0.0403, + "step": 30480 + }, + { + "epoch": 0.06724256946450743, + "grad_norm": 0.16142292320728302, + "learning_rate": 2.9781896430199543e-05, + "loss": 0.0436, + "step": 30490 + }, + { + "epoch": 0.0672646234394056, + "grad_norm": 0.1633729189634323, + "learning_rate": 2.978161540104255e-05, + "loss": 0.041, + "step": 30500 + }, + { + "epoch": 0.06728667741430376, + "grad_norm": 0.18396155536174774, + "learning_rate": 2.978133419227497e-05, + "loss": 0.0399, + "step": 30510 + }, + { + "epoch": 0.06730873138920193, + "grad_norm": 0.16392552852630615, + "learning_rate": 2.978105280390023e-05, + "loss": 0.0416, + "step": 30520 + }, + { + "epoch": 0.0673307853641001, + "grad_norm": 0.13241876661777496, + "learning_rate": 2.9780771235921737e-05, + "loss": 0.0384, + "step": 30530 + }, + { + "epoch": 0.06735283933899826, + "grad_norm": 0.10916918516159058, + "learning_rate": 2.9780489488342924e-05, + "loss": 0.0398, + "step": 30540 + }, + { + "epoch": 0.06737489331389643, + "grad_norm": 0.13554580509662628, + "learning_rate": 2.97802075611672e-05, + "loss": 0.0413, + "step": 30550 + }, + { + "epoch": 0.0673969472887946, + "grad_norm": 0.11833742260932922, + "learning_rate": 2.977992545439801e-05, + "loss": 0.0391, + "step": 30560 + }, + { + "epoch": 0.06741900126369277, + "grad_norm": 0.14905168116092682, + "learning_rate": 2.977964316803876e-05, + "loss": 0.0419, + "step": 30570 + }, + { + "epoch": 0.06744105523859092, + "grad_norm": 0.16209271550178528, + "learning_rate": 2.97793607020929e-05, + "loss": 0.04, + "step": 30580 + }, + { + "epoch": 0.0674631092134891, + "grad_norm": 0.17134982347488403, + "learning_rate": 2.9779078056563848e-05, + "loss": 0.0421, + "step": 30590 + }, + { + "epoch": 0.06748516318838726, + "grad_norm": 0.09427741169929504, + "learning_rate": 2.9778795231455046e-05, + "loss": 0.0383, + "step": 30600 + }, + { + "epoch": 0.06750721716328542, + "grad_norm": 0.16056765615940094, + "learning_rate": 2.977851222676993e-05, + "loss": 0.0431, + "step": 30610 + }, + { + "epoch": 0.06752927113818359, + "grad_norm": 0.11441341042518616, + "learning_rate": 2.9778229042511935e-05, + "loss": 0.0407, + "step": 30620 + }, + { + "epoch": 0.06755132511308176, + "grad_norm": 0.1012493371963501, + "learning_rate": 2.9777945678684503e-05, + "loss": 0.0392, + "step": 30630 + }, + { + "epoch": 0.06757337908797992, + "grad_norm": 0.11728101968765259, + "learning_rate": 2.977766213529108e-05, + "loss": 0.041, + "step": 30640 + }, + { + "epoch": 0.06759543306287809, + "grad_norm": 0.15129739046096802, + "learning_rate": 2.977737841233511e-05, + "loss": 0.0404, + "step": 30650 + }, + { + "epoch": 0.06761748703777626, + "grad_norm": 0.1354491263628006, + "learning_rate": 2.9777094509820037e-05, + "loss": 0.04, + "step": 30660 + }, + { + "epoch": 0.06763954101267441, + "grad_norm": 0.13938486576080322, + "learning_rate": 2.9776810427749317e-05, + "loss": 0.0386, + "step": 30670 + }, + { + "epoch": 0.06766159498757258, + "grad_norm": 0.13627830147743225, + "learning_rate": 2.97765261661264e-05, + "loss": 0.042, + "step": 30680 + }, + { + "epoch": 0.06768364896247075, + "grad_norm": 0.15527701377868652, + "learning_rate": 2.9776241724954737e-05, + "loss": 0.0392, + "step": 30690 + }, + { + "epoch": 0.06770570293736891, + "grad_norm": 0.1313861459493637, + "learning_rate": 2.9775957104237782e-05, + "loss": 0.0395, + "step": 30700 + }, + { + "epoch": 0.06772775691226708, + "grad_norm": 0.12266062200069427, + "learning_rate": 2.9775672303979003e-05, + "loss": 0.0401, + "step": 30710 + }, + { + "epoch": 0.06774981088716525, + "grad_norm": 0.10669951885938644, + "learning_rate": 2.9775387324181854e-05, + "loss": 0.0384, + "step": 30720 + }, + { + "epoch": 0.06777186486206341, + "grad_norm": 0.16709467768669128, + "learning_rate": 2.97751021648498e-05, + "loss": 0.0399, + "step": 30730 + }, + { + "epoch": 0.06779391883696158, + "grad_norm": 0.1389031708240509, + "learning_rate": 2.97748168259863e-05, + "loss": 0.0387, + "step": 30740 + }, + { + "epoch": 0.06781597281185975, + "grad_norm": 0.11187252402305603, + "learning_rate": 2.977453130759483e-05, + "loss": 0.0385, + "step": 30750 + }, + { + "epoch": 0.0678380267867579, + "grad_norm": 0.17283064126968384, + "learning_rate": 2.9774245609678855e-05, + "loss": 0.0384, + "step": 30760 + }, + { + "epoch": 0.06786008076165607, + "grad_norm": 0.16437791287899017, + "learning_rate": 2.9773959732241844e-05, + "loss": 0.0388, + "step": 30770 + }, + { + "epoch": 0.06788213473655424, + "grad_norm": 0.11031846702098846, + "learning_rate": 2.9773673675287275e-05, + "loss": 0.0381, + "step": 30780 + }, + { + "epoch": 0.06790418871145242, + "grad_norm": 0.14545446634292603, + "learning_rate": 2.977338743881862e-05, + "loss": 0.0393, + "step": 30790 + }, + { + "epoch": 0.06792624268635057, + "grad_norm": 0.12555955350399017, + "learning_rate": 2.977310102283936e-05, + "loss": 0.0395, + "step": 30800 + }, + { + "epoch": 0.06794829666124874, + "grad_norm": 0.12961484491825104, + "learning_rate": 2.9772814427352976e-05, + "loss": 0.0372, + "step": 30810 + }, + { + "epoch": 0.06797035063614691, + "grad_norm": 0.113949716091156, + "learning_rate": 2.977252765236295e-05, + "loss": 0.0414, + "step": 30820 + }, + { + "epoch": 0.06799240461104507, + "grad_norm": 0.14576837420463562, + "learning_rate": 2.977224069787276e-05, + "loss": 0.0386, + "step": 30830 + }, + { + "epoch": 0.06801445858594324, + "grad_norm": 0.12634752690792084, + "learning_rate": 2.9771953563885907e-05, + "loss": 0.0399, + "step": 30840 + }, + { + "epoch": 0.06803651256084141, + "grad_norm": 0.11150336265563965, + "learning_rate": 2.9771666250405865e-05, + "loss": 0.0388, + "step": 30850 + }, + { + "epoch": 0.06805856653573956, + "grad_norm": 0.1214752271771431, + "learning_rate": 2.977137875743613e-05, + "loss": 0.0407, + "step": 30860 + }, + { + "epoch": 0.06808062051063773, + "grad_norm": 0.16798104345798492, + "learning_rate": 2.97710910849802e-05, + "loss": 0.0383, + "step": 30870 + }, + { + "epoch": 0.0681026744855359, + "grad_norm": 0.09814213961362839, + "learning_rate": 2.9770803233041567e-05, + "loss": 0.0412, + "step": 30880 + }, + { + "epoch": 0.06812472846043406, + "grad_norm": 0.13375315070152283, + "learning_rate": 2.9770515201623725e-05, + "loss": 0.039, + "step": 30890 + }, + { + "epoch": 0.06814678243533223, + "grad_norm": 0.12314844876527786, + "learning_rate": 2.9770226990730182e-05, + "loss": 0.0407, + "step": 30900 + }, + { + "epoch": 0.0681688364102304, + "grad_norm": 0.1263534426689148, + "learning_rate": 2.9769938600364432e-05, + "loss": 0.0413, + "step": 30910 + }, + { + "epoch": 0.06819089038512856, + "grad_norm": 0.15785053372383118, + "learning_rate": 2.976965003052999e-05, + "loss": 0.0443, + "step": 30920 + }, + { + "epoch": 0.06821294436002673, + "grad_norm": 0.12130202353000641, + "learning_rate": 2.9769361281230344e-05, + "loss": 0.0402, + "step": 30930 + }, + { + "epoch": 0.0682349983349249, + "grad_norm": 0.1293007731437683, + "learning_rate": 2.9769072352469023e-05, + "loss": 0.0404, + "step": 30940 + }, + { + "epoch": 0.06825705230982305, + "grad_norm": 0.14406660199165344, + "learning_rate": 2.9768783244249525e-05, + "loss": 0.0391, + "step": 30950 + }, + { + "epoch": 0.06827910628472122, + "grad_norm": 0.11910350620746613, + "learning_rate": 2.976849395657537e-05, + "loss": 0.0391, + "step": 30960 + }, + { + "epoch": 0.0683011602596194, + "grad_norm": 0.1200883612036705, + "learning_rate": 2.9768204489450065e-05, + "loss": 0.0404, + "step": 30970 + }, + { + "epoch": 0.06832321423451755, + "grad_norm": 0.12489360570907593, + "learning_rate": 2.9767914842877134e-05, + "loss": 0.0415, + "step": 30980 + }, + { + "epoch": 0.06834526820941572, + "grad_norm": 0.133257195353508, + "learning_rate": 2.9767625016860094e-05, + "loss": 0.0395, + "step": 30990 + }, + { + "epoch": 0.06836732218431389, + "grad_norm": 0.1702185869216919, + "learning_rate": 2.9767335011402464e-05, + "loss": 0.0414, + "step": 31000 + }, + { + "epoch": 0.06838937615921206, + "grad_norm": 0.10801480710506439, + "learning_rate": 2.976704482650778e-05, + "loss": 0.0396, + "step": 31010 + }, + { + "epoch": 0.06841143013411022, + "grad_norm": 0.12357419729232788, + "learning_rate": 2.9766754462179552e-05, + "loss": 0.0371, + "step": 31020 + }, + { + "epoch": 0.06843348410900839, + "grad_norm": 0.09164285659790039, + "learning_rate": 2.9766463918421313e-05, + "loss": 0.0397, + "step": 31030 + }, + { + "epoch": 0.06845553808390656, + "grad_norm": 0.16154545545578003, + "learning_rate": 2.97661731952366e-05, + "loss": 0.0392, + "step": 31040 + }, + { + "epoch": 0.06847759205880471, + "grad_norm": 0.13811306655406952, + "learning_rate": 2.9765882292628944e-05, + "loss": 0.0395, + "step": 31050 + }, + { + "epoch": 0.06849964603370289, + "grad_norm": 0.14839686453342438, + "learning_rate": 2.976559121060187e-05, + "loss": 0.0377, + "step": 31060 + }, + { + "epoch": 0.06852170000860106, + "grad_norm": 0.17014658451080322, + "learning_rate": 2.9765299949158925e-05, + "loss": 0.0403, + "step": 31070 + }, + { + "epoch": 0.06854375398349921, + "grad_norm": 0.16494213044643402, + "learning_rate": 2.9765008508303644e-05, + "loss": 0.041, + "step": 31080 + }, + { + "epoch": 0.06856580795839738, + "grad_norm": 0.11602681875228882, + "learning_rate": 2.976471688803957e-05, + "loss": 0.0376, + "step": 31090 + }, + { + "epoch": 0.06858786193329555, + "grad_norm": 0.12676042318344116, + "learning_rate": 2.9764425088370244e-05, + "loss": 0.0414, + "step": 31100 + }, + { + "epoch": 0.06860991590819371, + "grad_norm": 0.12522436678409576, + "learning_rate": 2.9764133109299216e-05, + "loss": 0.0399, + "step": 31110 + }, + { + "epoch": 0.06863196988309188, + "grad_norm": 0.12213841080665588, + "learning_rate": 2.9763840950830032e-05, + "loss": 0.0402, + "step": 31120 + }, + { + "epoch": 0.06865402385799005, + "grad_norm": 0.15640558302402496, + "learning_rate": 2.9763548612966236e-05, + "loss": 0.0395, + "step": 31130 + }, + { + "epoch": 0.0686760778328882, + "grad_norm": 0.12329690903425217, + "learning_rate": 2.9763256095711392e-05, + "loss": 0.0412, + "step": 31140 + }, + { + "epoch": 0.06869813180778638, + "grad_norm": 0.12536317110061646, + "learning_rate": 2.976296339906904e-05, + "loss": 0.0415, + "step": 31150 + }, + { + "epoch": 0.06872018578268455, + "grad_norm": 0.09837694466114044, + "learning_rate": 2.976267052304275e-05, + "loss": 0.0406, + "step": 31160 + }, + { + "epoch": 0.0687422397575827, + "grad_norm": 0.11231237649917603, + "learning_rate": 2.9762377467636067e-05, + "loss": 0.0422, + "step": 31170 + }, + { + "epoch": 0.06876429373248087, + "grad_norm": 0.12371707707643509, + "learning_rate": 2.9762084232852566e-05, + "loss": 0.04, + "step": 31180 + }, + { + "epoch": 0.06878634770737904, + "grad_norm": 0.12943044304847717, + "learning_rate": 2.9761790818695805e-05, + "loss": 0.0389, + "step": 31190 + }, + { + "epoch": 0.06880840168227721, + "grad_norm": 0.13333889842033386, + "learning_rate": 2.9761497225169345e-05, + "loss": 0.0401, + "step": 31200 + }, + { + "epoch": 0.06883045565717537, + "grad_norm": 0.14035840332508087, + "learning_rate": 2.976120345227676e-05, + "loss": 0.039, + "step": 31210 + }, + { + "epoch": 0.06885250963207354, + "grad_norm": 0.11710336804389954, + "learning_rate": 2.9760909500021617e-05, + "loss": 0.041, + "step": 31220 + }, + { + "epoch": 0.06887456360697171, + "grad_norm": 0.12435390800237656, + "learning_rate": 2.9760615368407482e-05, + "loss": 0.0408, + "step": 31230 + }, + { + "epoch": 0.06889661758186987, + "grad_norm": 0.1679300218820572, + "learning_rate": 2.9760321057437935e-05, + "loss": 0.039, + "step": 31240 + }, + { + "epoch": 0.06891867155676804, + "grad_norm": 0.11185582727193832, + "learning_rate": 2.9760026567116555e-05, + "loss": 0.042, + "step": 31250 + }, + { + "epoch": 0.0689407255316662, + "grad_norm": 0.10576479882001877, + "learning_rate": 2.975973189744691e-05, + "loss": 0.0391, + "step": 31260 + }, + { + "epoch": 0.06896277950656436, + "grad_norm": 0.12568922340869904, + "learning_rate": 2.975943704843259e-05, + "loss": 0.0388, + "step": 31270 + }, + { + "epoch": 0.06898483348146253, + "grad_norm": 0.15211091935634613, + "learning_rate": 2.9759142020077177e-05, + "loss": 0.0419, + "step": 31280 + }, + { + "epoch": 0.0690068874563607, + "grad_norm": 0.13705866038799286, + "learning_rate": 2.975884681238425e-05, + "loss": 0.0399, + "step": 31290 + }, + { + "epoch": 0.06902894143125886, + "grad_norm": 0.12038538604974747, + "learning_rate": 2.9758551425357404e-05, + "loss": 0.04, + "step": 31300 + }, + { + "epoch": 0.06905099540615703, + "grad_norm": 0.13527387380599976, + "learning_rate": 2.975825585900022e-05, + "loss": 0.0395, + "step": 31310 + }, + { + "epoch": 0.0690730493810552, + "grad_norm": 0.12782028317451477, + "learning_rate": 2.975796011331629e-05, + "loss": 0.0403, + "step": 31320 + }, + { + "epoch": 0.06909510335595336, + "grad_norm": 0.11039614677429199, + "learning_rate": 2.975766418830922e-05, + "loss": 0.0373, + "step": 31330 + }, + { + "epoch": 0.06911715733085153, + "grad_norm": 0.1392749696969986, + "learning_rate": 2.9757368083982588e-05, + "loss": 0.0402, + "step": 31340 + }, + { + "epoch": 0.0691392113057497, + "grad_norm": 0.10987646132707596, + "learning_rate": 2.9757071800340006e-05, + "loss": 0.0389, + "step": 31350 + }, + { + "epoch": 0.06916126528064785, + "grad_norm": 0.11782520264387131, + "learning_rate": 2.9756775337385065e-05, + "loss": 0.0424, + "step": 31360 + }, + { + "epoch": 0.06918331925554602, + "grad_norm": 0.14623479545116425, + "learning_rate": 2.9756478695121375e-05, + "loss": 0.0389, + "step": 31370 + }, + { + "epoch": 0.06920537323044419, + "grad_norm": 0.12158558517694473, + "learning_rate": 2.9756181873552532e-05, + "loss": 0.0419, + "step": 31380 + }, + { + "epoch": 0.06922742720534235, + "grad_norm": 0.14335772395133972, + "learning_rate": 2.975588487268215e-05, + "loss": 0.0415, + "step": 31390 + }, + { + "epoch": 0.06924948118024052, + "grad_norm": 0.1318081021308899, + "learning_rate": 2.9755587692513834e-05, + "loss": 0.0414, + "step": 31400 + }, + { + "epoch": 0.06927153515513869, + "grad_norm": 0.16460323333740234, + "learning_rate": 2.9755290333051195e-05, + "loss": 0.0385, + "step": 31410 + }, + { + "epoch": 0.06929358913003686, + "grad_norm": 0.12364394217729568, + "learning_rate": 2.975499279429785e-05, + "loss": 0.0381, + "step": 31420 + }, + { + "epoch": 0.06931564310493502, + "grad_norm": 0.17804530262947083, + "learning_rate": 2.975469507625741e-05, + "loss": 0.0394, + "step": 31430 + }, + { + "epoch": 0.06933769707983319, + "grad_norm": 0.1496461033821106, + "learning_rate": 2.9754397178933494e-05, + "loss": 0.0392, + "step": 31440 + }, + { + "epoch": 0.06935975105473136, + "grad_norm": 0.13442964851856232, + "learning_rate": 2.975409910232972e-05, + "loss": 0.0376, + "step": 31450 + }, + { + "epoch": 0.06938180502962951, + "grad_norm": 0.17381612956523895, + "learning_rate": 2.9753800846449714e-05, + "loss": 0.0405, + "step": 31460 + }, + { + "epoch": 0.06940385900452768, + "grad_norm": 0.1226225197315216, + "learning_rate": 2.9753502411297096e-05, + "loss": 0.0393, + "step": 31470 + }, + { + "epoch": 0.06942591297942585, + "grad_norm": 0.16826418042182922, + "learning_rate": 2.9753203796875498e-05, + "loss": 0.0408, + "step": 31480 + }, + { + "epoch": 0.06944796695432401, + "grad_norm": 0.10882978141307831, + "learning_rate": 2.9752905003188542e-05, + "loss": 0.0396, + "step": 31490 + }, + { + "epoch": 0.06947002092922218, + "grad_norm": 0.10388171672821045, + "learning_rate": 2.975260603023986e-05, + "loss": 0.0407, + "step": 31500 + }, + { + "epoch": 0.06949207490412035, + "grad_norm": 0.11474250257015228, + "learning_rate": 2.9752306878033085e-05, + "loss": 0.0399, + "step": 31510 + }, + { + "epoch": 0.0695141288790185, + "grad_norm": 0.13210387527942657, + "learning_rate": 2.9752007546571856e-05, + "loss": 0.0398, + "step": 31520 + }, + { + "epoch": 0.06953618285391668, + "grad_norm": 0.14991407096385956, + "learning_rate": 2.9751708035859804e-05, + "loss": 0.0399, + "step": 31530 + }, + { + "epoch": 0.06955823682881485, + "grad_norm": 0.10414779186248779, + "learning_rate": 2.9751408345900574e-05, + "loss": 0.0385, + "step": 31540 + }, + { + "epoch": 0.069580290803713, + "grad_norm": 0.09959672391414642, + "learning_rate": 2.9751108476697806e-05, + "loss": 0.0395, + "step": 31550 + }, + { + "epoch": 0.06960234477861117, + "grad_norm": 0.12994113564491272, + "learning_rate": 2.975080842825514e-05, + "loss": 0.0389, + "step": 31560 + }, + { + "epoch": 0.06962439875350934, + "grad_norm": 0.14260141551494598, + "learning_rate": 2.9750508200576227e-05, + "loss": 0.0405, + "step": 31570 + }, + { + "epoch": 0.0696464527284075, + "grad_norm": 0.12062584608793259, + "learning_rate": 2.975020779366471e-05, + "loss": 0.0398, + "step": 31580 + }, + { + "epoch": 0.06966850670330567, + "grad_norm": 0.14911897480487823, + "learning_rate": 2.9749907207524245e-05, + "loss": 0.0402, + "step": 31590 + }, + { + "epoch": 0.06969056067820384, + "grad_norm": 0.14993271231651306, + "learning_rate": 2.974960644215848e-05, + "loss": 0.041, + "step": 31600 + }, + { + "epoch": 0.069712614653102, + "grad_norm": 0.11207881569862366, + "learning_rate": 2.974930549757107e-05, + "loss": 0.0375, + "step": 31610 + }, + { + "epoch": 0.06973466862800017, + "grad_norm": 0.12828174233436584, + "learning_rate": 2.9749004373765672e-05, + "loss": 0.0402, + "step": 31620 + }, + { + "epoch": 0.06975672260289834, + "grad_norm": 0.11366057395935059, + "learning_rate": 2.9748703070745944e-05, + "loss": 0.0393, + "step": 31630 + }, + { + "epoch": 0.0697787765777965, + "grad_norm": 0.11042144149541855, + "learning_rate": 2.974840158851555e-05, + "loss": 0.0384, + "step": 31640 + }, + { + "epoch": 0.06980083055269466, + "grad_norm": 0.09291733056306839, + "learning_rate": 2.9748099927078155e-05, + "loss": 0.0402, + "step": 31650 + }, + { + "epoch": 0.06982288452759283, + "grad_norm": 0.1249893382191658, + "learning_rate": 2.974779808643742e-05, + "loss": 0.0391, + "step": 31660 + }, + { + "epoch": 0.069844938502491, + "grad_norm": 0.11629047244787216, + "learning_rate": 2.974749606659701e-05, + "loss": 0.0368, + "step": 31670 + }, + { + "epoch": 0.06986699247738916, + "grad_norm": 0.15348775684833527, + "learning_rate": 2.97471938675606e-05, + "loss": 0.0415, + "step": 31680 + }, + { + "epoch": 0.06988904645228733, + "grad_norm": 0.11244700849056244, + "learning_rate": 2.9746891489331865e-05, + "loss": 0.04, + "step": 31690 + }, + { + "epoch": 0.0699111004271855, + "grad_norm": 0.14054222404956818, + "learning_rate": 2.9746588931914473e-05, + "loss": 0.0411, + "step": 31700 + }, + { + "epoch": 0.06993315440208366, + "grad_norm": 0.1336311250925064, + "learning_rate": 2.97462861953121e-05, + "loss": 0.0387, + "step": 31710 + }, + { + "epoch": 0.06995520837698183, + "grad_norm": 0.12294085323810577, + "learning_rate": 2.9745983279528428e-05, + "loss": 0.0399, + "step": 31720 + }, + { + "epoch": 0.06997726235188, + "grad_norm": 0.14283297955989838, + "learning_rate": 2.9745680184567138e-05, + "loss": 0.0394, + "step": 31730 + }, + { + "epoch": 0.06999931632677815, + "grad_norm": 0.18980836868286133, + "learning_rate": 2.9745376910431912e-05, + "loss": 0.0391, + "step": 31740 + }, + { + "epoch": 0.07002137030167632, + "grad_norm": 0.11333638429641724, + "learning_rate": 2.9745073457126433e-05, + "loss": 0.0388, + "step": 31750 + }, + { + "epoch": 0.07004342427657449, + "grad_norm": 0.11405391991138458, + "learning_rate": 2.9744769824654393e-05, + "loss": 0.0391, + "step": 31760 + }, + { + "epoch": 0.07006547825147265, + "grad_norm": 0.14727716147899628, + "learning_rate": 2.9744466013019476e-05, + "loss": 0.0414, + "step": 31770 + }, + { + "epoch": 0.07008753222637082, + "grad_norm": 0.13629095256328583, + "learning_rate": 2.974416202222537e-05, + "loss": 0.0393, + "step": 31780 + }, + { + "epoch": 0.07010958620126899, + "grad_norm": 0.09996520727872849, + "learning_rate": 2.9743857852275784e-05, + "loss": 0.038, + "step": 31790 + }, + { + "epoch": 0.07013164017616715, + "grad_norm": 0.15567700564861298, + "learning_rate": 2.9743553503174402e-05, + "loss": 0.0402, + "step": 31800 + }, + { + "epoch": 0.07015369415106532, + "grad_norm": 0.10208208858966827, + "learning_rate": 2.9743248974924922e-05, + "loss": 0.0404, + "step": 31810 + }, + { + "epoch": 0.07017574812596349, + "grad_norm": 0.11796292662620544, + "learning_rate": 2.974294426753105e-05, + "loss": 0.0397, + "step": 31820 + }, + { + "epoch": 0.07019780210086164, + "grad_norm": 0.12518438696861267, + "learning_rate": 2.9742639380996482e-05, + "loss": 0.0413, + "step": 31830 + }, + { + "epoch": 0.07021985607575981, + "grad_norm": 0.10887070745229721, + "learning_rate": 2.974233431532493e-05, + "loss": 0.0391, + "step": 31840 + }, + { + "epoch": 0.07024191005065798, + "grad_norm": 0.12282193452119827, + "learning_rate": 2.9742029070520094e-05, + "loss": 0.0428, + "step": 31850 + }, + { + "epoch": 0.07026396402555615, + "grad_norm": 0.11005564779043198, + "learning_rate": 2.974172364658569e-05, + "loss": 0.0409, + "step": 31860 + }, + { + "epoch": 0.07028601800045431, + "grad_norm": 0.1567414551973343, + "learning_rate": 2.974141804352542e-05, + "loss": 0.0423, + "step": 31870 + }, + { + "epoch": 0.07030807197535248, + "grad_norm": 0.1534113883972168, + "learning_rate": 2.9741112261343005e-05, + "loss": 0.0418, + "step": 31880 + }, + { + "epoch": 0.07033012595025065, + "grad_norm": 0.11632145196199417, + "learning_rate": 2.9740806300042158e-05, + "loss": 0.0399, + "step": 31890 + }, + { + "epoch": 0.0703521799251488, + "grad_norm": 0.1472872793674469, + "learning_rate": 2.97405001596266e-05, + "loss": 0.0382, + "step": 31900 + }, + { + "epoch": 0.07037423390004698, + "grad_norm": 0.16195593774318695, + "learning_rate": 2.974019384010004e-05, + "loss": 0.0418, + "step": 31910 + }, + { + "epoch": 0.07039628787494515, + "grad_norm": 0.13439802825450897, + "learning_rate": 2.9739887341466216e-05, + "loss": 0.0374, + "step": 31920 + }, + { + "epoch": 0.0704183418498433, + "grad_norm": 0.11344728618860245, + "learning_rate": 2.973958066372884e-05, + "loss": 0.0384, + "step": 31930 + }, + { + "epoch": 0.07044039582474147, + "grad_norm": 0.10691297054290771, + "learning_rate": 2.9739273806891644e-05, + "loss": 0.0403, + "step": 31940 + }, + { + "epoch": 0.07046244979963964, + "grad_norm": 0.13919858634471893, + "learning_rate": 2.973896677095835e-05, + "loss": 0.0395, + "step": 31950 + }, + { + "epoch": 0.0704845037745378, + "grad_norm": 0.13464435935020447, + "learning_rate": 2.97386595559327e-05, + "loss": 0.0422, + "step": 31960 + }, + { + "epoch": 0.07050655774943597, + "grad_norm": 0.09811936318874359, + "learning_rate": 2.973835216181842e-05, + "loss": 0.037, + "step": 31970 + }, + { + "epoch": 0.07052861172433414, + "grad_norm": 0.10981761664152145, + "learning_rate": 2.9738044588619244e-05, + "loss": 0.0378, + "step": 31980 + }, + { + "epoch": 0.0705506656992323, + "grad_norm": 0.10773103684186935, + "learning_rate": 2.9737736836338912e-05, + "loss": 0.041, + "step": 31990 + }, + { + "epoch": 0.07057271967413047, + "grad_norm": 0.13661523163318634, + "learning_rate": 2.9737428904981162e-05, + "loss": 0.0376, + "step": 32000 + }, + { + "epoch": 0.07059477364902864, + "grad_norm": 0.14576107263565063, + "learning_rate": 2.973712079454974e-05, + "loss": 0.0379, + "step": 32010 + }, + { + "epoch": 0.07061682762392679, + "grad_norm": 0.12173331528902054, + "learning_rate": 2.9736812505048378e-05, + "loss": 0.04, + "step": 32020 + }, + { + "epoch": 0.07063888159882496, + "grad_norm": 0.1243881806731224, + "learning_rate": 2.9736504036480835e-05, + "loss": 0.0397, + "step": 32030 + }, + { + "epoch": 0.07066093557372313, + "grad_norm": 0.13886478543281555, + "learning_rate": 2.9736195388850855e-05, + "loss": 0.0404, + "step": 32040 + }, + { + "epoch": 0.07068298954862129, + "grad_norm": 0.11620663106441498, + "learning_rate": 2.973588656216219e-05, + "loss": 0.0391, + "step": 32050 + }, + { + "epoch": 0.07070504352351946, + "grad_norm": 0.11165710538625717, + "learning_rate": 2.9735577556418587e-05, + "loss": 0.0403, + "step": 32060 + }, + { + "epoch": 0.07072709749841763, + "grad_norm": 0.12163511663675308, + "learning_rate": 2.973526837162381e-05, + "loss": 0.0413, + "step": 32070 + }, + { + "epoch": 0.0707491514733158, + "grad_norm": 0.09699960798025131, + "learning_rate": 2.97349590077816e-05, + "loss": 0.0422, + "step": 32080 + }, + { + "epoch": 0.07077120544821396, + "grad_norm": 0.14071735739707947, + "learning_rate": 2.973464946489573e-05, + "loss": 0.0411, + "step": 32090 + }, + { + "epoch": 0.07079325942311213, + "grad_norm": 0.16062027215957642, + "learning_rate": 2.973433974296996e-05, + "loss": 0.0394, + "step": 32100 + }, + { + "epoch": 0.0708153133980103, + "grad_norm": 0.10037723183631897, + "learning_rate": 2.973402984200805e-05, + "loss": 0.0395, + "step": 32110 + }, + { + "epoch": 0.07083736737290845, + "grad_norm": 0.11685647815465927, + "learning_rate": 2.973371976201376e-05, + "loss": 0.0382, + "step": 32120 + }, + { + "epoch": 0.07085942134780662, + "grad_norm": 0.11724628508090973, + "learning_rate": 2.9733409502990868e-05, + "loss": 0.0401, + "step": 32130 + }, + { + "epoch": 0.07088147532270479, + "grad_norm": 0.1262110322713852, + "learning_rate": 2.9733099064943143e-05, + "loss": 0.0395, + "step": 32140 + }, + { + "epoch": 0.07090352929760295, + "grad_norm": 0.10844334214925766, + "learning_rate": 2.973278844787435e-05, + "loss": 0.0397, + "step": 32150 + }, + { + "epoch": 0.07092558327250112, + "grad_norm": 0.09907805174589157, + "learning_rate": 2.973247765178827e-05, + "loss": 0.0407, + "step": 32160 + }, + { + "epoch": 0.07094763724739929, + "grad_norm": 0.1079014465212822, + "learning_rate": 2.9732166676688675e-05, + "loss": 0.0381, + "step": 32170 + }, + { + "epoch": 0.07096969122229745, + "grad_norm": 0.14213871955871582, + "learning_rate": 2.9731855522579344e-05, + "loss": 0.0406, + "step": 32180 + }, + { + "epoch": 0.07099174519719562, + "grad_norm": 0.11668550968170166, + "learning_rate": 2.973154418946406e-05, + "loss": 0.0396, + "step": 32190 + }, + { + "epoch": 0.07101379917209379, + "grad_norm": 0.1394353210926056, + "learning_rate": 2.973123267734661e-05, + "loss": 0.0382, + "step": 32200 + }, + { + "epoch": 0.07103585314699194, + "grad_norm": 0.12148043513298035, + "learning_rate": 2.973092098623077e-05, + "loss": 0.0382, + "step": 32210 + }, + { + "epoch": 0.07105790712189011, + "grad_norm": 0.115531325340271, + "learning_rate": 2.9730609116120334e-05, + "loss": 0.0391, + "step": 32220 + }, + { + "epoch": 0.07107996109678828, + "grad_norm": 0.1322699785232544, + "learning_rate": 2.9730297067019087e-05, + "loss": 0.0379, + "step": 32230 + }, + { + "epoch": 0.07110201507168644, + "grad_norm": 0.10686573386192322, + "learning_rate": 2.9729984838930824e-05, + "loss": 0.0389, + "step": 32240 + }, + { + "epoch": 0.07112406904658461, + "grad_norm": 0.12596282362937927, + "learning_rate": 2.9729672431859336e-05, + "loss": 0.0399, + "step": 32250 + }, + { + "epoch": 0.07114612302148278, + "grad_norm": 0.13349129259586334, + "learning_rate": 2.972935984580842e-05, + "loss": 0.038, + "step": 32260 + }, + { + "epoch": 0.07116817699638095, + "grad_norm": 0.10382160544395447, + "learning_rate": 2.9729047080781876e-05, + "loss": 0.0382, + "step": 32270 + }, + { + "epoch": 0.0711902309712791, + "grad_norm": 0.11482575535774231, + "learning_rate": 2.9728734136783507e-05, + "loss": 0.0396, + "step": 32280 + }, + { + "epoch": 0.07121228494617728, + "grad_norm": 0.11446287482976913, + "learning_rate": 2.9728421013817112e-05, + "loss": 0.0403, + "step": 32290 + }, + { + "epoch": 0.07123433892107545, + "grad_norm": 0.12304499000310898, + "learning_rate": 2.9728107711886493e-05, + "loss": 0.0384, + "step": 32300 + }, + { + "epoch": 0.0712563928959736, + "grad_norm": 0.11574438959360123, + "learning_rate": 2.972779423099546e-05, + "loss": 0.0393, + "step": 32310 + }, + { + "epoch": 0.07127844687087177, + "grad_norm": 0.13344407081604004, + "learning_rate": 2.972748057114782e-05, + "loss": 0.0388, + "step": 32320 + }, + { + "epoch": 0.07130050084576994, + "grad_norm": 0.1153254359960556, + "learning_rate": 2.972716673234739e-05, + "loss": 0.0391, + "step": 32330 + }, + { + "epoch": 0.0713225548206681, + "grad_norm": 0.12002410739660263, + "learning_rate": 2.9726852714597977e-05, + "loss": 0.0392, + "step": 32340 + }, + { + "epoch": 0.07134460879556627, + "grad_norm": 0.11877157539129257, + "learning_rate": 2.97265385179034e-05, + "loss": 0.0398, + "step": 32350 + }, + { + "epoch": 0.07136666277046444, + "grad_norm": 0.12831726670265198, + "learning_rate": 2.9726224142267473e-05, + "loss": 0.0385, + "step": 32360 + }, + { + "epoch": 0.0713887167453626, + "grad_norm": 0.1764262169599533, + "learning_rate": 2.972590958769402e-05, + "loss": 0.0411, + "step": 32370 + }, + { + "epoch": 0.07141077072026077, + "grad_norm": 0.137841135263443, + "learning_rate": 2.9725594854186865e-05, + "loss": 0.0408, + "step": 32380 + }, + { + "epoch": 0.07143282469515894, + "grad_norm": 0.15861499309539795, + "learning_rate": 2.9725279941749826e-05, + "loss": 0.0429, + "step": 32390 + }, + { + "epoch": 0.07145487867005709, + "grad_norm": 0.1492658108472824, + "learning_rate": 2.972496485038674e-05, + "loss": 0.04, + "step": 32400 + }, + { + "epoch": 0.07147693264495526, + "grad_norm": 0.13014976680278778, + "learning_rate": 2.972464958010142e-05, + "loss": 0.0414, + "step": 32410 + }, + { + "epoch": 0.07149898661985343, + "grad_norm": 0.13691984117031097, + "learning_rate": 2.972433413089771e-05, + "loss": 0.042, + "step": 32420 + }, + { + "epoch": 0.07152104059475159, + "grad_norm": 0.10611078143119812, + "learning_rate": 2.9724018502779433e-05, + "loss": 0.0406, + "step": 32430 + }, + { + "epoch": 0.07154309456964976, + "grad_norm": 0.1418273001909256, + "learning_rate": 2.9723702695750434e-05, + "loss": 0.038, + "step": 32440 + }, + { + "epoch": 0.07156514854454793, + "grad_norm": 0.10070324689149857, + "learning_rate": 2.972338670981454e-05, + "loss": 0.0388, + "step": 32450 + }, + { + "epoch": 0.07158720251944609, + "grad_norm": 0.1391269564628601, + "learning_rate": 2.9723070544975602e-05, + "loss": 0.0362, + "step": 32460 + }, + { + "epoch": 0.07160925649434426, + "grad_norm": 0.10996181517839432, + "learning_rate": 2.9722754201237454e-05, + "loss": 0.0384, + "step": 32470 + }, + { + "epoch": 0.07163131046924243, + "grad_norm": 0.14429883658885956, + "learning_rate": 2.9722437678603945e-05, + "loss": 0.0391, + "step": 32480 + }, + { + "epoch": 0.0716533644441406, + "grad_norm": 0.1396510750055313, + "learning_rate": 2.972212097707891e-05, + "loss": 0.0412, + "step": 32490 + }, + { + "epoch": 0.07167541841903875, + "grad_norm": 0.12826266884803772, + "learning_rate": 2.9721804096666214e-05, + "loss": 0.0404, + "step": 32500 + }, + { + "epoch": 0.07169747239393692, + "grad_norm": 0.1234617605805397, + "learning_rate": 2.9721487037369692e-05, + "loss": 0.0421, + "step": 32510 + }, + { + "epoch": 0.0717195263688351, + "grad_norm": 0.09272156655788422, + "learning_rate": 2.972116979919321e-05, + "loss": 0.0404, + "step": 32520 + }, + { + "epoch": 0.07174158034373325, + "grad_norm": 0.11710046231746674, + "learning_rate": 2.972085238214061e-05, + "loss": 0.0379, + "step": 32530 + }, + { + "epoch": 0.07176363431863142, + "grad_norm": 0.12001381069421768, + "learning_rate": 2.9720534786215755e-05, + "loss": 0.0401, + "step": 32540 + }, + { + "epoch": 0.07178568829352959, + "grad_norm": 0.1412166804075241, + "learning_rate": 2.9720217011422507e-05, + "loss": 0.0394, + "step": 32550 + }, + { + "epoch": 0.07180774226842775, + "grad_norm": 0.13436108827590942, + "learning_rate": 2.971989905776472e-05, + "loss": 0.0372, + "step": 32560 + }, + { + "epoch": 0.07182979624332592, + "grad_norm": 0.13092024624347687, + "learning_rate": 2.9719580925246267e-05, + "loss": 0.0372, + "step": 32570 + }, + { + "epoch": 0.07185185021822409, + "grad_norm": 0.09925015270709991, + "learning_rate": 2.9719262613871006e-05, + "loss": 0.0386, + "step": 32580 + }, + { + "epoch": 0.07187390419312224, + "grad_norm": 0.12031036615371704, + "learning_rate": 2.9718944123642803e-05, + "loss": 0.0389, + "step": 32590 + }, + { + "epoch": 0.07189595816802041, + "grad_norm": 0.14841657876968384, + "learning_rate": 2.971862545456554e-05, + "loss": 0.0412, + "step": 32600 + }, + { + "epoch": 0.07191801214291858, + "grad_norm": 0.10248025506734848, + "learning_rate": 2.9718306606643072e-05, + "loss": 0.0407, + "step": 32610 + }, + { + "epoch": 0.07194006611781674, + "grad_norm": 0.11261311173439026, + "learning_rate": 2.9717987579879288e-05, + "loss": 0.0409, + "step": 32620 + }, + { + "epoch": 0.07196212009271491, + "grad_norm": 0.10548077523708344, + "learning_rate": 2.9717668374278056e-05, + "loss": 0.0408, + "step": 32630 + }, + { + "epoch": 0.07198417406761308, + "grad_norm": 0.12139672040939331, + "learning_rate": 2.971734898984326e-05, + "loss": 0.0399, + "step": 32640 + }, + { + "epoch": 0.07200622804251124, + "grad_norm": 0.12610778212547302, + "learning_rate": 2.971702942657878e-05, + "loss": 0.0414, + "step": 32650 + }, + { + "epoch": 0.0720282820174094, + "grad_norm": 0.10855897516012192, + "learning_rate": 2.971670968448849e-05, + "loss": 0.0409, + "step": 32660 + }, + { + "epoch": 0.07205033599230758, + "grad_norm": 0.10342229157686234, + "learning_rate": 2.9716389763576287e-05, + "loss": 0.0388, + "step": 32670 + }, + { + "epoch": 0.07207238996720573, + "grad_norm": 0.12267552316188812, + "learning_rate": 2.9716069663846057e-05, + "loss": 0.0397, + "step": 32680 + }, + { + "epoch": 0.0720944439421039, + "grad_norm": 0.09150432795286179, + "learning_rate": 2.9715749385301682e-05, + "loss": 0.0395, + "step": 32690 + }, + { + "epoch": 0.07211649791700207, + "grad_norm": 0.1070980653166771, + "learning_rate": 2.971542892794706e-05, + "loss": 0.0368, + "step": 32700 + }, + { + "epoch": 0.07213855189190024, + "grad_norm": 0.1333264708518982, + "learning_rate": 2.9715108291786077e-05, + "loss": 0.042, + "step": 32710 + }, + { + "epoch": 0.0721606058667984, + "grad_norm": 0.12623271346092224, + "learning_rate": 2.9714787476822642e-05, + "loss": 0.0389, + "step": 32720 + }, + { + "epoch": 0.07218265984169657, + "grad_norm": 0.18516457080841064, + "learning_rate": 2.9714466483060645e-05, + "loss": 0.0419, + "step": 32730 + }, + { + "epoch": 0.07220471381659474, + "grad_norm": 0.1559920310974121, + "learning_rate": 2.9714145310503986e-05, + "loss": 0.0393, + "step": 32740 + }, + { + "epoch": 0.0722267677914929, + "grad_norm": 0.14576344192028046, + "learning_rate": 2.971382395915657e-05, + "loss": 0.0393, + "step": 32750 + }, + { + "epoch": 0.07224882176639107, + "grad_norm": 0.11235616356134415, + "learning_rate": 2.9713502429022298e-05, + "loss": 0.0387, + "step": 32760 + }, + { + "epoch": 0.07227087574128924, + "grad_norm": 0.14962108433246613, + "learning_rate": 2.9713180720105085e-05, + "loss": 0.0401, + "step": 32770 + }, + { + "epoch": 0.07229292971618739, + "grad_norm": 0.1518261432647705, + "learning_rate": 2.971285883240883e-05, + "loss": 0.0365, + "step": 32780 + }, + { + "epoch": 0.07231498369108556, + "grad_norm": 0.10574536770582199, + "learning_rate": 2.971253676593745e-05, + "loss": 0.0382, + "step": 32790 + }, + { + "epoch": 0.07233703766598373, + "grad_norm": 0.11082934588193893, + "learning_rate": 2.9712214520694858e-05, + "loss": 0.0387, + "step": 32800 + }, + { + "epoch": 0.07235909164088189, + "grad_norm": 0.11166096478700638, + "learning_rate": 2.971189209668497e-05, + "loss": 0.0386, + "step": 32810 + }, + { + "epoch": 0.07238114561578006, + "grad_norm": 0.15608637034893036, + "learning_rate": 2.9711569493911702e-05, + "loss": 0.0416, + "step": 32820 + }, + { + "epoch": 0.07240319959067823, + "grad_norm": 0.13830561935901642, + "learning_rate": 2.971124671237897e-05, + "loss": 0.0372, + "step": 32830 + }, + { + "epoch": 0.07242525356557639, + "grad_norm": 0.11233378201723099, + "learning_rate": 2.9710923752090705e-05, + "loss": 0.0391, + "step": 32840 + }, + { + "epoch": 0.07244730754047456, + "grad_norm": 0.11852199584245682, + "learning_rate": 2.9710600613050826e-05, + "loss": 0.037, + "step": 32850 + }, + { + "epoch": 0.07246936151537273, + "grad_norm": 0.1319926530122757, + "learning_rate": 2.9710277295263264e-05, + "loss": 0.0398, + "step": 32860 + }, + { + "epoch": 0.07249141549027088, + "grad_norm": 0.13762064278125763, + "learning_rate": 2.970995379873194e-05, + "loss": 0.0398, + "step": 32870 + }, + { + "epoch": 0.07251346946516905, + "grad_norm": 0.1315036565065384, + "learning_rate": 2.9709630123460788e-05, + "loss": 0.04, + "step": 32880 + }, + { + "epoch": 0.07253552344006722, + "grad_norm": 0.12268552929162979, + "learning_rate": 2.9709306269453744e-05, + "loss": 0.0397, + "step": 32890 + }, + { + "epoch": 0.07255757741496538, + "grad_norm": 0.09557361155748367, + "learning_rate": 2.9708982236714745e-05, + "loss": 0.0384, + "step": 32900 + }, + { + "epoch": 0.07257963138986355, + "grad_norm": 0.14803294837474823, + "learning_rate": 2.970865802524772e-05, + "loss": 0.0376, + "step": 32910 + }, + { + "epoch": 0.07260168536476172, + "grad_norm": 0.16795916855335236, + "learning_rate": 2.9708333635056608e-05, + "loss": 0.0398, + "step": 32920 + }, + { + "epoch": 0.07262373933965989, + "grad_norm": 0.11695367097854614, + "learning_rate": 2.970800906614536e-05, + "loss": 0.0369, + "step": 32930 + }, + { + "epoch": 0.07264579331455805, + "grad_norm": 0.10699296742677689, + "learning_rate": 2.9707684318517917e-05, + "loss": 0.0395, + "step": 32940 + }, + { + "epoch": 0.07266784728945622, + "grad_norm": 0.12688133120536804, + "learning_rate": 2.9707359392178224e-05, + "loss": 0.0406, + "step": 32950 + }, + { + "epoch": 0.07268990126435439, + "grad_norm": 0.13540039956569672, + "learning_rate": 2.9707034287130225e-05, + "loss": 0.0415, + "step": 32960 + }, + { + "epoch": 0.07271195523925254, + "grad_norm": 0.12903477251529694, + "learning_rate": 2.9706709003377875e-05, + "loss": 0.0387, + "step": 32970 + }, + { + "epoch": 0.07273400921415071, + "grad_norm": 0.1152138039469719, + "learning_rate": 2.9706383540925124e-05, + "loss": 0.0378, + "step": 32980 + }, + { + "epoch": 0.07275606318904888, + "grad_norm": 0.11113075911998749, + "learning_rate": 2.9706057899775926e-05, + "loss": 0.0391, + "step": 32990 + }, + { + "epoch": 0.07277811716394704, + "grad_norm": 0.1132456511259079, + "learning_rate": 2.9705732079934245e-05, + "loss": 0.0398, + "step": 33000 + }, + { + "epoch": 0.07280017113884521, + "grad_norm": 0.11721281707286835, + "learning_rate": 2.970540608140403e-05, + "loss": 0.0395, + "step": 33010 + }, + { + "epoch": 0.07282222511374338, + "grad_norm": 0.1298617571592331, + "learning_rate": 2.970507990418925e-05, + "loss": 0.0409, + "step": 33020 + }, + { + "epoch": 0.07284427908864154, + "grad_norm": 0.104324109852314, + "learning_rate": 2.9704753548293865e-05, + "loss": 0.0379, + "step": 33030 + }, + { + "epoch": 0.0728663330635397, + "grad_norm": 0.14046324789524078, + "learning_rate": 2.970442701372184e-05, + "loss": 0.0396, + "step": 33040 + }, + { + "epoch": 0.07288838703843788, + "grad_norm": 0.11257212609052658, + "learning_rate": 2.9704100300477144e-05, + "loss": 0.0366, + "step": 33050 + }, + { + "epoch": 0.07291044101333603, + "grad_norm": 0.12257816642522812, + "learning_rate": 2.9703773408563745e-05, + "loss": 0.0399, + "step": 33060 + }, + { + "epoch": 0.0729324949882342, + "grad_norm": 0.12276660650968552, + "learning_rate": 2.9703446337985618e-05, + "loss": 0.0407, + "step": 33070 + }, + { + "epoch": 0.07295454896313237, + "grad_norm": 0.13131879270076752, + "learning_rate": 2.970311908874673e-05, + "loss": 0.0372, + "step": 33080 + }, + { + "epoch": 0.07297660293803053, + "grad_norm": 0.12692247331142426, + "learning_rate": 2.9702791660851072e-05, + "loss": 0.0412, + "step": 33090 + }, + { + "epoch": 0.0729986569129287, + "grad_norm": 0.0775655061006546, + "learning_rate": 2.970246405430261e-05, + "loss": 0.0389, + "step": 33100 + }, + { + "epoch": 0.07302071088782687, + "grad_norm": 0.15934430062770844, + "learning_rate": 2.9702136269105325e-05, + "loss": 0.0399, + "step": 33110 + }, + { + "epoch": 0.07304276486272503, + "grad_norm": 0.1204560175538063, + "learning_rate": 2.970180830526321e-05, + "loss": 0.0401, + "step": 33120 + }, + { + "epoch": 0.0730648188376232, + "grad_norm": 0.12351813912391663, + "learning_rate": 2.9701480162780236e-05, + "loss": 0.0389, + "step": 33130 + }, + { + "epoch": 0.07308687281252137, + "grad_norm": 0.1288272887468338, + "learning_rate": 2.97011518416604e-05, + "loss": 0.0399, + "step": 33140 + }, + { + "epoch": 0.07310892678741954, + "grad_norm": 0.1320098638534546, + "learning_rate": 2.9700823341907693e-05, + "loss": 0.0409, + "step": 33150 + }, + { + "epoch": 0.0731309807623177, + "grad_norm": 0.09014056622982025, + "learning_rate": 2.9700494663526097e-05, + "loss": 0.0387, + "step": 33160 + }, + { + "epoch": 0.07315303473721586, + "grad_norm": 0.12352600693702698, + "learning_rate": 2.9700165806519614e-05, + "loss": 0.0405, + "step": 33170 + }, + { + "epoch": 0.07317508871211403, + "grad_norm": 0.10576041042804718, + "learning_rate": 2.9699836770892236e-05, + "loss": 0.04, + "step": 33180 + }, + { + "epoch": 0.07319714268701219, + "grad_norm": 0.1332453340291977, + "learning_rate": 2.9699507556647964e-05, + "loss": 0.0387, + "step": 33190 + }, + { + "epoch": 0.07321919666191036, + "grad_norm": 0.1529841274023056, + "learning_rate": 2.9699178163790793e-05, + "loss": 0.042, + "step": 33200 + }, + { + "epoch": 0.07324125063680853, + "grad_norm": 0.14023855328559875, + "learning_rate": 2.9698848592324735e-05, + "loss": 0.0393, + "step": 33210 + }, + { + "epoch": 0.07326330461170669, + "grad_norm": 0.13656708598136902, + "learning_rate": 2.969851884225378e-05, + "loss": 0.0372, + "step": 33220 + }, + { + "epoch": 0.07328535858660486, + "grad_norm": 0.1468542516231537, + "learning_rate": 2.969818891358195e-05, + "loss": 0.0402, + "step": 33230 + }, + { + "epoch": 0.07330741256150303, + "grad_norm": 0.09555530548095703, + "learning_rate": 2.969785880631325e-05, + "loss": 0.0402, + "step": 33240 + }, + { + "epoch": 0.07332946653640118, + "grad_norm": 0.1209111362695694, + "learning_rate": 2.9697528520451685e-05, + "loss": 0.039, + "step": 33250 + }, + { + "epoch": 0.07335152051129935, + "grad_norm": 0.10545884072780609, + "learning_rate": 2.9697198056001267e-05, + "loss": 0.0409, + "step": 33260 + }, + { + "epoch": 0.07337357448619752, + "grad_norm": 0.12293323874473572, + "learning_rate": 2.969686741296602e-05, + "loss": 0.039, + "step": 33270 + }, + { + "epoch": 0.07339562846109568, + "grad_norm": 0.16621579229831696, + "learning_rate": 2.969653659134996e-05, + "loss": 0.0418, + "step": 33280 + }, + { + "epoch": 0.07341768243599385, + "grad_norm": 0.14332464337348938, + "learning_rate": 2.96962055911571e-05, + "loss": 0.0398, + "step": 33290 + }, + { + "epoch": 0.07343973641089202, + "grad_norm": 0.12149569392204285, + "learning_rate": 2.969587441239147e-05, + "loss": 0.0354, + "step": 33300 + }, + { + "epoch": 0.07346179038579018, + "grad_norm": 0.13621100783348083, + "learning_rate": 2.969554305505709e-05, + "loss": 0.0401, + "step": 33310 + }, + { + "epoch": 0.07348384436068835, + "grad_norm": 0.1318879872560501, + "learning_rate": 2.9695211519157986e-05, + "loss": 0.0397, + "step": 33320 + }, + { + "epoch": 0.07350589833558652, + "grad_norm": 0.15951064229011536, + "learning_rate": 2.9694879804698188e-05, + "loss": 0.0399, + "step": 33330 + }, + { + "epoch": 0.07352795231048467, + "grad_norm": 0.15135028958320618, + "learning_rate": 2.9694547911681728e-05, + "loss": 0.0384, + "step": 33340 + }, + { + "epoch": 0.07355000628538284, + "grad_norm": 0.12446792423725128, + "learning_rate": 2.9694215840112633e-05, + "loss": 0.0402, + "step": 33350 + }, + { + "epoch": 0.07357206026028101, + "grad_norm": 0.15430758893489838, + "learning_rate": 2.9693883589994945e-05, + "loss": 0.0419, + "step": 33360 + }, + { + "epoch": 0.07359411423517918, + "grad_norm": 0.09677799046039581, + "learning_rate": 2.9693551161332698e-05, + "loss": 0.0406, + "step": 33370 + }, + { + "epoch": 0.07361616821007734, + "grad_norm": 0.11681084334850311, + "learning_rate": 2.9693218554129928e-05, + "loss": 0.0404, + "step": 33380 + }, + { + "epoch": 0.07363822218497551, + "grad_norm": 0.09235836565494537, + "learning_rate": 2.9692885768390682e-05, + "loss": 0.0389, + "step": 33390 + }, + { + "epoch": 0.07366027615987368, + "grad_norm": 0.11269718408584595, + "learning_rate": 2.9692552804119005e-05, + "loss": 0.04, + "step": 33400 + }, + { + "epoch": 0.07368233013477184, + "grad_norm": 0.12898239493370056, + "learning_rate": 2.9692219661318936e-05, + "loss": 0.04, + "step": 33410 + }, + { + "epoch": 0.07370438410967001, + "grad_norm": 0.10012651979923248, + "learning_rate": 2.9691886339994527e-05, + "loss": 0.0394, + "step": 33420 + }, + { + "epoch": 0.07372643808456818, + "grad_norm": 0.1177351176738739, + "learning_rate": 2.9691552840149826e-05, + "loss": 0.0411, + "step": 33430 + }, + { + "epoch": 0.07374849205946633, + "grad_norm": 0.12069936841726303, + "learning_rate": 2.969121916178889e-05, + "loss": 0.041, + "step": 33440 + }, + { + "epoch": 0.0737705460343645, + "grad_norm": 0.14534145593643188, + "learning_rate": 2.9690885304915764e-05, + "loss": 0.0369, + "step": 33450 + }, + { + "epoch": 0.07379260000926267, + "grad_norm": 0.09803425520658493, + "learning_rate": 2.969055126953452e-05, + "loss": 0.04, + "step": 33460 + }, + { + "epoch": 0.07381465398416083, + "grad_norm": 0.1524192988872528, + "learning_rate": 2.9690217055649203e-05, + "loss": 0.0394, + "step": 33470 + }, + { + "epoch": 0.073836707959059, + "grad_norm": 0.13775880634784698, + "learning_rate": 2.9689882663263877e-05, + "loss": 0.0392, + "step": 33480 + }, + { + "epoch": 0.07385876193395717, + "grad_norm": 0.11945164948701859, + "learning_rate": 2.9689548092382613e-05, + "loss": 0.0388, + "step": 33490 + }, + { + "epoch": 0.07388081590885533, + "grad_norm": 0.1492708921432495, + "learning_rate": 2.9689213343009467e-05, + "loss": 0.0387, + "step": 33500 + }, + { + "epoch": 0.0739028698837535, + "grad_norm": 0.1519739180803299, + "learning_rate": 2.968887841514851e-05, + "loss": 0.0382, + "step": 33510 + }, + { + "epoch": 0.07392492385865167, + "grad_norm": 0.12197979539632797, + "learning_rate": 2.9688543308803808e-05, + "loss": 0.0398, + "step": 33520 + }, + { + "epoch": 0.07394697783354982, + "grad_norm": 0.12217320501804352, + "learning_rate": 2.9688208023979442e-05, + "loss": 0.038, + "step": 33530 + }, + { + "epoch": 0.073969031808448, + "grad_norm": 0.1174515038728714, + "learning_rate": 2.9687872560679478e-05, + "loss": 0.0416, + "step": 33540 + }, + { + "epoch": 0.07399108578334616, + "grad_norm": 0.11915493756532669, + "learning_rate": 2.9687536918907998e-05, + "loss": 0.0388, + "step": 33550 + }, + { + "epoch": 0.07401313975824433, + "grad_norm": 0.12593869864940643, + "learning_rate": 2.968720109866907e-05, + "loss": 0.0392, + "step": 33560 + }, + { + "epoch": 0.07403519373314249, + "grad_norm": 0.16303890943527222, + "learning_rate": 2.9686865099966784e-05, + "loss": 0.0428, + "step": 33570 + }, + { + "epoch": 0.07405724770804066, + "grad_norm": 0.13043926656246185, + "learning_rate": 2.9686528922805223e-05, + "loss": 0.0387, + "step": 33580 + }, + { + "epoch": 0.07407930168293883, + "grad_norm": 0.12104841321706772, + "learning_rate": 2.968619256718847e-05, + "loss": 0.0393, + "step": 33590 + }, + { + "epoch": 0.07410135565783699, + "grad_norm": 0.11580824106931686, + "learning_rate": 2.968585603312061e-05, + "loss": 0.0404, + "step": 33600 + }, + { + "epoch": 0.07412340963273516, + "grad_norm": 0.1186559721827507, + "learning_rate": 2.9685519320605736e-05, + "loss": 0.04, + "step": 33610 + }, + { + "epoch": 0.07414546360763333, + "grad_norm": 0.1492510885000229, + "learning_rate": 2.968518242964793e-05, + "loss": 0.041, + "step": 33620 + }, + { + "epoch": 0.07416751758253148, + "grad_norm": 0.14484427869319916, + "learning_rate": 2.9684845360251295e-05, + "loss": 0.0388, + "step": 33630 + }, + { + "epoch": 0.07418957155742965, + "grad_norm": 0.14049406349658966, + "learning_rate": 2.968450811241993e-05, + "loss": 0.0394, + "step": 33640 + }, + { + "epoch": 0.07421162553232782, + "grad_norm": 0.12042245268821716, + "learning_rate": 2.9684170686157922e-05, + "loss": 0.0407, + "step": 33650 + }, + { + "epoch": 0.07423367950722598, + "grad_norm": 0.15577581524848938, + "learning_rate": 2.9683833081469376e-05, + "loss": 0.0383, + "step": 33660 + }, + { + "epoch": 0.07425573348212415, + "grad_norm": 0.1315208226442337, + "learning_rate": 2.9683495298358395e-05, + "loss": 0.0391, + "step": 33670 + }, + { + "epoch": 0.07427778745702232, + "grad_norm": 0.1104339137673378, + "learning_rate": 2.9683157336829082e-05, + "loss": 0.0382, + "step": 33680 + }, + { + "epoch": 0.07429984143192048, + "grad_norm": 0.1150406077504158, + "learning_rate": 2.9682819196885543e-05, + "loss": 0.038, + "step": 33690 + }, + { + "epoch": 0.07432189540681865, + "grad_norm": 0.11574581265449524, + "learning_rate": 2.968248087853189e-05, + "loss": 0.0377, + "step": 33700 + }, + { + "epoch": 0.07434394938171682, + "grad_norm": 0.12300170212984085, + "learning_rate": 2.968214238177223e-05, + "loss": 0.0392, + "step": 33710 + }, + { + "epoch": 0.07436600335661497, + "grad_norm": 0.14337636530399323, + "learning_rate": 2.968180370661068e-05, + "loss": 0.042, + "step": 33720 + }, + { + "epoch": 0.07438805733151314, + "grad_norm": 0.14155159890651703, + "learning_rate": 2.9681464853051355e-05, + "loss": 0.0409, + "step": 33730 + }, + { + "epoch": 0.07441011130641131, + "grad_norm": 0.14888279139995575, + "learning_rate": 2.968112582109836e-05, + "loss": 0.0394, + "step": 33740 + }, + { + "epoch": 0.07443216528130947, + "grad_norm": 0.09924691915512085, + "learning_rate": 2.9680786610755835e-05, + "loss": 0.0395, + "step": 33750 + }, + { + "epoch": 0.07445421925620764, + "grad_norm": 0.13690027594566345, + "learning_rate": 2.9680447222027895e-05, + "loss": 0.0391, + "step": 33760 + }, + { + "epoch": 0.07447627323110581, + "grad_norm": 0.10647164285182953, + "learning_rate": 2.9680107654918648e-05, + "loss": 0.0405, + "step": 33770 + }, + { + "epoch": 0.07449832720600398, + "grad_norm": 0.1876927763223648, + "learning_rate": 2.967976790943224e-05, + "loss": 0.0386, + "step": 33780 + }, + { + "epoch": 0.07452038118090214, + "grad_norm": 0.11277896165847778, + "learning_rate": 2.9679427985572792e-05, + "loss": 0.0389, + "step": 33790 + }, + { + "epoch": 0.07454243515580031, + "grad_norm": 0.12401893734931946, + "learning_rate": 2.9679087883344436e-05, + "loss": 0.0374, + "step": 33800 + }, + { + "epoch": 0.07456448913069848, + "grad_norm": 0.13201646506786346, + "learning_rate": 2.96787476027513e-05, + "loss": 0.0408, + "step": 33810 + }, + { + "epoch": 0.07458654310559663, + "grad_norm": 0.11791165173053741, + "learning_rate": 2.967840714379752e-05, + "loss": 0.0366, + "step": 33820 + }, + { + "epoch": 0.0746085970804948, + "grad_norm": 0.14863739907741547, + "learning_rate": 2.967806650648724e-05, + "loss": 0.0406, + "step": 33830 + }, + { + "epoch": 0.07463065105539297, + "grad_norm": 0.12816445529460907, + "learning_rate": 2.967772569082459e-05, + "loss": 0.0388, + "step": 33840 + }, + { + "epoch": 0.07465270503029113, + "grad_norm": 0.13334254920482635, + "learning_rate": 2.9677384696813715e-05, + "loss": 0.039, + "step": 33850 + }, + { + "epoch": 0.0746747590051893, + "grad_norm": 0.11200153827667236, + "learning_rate": 2.9677043524458757e-05, + "loss": 0.0397, + "step": 33860 + }, + { + "epoch": 0.07469681298008747, + "grad_norm": 0.18028226494789124, + "learning_rate": 2.9676702173763865e-05, + "loss": 0.0421, + "step": 33870 + }, + { + "epoch": 0.07471886695498563, + "grad_norm": 0.08440309762954712, + "learning_rate": 2.967636064473318e-05, + "loss": 0.0405, + "step": 33880 + }, + { + "epoch": 0.0747409209298838, + "grad_norm": 0.1487039029598236, + "learning_rate": 2.9676018937370862e-05, + "loss": 0.0406, + "step": 33890 + }, + { + "epoch": 0.07476297490478197, + "grad_norm": 0.12291497737169266, + "learning_rate": 2.9675677051681056e-05, + "loss": 0.0413, + "step": 33900 + }, + { + "epoch": 0.07478502887968012, + "grad_norm": 0.1626415103673935, + "learning_rate": 2.9675334987667915e-05, + "loss": 0.0407, + "step": 33910 + }, + { + "epoch": 0.0748070828545783, + "grad_norm": 0.12788917124271393, + "learning_rate": 2.96749927453356e-05, + "loss": 0.0377, + "step": 33920 + }, + { + "epoch": 0.07482913682947646, + "grad_norm": 0.10427878051996231, + "learning_rate": 2.9674650324688266e-05, + "loss": 0.037, + "step": 33930 + }, + { + "epoch": 0.07485119080437462, + "grad_norm": 0.13428637385368347, + "learning_rate": 2.9674307725730078e-05, + "loss": 0.0403, + "step": 33940 + }, + { + "epoch": 0.07487324477927279, + "grad_norm": 0.12670661509037018, + "learning_rate": 2.9673964948465196e-05, + "loss": 0.0394, + "step": 33950 + }, + { + "epoch": 0.07489529875417096, + "grad_norm": 0.10395866632461548, + "learning_rate": 2.9673621992897785e-05, + "loss": 0.0402, + "step": 33960 + }, + { + "epoch": 0.07491735272906912, + "grad_norm": 0.10855694860219955, + "learning_rate": 2.967327885903201e-05, + "loss": 0.0419, + "step": 33970 + }, + { + "epoch": 0.07493940670396729, + "grad_norm": 0.1313048154115677, + "learning_rate": 2.9672935546872048e-05, + "loss": 0.0404, + "step": 33980 + }, + { + "epoch": 0.07496146067886546, + "grad_norm": 0.12220221757888794, + "learning_rate": 2.9672592056422063e-05, + "loss": 0.0388, + "step": 33990 + }, + { + "epoch": 0.07498351465376363, + "grad_norm": 0.10052171349525452, + "learning_rate": 2.967224838768623e-05, + "loss": 0.0397, + "step": 34000 + }, + { + "epoch": 0.07500556862866178, + "grad_norm": 0.13339562714099884, + "learning_rate": 2.9671904540668727e-05, + "loss": 0.0388, + "step": 34010 + }, + { + "epoch": 0.07502762260355995, + "grad_norm": 0.1337834894657135, + "learning_rate": 2.9671560515373734e-05, + "loss": 0.0379, + "step": 34020 + }, + { + "epoch": 0.07504967657845812, + "grad_norm": 0.1325923353433609, + "learning_rate": 2.9671216311805423e-05, + "loss": 0.0391, + "step": 34030 + }, + { + "epoch": 0.07507173055335628, + "grad_norm": 0.11486165970563889, + "learning_rate": 2.9670871929967984e-05, + "loss": 0.0395, + "step": 34040 + }, + { + "epoch": 0.07509378452825445, + "grad_norm": 0.20707623660564423, + "learning_rate": 2.96705273698656e-05, + "loss": 0.0427, + "step": 34050 + }, + { + "epoch": 0.07511583850315262, + "grad_norm": 0.11154941469430923, + "learning_rate": 2.967018263150246e-05, + "loss": 0.0393, + "step": 34060 + }, + { + "epoch": 0.07513789247805078, + "grad_norm": 0.11958188563585281, + "learning_rate": 2.9669837714882746e-05, + "loss": 0.0378, + "step": 34070 + }, + { + "epoch": 0.07515994645294895, + "grad_norm": 0.16058747470378876, + "learning_rate": 2.9669492620010654e-05, + "loss": 0.0405, + "step": 34080 + }, + { + "epoch": 0.07518200042784712, + "grad_norm": 0.17350651323795319, + "learning_rate": 2.966914734689038e-05, + "loss": 0.0416, + "step": 34090 + }, + { + "epoch": 0.07520405440274527, + "grad_norm": 0.12243831902742386, + "learning_rate": 2.966880189552611e-05, + "loss": 0.0401, + "step": 34100 + }, + { + "epoch": 0.07522610837764344, + "grad_norm": 0.12507973611354828, + "learning_rate": 2.9668456265922052e-05, + "loss": 0.0373, + "step": 34110 + }, + { + "epoch": 0.07524816235254161, + "grad_norm": 0.13748164474964142, + "learning_rate": 2.96681104580824e-05, + "loss": 0.0373, + "step": 34120 + }, + { + "epoch": 0.07527021632743977, + "grad_norm": 0.1151563972234726, + "learning_rate": 2.9667764472011355e-05, + "loss": 0.038, + "step": 34130 + }, + { + "epoch": 0.07529227030233794, + "grad_norm": 0.11738225817680359, + "learning_rate": 2.9667418307713127e-05, + "loss": 0.0392, + "step": 34140 + }, + { + "epoch": 0.07531432427723611, + "grad_norm": 0.13267357647418976, + "learning_rate": 2.966707196519191e-05, + "loss": 0.0376, + "step": 34150 + }, + { + "epoch": 0.07533637825213427, + "grad_norm": 0.15119466185569763, + "learning_rate": 2.9666725444451926e-05, + "loss": 0.0401, + "step": 34160 + }, + { + "epoch": 0.07535843222703244, + "grad_norm": 0.14913593232631683, + "learning_rate": 2.966637874549738e-05, + "loss": 0.0398, + "step": 34170 + }, + { + "epoch": 0.07538048620193061, + "grad_norm": 0.12931033968925476, + "learning_rate": 2.9666031868332487e-05, + "loss": 0.0386, + "step": 34180 + }, + { + "epoch": 0.07540254017682876, + "grad_norm": 0.11111156642436981, + "learning_rate": 2.966568481296146e-05, + "loss": 0.0368, + "step": 34190 + }, + { + "epoch": 0.07542459415172693, + "grad_norm": 0.1391536295413971, + "learning_rate": 2.9665337579388513e-05, + "loss": 0.0389, + "step": 34200 + }, + { + "epoch": 0.0754466481266251, + "grad_norm": 0.12271915376186371, + "learning_rate": 2.966499016761787e-05, + "loss": 0.04, + "step": 34210 + }, + { + "epoch": 0.07546870210152327, + "grad_norm": 0.13494238257408142, + "learning_rate": 2.9664642577653745e-05, + "loss": 0.041, + "step": 34220 + }, + { + "epoch": 0.07549075607642143, + "grad_norm": 0.11786313354969025, + "learning_rate": 2.9664294809500372e-05, + "loss": 0.0412, + "step": 34230 + }, + { + "epoch": 0.0755128100513196, + "grad_norm": 0.12932391464710236, + "learning_rate": 2.966394686316197e-05, + "loss": 0.0374, + "step": 34240 + }, + { + "epoch": 0.07553486402621777, + "grad_norm": 0.12585647404193878, + "learning_rate": 2.966359873864277e-05, + "loss": 0.0376, + "step": 34250 + }, + { + "epoch": 0.07555691800111593, + "grad_norm": 0.14764082431793213, + "learning_rate": 2.9663250435947e-05, + "loss": 0.0386, + "step": 34260 + }, + { + "epoch": 0.0755789719760141, + "grad_norm": 0.1486954689025879, + "learning_rate": 2.9662901955078898e-05, + "loss": 0.039, + "step": 34270 + }, + { + "epoch": 0.07560102595091227, + "grad_norm": 0.11460227519273758, + "learning_rate": 2.9662553296042685e-05, + "loss": 0.039, + "step": 34280 + }, + { + "epoch": 0.07562307992581042, + "grad_norm": 0.11923359334468842, + "learning_rate": 2.966220445884261e-05, + "loss": 0.0392, + "step": 34290 + }, + { + "epoch": 0.0756451339007086, + "grad_norm": 0.11709167063236237, + "learning_rate": 2.9661855443482902e-05, + "loss": 0.0401, + "step": 34300 + }, + { + "epoch": 0.07566718787560676, + "grad_norm": 0.14707599580287933, + "learning_rate": 2.9661506249967812e-05, + "loss": 0.0411, + "step": 34310 + }, + { + "epoch": 0.07568924185050492, + "grad_norm": 0.15280620753765106, + "learning_rate": 2.9661156878301578e-05, + "loss": 0.0383, + "step": 34320 + }, + { + "epoch": 0.07571129582540309, + "grad_norm": 0.12296684831380844, + "learning_rate": 2.9660807328488445e-05, + "loss": 0.0396, + "step": 34330 + }, + { + "epoch": 0.07573334980030126, + "grad_norm": 0.11918414384126663, + "learning_rate": 2.9660457600532662e-05, + "loss": 0.0386, + "step": 34340 + }, + { + "epoch": 0.07575540377519942, + "grad_norm": 0.1351708620786667, + "learning_rate": 2.9660107694438474e-05, + "loss": 0.0393, + "step": 34350 + }, + { + "epoch": 0.07577745775009759, + "grad_norm": 0.1715468019247055, + "learning_rate": 2.9659757610210137e-05, + "loss": 0.0392, + "step": 34360 + }, + { + "epoch": 0.07579951172499576, + "grad_norm": 0.12091649323701859, + "learning_rate": 2.9659407347851907e-05, + "loss": 0.0394, + "step": 34370 + }, + { + "epoch": 0.07582156569989391, + "grad_norm": 0.11218990385532379, + "learning_rate": 2.9659056907368037e-05, + "loss": 0.039, + "step": 34380 + }, + { + "epoch": 0.07584361967479208, + "grad_norm": 0.14731772243976593, + "learning_rate": 2.965870628876278e-05, + "loss": 0.0397, + "step": 34390 + }, + { + "epoch": 0.07586567364969025, + "grad_norm": 0.10821586847305298, + "learning_rate": 2.965835549204041e-05, + "loss": 0.0411, + "step": 34400 + }, + { + "epoch": 0.07588772762458841, + "grad_norm": 0.13882407546043396, + "learning_rate": 2.9658004517205176e-05, + "loss": 0.0398, + "step": 34410 + }, + { + "epoch": 0.07590978159948658, + "grad_norm": 0.13267798721790314, + "learning_rate": 2.9657653364261346e-05, + "loss": 0.0389, + "step": 34420 + }, + { + "epoch": 0.07593183557438475, + "grad_norm": 0.14531119167804718, + "learning_rate": 2.965730203321319e-05, + "loss": 0.04, + "step": 34430 + }, + { + "epoch": 0.07595388954928292, + "grad_norm": 0.13040535151958466, + "learning_rate": 2.9656950524064977e-05, + "loss": 0.0396, + "step": 34440 + }, + { + "epoch": 0.07597594352418108, + "grad_norm": 0.13198861479759216, + "learning_rate": 2.9656598836820974e-05, + "loss": 0.0378, + "step": 34450 + }, + { + "epoch": 0.07599799749907925, + "grad_norm": 0.13809216022491455, + "learning_rate": 2.9656246971485453e-05, + "loss": 0.04, + "step": 34460 + }, + { + "epoch": 0.07602005147397742, + "grad_norm": 0.10840544104576111, + "learning_rate": 2.9655894928062703e-05, + "loss": 0.0421, + "step": 34470 + }, + { + "epoch": 0.07604210544887557, + "grad_norm": 0.1212439015507698, + "learning_rate": 2.9655542706556987e-05, + "loss": 0.0398, + "step": 34480 + }, + { + "epoch": 0.07606415942377374, + "grad_norm": 0.1095346137881279, + "learning_rate": 2.9655190306972593e-05, + "loss": 0.0388, + "step": 34490 + }, + { + "epoch": 0.07608621339867191, + "grad_norm": 0.10170942544937134, + "learning_rate": 2.9654837729313796e-05, + "loss": 0.0396, + "step": 34500 + }, + { + "epoch": 0.07610826737357007, + "grad_norm": 0.11855212599039078, + "learning_rate": 2.9654484973584886e-05, + "loss": 0.0409, + "step": 34510 + }, + { + "epoch": 0.07613032134846824, + "grad_norm": 0.14245952665805817, + "learning_rate": 2.9654132039790147e-05, + "loss": 0.0395, + "step": 34520 + }, + { + "epoch": 0.07615237532336641, + "grad_norm": 0.12531951069831848, + "learning_rate": 2.9653778927933867e-05, + "loss": 0.0411, + "step": 34530 + }, + { + "epoch": 0.07617442929826457, + "grad_norm": 0.13710762560367584, + "learning_rate": 2.9653425638020342e-05, + "loss": 0.0381, + "step": 34540 + }, + { + "epoch": 0.07619648327316274, + "grad_norm": 0.14012740552425385, + "learning_rate": 2.9653072170053857e-05, + "loss": 0.0384, + "step": 34550 + }, + { + "epoch": 0.07621853724806091, + "grad_norm": 0.11528822034597397, + "learning_rate": 2.965271852403871e-05, + "loss": 0.0404, + "step": 34560 + }, + { + "epoch": 0.07624059122295906, + "grad_norm": 0.12323247641324997, + "learning_rate": 2.9652364699979197e-05, + "loss": 0.0402, + "step": 34570 + }, + { + "epoch": 0.07626264519785723, + "grad_norm": 0.1088605597615242, + "learning_rate": 2.965201069787962e-05, + "loss": 0.0392, + "step": 34580 + }, + { + "epoch": 0.0762846991727554, + "grad_norm": 0.09940927475690842, + "learning_rate": 2.965165651774428e-05, + "loss": 0.0419, + "step": 34590 + }, + { + "epoch": 0.07630675314765356, + "grad_norm": 0.097262442111969, + "learning_rate": 2.9651302159577484e-05, + "loss": 0.0396, + "step": 34600 + }, + { + "epoch": 0.07632880712255173, + "grad_norm": 0.11173972487449646, + "learning_rate": 2.965094762338353e-05, + "loss": 0.0397, + "step": 34610 + }, + { + "epoch": 0.0763508610974499, + "grad_norm": 0.13563917577266693, + "learning_rate": 2.965059290916673e-05, + "loss": 0.0395, + "step": 34620 + }, + { + "epoch": 0.07637291507234807, + "grad_norm": 0.13459086418151855, + "learning_rate": 2.965023801693139e-05, + "loss": 0.0402, + "step": 34630 + }, + { + "epoch": 0.07639496904724623, + "grad_norm": 0.12330688536167145, + "learning_rate": 2.9649882946681826e-05, + "loss": 0.0388, + "step": 34640 + }, + { + "epoch": 0.0764170230221444, + "grad_norm": 0.08968804031610489, + "learning_rate": 2.9649527698422357e-05, + "loss": 0.0386, + "step": 34650 + }, + { + "epoch": 0.07643907699704257, + "grad_norm": 0.14034147560596466, + "learning_rate": 2.9649172272157294e-05, + "loss": 0.0394, + "step": 34660 + }, + { + "epoch": 0.07646113097194072, + "grad_norm": 0.12425975501537323, + "learning_rate": 2.9648816667890958e-05, + "loss": 0.0402, + "step": 34670 + }, + { + "epoch": 0.0764831849468389, + "grad_norm": 0.11330121010541916, + "learning_rate": 2.9648460885627668e-05, + "loss": 0.0379, + "step": 34680 + }, + { + "epoch": 0.07650523892173706, + "grad_norm": 0.124474436044693, + "learning_rate": 2.9648104925371744e-05, + "loss": 0.0405, + "step": 34690 + }, + { + "epoch": 0.07652729289663522, + "grad_norm": 0.13767459988594055, + "learning_rate": 2.964774878712752e-05, + "loss": 0.0389, + "step": 34700 + }, + { + "epoch": 0.07654934687153339, + "grad_norm": 0.09835672378540039, + "learning_rate": 2.9647392470899315e-05, + "loss": 0.0395, + "step": 34710 + }, + { + "epoch": 0.07657140084643156, + "grad_norm": 0.11913083493709564, + "learning_rate": 2.9647035976691467e-05, + "loss": 0.0384, + "step": 34720 + }, + { + "epoch": 0.07659345482132972, + "grad_norm": 0.09316457808017731, + "learning_rate": 2.9646679304508298e-05, + "loss": 0.0383, + "step": 34730 + }, + { + "epoch": 0.07661550879622789, + "grad_norm": 0.13225385546684265, + "learning_rate": 2.9646322454354146e-05, + "loss": 0.0385, + "step": 34740 + }, + { + "epoch": 0.07663756277112606, + "grad_norm": 0.12679719924926758, + "learning_rate": 2.964596542623335e-05, + "loss": 0.0378, + "step": 34750 + }, + { + "epoch": 0.07665961674602421, + "grad_norm": 0.18891087174415588, + "learning_rate": 2.9645608220150244e-05, + "loss": 0.0402, + "step": 34760 + }, + { + "epoch": 0.07668167072092238, + "grad_norm": 0.10519905388355255, + "learning_rate": 2.964525083610917e-05, + "loss": 0.0399, + "step": 34770 + }, + { + "epoch": 0.07670372469582055, + "grad_norm": 0.16745634377002716, + "learning_rate": 2.9644893274114474e-05, + "loss": 0.0406, + "step": 34780 + }, + { + "epoch": 0.07672577867071871, + "grad_norm": 0.13150185346603394, + "learning_rate": 2.9644535534170496e-05, + "loss": 0.0418, + "step": 34790 + }, + { + "epoch": 0.07674783264561688, + "grad_norm": 0.10365013033151627, + "learning_rate": 2.9644177616281586e-05, + "loss": 0.0403, + "step": 34800 + }, + { + "epoch": 0.07676988662051505, + "grad_norm": 0.09433913230895996, + "learning_rate": 2.964381952045209e-05, + "loss": 0.0405, + "step": 34810 + }, + { + "epoch": 0.07679194059541321, + "grad_norm": 0.11192040890455246, + "learning_rate": 2.964346124668636e-05, + "loss": 0.0405, + "step": 34820 + }, + { + "epoch": 0.07681399457031138, + "grad_norm": 0.11107583343982697, + "learning_rate": 2.9643102794988756e-05, + "loss": 0.039, + "step": 34830 + }, + { + "epoch": 0.07683604854520955, + "grad_norm": 0.10246419161558151, + "learning_rate": 2.9642744165363617e-05, + "loss": 0.0368, + "step": 34840 + }, + { + "epoch": 0.07685810252010772, + "grad_norm": 0.12343418598175049, + "learning_rate": 2.9642385357815318e-05, + "loss": 0.0385, + "step": 34850 + }, + { + "epoch": 0.07688015649500587, + "grad_norm": 0.13399092853069305, + "learning_rate": 2.9642026372348212e-05, + "loss": 0.0405, + "step": 34860 + }, + { + "epoch": 0.07690221046990404, + "grad_norm": 0.14695464074611664, + "learning_rate": 2.9641667208966664e-05, + "loss": 0.0418, + "step": 34870 + }, + { + "epoch": 0.07692426444480222, + "grad_norm": 0.19508543610572815, + "learning_rate": 2.964130786767503e-05, + "loss": 0.0398, + "step": 34880 + }, + { + "epoch": 0.07694631841970037, + "grad_norm": 0.11423234641551971, + "learning_rate": 2.9640948348477683e-05, + "loss": 0.0392, + "step": 34890 + }, + { + "epoch": 0.07696837239459854, + "grad_norm": 0.15471619367599487, + "learning_rate": 2.9640588651378995e-05, + "loss": 0.0379, + "step": 34900 + }, + { + "epoch": 0.07699042636949671, + "grad_norm": 0.1253165751695633, + "learning_rate": 2.9640228776383326e-05, + "loss": 0.0381, + "step": 34910 + }, + { + "epoch": 0.07701248034439487, + "grad_norm": 0.13133932650089264, + "learning_rate": 2.963986872349506e-05, + "loss": 0.039, + "step": 34920 + }, + { + "epoch": 0.07703453431929304, + "grad_norm": 0.1329023540019989, + "learning_rate": 2.963950849271856e-05, + "loss": 0.0418, + "step": 34930 + }, + { + "epoch": 0.07705658829419121, + "grad_norm": 0.12307779490947723, + "learning_rate": 2.963914808405821e-05, + "loss": 0.0404, + "step": 34940 + }, + { + "epoch": 0.07707864226908936, + "grad_norm": 0.13537263870239258, + "learning_rate": 2.9638787497518397e-05, + "loss": 0.0408, + "step": 34950 + }, + { + "epoch": 0.07710069624398753, + "grad_norm": 0.1501885950565338, + "learning_rate": 2.9638426733103487e-05, + "loss": 0.0414, + "step": 34960 + }, + { + "epoch": 0.0771227502188857, + "grad_norm": 0.09372271597385406, + "learning_rate": 2.9638065790817875e-05, + "loss": 0.0391, + "step": 34970 + }, + { + "epoch": 0.07714480419378386, + "grad_norm": 0.1487504243850708, + "learning_rate": 2.9637704670665943e-05, + "loss": 0.0388, + "step": 34980 + }, + { + "epoch": 0.07716685816868203, + "grad_norm": 0.15622319281101227, + "learning_rate": 2.963734337265208e-05, + "loss": 0.0377, + "step": 34990 + }, + { + "epoch": 0.0771889121435802, + "grad_norm": 0.12902051210403442, + "learning_rate": 2.9636981896780672e-05, + "loss": 0.0406, + "step": 35000 + }, + { + "epoch": 0.07721096611847836, + "grad_norm": 0.12089589983224869, + "learning_rate": 2.9636620243056116e-05, + "loss": 0.039, + "step": 35010 + }, + { + "epoch": 0.07723302009337653, + "grad_norm": 0.10448092967271805, + "learning_rate": 2.9636258411482803e-05, + "loss": 0.0388, + "step": 35020 + }, + { + "epoch": 0.0772550740682747, + "grad_norm": 0.11019435524940491, + "learning_rate": 2.9635896402065135e-05, + "loss": 0.0384, + "step": 35030 + }, + { + "epoch": 0.07727712804317285, + "grad_norm": 0.16358526051044464, + "learning_rate": 2.9635534214807505e-05, + "loss": 0.0395, + "step": 35040 + }, + { + "epoch": 0.07729918201807102, + "grad_norm": 0.1463945060968399, + "learning_rate": 2.9635171849714313e-05, + "loss": 0.0395, + "step": 35050 + }, + { + "epoch": 0.0773212359929692, + "grad_norm": 0.12829384207725525, + "learning_rate": 2.963480930678997e-05, + "loss": 0.039, + "step": 35060 + }, + { + "epoch": 0.07734328996786737, + "grad_norm": 0.12774677574634552, + "learning_rate": 2.9634446586038875e-05, + "loss": 0.0383, + "step": 35070 + }, + { + "epoch": 0.07736534394276552, + "grad_norm": 0.10591887682676315, + "learning_rate": 2.9634083687465437e-05, + "loss": 0.0407, + "step": 35080 + }, + { + "epoch": 0.07738739791766369, + "grad_norm": 0.10904046893119812, + "learning_rate": 2.9633720611074063e-05, + "loss": 0.0397, + "step": 35090 + }, + { + "epoch": 0.07740945189256186, + "grad_norm": 0.12443386018276215, + "learning_rate": 2.9633357356869166e-05, + "loss": 0.0372, + "step": 35100 + }, + { + "epoch": 0.07743150586746002, + "grad_norm": 0.08969524502754211, + "learning_rate": 2.9632993924855166e-05, + "loss": 0.0412, + "step": 35110 + }, + { + "epoch": 0.07745355984235819, + "grad_norm": 0.13763846457004547, + "learning_rate": 2.9632630315036473e-05, + "loss": 0.0402, + "step": 35120 + }, + { + "epoch": 0.07747561381725636, + "grad_norm": 0.15314196050167084, + "learning_rate": 2.9632266527417504e-05, + "loss": 0.0397, + "step": 35130 + }, + { + "epoch": 0.07749766779215451, + "grad_norm": 0.09378417581319809, + "learning_rate": 2.963190256200268e-05, + "loss": 0.0396, + "step": 35140 + }, + { + "epoch": 0.07751972176705269, + "grad_norm": 0.14453256130218506, + "learning_rate": 2.963153841879643e-05, + "loss": 0.0395, + "step": 35150 + }, + { + "epoch": 0.07754177574195086, + "grad_norm": 0.12942655384540558, + "learning_rate": 2.9631174097803168e-05, + "loss": 0.0376, + "step": 35160 + }, + { + "epoch": 0.07756382971684901, + "grad_norm": 0.14474254846572876, + "learning_rate": 2.9630809599027334e-05, + "loss": 0.0395, + "step": 35170 + }, + { + "epoch": 0.07758588369174718, + "grad_norm": 0.1328565776348114, + "learning_rate": 2.963044492247334e-05, + "loss": 0.0411, + "step": 35180 + }, + { + "epoch": 0.07760793766664535, + "grad_norm": 0.10475730150938034, + "learning_rate": 2.9630080068145635e-05, + "loss": 0.0398, + "step": 35190 + }, + { + "epoch": 0.07762999164154351, + "grad_norm": 0.0843178778886795, + "learning_rate": 2.9629715036048638e-05, + "loss": 0.0389, + "step": 35200 + }, + { + "epoch": 0.07765204561644168, + "grad_norm": 0.15329189598560333, + "learning_rate": 2.9629349826186797e-05, + "loss": 0.04, + "step": 35210 + }, + { + "epoch": 0.07767409959133985, + "grad_norm": 0.12928059697151184, + "learning_rate": 2.9628984438564542e-05, + "loss": 0.0414, + "step": 35220 + }, + { + "epoch": 0.077696153566238, + "grad_norm": 0.1546824872493744, + "learning_rate": 2.9628618873186308e-05, + "loss": 0.0403, + "step": 35230 + }, + { + "epoch": 0.07771820754113618, + "grad_norm": 0.1473262906074524, + "learning_rate": 2.962825313005655e-05, + "loss": 0.0375, + "step": 35240 + }, + { + "epoch": 0.07774026151603435, + "grad_norm": 0.1506279855966568, + "learning_rate": 2.9627887209179698e-05, + "loss": 0.0373, + "step": 35250 + }, + { + "epoch": 0.0777623154909325, + "grad_norm": 0.09998537600040436, + "learning_rate": 2.9627521110560213e-05, + "loss": 0.0373, + "step": 35260 + }, + { + "epoch": 0.07778436946583067, + "grad_norm": 0.10125086456537247, + "learning_rate": 2.962715483420253e-05, + "loss": 0.0406, + "step": 35270 + }, + { + "epoch": 0.07780642344072884, + "grad_norm": 0.11771436780691147, + "learning_rate": 2.962678838011111e-05, + "loss": 0.0401, + "step": 35280 + }, + { + "epoch": 0.07782847741562701, + "grad_norm": 0.11512845754623413, + "learning_rate": 2.96264217482904e-05, + "loss": 0.0388, + "step": 35290 + }, + { + "epoch": 0.07785053139052517, + "grad_norm": 0.09974043816328049, + "learning_rate": 2.9626054938744852e-05, + "loss": 0.039, + "step": 35300 + }, + { + "epoch": 0.07787258536542334, + "grad_norm": 0.11558813601732254, + "learning_rate": 2.962568795147893e-05, + "loss": 0.0398, + "step": 35310 + }, + { + "epoch": 0.07789463934032151, + "grad_norm": 0.1467701941728592, + "learning_rate": 2.9625320786497092e-05, + "loss": 0.0374, + "step": 35320 + }, + { + "epoch": 0.07791669331521967, + "grad_norm": 0.13150067627429962, + "learning_rate": 2.9624953443803797e-05, + "loss": 0.0396, + "step": 35330 + }, + { + "epoch": 0.07793874729011784, + "grad_norm": 0.11156397312879562, + "learning_rate": 2.9624585923403508e-05, + "loss": 0.04, + "step": 35340 + }, + { + "epoch": 0.077960801265016, + "grad_norm": 0.14888660609722137, + "learning_rate": 2.9624218225300694e-05, + "loss": 0.0384, + "step": 35350 + }, + { + "epoch": 0.07798285523991416, + "grad_norm": 0.10422154515981674, + "learning_rate": 2.9623850349499817e-05, + "loss": 0.0412, + "step": 35360 + }, + { + "epoch": 0.07800490921481233, + "grad_norm": 0.18254144489765167, + "learning_rate": 2.9623482296005356e-05, + "loss": 0.0395, + "step": 35370 + }, + { + "epoch": 0.0780269631897105, + "grad_norm": 0.11421360075473785, + "learning_rate": 2.9623114064821777e-05, + "loss": 0.0388, + "step": 35380 + }, + { + "epoch": 0.07804901716460866, + "grad_norm": 0.10177017003297806, + "learning_rate": 2.9622745655953556e-05, + "loss": 0.0369, + "step": 35390 + }, + { + "epoch": 0.07807107113950683, + "grad_norm": 0.10495181381702423, + "learning_rate": 2.9622377069405168e-05, + "loss": 0.0397, + "step": 35400 + }, + { + "epoch": 0.078093125114405, + "grad_norm": 0.1208435446023941, + "learning_rate": 2.9622008305181093e-05, + "loss": 0.0407, + "step": 35410 + }, + { + "epoch": 0.07811517908930316, + "grad_norm": 0.09898939728736877, + "learning_rate": 2.9621639363285812e-05, + "loss": 0.0416, + "step": 35420 + }, + { + "epoch": 0.07813723306420133, + "grad_norm": 0.16232112050056458, + "learning_rate": 2.9621270243723805e-05, + "loss": 0.0375, + "step": 35430 + }, + { + "epoch": 0.0781592870390995, + "grad_norm": 0.13022403419017792, + "learning_rate": 2.962090094649956e-05, + "loss": 0.0412, + "step": 35440 + }, + { + "epoch": 0.07818134101399765, + "grad_norm": 0.11706244200468063, + "learning_rate": 2.9620531471617567e-05, + "loss": 0.0396, + "step": 35450 + }, + { + "epoch": 0.07820339498889582, + "grad_norm": 0.101859450340271, + "learning_rate": 2.962016181908231e-05, + "loss": 0.0409, + "step": 35460 + }, + { + "epoch": 0.07822544896379399, + "grad_norm": 0.1361846774816513, + "learning_rate": 2.9619791988898287e-05, + "loss": 0.0402, + "step": 35470 + }, + { + "epoch": 0.07824750293869215, + "grad_norm": 0.1273210346698761, + "learning_rate": 2.9619421981069986e-05, + "loss": 0.0383, + "step": 35480 + }, + { + "epoch": 0.07826955691359032, + "grad_norm": 0.1571016013622284, + "learning_rate": 2.9619051795601905e-05, + "loss": 0.0405, + "step": 35490 + }, + { + "epoch": 0.07829161088848849, + "grad_norm": 0.1463393121957779, + "learning_rate": 2.9618681432498543e-05, + "loss": 0.0386, + "step": 35500 + }, + { + "epoch": 0.07831366486338666, + "grad_norm": 0.1270325779914856, + "learning_rate": 2.9618310891764396e-05, + "loss": 0.0407, + "step": 35510 + }, + { + "epoch": 0.07833571883828482, + "grad_norm": 0.12001682072877884, + "learning_rate": 2.961794017340397e-05, + "loss": 0.038, + "step": 35520 + }, + { + "epoch": 0.07835777281318299, + "grad_norm": 0.1332269161939621, + "learning_rate": 2.9617569277421774e-05, + "loss": 0.0391, + "step": 35530 + }, + { + "epoch": 0.07837982678808116, + "grad_norm": 0.1552206575870514, + "learning_rate": 2.9617198203822303e-05, + "loss": 0.0401, + "step": 35540 + }, + { + "epoch": 0.07840188076297931, + "grad_norm": 0.12769220769405365, + "learning_rate": 2.961682695261008e-05, + "loss": 0.0409, + "step": 35550 + }, + { + "epoch": 0.07842393473787748, + "grad_norm": 0.12175657600164413, + "learning_rate": 2.9616455523789606e-05, + "loss": 0.0396, + "step": 35560 + }, + { + "epoch": 0.07844598871277565, + "grad_norm": 0.1105186864733696, + "learning_rate": 2.9616083917365396e-05, + "loss": 0.0389, + "step": 35570 + }, + { + "epoch": 0.07846804268767381, + "grad_norm": 0.11571872979402542, + "learning_rate": 2.9615712133341964e-05, + "loss": 0.0406, + "step": 35580 + }, + { + "epoch": 0.07849009666257198, + "grad_norm": 0.1379610002040863, + "learning_rate": 2.9615340171723838e-05, + "loss": 0.0398, + "step": 35590 + }, + { + "epoch": 0.07851215063747015, + "grad_norm": 0.11261984705924988, + "learning_rate": 2.9614968032515524e-05, + "loss": 0.0395, + "step": 35600 + }, + { + "epoch": 0.0785342046123683, + "grad_norm": 0.14039848744869232, + "learning_rate": 2.9614595715721552e-05, + "loss": 0.0405, + "step": 35610 + }, + { + "epoch": 0.07855625858726648, + "grad_norm": 0.10045108944177628, + "learning_rate": 2.9614223221346445e-05, + "loss": 0.038, + "step": 35620 + }, + { + "epoch": 0.07857831256216465, + "grad_norm": 0.12000560760498047, + "learning_rate": 2.9613850549394722e-05, + "loss": 0.042, + "step": 35630 + }, + { + "epoch": 0.0786003665370628, + "grad_norm": 0.10390632599592209, + "learning_rate": 2.961347769987092e-05, + "loss": 0.0383, + "step": 35640 + }, + { + "epoch": 0.07862242051196097, + "grad_norm": 0.11755947023630142, + "learning_rate": 2.9613104672779568e-05, + "loss": 0.0364, + "step": 35650 + }, + { + "epoch": 0.07864447448685914, + "grad_norm": 0.10406028479337692, + "learning_rate": 2.9612731468125193e-05, + "loss": 0.0374, + "step": 35660 + }, + { + "epoch": 0.0786665284617573, + "grad_norm": 0.1561930626630783, + "learning_rate": 2.961235808591234e-05, + "loss": 0.0395, + "step": 35670 + }, + { + "epoch": 0.07868858243665547, + "grad_norm": 0.14725683629512787, + "learning_rate": 2.9611984526145537e-05, + "loss": 0.0375, + "step": 35680 + }, + { + "epoch": 0.07871063641155364, + "grad_norm": 0.13352851569652557, + "learning_rate": 2.9611610788829322e-05, + "loss": 0.0424, + "step": 35690 + }, + { + "epoch": 0.07873269038645181, + "grad_norm": 0.1403861790895462, + "learning_rate": 2.9611236873968246e-05, + "loss": 0.0376, + "step": 35700 + }, + { + "epoch": 0.07875474436134997, + "grad_norm": 0.11900701373815536, + "learning_rate": 2.961086278156685e-05, + "loss": 0.0402, + "step": 35710 + }, + { + "epoch": 0.07877679833624814, + "grad_norm": 0.13195739686489105, + "learning_rate": 2.9610488511629667e-05, + "loss": 0.0373, + "step": 35720 + }, + { + "epoch": 0.0787988523111463, + "grad_norm": 0.11521182954311371, + "learning_rate": 2.9610114064161256e-05, + "loss": 0.0406, + "step": 35730 + }, + { + "epoch": 0.07882090628604446, + "grad_norm": 0.13438110053539276, + "learning_rate": 2.960973943916617e-05, + "loss": 0.0374, + "step": 35740 + }, + { + "epoch": 0.07884296026094263, + "grad_norm": 0.13655702769756317, + "learning_rate": 2.9609364636648953e-05, + "loss": 0.0394, + "step": 35750 + }, + { + "epoch": 0.0788650142358408, + "grad_norm": 0.11055470257997513, + "learning_rate": 2.960898965661416e-05, + "loss": 0.0392, + "step": 35760 + }, + { + "epoch": 0.07888706821073896, + "grad_norm": 0.1094759926199913, + "learning_rate": 2.9608614499066352e-05, + "loss": 0.0389, + "step": 35770 + }, + { + "epoch": 0.07890912218563713, + "grad_norm": 0.11420971900224686, + "learning_rate": 2.9608239164010084e-05, + "loss": 0.0412, + "step": 35780 + }, + { + "epoch": 0.0789311761605353, + "grad_norm": 0.12407401204109192, + "learning_rate": 2.960786365144992e-05, + "loss": 0.0401, + "step": 35790 + }, + { + "epoch": 0.07895323013543346, + "grad_norm": 0.1520622819662094, + "learning_rate": 2.9607487961390417e-05, + "loss": 0.0399, + "step": 35800 + }, + { + "epoch": 0.07897528411033163, + "grad_norm": 0.11492110788822174, + "learning_rate": 2.9607112093836146e-05, + "loss": 0.0357, + "step": 35810 + }, + { + "epoch": 0.0789973380852298, + "grad_norm": 0.11269518733024597, + "learning_rate": 2.960673604879167e-05, + "loss": 0.0403, + "step": 35820 + }, + { + "epoch": 0.07901939206012795, + "grad_norm": 0.13642124831676483, + "learning_rate": 2.960635982626156e-05, + "loss": 0.0378, + "step": 35830 + }, + { + "epoch": 0.07904144603502612, + "grad_norm": 0.096964992582798, + "learning_rate": 2.9605983426250392e-05, + "loss": 0.0398, + "step": 35840 + }, + { + "epoch": 0.07906350000992429, + "grad_norm": 0.12457302212715149, + "learning_rate": 2.9605606848762727e-05, + "loss": 0.0359, + "step": 35850 + }, + { + "epoch": 0.07908555398482245, + "grad_norm": 0.12147419899702072, + "learning_rate": 2.9605230093803154e-05, + "loss": 0.0408, + "step": 35860 + }, + { + "epoch": 0.07910760795972062, + "grad_norm": 0.09180808812379837, + "learning_rate": 2.9604853161376243e-05, + "loss": 0.0413, + "step": 35870 + }, + { + "epoch": 0.07912966193461879, + "grad_norm": 0.1195477843284607, + "learning_rate": 2.960447605148658e-05, + "loss": 0.037, + "step": 35880 + }, + { + "epoch": 0.07915171590951695, + "grad_norm": 0.13114432990550995, + "learning_rate": 2.9604098764138747e-05, + "loss": 0.0388, + "step": 35890 + }, + { + "epoch": 0.07917376988441512, + "grad_norm": 0.15562450885772705, + "learning_rate": 2.9603721299337317e-05, + "loss": 0.0367, + "step": 35900 + }, + { + "epoch": 0.07919582385931329, + "grad_norm": 0.11813225597143173, + "learning_rate": 2.9603343657086892e-05, + "loss": 0.0401, + "step": 35910 + }, + { + "epoch": 0.07921787783421146, + "grad_norm": 0.14376239478588104, + "learning_rate": 2.9602965837392047e-05, + "loss": 0.0389, + "step": 35920 + }, + { + "epoch": 0.07923993180910961, + "grad_norm": 0.1261707991361618, + "learning_rate": 2.9602587840257386e-05, + "loss": 0.038, + "step": 35930 + }, + { + "epoch": 0.07926198578400778, + "grad_norm": 0.1337004154920578, + "learning_rate": 2.9602209665687493e-05, + "loss": 0.0409, + "step": 35940 + }, + { + "epoch": 0.07928403975890595, + "grad_norm": 0.16589310765266418, + "learning_rate": 2.9601831313686966e-05, + "loss": 0.0389, + "step": 35950 + }, + { + "epoch": 0.07930609373380411, + "grad_norm": 0.12705396115779877, + "learning_rate": 2.9601452784260403e-05, + "loss": 0.0392, + "step": 35960 + }, + { + "epoch": 0.07932814770870228, + "grad_norm": 0.14586299657821655, + "learning_rate": 2.9601074077412404e-05, + "loss": 0.0399, + "step": 35970 + }, + { + "epoch": 0.07935020168360045, + "grad_norm": 0.11118505895137787, + "learning_rate": 2.9600695193147563e-05, + "loss": 0.0406, + "step": 35980 + }, + { + "epoch": 0.0793722556584986, + "grad_norm": 0.10683304816484451, + "learning_rate": 2.9600316131470493e-05, + "loss": 0.0411, + "step": 35990 + }, + { + "epoch": 0.07939430963339678, + "grad_norm": 0.11974352598190308, + "learning_rate": 2.9599936892385798e-05, + "loss": 0.0391, + "step": 36000 + }, + { + "epoch": 0.07941636360829495, + "grad_norm": 0.13884086906909943, + "learning_rate": 2.9599557475898088e-05, + "loss": 0.042, + "step": 36010 + }, + { + "epoch": 0.0794384175831931, + "grad_norm": 0.11614231765270233, + "learning_rate": 2.9599177882011966e-05, + "loss": 0.0372, + "step": 36020 + }, + { + "epoch": 0.07946047155809127, + "grad_norm": 0.13656792044639587, + "learning_rate": 2.959879811073205e-05, + "loss": 0.0392, + "step": 36030 + }, + { + "epoch": 0.07948252553298944, + "grad_norm": 0.13358443975448608, + "learning_rate": 2.9598418162062953e-05, + "loss": 0.0409, + "step": 36040 + }, + { + "epoch": 0.0795045795078876, + "grad_norm": 0.09646546840667725, + "learning_rate": 2.9598038036009292e-05, + "loss": 0.0386, + "step": 36050 + }, + { + "epoch": 0.07952663348278577, + "grad_norm": 0.13937915861606598, + "learning_rate": 2.9597657732575688e-05, + "loss": 0.0397, + "step": 36060 + }, + { + "epoch": 0.07954868745768394, + "grad_norm": 0.148870050907135, + "learning_rate": 2.959727725176676e-05, + "loss": 0.0385, + "step": 36070 + }, + { + "epoch": 0.0795707414325821, + "grad_norm": 0.11841742694377899, + "learning_rate": 2.9596896593587134e-05, + "loss": 0.04, + "step": 36080 + }, + { + "epoch": 0.07959279540748027, + "grad_norm": 0.1381302922964096, + "learning_rate": 2.9596515758041428e-05, + "loss": 0.0403, + "step": 36090 + }, + { + "epoch": 0.07961484938237844, + "grad_norm": 0.1471579372882843, + "learning_rate": 2.9596134745134273e-05, + "loss": 0.0385, + "step": 36100 + }, + { + "epoch": 0.07963690335727659, + "grad_norm": 0.12403595447540283, + "learning_rate": 2.95957535548703e-05, + "loss": 0.0386, + "step": 36110 + }, + { + "epoch": 0.07965895733217476, + "grad_norm": 0.13615445792675018, + "learning_rate": 2.9595372187254144e-05, + "loss": 0.0376, + "step": 36120 + }, + { + "epoch": 0.07968101130707293, + "grad_norm": 0.12238048762083054, + "learning_rate": 2.9594990642290433e-05, + "loss": 0.0386, + "step": 36130 + }, + { + "epoch": 0.0797030652819711, + "grad_norm": 0.12123148143291473, + "learning_rate": 2.9594608919983807e-05, + "loss": 0.0383, + "step": 36140 + }, + { + "epoch": 0.07972511925686926, + "grad_norm": 0.12041972577571869, + "learning_rate": 2.9594227020338907e-05, + "loss": 0.0355, + "step": 36150 + }, + { + "epoch": 0.07974717323176743, + "grad_norm": 0.1324692964553833, + "learning_rate": 2.9593844943360365e-05, + "loss": 0.0394, + "step": 36160 + }, + { + "epoch": 0.0797692272066656, + "grad_norm": 0.11590767651796341, + "learning_rate": 2.9593462689052825e-05, + "loss": 0.0368, + "step": 36170 + }, + { + "epoch": 0.07979128118156376, + "grad_norm": 0.08619704842567444, + "learning_rate": 2.959308025742094e-05, + "loss": 0.0369, + "step": 36180 + }, + { + "epoch": 0.07981333515646193, + "grad_norm": 0.12962095439434052, + "learning_rate": 2.9592697648469346e-05, + "loss": 0.0381, + "step": 36190 + }, + { + "epoch": 0.0798353891313601, + "grad_norm": 0.10388755053281784, + "learning_rate": 2.9592314862202704e-05, + "loss": 0.0395, + "step": 36200 + }, + { + "epoch": 0.07985744310625825, + "grad_norm": 0.12814205884933472, + "learning_rate": 2.9591931898625658e-05, + "loss": 0.0389, + "step": 36210 + }, + { + "epoch": 0.07987949708115642, + "grad_norm": 0.12227099388837814, + "learning_rate": 2.9591548757742858e-05, + "loss": 0.0382, + "step": 36220 + }, + { + "epoch": 0.07990155105605459, + "grad_norm": 0.1635427176952362, + "learning_rate": 2.9591165439558973e-05, + "loss": 0.0413, + "step": 36230 + }, + { + "epoch": 0.07992360503095275, + "grad_norm": 0.12277448177337646, + "learning_rate": 2.9590781944078643e-05, + "loss": 0.0386, + "step": 36240 + }, + { + "epoch": 0.07994565900585092, + "grad_norm": 0.12838494777679443, + "learning_rate": 2.959039827130654e-05, + "loss": 0.0396, + "step": 36250 + }, + { + "epoch": 0.07996771298074909, + "grad_norm": 0.13887915015220642, + "learning_rate": 2.959001442124732e-05, + "loss": 0.0397, + "step": 36260 + }, + { + "epoch": 0.07998976695564725, + "grad_norm": 0.14069464802742004, + "learning_rate": 2.958963039390565e-05, + "loss": 0.0385, + "step": 36270 + }, + { + "epoch": 0.08001182093054542, + "grad_norm": 0.1241833046078682, + "learning_rate": 2.9589246189286196e-05, + "loss": 0.0373, + "step": 36280 + }, + { + "epoch": 0.08003387490544359, + "grad_norm": 0.14665751159191132, + "learning_rate": 2.958886180739363e-05, + "loss": 0.0375, + "step": 36290 + }, + { + "epoch": 0.08005592888034174, + "grad_norm": 0.13964048027992249, + "learning_rate": 2.9588477248232614e-05, + "loss": 0.0414, + "step": 36300 + }, + { + "epoch": 0.08007798285523991, + "grad_norm": 0.1138080358505249, + "learning_rate": 2.9588092511807827e-05, + "loss": 0.0393, + "step": 36310 + }, + { + "epoch": 0.08010003683013808, + "grad_norm": 0.11990674585103989, + "learning_rate": 2.9587707598123944e-05, + "loss": 0.0393, + "step": 36320 + }, + { + "epoch": 0.08012209080503624, + "grad_norm": 0.14270338416099548, + "learning_rate": 2.958732250718564e-05, + "loss": 0.0422, + "step": 36330 + }, + { + "epoch": 0.08014414477993441, + "grad_norm": 0.13684909045696259, + "learning_rate": 2.9586937238997594e-05, + "loss": 0.0407, + "step": 36340 + }, + { + "epoch": 0.08016619875483258, + "grad_norm": 0.11723807454109192, + "learning_rate": 2.9586551793564485e-05, + "loss": 0.0369, + "step": 36350 + }, + { + "epoch": 0.08018825272973075, + "grad_norm": 0.14191168546676636, + "learning_rate": 2.9586166170891006e-05, + "loss": 0.0402, + "step": 36360 + }, + { + "epoch": 0.0802103067046289, + "grad_norm": 0.14634014666080475, + "learning_rate": 2.9585780370981833e-05, + "loss": 0.0388, + "step": 36370 + }, + { + "epoch": 0.08023236067952708, + "grad_norm": 0.12393698841333389, + "learning_rate": 2.9585394393841657e-05, + "loss": 0.0403, + "step": 36380 + }, + { + "epoch": 0.08025441465442525, + "grad_norm": 0.12005170434713364, + "learning_rate": 2.958500823947517e-05, + "loss": 0.0408, + "step": 36390 + }, + { + "epoch": 0.0802764686293234, + "grad_norm": 0.1270863562822342, + "learning_rate": 2.9584621907887062e-05, + "loss": 0.0394, + "step": 36400 + }, + { + "epoch": 0.08029852260422157, + "grad_norm": 0.11914227902889252, + "learning_rate": 2.9584235399082026e-05, + "loss": 0.0374, + "step": 36410 + }, + { + "epoch": 0.08032057657911974, + "grad_norm": 0.1252089887857437, + "learning_rate": 2.9583848713064764e-05, + "loss": 0.0386, + "step": 36420 + }, + { + "epoch": 0.0803426305540179, + "grad_norm": 0.10916820913553238, + "learning_rate": 2.9583461849839966e-05, + "loss": 0.039, + "step": 36430 + }, + { + "epoch": 0.08036468452891607, + "grad_norm": 0.11232609301805496, + "learning_rate": 2.9583074809412342e-05, + "loss": 0.0383, + "step": 36440 + }, + { + "epoch": 0.08038673850381424, + "grad_norm": 0.16752274334430695, + "learning_rate": 2.9582687591786587e-05, + "loss": 0.0385, + "step": 36450 + }, + { + "epoch": 0.0804087924787124, + "grad_norm": 0.11112916469573975, + "learning_rate": 2.958230019696741e-05, + "loss": 0.0377, + "step": 36460 + }, + { + "epoch": 0.08043084645361057, + "grad_norm": 0.0822928175330162, + "learning_rate": 2.9581912624959518e-05, + "loss": 0.039, + "step": 36470 + }, + { + "epoch": 0.08045290042850874, + "grad_norm": 0.12292749434709549, + "learning_rate": 2.9581524875767622e-05, + "loss": 0.0389, + "step": 36480 + }, + { + "epoch": 0.08047495440340689, + "grad_norm": 0.14774343371391296, + "learning_rate": 2.958113694939643e-05, + "loss": 0.0399, + "step": 36490 + }, + { + "epoch": 0.08049700837830506, + "grad_norm": 0.13602666556835175, + "learning_rate": 2.9580748845850656e-05, + "loss": 0.0409, + "step": 36500 + }, + { + "epoch": 0.08051906235320323, + "grad_norm": 0.11557314544916153, + "learning_rate": 2.9580360565135015e-05, + "loss": 0.0377, + "step": 36510 + }, + { + "epoch": 0.08054111632810139, + "grad_norm": 0.1437848061323166, + "learning_rate": 2.957997210725423e-05, + "loss": 0.0388, + "step": 36520 + }, + { + "epoch": 0.08056317030299956, + "grad_norm": 0.12743164598941803, + "learning_rate": 2.957958347221302e-05, + "loss": 0.0399, + "step": 36530 + }, + { + "epoch": 0.08058522427789773, + "grad_norm": 0.13898783922195435, + "learning_rate": 2.95791946600161e-05, + "loss": 0.0374, + "step": 36540 + }, + { + "epoch": 0.08060727825279589, + "grad_norm": 0.11750765889883041, + "learning_rate": 2.9578805670668205e-05, + "loss": 0.0397, + "step": 36550 + }, + { + "epoch": 0.08062933222769406, + "grad_norm": 0.1263822764158249, + "learning_rate": 2.9578416504174053e-05, + "loss": 0.0428, + "step": 36560 + }, + { + "epoch": 0.08065138620259223, + "grad_norm": 0.13197694718837738, + "learning_rate": 2.957802716053838e-05, + "loss": 0.0393, + "step": 36570 + }, + { + "epoch": 0.0806734401774904, + "grad_norm": 0.1506931483745575, + "learning_rate": 2.957763763976591e-05, + "loss": 0.037, + "step": 36580 + }, + { + "epoch": 0.08069549415238855, + "grad_norm": 0.15648242831230164, + "learning_rate": 2.957724794186138e-05, + "loss": 0.0412, + "step": 36590 + }, + { + "epoch": 0.08071754812728672, + "grad_norm": 0.16599759459495544, + "learning_rate": 2.9576858066829524e-05, + "loss": 0.0398, + "step": 36600 + }, + { + "epoch": 0.0807396021021849, + "grad_norm": 0.15699835121631622, + "learning_rate": 2.957646801467508e-05, + "loss": 0.0416, + "step": 36610 + }, + { + "epoch": 0.08076165607708305, + "grad_norm": 0.0989176332950592, + "learning_rate": 2.9576077785402786e-05, + "loss": 0.0385, + "step": 36620 + }, + { + "epoch": 0.08078371005198122, + "grad_norm": 0.13241581618785858, + "learning_rate": 2.9575687379017388e-05, + "loss": 0.0371, + "step": 36630 + }, + { + "epoch": 0.08080576402687939, + "grad_norm": 0.12385234236717224, + "learning_rate": 2.957529679552362e-05, + "loss": 0.0401, + "step": 36640 + }, + { + "epoch": 0.08082781800177755, + "grad_norm": 0.0953792929649353, + "learning_rate": 2.9574906034926238e-05, + "loss": 0.0383, + "step": 36650 + }, + { + "epoch": 0.08084987197667572, + "grad_norm": 0.11744920164346695, + "learning_rate": 2.957451509722999e-05, + "loss": 0.0391, + "step": 36660 + }, + { + "epoch": 0.08087192595157389, + "grad_norm": 0.1314687430858612, + "learning_rate": 2.9574123982439622e-05, + "loss": 0.0388, + "step": 36670 + }, + { + "epoch": 0.08089397992647204, + "grad_norm": 0.0915519967675209, + "learning_rate": 2.9573732690559885e-05, + "loss": 0.0382, + "step": 36680 + }, + { + "epoch": 0.08091603390137021, + "grad_norm": 0.1376856118440628, + "learning_rate": 2.9573341221595534e-05, + "loss": 0.0387, + "step": 36690 + }, + { + "epoch": 0.08093808787626838, + "grad_norm": 0.09067053347826004, + "learning_rate": 2.9572949575551332e-05, + "loss": 0.0406, + "step": 36700 + }, + { + "epoch": 0.08096014185116654, + "grad_norm": 0.13055481016635895, + "learning_rate": 2.9572557752432033e-05, + "loss": 0.0383, + "step": 36710 + }, + { + "epoch": 0.08098219582606471, + "grad_norm": 0.10214599967002869, + "learning_rate": 2.9572165752242397e-05, + "loss": 0.0407, + "step": 36720 + }, + { + "epoch": 0.08100424980096288, + "grad_norm": 0.11147495359182358, + "learning_rate": 2.9571773574987188e-05, + "loss": 0.0388, + "step": 36730 + }, + { + "epoch": 0.08102630377586104, + "grad_norm": 0.11428994685411453, + "learning_rate": 2.9571381220671172e-05, + "loss": 0.0403, + "step": 36740 + }, + { + "epoch": 0.0810483577507592, + "grad_norm": 0.14291006326675415, + "learning_rate": 2.9570988689299118e-05, + "loss": 0.039, + "step": 36750 + }, + { + "epoch": 0.08107041172565738, + "grad_norm": 0.16836677491664886, + "learning_rate": 2.9570595980875795e-05, + "loss": 0.0386, + "step": 36760 + }, + { + "epoch": 0.08109246570055553, + "grad_norm": 0.09401247650384903, + "learning_rate": 2.9570203095405975e-05, + "loss": 0.038, + "step": 36770 + }, + { + "epoch": 0.0811145196754537, + "grad_norm": 0.18407461047172546, + "learning_rate": 2.956981003289443e-05, + "loss": 0.036, + "step": 36780 + }, + { + "epoch": 0.08113657365035187, + "grad_norm": 0.15309438109397888, + "learning_rate": 2.956941679334593e-05, + "loss": 0.0396, + "step": 36790 + }, + { + "epoch": 0.08115862762525004, + "grad_norm": 0.11384240537881851, + "learning_rate": 2.956902337676527e-05, + "loss": 0.0396, + "step": 36800 + }, + { + "epoch": 0.0811806816001482, + "grad_norm": 0.1273835450410843, + "learning_rate": 2.9568629783157213e-05, + "loss": 0.0396, + "step": 36810 + }, + { + "epoch": 0.08120273557504637, + "grad_norm": 0.14941494166851044, + "learning_rate": 2.9568236012526554e-05, + "loss": 0.0407, + "step": 36820 + }, + { + "epoch": 0.08122478954994454, + "grad_norm": 0.137144073843956, + "learning_rate": 2.9567842064878072e-05, + "loss": 0.0373, + "step": 36830 + }, + { + "epoch": 0.0812468435248427, + "grad_norm": 0.13848887383937836, + "learning_rate": 2.9567447940216556e-05, + "loss": 0.0384, + "step": 36840 + }, + { + "epoch": 0.08126889749974087, + "grad_norm": 0.13058383762836456, + "learning_rate": 2.9567053638546787e-05, + "loss": 0.0378, + "step": 36850 + }, + { + "epoch": 0.08129095147463904, + "grad_norm": 0.16734811663627625, + "learning_rate": 2.9566659159873566e-05, + "loss": 0.0386, + "step": 36860 + }, + { + "epoch": 0.08131300544953719, + "grad_norm": 0.2043652981519699, + "learning_rate": 2.9566264504201685e-05, + "loss": 0.0399, + "step": 36870 + }, + { + "epoch": 0.08133505942443536, + "grad_norm": 0.1273254156112671, + "learning_rate": 2.9565869671535933e-05, + "loss": 0.0377, + "step": 36880 + }, + { + "epoch": 0.08135711339933353, + "grad_norm": 0.15560279786586761, + "learning_rate": 2.956547466188112e-05, + "loss": 0.0399, + "step": 36890 + }, + { + "epoch": 0.08137916737423169, + "grad_norm": 0.1379849761724472, + "learning_rate": 2.956507947524203e-05, + "loss": 0.0393, + "step": 36900 + }, + { + "epoch": 0.08140122134912986, + "grad_norm": 0.10857958346605301, + "learning_rate": 2.956468411162348e-05, + "loss": 0.0379, + "step": 36910 + }, + { + "epoch": 0.08142327532402803, + "grad_norm": 0.09926591068506241, + "learning_rate": 2.956428857103026e-05, + "loss": 0.0363, + "step": 36920 + }, + { + "epoch": 0.08144532929892619, + "grad_norm": 0.1295299530029297, + "learning_rate": 2.9563892853467187e-05, + "loss": 0.0354, + "step": 36930 + }, + { + "epoch": 0.08146738327382436, + "grad_norm": 0.11936694383621216, + "learning_rate": 2.9563496958939062e-05, + "loss": 0.0365, + "step": 36940 + }, + { + "epoch": 0.08148943724872253, + "grad_norm": 0.13585390150547028, + "learning_rate": 2.95631008874507e-05, + "loss": 0.04, + "step": 36950 + }, + { + "epoch": 0.08151149122362068, + "grad_norm": 0.12457490712404251, + "learning_rate": 2.9562704639006912e-05, + "loss": 0.0382, + "step": 36960 + }, + { + "epoch": 0.08153354519851885, + "grad_norm": 0.08658220618963242, + "learning_rate": 2.9562308213612516e-05, + "loss": 0.0379, + "step": 36970 + }, + { + "epoch": 0.08155559917341702, + "grad_norm": 0.12349335104227066, + "learning_rate": 2.9561911611272327e-05, + "loss": 0.0385, + "step": 36980 + }, + { + "epoch": 0.0815776531483152, + "grad_norm": 0.11623448133468628, + "learning_rate": 2.9561514831991153e-05, + "loss": 0.0398, + "step": 36990 + }, + { + "epoch": 0.08159970712321335, + "grad_norm": 0.1169591173529625, + "learning_rate": 2.9561117875773836e-05, + "loss": 0.0386, + "step": 37000 + }, + { + "epoch": 0.08162176109811152, + "grad_norm": 0.12081551551818848, + "learning_rate": 2.9560720742625184e-05, + "loss": 0.0386, + "step": 37010 + }, + { + "epoch": 0.08164381507300969, + "grad_norm": 0.0838858038187027, + "learning_rate": 2.956032343255003e-05, + "loss": 0.0382, + "step": 37020 + }, + { + "epoch": 0.08166586904790785, + "grad_norm": 0.13569769263267517, + "learning_rate": 2.95599259455532e-05, + "loss": 0.0402, + "step": 37030 + }, + { + "epoch": 0.08168792302280602, + "grad_norm": 0.10443992912769318, + "learning_rate": 2.955952828163952e-05, + "loss": 0.0402, + "step": 37040 + }, + { + "epoch": 0.08170997699770419, + "grad_norm": 0.1849743127822876, + "learning_rate": 2.9559130440813827e-05, + "loss": 0.0378, + "step": 37050 + }, + { + "epoch": 0.08173203097260234, + "grad_norm": 0.1156870573759079, + "learning_rate": 2.9558732423080946e-05, + "loss": 0.0423, + "step": 37060 + }, + { + "epoch": 0.08175408494750051, + "grad_norm": 0.10406608134508133, + "learning_rate": 2.955833422844573e-05, + "loss": 0.0376, + "step": 37070 + }, + { + "epoch": 0.08177613892239868, + "grad_norm": 0.11251164972782135, + "learning_rate": 2.9557935856913003e-05, + "loss": 0.0383, + "step": 37080 + }, + { + "epoch": 0.08179819289729684, + "grad_norm": 0.09178601950407028, + "learning_rate": 2.955753730848761e-05, + "loss": 0.039, + "step": 37090 + }, + { + "epoch": 0.08182024687219501, + "grad_norm": 0.15289661288261414, + "learning_rate": 2.9557138583174397e-05, + "loss": 0.0395, + "step": 37100 + }, + { + "epoch": 0.08184230084709318, + "grad_norm": 0.13343080878257751, + "learning_rate": 2.9556739680978204e-05, + "loss": 0.0386, + "step": 37110 + }, + { + "epoch": 0.08186435482199134, + "grad_norm": 0.13864457607269287, + "learning_rate": 2.9556340601903882e-05, + "loss": 0.0398, + "step": 37120 + }, + { + "epoch": 0.0818864087968895, + "grad_norm": 0.13111400604248047, + "learning_rate": 2.9555941345956274e-05, + "loss": 0.0342, + "step": 37130 + }, + { + "epoch": 0.08190846277178768, + "grad_norm": 0.1251743584871292, + "learning_rate": 2.9555541913140237e-05, + "loss": 0.0396, + "step": 37140 + }, + { + "epoch": 0.08193051674668583, + "grad_norm": 0.14937762916088104, + "learning_rate": 2.9555142303460625e-05, + "loss": 0.0397, + "step": 37150 + }, + { + "epoch": 0.081952570721584, + "grad_norm": 0.12148106843233109, + "learning_rate": 2.955474251692229e-05, + "loss": 0.0394, + "step": 37160 + }, + { + "epoch": 0.08197462469648217, + "grad_norm": 0.10606327652931213, + "learning_rate": 2.9554342553530096e-05, + "loss": 0.0401, + "step": 37170 + }, + { + "epoch": 0.08199667867138033, + "grad_norm": 0.11425787210464478, + "learning_rate": 2.9553942413288893e-05, + "loss": 0.0396, + "step": 37180 + }, + { + "epoch": 0.0820187326462785, + "grad_norm": 0.09487966448068619, + "learning_rate": 2.9553542096203548e-05, + "loss": 0.0386, + "step": 37190 + }, + { + "epoch": 0.08204078662117667, + "grad_norm": 0.1364835649728775, + "learning_rate": 2.9553141602278934e-05, + "loss": 0.0398, + "step": 37200 + }, + { + "epoch": 0.08206284059607484, + "grad_norm": 0.11501912772655487, + "learning_rate": 2.95527409315199e-05, + "loss": 0.0396, + "step": 37210 + }, + { + "epoch": 0.082084894570973, + "grad_norm": 0.11248763650655746, + "learning_rate": 2.9552340083931327e-05, + "loss": 0.0397, + "step": 37220 + }, + { + "epoch": 0.08210694854587117, + "grad_norm": 0.13064712285995483, + "learning_rate": 2.9551939059518082e-05, + "loss": 0.0388, + "step": 37230 + }, + { + "epoch": 0.08212900252076934, + "grad_norm": 0.13479550182819366, + "learning_rate": 2.955153785828504e-05, + "loss": 0.0387, + "step": 37240 + }, + { + "epoch": 0.0821510564956675, + "grad_norm": 0.08194982260465622, + "learning_rate": 2.955113648023707e-05, + "loss": 0.0368, + "step": 37250 + }, + { + "epoch": 0.08217311047056566, + "grad_norm": 0.10287674516439438, + "learning_rate": 2.9550734925379058e-05, + "loss": 0.0395, + "step": 37260 + }, + { + "epoch": 0.08219516444546383, + "grad_norm": 0.11654510349035263, + "learning_rate": 2.9550333193715874e-05, + "loss": 0.0388, + "step": 37270 + }, + { + "epoch": 0.08221721842036199, + "grad_norm": 0.11773978173732758, + "learning_rate": 2.9549931285252404e-05, + "loss": 0.0398, + "step": 37280 + }, + { + "epoch": 0.08223927239526016, + "grad_norm": 0.12598517537117004, + "learning_rate": 2.9549529199993537e-05, + "loss": 0.0396, + "step": 37290 + }, + { + "epoch": 0.08226132637015833, + "grad_norm": 0.10289790481328964, + "learning_rate": 2.9549126937944146e-05, + "loss": 0.0385, + "step": 37300 + }, + { + "epoch": 0.08228338034505649, + "grad_norm": 0.10872403532266617, + "learning_rate": 2.954872449910913e-05, + "loss": 0.0398, + "step": 37310 + }, + { + "epoch": 0.08230543431995466, + "grad_norm": 0.13054120540618896, + "learning_rate": 2.9548321883493376e-05, + "loss": 0.0384, + "step": 37320 + }, + { + "epoch": 0.08232748829485283, + "grad_norm": 0.15173909068107605, + "learning_rate": 2.9547919091101772e-05, + "loss": 0.0411, + "step": 37330 + }, + { + "epoch": 0.08234954226975098, + "grad_norm": 0.12146206945180893, + "learning_rate": 2.9547516121939217e-05, + "loss": 0.0392, + "step": 37340 + }, + { + "epoch": 0.08237159624464915, + "grad_norm": 0.10991223156452179, + "learning_rate": 2.9547112976010607e-05, + "loss": 0.0373, + "step": 37350 + }, + { + "epoch": 0.08239365021954732, + "grad_norm": 0.11765729635953903, + "learning_rate": 2.954670965332084e-05, + "loss": 0.0369, + "step": 37360 + }, + { + "epoch": 0.08241570419444548, + "grad_norm": 0.1065465435385704, + "learning_rate": 2.9546306153874816e-05, + "loss": 0.0398, + "step": 37370 + }, + { + "epoch": 0.08243775816934365, + "grad_norm": 0.11690527200698853, + "learning_rate": 2.9545902477677437e-05, + "loss": 0.0409, + "step": 37380 + }, + { + "epoch": 0.08245981214424182, + "grad_norm": 0.10018567740917206, + "learning_rate": 2.954549862473361e-05, + "loss": 0.0393, + "step": 37390 + }, + { + "epoch": 0.08248186611913998, + "grad_norm": 0.14567068219184875, + "learning_rate": 2.9545094595048238e-05, + "loss": 0.0396, + "step": 37400 + }, + { + "epoch": 0.08250392009403815, + "grad_norm": 0.12909860908985138, + "learning_rate": 2.954469038862624e-05, + "loss": 0.0373, + "step": 37410 + }, + { + "epoch": 0.08252597406893632, + "grad_norm": 0.10640887171030045, + "learning_rate": 2.9544286005472517e-05, + "loss": 0.039, + "step": 37420 + }, + { + "epoch": 0.08254802804383449, + "grad_norm": 0.10533341020345688, + "learning_rate": 2.954388144559199e-05, + "loss": 0.0371, + "step": 37430 + }, + { + "epoch": 0.08257008201873264, + "grad_norm": 0.11768417060375214, + "learning_rate": 2.954347670898957e-05, + "loss": 0.0388, + "step": 37440 + }, + { + "epoch": 0.08259213599363081, + "grad_norm": 0.11099827289581299, + "learning_rate": 2.9543071795670176e-05, + "loss": 0.038, + "step": 37450 + }, + { + "epoch": 0.08261418996852898, + "grad_norm": 0.10637765377759933, + "learning_rate": 2.9542666705638728e-05, + "loss": 0.0381, + "step": 37460 + }, + { + "epoch": 0.08263624394342714, + "grad_norm": 0.1309927999973297, + "learning_rate": 2.9542261438900154e-05, + "loss": 0.0386, + "step": 37470 + }, + { + "epoch": 0.08265829791832531, + "grad_norm": 0.09687640517950058, + "learning_rate": 2.9541855995459367e-05, + "loss": 0.0379, + "step": 37480 + }, + { + "epoch": 0.08268035189322348, + "grad_norm": 0.11300229281187057, + "learning_rate": 2.95414503753213e-05, + "loss": 0.0375, + "step": 37490 + }, + { + "epoch": 0.08270240586812164, + "grad_norm": 0.11157863587141037, + "learning_rate": 2.954104457849089e-05, + "loss": 0.0378, + "step": 37500 + }, + { + "epoch": 0.0827244598430198, + "grad_norm": 0.1046452671289444, + "learning_rate": 2.9540638604973052e-05, + "loss": 0.0394, + "step": 37510 + }, + { + "epoch": 0.08274651381791798, + "grad_norm": 0.10248765349388123, + "learning_rate": 2.9540232454772728e-05, + "loss": 0.0376, + "step": 37520 + }, + { + "epoch": 0.08276856779281613, + "grad_norm": 0.11294706165790558, + "learning_rate": 2.9539826127894852e-05, + "loss": 0.0393, + "step": 37530 + }, + { + "epoch": 0.0827906217677143, + "grad_norm": 0.16839538514614105, + "learning_rate": 2.9539419624344356e-05, + "loss": 0.0388, + "step": 37540 + }, + { + "epoch": 0.08281267574261247, + "grad_norm": 0.11248447000980377, + "learning_rate": 2.953901294412619e-05, + "loss": 0.0396, + "step": 37550 + }, + { + "epoch": 0.08283472971751063, + "grad_norm": 0.1484765261411667, + "learning_rate": 2.953860608724529e-05, + "loss": 0.04, + "step": 37560 + }, + { + "epoch": 0.0828567836924088, + "grad_norm": 0.12465279549360275, + "learning_rate": 2.9538199053706597e-05, + "loss": 0.0397, + "step": 37570 + }, + { + "epoch": 0.08287883766730697, + "grad_norm": 0.11548402905464172, + "learning_rate": 2.9537791843515056e-05, + "loss": 0.0399, + "step": 37580 + }, + { + "epoch": 0.08290089164220513, + "grad_norm": 0.11616154760122299, + "learning_rate": 2.9537384456675624e-05, + "loss": 0.0373, + "step": 37590 + }, + { + "epoch": 0.0829229456171033, + "grad_norm": 0.10594876110553741, + "learning_rate": 2.9536976893193244e-05, + "loss": 0.0387, + "step": 37600 + }, + { + "epoch": 0.08294499959200147, + "grad_norm": 0.1492839902639389, + "learning_rate": 2.9536569153072866e-05, + "loss": 0.0384, + "step": 37610 + }, + { + "epoch": 0.08296705356689962, + "grad_norm": 0.12075408548116684, + "learning_rate": 2.9536161236319448e-05, + "loss": 0.0381, + "step": 37620 + }, + { + "epoch": 0.0829891075417978, + "grad_norm": 0.11304622888565063, + "learning_rate": 2.953575314293795e-05, + "loss": 0.04, + "step": 37630 + }, + { + "epoch": 0.08301116151669596, + "grad_norm": 0.08867565542459488, + "learning_rate": 2.9535344872933326e-05, + "loss": 0.0386, + "step": 37640 + }, + { + "epoch": 0.08303321549159413, + "grad_norm": 0.11927420645952225, + "learning_rate": 2.953493642631054e-05, + "loss": 0.0391, + "step": 37650 + }, + { + "epoch": 0.08305526946649229, + "grad_norm": 0.1355646699666977, + "learning_rate": 2.953452780307455e-05, + "loss": 0.0386, + "step": 37660 + }, + { + "epoch": 0.08307732344139046, + "grad_norm": 0.1303929090499878, + "learning_rate": 2.9534119003230324e-05, + "loss": 0.0369, + "step": 37670 + }, + { + "epoch": 0.08309937741628863, + "grad_norm": 0.13928987085819244, + "learning_rate": 2.9533710026782835e-05, + "loss": 0.0373, + "step": 37680 + }, + { + "epoch": 0.08312143139118679, + "grad_norm": 0.12566207349300385, + "learning_rate": 2.9533300873737044e-05, + "loss": 0.0397, + "step": 37690 + }, + { + "epoch": 0.08314348536608496, + "grad_norm": 0.14648032188415527, + "learning_rate": 2.9532891544097924e-05, + "loss": 0.0381, + "step": 37700 + }, + { + "epoch": 0.08316553934098313, + "grad_norm": 0.12238338589668274, + "learning_rate": 2.9532482037870453e-05, + "loss": 0.0375, + "step": 37710 + }, + { + "epoch": 0.08318759331588128, + "grad_norm": 0.0935581624507904, + "learning_rate": 2.95320723550596e-05, + "loss": 0.0399, + "step": 37720 + }, + { + "epoch": 0.08320964729077945, + "grad_norm": 0.11627745628356934, + "learning_rate": 2.953166249567035e-05, + "loss": 0.0388, + "step": 37730 + }, + { + "epoch": 0.08323170126567762, + "grad_norm": 0.10941681265830994, + "learning_rate": 2.9531252459707683e-05, + "loss": 0.0372, + "step": 37740 + }, + { + "epoch": 0.08325375524057578, + "grad_norm": 0.12668079137802124, + "learning_rate": 2.9530842247176574e-05, + "loss": 0.0401, + "step": 37750 + }, + { + "epoch": 0.08327580921547395, + "grad_norm": 0.11004883050918579, + "learning_rate": 2.9530431858082013e-05, + "loss": 0.0395, + "step": 37760 + }, + { + "epoch": 0.08329786319037212, + "grad_norm": 0.13595624268054962, + "learning_rate": 2.9530021292428988e-05, + "loss": 0.0388, + "step": 37770 + }, + { + "epoch": 0.08331991716527028, + "grad_norm": 0.11460808664560318, + "learning_rate": 2.9529610550222486e-05, + "loss": 0.0385, + "step": 37780 + }, + { + "epoch": 0.08334197114016845, + "grad_norm": 0.13260984420776367, + "learning_rate": 2.9529199631467496e-05, + "loss": 0.0392, + "step": 37790 + }, + { + "epoch": 0.08336402511506662, + "grad_norm": 0.11435170471668243, + "learning_rate": 2.9528788536169014e-05, + "loss": 0.0388, + "step": 37800 + }, + { + "epoch": 0.08338607908996477, + "grad_norm": 0.12660622596740723, + "learning_rate": 2.9528377264332034e-05, + "loss": 0.0362, + "step": 37810 + }, + { + "epoch": 0.08340813306486294, + "grad_norm": 0.1261378824710846, + "learning_rate": 2.9527965815961553e-05, + "loss": 0.0395, + "step": 37820 + }, + { + "epoch": 0.08343018703976111, + "grad_norm": 0.11825723946094513, + "learning_rate": 2.9527554191062575e-05, + "loss": 0.0409, + "step": 37830 + }, + { + "epoch": 0.08345224101465927, + "grad_norm": 0.10788806527853012, + "learning_rate": 2.9527142389640088e-05, + "loss": 0.0403, + "step": 37840 + }, + { + "epoch": 0.08347429498955744, + "grad_norm": 0.13237205147743225, + "learning_rate": 2.952673041169911e-05, + "loss": 0.0387, + "step": 37850 + }, + { + "epoch": 0.08349634896445561, + "grad_norm": 0.12193932384252548, + "learning_rate": 2.9526318257244643e-05, + "loss": 0.0384, + "step": 37860 + }, + { + "epoch": 0.08351840293935378, + "grad_norm": 0.103431835770607, + "learning_rate": 2.952590592628169e-05, + "loss": 0.038, + "step": 37870 + }, + { + "epoch": 0.08354045691425194, + "grad_norm": 0.11917581409215927, + "learning_rate": 2.952549341881527e-05, + "loss": 0.0387, + "step": 37880 + }, + { + "epoch": 0.08356251088915011, + "grad_norm": 0.12472769618034363, + "learning_rate": 2.9525080734850387e-05, + "loss": 0.0401, + "step": 37890 + }, + { + "epoch": 0.08358456486404828, + "grad_norm": 0.14881201088428497, + "learning_rate": 2.9524667874392062e-05, + "loss": 0.0392, + "step": 37900 + }, + { + "epoch": 0.08360661883894643, + "grad_norm": 0.1214800626039505, + "learning_rate": 2.9524254837445305e-05, + "loss": 0.0381, + "step": 37910 + }, + { + "epoch": 0.0836286728138446, + "grad_norm": 0.14704793691635132, + "learning_rate": 2.9523841624015144e-05, + "loss": 0.0389, + "step": 37920 + }, + { + "epoch": 0.08365072678874277, + "grad_norm": 0.1319938749074936, + "learning_rate": 2.9523428234106587e-05, + "loss": 0.0379, + "step": 37930 + }, + { + "epoch": 0.08367278076364093, + "grad_norm": 0.14566956460475922, + "learning_rate": 2.9523014667724675e-05, + "loss": 0.0399, + "step": 37940 + }, + { + "epoch": 0.0836948347385391, + "grad_norm": 0.10633587837219238, + "learning_rate": 2.9522600924874415e-05, + "loss": 0.0398, + "step": 37950 + }, + { + "epoch": 0.08371688871343727, + "grad_norm": 0.10923252999782562, + "learning_rate": 2.9522187005560844e-05, + "loss": 0.0388, + "step": 37960 + }, + { + "epoch": 0.08373894268833543, + "grad_norm": 0.13810546696186066, + "learning_rate": 2.952177290978899e-05, + "loss": 0.0392, + "step": 37970 + }, + { + "epoch": 0.0837609966632336, + "grad_norm": 0.11808567494153976, + "learning_rate": 2.9521358637563885e-05, + "loss": 0.0375, + "step": 37980 + }, + { + "epoch": 0.08378305063813177, + "grad_norm": 0.11978480964899063, + "learning_rate": 2.952094418889056e-05, + "loss": 0.0395, + "step": 37990 + }, + { + "epoch": 0.08380510461302992, + "grad_norm": 0.11523395031690598, + "learning_rate": 2.9520529563774053e-05, + "loss": 0.0387, + "step": 38000 + }, + { + "epoch": 0.0838271585879281, + "grad_norm": 0.11776938289403915, + "learning_rate": 2.9520114762219403e-05, + "loss": 0.0402, + "step": 38010 + }, + { + "epoch": 0.08384921256282626, + "grad_norm": 0.11054611206054688, + "learning_rate": 2.951969978423165e-05, + "loss": 0.0396, + "step": 38020 + }, + { + "epoch": 0.08387126653772442, + "grad_norm": 0.13197678327560425, + "learning_rate": 2.9519284629815837e-05, + "loss": 0.0394, + "step": 38030 + }, + { + "epoch": 0.08389332051262259, + "grad_norm": 0.11572219431400299, + "learning_rate": 2.9518869298977007e-05, + "loss": 0.0387, + "step": 38040 + }, + { + "epoch": 0.08391537448752076, + "grad_norm": 0.10321839898824692, + "learning_rate": 2.9518453791720202e-05, + "loss": 0.0413, + "step": 38050 + }, + { + "epoch": 0.08393742846241893, + "grad_norm": 0.14637888967990875, + "learning_rate": 2.9518038108050482e-05, + "loss": 0.0406, + "step": 38060 + }, + { + "epoch": 0.08395948243731709, + "grad_norm": 0.16124273836612701, + "learning_rate": 2.951762224797289e-05, + "loss": 0.0415, + "step": 38070 + }, + { + "epoch": 0.08398153641221526, + "grad_norm": 0.12225335836410522, + "learning_rate": 2.9517206211492477e-05, + "loss": 0.0403, + "step": 38080 + }, + { + "epoch": 0.08400359038711343, + "grad_norm": 0.1269814372062683, + "learning_rate": 2.9516789998614306e-05, + "loss": 0.0371, + "step": 38090 + }, + { + "epoch": 0.08402564436201158, + "grad_norm": 0.11370417475700378, + "learning_rate": 2.951637360934343e-05, + "loss": 0.039, + "step": 38100 + }, + { + "epoch": 0.08404769833690975, + "grad_norm": 0.12389306724071503, + "learning_rate": 2.951595704368491e-05, + "loss": 0.0394, + "step": 38110 + }, + { + "epoch": 0.08406975231180792, + "grad_norm": 0.12989665567874908, + "learning_rate": 2.9515540301643803e-05, + "loss": 0.0377, + "step": 38120 + }, + { + "epoch": 0.08409180628670608, + "grad_norm": 0.11240468174219131, + "learning_rate": 2.9515123383225177e-05, + "loss": 0.0392, + "step": 38130 + }, + { + "epoch": 0.08411386026160425, + "grad_norm": 0.13651815056800842, + "learning_rate": 2.95147062884341e-05, + "loss": 0.0387, + "step": 38140 + }, + { + "epoch": 0.08413591423650242, + "grad_norm": 0.1231968030333519, + "learning_rate": 2.9514289017275633e-05, + "loss": 0.0394, + "step": 38150 + }, + { + "epoch": 0.08415796821140058, + "grad_norm": 0.12302205711603165, + "learning_rate": 2.9513871569754855e-05, + "loss": 0.0358, + "step": 38160 + }, + { + "epoch": 0.08418002218629875, + "grad_norm": 0.1283150315284729, + "learning_rate": 2.9513453945876833e-05, + "loss": 0.0373, + "step": 38170 + }, + { + "epoch": 0.08420207616119692, + "grad_norm": 0.09787340462207794, + "learning_rate": 2.9513036145646643e-05, + "loss": 0.038, + "step": 38180 + }, + { + "epoch": 0.08422413013609507, + "grad_norm": 0.13784676790237427, + "learning_rate": 2.951261816906936e-05, + "loss": 0.0378, + "step": 38190 + }, + { + "epoch": 0.08424618411099324, + "grad_norm": 0.14695696532726288, + "learning_rate": 2.9512200016150064e-05, + "loss": 0.0362, + "step": 38200 + }, + { + "epoch": 0.08426823808589141, + "grad_norm": 0.1527569591999054, + "learning_rate": 2.951178168689384e-05, + "loss": 0.0372, + "step": 38210 + }, + { + "epoch": 0.08429029206078957, + "grad_norm": 0.1007312685251236, + "learning_rate": 2.9511363181305765e-05, + "loss": 0.0379, + "step": 38220 + }, + { + "epoch": 0.08431234603568774, + "grad_norm": 0.11053784936666489, + "learning_rate": 2.951094449939092e-05, + "loss": 0.0375, + "step": 38230 + }, + { + "epoch": 0.08433440001058591, + "grad_norm": 0.1276630461215973, + "learning_rate": 2.9510525641154403e-05, + "loss": 0.0367, + "step": 38240 + }, + { + "epoch": 0.08435645398548407, + "grad_norm": 0.11764645576477051, + "learning_rate": 2.95101066066013e-05, + "loss": 0.0408, + "step": 38250 + }, + { + "epoch": 0.08437850796038224, + "grad_norm": 0.1359863430261612, + "learning_rate": 2.9509687395736708e-05, + "loss": 0.0398, + "step": 38260 + }, + { + "epoch": 0.08440056193528041, + "grad_norm": 0.12676967680454254, + "learning_rate": 2.9509268008565707e-05, + "loss": 0.0384, + "step": 38270 + }, + { + "epoch": 0.08442261591017858, + "grad_norm": 0.14132513105869293, + "learning_rate": 2.95088484450934e-05, + "loss": 0.0377, + "step": 38280 + }, + { + "epoch": 0.08444466988507673, + "grad_norm": 0.1117006316781044, + "learning_rate": 2.9508428705324888e-05, + "loss": 0.0379, + "step": 38290 + }, + { + "epoch": 0.0844667238599749, + "grad_norm": 0.1081499382853508, + "learning_rate": 2.950800878926527e-05, + "loss": 0.0401, + "step": 38300 + }, + { + "epoch": 0.08448877783487307, + "grad_norm": 0.13754335045814514, + "learning_rate": 2.9507588696919646e-05, + "loss": 0.039, + "step": 38310 + }, + { + "epoch": 0.08451083180977123, + "grad_norm": 0.14488616585731506, + "learning_rate": 2.950716842829312e-05, + "loss": 0.0383, + "step": 38320 + }, + { + "epoch": 0.0845328857846694, + "grad_norm": 0.11461758613586426, + "learning_rate": 2.9506747983390807e-05, + "loss": 0.0383, + "step": 38330 + }, + { + "epoch": 0.08455493975956757, + "grad_norm": 0.14814133942127228, + "learning_rate": 2.9506327362217803e-05, + "loss": 0.0373, + "step": 38340 + }, + { + "epoch": 0.08457699373446573, + "grad_norm": 0.11662456393241882, + "learning_rate": 2.950590656477923e-05, + "loss": 0.0368, + "step": 38350 + }, + { + "epoch": 0.0845990477093639, + "grad_norm": 0.12058810144662857, + "learning_rate": 2.9505485591080193e-05, + "loss": 0.0382, + "step": 38360 + }, + { + "epoch": 0.08462110168426207, + "grad_norm": 0.0979432463645935, + "learning_rate": 2.9505064441125812e-05, + "loss": 0.0383, + "step": 38370 + }, + { + "epoch": 0.08464315565916022, + "grad_norm": 0.15413323044776917, + "learning_rate": 2.95046431149212e-05, + "loss": 0.0371, + "step": 38380 + }, + { + "epoch": 0.0846652096340584, + "grad_norm": 0.11248619854450226, + "learning_rate": 2.950422161247148e-05, + "loss": 0.0387, + "step": 38390 + }, + { + "epoch": 0.08468726360895656, + "grad_norm": 0.09041125327348709, + "learning_rate": 2.950379993378178e-05, + "loss": 0.0372, + "step": 38400 + }, + { + "epoch": 0.08470931758385472, + "grad_norm": 0.09975188225507736, + "learning_rate": 2.950337807885721e-05, + "loss": 0.0371, + "step": 38410 + }, + { + "epoch": 0.08473137155875289, + "grad_norm": 0.10716956853866577, + "learning_rate": 2.950295604770291e-05, + "loss": 0.039, + "step": 38420 + }, + { + "epoch": 0.08475342553365106, + "grad_norm": 0.1147739589214325, + "learning_rate": 2.9502533840323997e-05, + "loss": 0.0384, + "step": 38430 + }, + { + "epoch": 0.08477547950854922, + "grad_norm": 0.11336079984903336, + "learning_rate": 2.95021114567256e-05, + "loss": 0.0372, + "step": 38440 + }, + { + "epoch": 0.08479753348344739, + "grad_norm": 0.12527164816856384, + "learning_rate": 2.9501688896912864e-05, + "loss": 0.0364, + "step": 38450 + }, + { + "epoch": 0.08481958745834556, + "grad_norm": 0.10220848768949509, + "learning_rate": 2.950126616089091e-05, + "loss": 0.0399, + "step": 38460 + }, + { + "epoch": 0.08484164143324371, + "grad_norm": 0.13529489934444427, + "learning_rate": 2.9500843248664883e-05, + "loss": 0.0369, + "step": 38470 + }, + { + "epoch": 0.08486369540814188, + "grad_norm": 0.16729137301445007, + "learning_rate": 2.9500420160239922e-05, + "loss": 0.0373, + "step": 38480 + }, + { + "epoch": 0.08488574938304005, + "grad_norm": 0.11968916654586792, + "learning_rate": 2.949999689562116e-05, + "loss": 0.0402, + "step": 38490 + }, + { + "epoch": 0.08490780335793822, + "grad_norm": 0.14883625507354736, + "learning_rate": 2.9499573454813752e-05, + "loss": 0.0398, + "step": 38500 + }, + { + "epoch": 0.08492985733283638, + "grad_norm": 0.16578064858913422, + "learning_rate": 2.949914983782283e-05, + "loss": 0.0388, + "step": 38510 + }, + { + "epoch": 0.08495191130773455, + "grad_norm": 0.14715361595153809, + "learning_rate": 2.9498726044653555e-05, + "loss": 0.0407, + "step": 38520 + }, + { + "epoch": 0.08497396528263272, + "grad_norm": 0.15004101395606995, + "learning_rate": 2.949830207531107e-05, + "loss": 0.0385, + "step": 38530 + }, + { + "epoch": 0.08499601925753088, + "grad_norm": 0.12349579483270645, + "learning_rate": 2.949787792980052e-05, + "loss": 0.0404, + "step": 38540 + }, + { + "epoch": 0.08501807323242905, + "grad_norm": 0.1078343391418457, + "learning_rate": 2.9497453608127062e-05, + "loss": 0.0402, + "step": 38550 + }, + { + "epoch": 0.08504012720732722, + "grad_norm": 0.1465681791305542, + "learning_rate": 2.9497029110295865e-05, + "loss": 0.0371, + "step": 38560 + }, + { + "epoch": 0.08506218118222537, + "grad_norm": 0.1260446459054947, + "learning_rate": 2.949660443631207e-05, + "loss": 0.039, + "step": 38570 + }, + { + "epoch": 0.08508423515712354, + "grad_norm": 0.10330376774072647, + "learning_rate": 2.9496179586180843e-05, + "loss": 0.0384, + "step": 38580 + }, + { + "epoch": 0.08510628913202171, + "grad_norm": 0.18005375564098358, + "learning_rate": 2.9495754559907352e-05, + "loss": 0.0389, + "step": 38590 + }, + { + "epoch": 0.08512834310691987, + "grad_norm": 0.13968825340270996, + "learning_rate": 2.9495329357496755e-05, + "loss": 0.0383, + "step": 38600 + }, + { + "epoch": 0.08515039708181804, + "grad_norm": 0.13994736969470978, + "learning_rate": 2.949490397895422e-05, + "loss": 0.0384, + "step": 38610 + }, + { + "epoch": 0.08517245105671621, + "grad_norm": 0.10055051743984222, + "learning_rate": 2.949447842428492e-05, + "loss": 0.0385, + "step": 38620 + }, + { + "epoch": 0.08519450503161437, + "grad_norm": 0.1145385280251503, + "learning_rate": 2.9494052693494017e-05, + "loss": 0.0371, + "step": 38630 + }, + { + "epoch": 0.08521655900651254, + "grad_norm": 0.13322633504867554, + "learning_rate": 2.949362678658669e-05, + "loss": 0.0399, + "step": 38640 + }, + { + "epoch": 0.08523861298141071, + "grad_norm": 0.10773841291666031, + "learning_rate": 2.9493200703568115e-05, + "loss": 0.0384, + "step": 38650 + }, + { + "epoch": 0.08526066695630886, + "grad_norm": 0.12226152420043945, + "learning_rate": 2.9492774444443465e-05, + "loss": 0.036, + "step": 38660 + }, + { + "epoch": 0.08528272093120703, + "grad_norm": 0.1769694834947586, + "learning_rate": 2.9492348009217924e-05, + "loss": 0.0373, + "step": 38670 + }, + { + "epoch": 0.0853047749061052, + "grad_norm": 0.13174262642860413, + "learning_rate": 2.949192139789667e-05, + "loss": 0.0383, + "step": 38680 + }, + { + "epoch": 0.08532682888100336, + "grad_norm": 0.13587205111980438, + "learning_rate": 2.9491494610484893e-05, + "loss": 0.0383, + "step": 38690 + }, + { + "epoch": 0.08534888285590153, + "grad_norm": 0.10095226019620895, + "learning_rate": 2.949106764698777e-05, + "loss": 0.0379, + "step": 38700 + }, + { + "epoch": 0.0853709368307997, + "grad_norm": 0.10418447107076645, + "learning_rate": 2.9490640507410494e-05, + "loss": 0.0378, + "step": 38710 + }, + { + "epoch": 0.08539299080569787, + "grad_norm": 0.15486770868301392, + "learning_rate": 2.9490213191758254e-05, + "loss": 0.0389, + "step": 38720 + }, + { + "epoch": 0.08541504478059603, + "grad_norm": 0.11406948417425156, + "learning_rate": 2.9489785700036243e-05, + "loss": 0.0396, + "step": 38730 + }, + { + "epoch": 0.0854370987554942, + "grad_norm": 0.09906989336013794, + "learning_rate": 2.9489358032249656e-05, + "loss": 0.0373, + "step": 38740 + }, + { + "epoch": 0.08545915273039237, + "grad_norm": 0.12348903715610504, + "learning_rate": 2.9488930188403685e-05, + "loss": 0.0353, + "step": 38750 + }, + { + "epoch": 0.08548120670529052, + "grad_norm": 0.11721760034561157, + "learning_rate": 2.9488502168503537e-05, + "loss": 0.0393, + "step": 38760 + }, + { + "epoch": 0.0855032606801887, + "grad_norm": 0.1476026177406311, + "learning_rate": 2.9488073972554407e-05, + "loss": 0.0403, + "step": 38770 + }, + { + "epoch": 0.08552531465508686, + "grad_norm": 0.13994130492210388, + "learning_rate": 2.9487645600561503e-05, + "loss": 0.0382, + "step": 38780 + }, + { + "epoch": 0.08554736862998502, + "grad_norm": 0.16757149994373322, + "learning_rate": 2.948721705253002e-05, + "loss": 0.0368, + "step": 38790 + }, + { + "epoch": 0.08556942260488319, + "grad_norm": 0.12396430969238281, + "learning_rate": 2.9486788328465175e-05, + "loss": 0.0387, + "step": 38800 + }, + { + "epoch": 0.08559147657978136, + "grad_norm": 0.12255940586328506, + "learning_rate": 2.948635942837217e-05, + "loss": 0.038, + "step": 38810 + }, + { + "epoch": 0.08561353055467952, + "grad_norm": 0.11282986402511597, + "learning_rate": 2.9485930352256226e-05, + "loss": 0.0374, + "step": 38820 + }, + { + "epoch": 0.08563558452957769, + "grad_norm": 0.12644457817077637, + "learning_rate": 2.9485501100122547e-05, + "loss": 0.0395, + "step": 38830 + }, + { + "epoch": 0.08565763850447586, + "grad_norm": 0.13951057195663452, + "learning_rate": 2.9485071671976354e-05, + "loss": 0.0389, + "step": 38840 + }, + { + "epoch": 0.08567969247937401, + "grad_norm": 0.12037286162376404, + "learning_rate": 2.9484642067822867e-05, + "loss": 0.0361, + "step": 38850 + }, + { + "epoch": 0.08570174645427218, + "grad_norm": 0.13233061134815216, + "learning_rate": 2.94842122876673e-05, + "loss": 0.0401, + "step": 38860 + }, + { + "epoch": 0.08572380042917035, + "grad_norm": 0.12699727714061737, + "learning_rate": 2.948378233151488e-05, + "loss": 0.0378, + "step": 38870 + }, + { + "epoch": 0.08574585440406851, + "grad_norm": 0.13788023591041565, + "learning_rate": 2.948335219937083e-05, + "loss": 0.0389, + "step": 38880 + }, + { + "epoch": 0.08576790837896668, + "grad_norm": 0.1312612146139145, + "learning_rate": 2.9482921891240375e-05, + "loss": 0.0372, + "step": 38890 + }, + { + "epoch": 0.08578996235386485, + "grad_norm": 0.1566932201385498, + "learning_rate": 2.9482491407128744e-05, + "loss": 0.0402, + "step": 38900 + }, + { + "epoch": 0.08581201632876301, + "grad_norm": 0.1450723260641098, + "learning_rate": 2.948206074704117e-05, + "loss": 0.0388, + "step": 38910 + }, + { + "epoch": 0.08583407030366118, + "grad_norm": 0.17868490517139435, + "learning_rate": 2.9481629910982886e-05, + "loss": 0.0374, + "step": 38920 + }, + { + "epoch": 0.08585612427855935, + "grad_norm": 0.11471164971590042, + "learning_rate": 2.9481198898959122e-05, + "loss": 0.0367, + "step": 38930 + }, + { + "epoch": 0.08587817825345752, + "grad_norm": 0.13402031362056732, + "learning_rate": 2.9480767710975123e-05, + "loss": 0.0369, + "step": 38940 + }, + { + "epoch": 0.08590023222835567, + "grad_norm": 0.12761908769607544, + "learning_rate": 2.9480336347036122e-05, + "loss": 0.0349, + "step": 38950 + }, + { + "epoch": 0.08592228620325384, + "grad_norm": 0.10776767879724503, + "learning_rate": 2.947990480714736e-05, + "loss": 0.0374, + "step": 38960 + }, + { + "epoch": 0.08594434017815201, + "grad_norm": 0.09007249027490616, + "learning_rate": 2.947947309131409e-05, + "loss": 0.0383, + "step": 38970 + }, + { + "epoch": 0.08596639415305017, + "grad_norm": 0.14827676117420197, + "learning_rate": 2.9479041199541544e-05, + "loss": 0.0397, + "step": 38980 + }, + { + "epoch": 0.08598844812794834, + "grad_norm": 0.14749544858932495, + "learning_rate": 2.947860913183498e-05, + "loss": 0.0404, + "step": 38990 + }, + { + "epoch": 0.08601050210284651, + "grad_norm": 0.13573895394802094, + "learning_rate": 2.9478176888199646e-05, + "loss": 0.0412, + "step": 39000 + }, + { + "epoch": 0.08603255607774467, + "grad_norm": 0.12877345085144043, + "learning_rate": 2.9477744468640793e-05, + "loss": 0.0411, + "step": 39010 + }, + { + "epoch": 0.08605461005264284, + "grad_norm": 0.1168782040476799, + "learning_rate": 2.9477311873163673e-05, + "loss": 0.0373, + "step": 39020 + }, + { + "epoch": 0.08607666402754101, + "grad_norm": 0.10405486077070236, + "learning_rate": 2.9476879101773548e-05, + "loss": 0.0404, + "step": 39030 + }, + { + "epoch": 0.08609871800243916, + "grad_norm": 0.08296932280063629, + "learning_rate": 2.947644615447567e-05, + "loss": 0.0394, + "step": 39040 + }, + { + "epoch": 0.08612077197733733, + "grad_norm": 0.10324474424123764, + "learning_rate": 2.9476013031275305e-05, + "loss": 0.0363, + "step": 39050 + }, + { + "epoch": 0.0861428259522355, + "grad_norm": 0.12667441368103027, + "learning_rate": 2.9475579732177714e-05, + "loss": 0.0399, + "step": 39060 + }, + { + "epoch": 0.08616487992713366, + "grad_norm": 0.11353759467601776, + "learning_rate": 2.9475146257188162e-05, + "loss": 0.0395, + "step": 39070 + }, + { + "epoch": 0.08618693390203183, + "grad_norm": 0.14534927904605865, + "learning_rate": 2.9474712606311917e-05, + "loss": 0.0398, + "step": 39080 + }, + { + "epoch": 0.08620898787693, + "grad_norm": 0.14161381125450134, + "learning_rate": 2.9474278779554247e-05, + "loss": 0.0393, + "step": 39090 + }, + { + "epoch": 0.08623104185182816, + "grad_norm": 0.12052775919437408, + "learning_rate": 2.947384477692042e-05, + "loss": 0.0404, + "step": 39100 + }, + { + "epoch": 0.08625309582672633, + "grad_norm": 0.15191173553466797, + "learning_rate": 2.947341059841572e-05, + "loss": 0.0407, + "step": 39110 + }, + { + "epoch": 0.0862751498016245, + "grad_norm": 0.1425744593143463, + "learning_rate": 2.9472976244045417e-05, + "loss": 0.0381, + "step": 39120 + }, + { + "epoch": 0.08629720377652267, + "grad_norm": 0.14483210444450378, + "learning_rate": 2.947254171381478e-05, + "loss": 0.0401, + "step": 39130 + }, + { + "epoch": 0.08631925775142082, + "grad_norm": 0.09627696871757507, + "learning_rate": 2.9472107007729102e-05, + "loss": 0.0404, + "step": 39140 + }, + { + "epoch": 0.086341311726319, + "grad_norm": 0.12991058826446533, + "learning_rate": 2.947167212579366e-05, + "loss": 0.0379, + "step": 39150 + }, + { + "epoch": 0.08636336570121717, + "grad_norm": 0.128003790974617, + "learning_rate": 2.9471237068013735e-05, + "loss": 0.0397, + "step": 39160 + }, + { + "epoch": 0.08638541967611532, + "grad_norm": 0.11297795176506042, + "learning_rate": 2.947080183439462e-05, + "loss": 0.0367, + "step": 39170 + }, + { + "epoch": 0.08640747365101349, + "grad_norm": 0.11180391162633896, + "learning_rate": 2.9470366424941595e-05, + "loss": 0.0393, + "step": 39180 + }, + { + "epoch": 0.08642952762591166, + "grad_norm": 0.11769551038742065, + "learning_rate": 2.9469930839659962e-05, + "loss": 0.0382, + "step": 39190 + }, + { + "epoch": 0.08645158160080982, + "grad_norm": 0.10364988446235657, + "learning_rate": 2.9469495078555004e-05, + "loss": 0.0377, + "step": 39200 + }, + { + "epoch": 0.08647363557570799, + "grad_norm": 0.14348222315311432, + "learning_rate": 2.9469059141632017e-05, + "loss": 0.0397, + "step": 39210 + }, + { + "epoch": 0.08649568955060616, + "grad_norm": 0.12701226770877838, + "learning_rate": 2.9468623028896306e-05, + "loss": 0.0396, + "step": 39220 + }, + { + "epoch": 0.08651774352550431, + "grad_norm": 0.09173528105020523, + "learning_rate": 2.946818674035316e-05, + "loss": 0.0384, + "step": 39230 + }, + { + "epoch": 0.08653979750040248, + "grad_norm": 0.12179995328187943, + "learning_rate": 2.9467750276007887e-05, + "loss": 0.0374, + "step": 39240 + }, + { + "epoch": 0.08656185147530066, + "grad_norm": 0.10643289238214493, + "learning_rate": 2.946731363586579e-05, + "loss": 0.0389, + "step": 39250 + }, + { + "epoch": 0.08658390545019881, + "grad_norm": 0.1333206743001938, + "learning_rate": 2.946687681993217e-05, + "loss": 0.0366, + "step": 39260 + }, + { + "epoch": 0.08660595942509698, + "grad_norm": 0.11688581854104996, + "learning_rate": 2.9466439828212336e-05, + "loss": 0.0381, + "step": 39270 + }, + { + "epoch": 0.08662801339999515, + "grad_norm": 0.09014908224344254, + "learning_rate": 2.9466002660711605e-05, + "loss": 0.0377, + "step": 39280 + }, + { + "epoch": 0.08665006737489331, + "grad_norm": 0.10879282653331757, + "learning_rate": 2.946556531743528e-05, + "loss": 0.0396, + "step": 39290 + }, + { + "epoch": 0.08667212134979148, + "grad_norm": 0.11391344666481018, + "learning_rate": 2.946512779838868e-05, + "loss": 0.0392, + "step": 39300 + }, + { + "epoch": 0.08669417532468965, + "grad_norm": 0.11008788645267487, + "learning_rate": 2.946469010357712e-05, + "loss": 0.0391, + "step": 39310 + }, + { + "epoch": 0.0867162292995878, + "grad_norm": 0.1262618452310562, + "learning_rate": 2.9464252233005916e-05, + "loss": 0.0362, + "step": 39320 + }, + { + "epoch": 0.08673828327448597, + "grad_norm": 0.10088801383972168, + "learning_rate": 2.9463814186680392e-05, + "loss": 0.0372, + "step": 39330 + }, + { + "epoch": 0.08676033724938415, + "grad_norm": 0.16662131249904633, + "learning_rate": 2.9463375964605872e-05, + "loss": 0.0365, + "step": 39340 + }, + { + "epoch": 0.08678239122428232, + "grad_norm": 0.1693074256181717, + "learning_rate": 2.9462937566787672e-05, + "loss": 0.0404, + "step": 39350 + }, + { + "epoch": 0.08680444519918047, + "grad_norm": 0.10335110127925873, + "learning_rate": 2.946249899323113e-05, + "loss": 0.0396, + "step": 39360 + }, + { + "epoch": 0.08682649917407864, + "grad_norm": 0.12395534664392471, + "learning_rate": 2.9462060243941573e-05, + "loss": 0.039, + "step": 39370 + }, + { + "epoch": 0.08684855314897681, + "grad_norm": 0.11817148327827454, + "learning_rate": 2.9461621318924324e-05, + "loss": 0.0381, + "step": 39380 + }, + { + "epoch": 0.08687060712387497, + "grad_norm": 0.14405257999897003, + "learning_rate": 2.946118221818473e-05, + "loss": 0.0379, + "step": 39390 + }, + { + "epoch": 0.08689266109877314, + "grad_norm": 0.12640424072742462, + "learning_rate": 2.946074294172811e-05, + "loss": 0.0372, + "step": 39400 + }, + { + "epoch": 0.08691471507367131, + "grad_norm": 0.10869249701499939, + "learning_rate": 2.946030348955981e-05, + "loss": 0.0388, + "step": 39410 + }, + { + "epoch": 0.08693676904856946, + "grad_norm": 0.13177642226219177, + "learning_rate": 2.945986386168517e-05, + "loss": 0.0384, + "step": 39420 + }, + { + "epoch": 0.08695882302346764, + "grad_norm": 0.10335146635770798, + "learning_rate": 2.9459424058109537e-05, + "loss": 0.0402, + "step": 39430 + }, + { + "epoch": 0.0869808769983658, + "grad_norm": 0.11498601734638214, + "learning_rate": 2.945898407883825e-05, + "loss": 0.038, + "step": 39440 + }, + { + "epoch": 0.08700293097326396, + "grad_norm": 0.11618831753730774, + "learning_rate": 2.945854392387665e-05, + "loss": 0.04, + "step": 39450 + }, + { + "epoch": 0.08702498494816213, + "grad_norm": 0.13619637489318848, + "learning_rate": 2.9458103593230096e-05, + "loss": 0.0358, + "step": 39460 + }, + { + "epoch": 0.0870470389230603, + "grad_norm": 0.1380367875099182, + "learning_rate": 2.9457663086903927e-05, + "loss": 0.0395, + "step": 39470 + }, + { + "epoch": 0.08706909289795846, + "grad_norm": 0.13169685006141663, + "learning_rate": 2.94572224049035e-05, + "loss": 0.0385, + "step": 39480 + }, + { + "epoch": 0.08709114687285663, + "grad_norm": 0.11361347883939743, + "learning_rate": 2.9456781547234177e-05, + "loss": 0.0406, + "step": 39490 + }, + { + "epoch": 0.0871132008477548, + "grad_norm": 0.18444694578647614, + "learning_rate": 2.9456340513901303e-05, + "loss": 0.0398, + "step": 39500 + }, + { + "epoch": 0.08713525482265295, + "grad_norm": 0.12471342086791992, + "learning_rate": 2.945589930491025e-05, + "loss": 0.0365, + "step": 39510 + }, + { + "epoch": 0.08715730879755113, + "grad_norm": 0.12100452184677124, + "learning_rate": 2.9455457920266364e-05, + "loss": 0.0382, + "step": 39520 + }, + { + "epoch": 0.0871793627724493, + "grad_norm": 0.10710279643535614, + "learning_rate": 2.945501635997502e-05, + "loss": 0.0396, + "step": 39530 + }, + { + "epoch": 0.08720141674734745, + "grad_norm": 0.13543500006198883, + "learning_rate": 2.9454574624041576e-05, + "loss": 0.037, + "step": 39540 + }, + { + "epoch": 0.08722347072224562, + "grad_norm": 0.09813883900642395, + "learning_rate": 2.9454132712471407e-05, + "loss": 0.0389, + "step": 39550 + }, + { + "epoch": 0.08724552469714379, + "grad_norm": 0.138837993144989, + "learning_rate": 2.9453690625269877e-05, + "loss": 0.0353, + "step": 39560 + }, + { + "epoch": 0.08726757867204196, + "grad_norm": 0.1379396766424179, + "learning_rate": 2.945324836244236e-05, + "loss": 0.0367, + "step": 39570 + }, + { + "epoch": 0.08728963264694012, + "grad_norm": 0.1390370875597, + "learning_rate": 2.945280592399423e-05, + "loss": 0.0378, + "step": 39580 + }, + { + "epoch": 0.08731168662183829, + "grad_norm": 0.09801390767097473, + "learning_rate": 2.945236330993086e-05, + "loss": 0.0395, + "step": 39590 + }, + { + "epoch": 0.08733374059673646, + "grad_norm": 0.11304931342601776, + "learning_rate": 2.945192052025763e-05, + "loss": 0.0401, + "step": 39600 + }, + { + "epoch": 0.08735579457163462, + "grad_norm": 0.10326701402664185, + "learning_rate": 2.9451477554979922e-05, + "loss": 0.0369, + "step": 39610 + }, + { + "epoch": 0.08737784854653279, + "grad_norm": 0.11761586368083954, + "learning_rate": 2.9451034414103118e-05, + "loss": 0.0379, + "step": 39620 + }, + { + "epoch": 0.08739990252143096, + "grad_norm": 0.09622594714164734, + "learning_rate": 2.9450591097632603e-05, + "loss": 0.039, + "step": 39630 + }, + { + "epoch": 0.08742195649632911, + "grad_norm": 0.11474766582250595, + "learning_rate": 2.9450147605573757e-05, + "loss": 0.039, + "step": 39640 + }, + { + "epoch": 0.08744401047122728, + "grad_norm": 0.11851464211940765, + "learning_rate": 2.944970393793198e-05, + "loss": 0.0387, + "step": 39650 + }, + { + "epoch": 0.08746606444612545, + "grad_norm": 0.17261040210723877, + "learning_rate": 2.9449260094712656e-05, + "loss": 0.0402, + "step": 39660 + }, + { + "epoch": 0.08748811842102361, + "grad_norm": 0.13052567839622498, + "learning_rate": 2.944881607592118e-05, + "loss": 0.038, + "step": 39670 + }, + { + "epoch": 0.08751017239592178, + "grad_norm": 0.1902877241373062, + "learning_rate": 2.9448371881562945e-05, + "loss": 0.041, + "step": 39680 + }, + { + "epoch": 0.08753222637081995, + "grad_norm": 0.1087626963853836, + "learning_rate": 2.944792751164335e-05, + "loss": 0.0407, + "step": 39690 + }, + { + "epoch": 0.0875542803457181, + "grad_norm": 0.12123168259859085, + "learning_rate": 2.94474829661678e-05, + "loss": 0.0384, + "step": 39700 + }, + { + "epoch": 0.08757633432061628, + "grad_norm": 0.11111775040626526, + "learning_rate": 2.9447038245141684e-05, + "loss": 0.0375, + "step": 39710 + }, + { + "epoch": 0.08759838829551445, + "grad_norm": 0.12381768971681595, + "learning_rate": 2.944659334857042e-05, + "loss": 0.0381, + "step": 39720 + }, + { + "epoch": 0.0876204422704126, + "grad_norm": 0.14963825047016144, + "learning_rate": 2.9446148276459398e-05, + "loss": 0.0379, + "step": 39730 + }, + { + "epoch": 0.08764249624531077, + "grad_norm": 0.13896751403808594, + "learning_rate": 2.944570302881404e-05, + "loss": 0.0385, + "step": 39740 + }, + { + "epoch": 0.08766455022020894, + "grad_norm": 0.1498422473669052, + "learning_rate": 2.9445257605639755e-05, + "loss": 0.0397, + "step": 39750 + }, + { + "epoch": 0.0876866041951071, + "grad_norm": 0.12836381793022156, + "learning_rate": 2.944481200694195e-05, + "loss": 0.0376, + "step": 39760 + }, + { + "epoch": 0.08770865817000527, + "grad_norm": 0.13228678703308105, + "learning_rate": 2.9444366232726035e-05, + "loss": 0.0366, + "step": 39770 + }, + { + "epoch": 0.08773071214490344, + "grad_norm": 0.12962301075458527, + "learning_rate": 2.9443920282997437e-05, + "loss": 0.0394, + "step": 39780 + }, + { + "epoch": 0.08775276611980161, + "grad_norm": 0.09195326268672943, + "learning_rate": 2.944347415776157e-05, + "loss": 0.0395, + "step": 39790 + }, + { + "epoch": 0.08777482009469977, + "grad_norm": 0.12461750209331512, + "learning_rate": 2.9443027857023853e-05, + "loss": 0.0403, + "step": 39800 + }, + { + "epoch": 0.08779687406959794, + "grad_norm": 0.14281246066093445, + "learning_rate": 2.9442581380789712e-05, + "loss": 0.0405, + "step": 39810 + }, + { + "epoch": 0.0878189280444961, + "grad_norm": 0.10175448656082153, + "learning_rate": 2.9442134729064575e-05, + "loss": 0.038, + "step": 39820 + }, + { + "epoch": 0.08784098201939426, + "grad_norm": 0.14151513576507568, + "learning_rate": 2.9441687901853862e-05, + "loss": 0.0364, + "step": 39830 + }, + { + "epoch": 0.08786303599429243, + "grad_norm": 0.14561831951141357, + "learning_rate": 2.9441240899163005e-05, + "loss": 0.0387, + "step": 39840 + }, + { + "epoch": 0.0878850899691906, + "grad_norm": 0.10444963723421097, + "learning_rate": 2.9440793720997435e-05, + "loss": 0.0375, + "step": 39850 + }, + { + "epoch": 0.08790714394408876, + "grad_norm": 0.1423826515674591, + "learning_rate": 2.944034636736259e-05, + "loss": 0.0384, + "step": 39860 + }, + { + "epoch": 0.08792919791898693, + "grad_norm": 0.1333315074443817, + "learning_rate": 2.94398988382639e-05, + "loss": 0.04, + "step": 39870 + }, + { + "epoch": 0.0879512518938851, + "grad_norm": 0.10007838159799576, + "learning_rate": 2.943945113370681e-05, + "loss": 0.0388, + "step": 39880 + }, + { + "epoch": 0.08797330586878326, + "grad_norm": 0.1466890424489975, + "learning_rate": 2.9439003253696754e-05, + "loss": 0.0398, + "step": 39890 + }, + { + "epoch": 0.08799535984368143, + "grad_norm": 0.13903522491455078, + "learning_rate": 2.943855519823917e-05, + "loss": 0.0376, + "step": 39900 + }, + { + "epoch": 0.0880174138185796, + "grad_norm": 0.1349353790283203, + "learning_rate": 2.9438106967339515e-05, + "loss": 0.039, + "step": 39910 + }, + { + "epoch": 0.08803946779347775, + "grad_norm": 0.14193613827228546, + "learning_rate": 2.9437658561003226e-05, + "loss": 0.0402, + "step": 39920 + }, + { + "epoch": 0.08806152176837592, + "grad_norm": 0.1385023593902588, + "learning_rate": 2.9437209979235754e-05, + "loss": 0.0406, + "step": 39930 + }, + { + "epoch": 0.08808357574327409, + "grad_norm": 0.10578408092260361, + "learning_rate": 2.9436761222042554e-05, + "loss": 0.0392, + "step": 39940 + }, + { + "epoch": 0.08810562971817225, + "grad_norm": 0.1276647001504898, + "learning_rate": 2.9436312289429072e-05, + "loss": 0.0404, + "step": 39950 + }, + { + "epoch": 0.08812768369307042, + "grad_norm": 0.09450389444828033, + "learning_rate": 2.943586318140076e-05, + "loss": 0.0385, + "step": 39960 + }, + { + "epoch": 0.08814973766796859, + "grad_norm": 0.08902417123317719, + "learning_rate": 2.9435413897963085e-05, + "loss": 0.0401, + "step": 39970 + }, + { + "epoch": 0.08817179164286675, + "grad_norm": 0.0837063416838646, + "learning_rate": 2.9434964439121504e-05, + "loss": 0.0351, + "step": 39980 + }, + { + "epoch": 0.08819384561776492, + "grad_norm": 0.09029874205589294, + "learning_rate": 2.9434514804881473e-05, + "loss": 0.0375, + "step": 39990 + }, + { + "epoch": 0.08821589959266309, + "grad_norm": 0.16520173847675323, + "learning_rate": 2.943406499524846e-05, + "loss": 0.0387, + "step": 40000 + }, + { + "epoch": 0.08823795356756126, + "grad_norm": 0.10192395746707916, + "learning_rate": 2.943361501022793e-05, + "loss": 0.0384, + "step": 40010 + }, + { + "epoch": 0.08826000754245941, + "grad_norm": 0.10407900810241699, + "learning_rate": 2.943316484982535e-05, + "loss": 0.0382, + "step": 40020 + }, + { + "epoch": 0.08828206151735758, + "grad_norm": 0.11978653818368912, + "learning_rate": 2.9432714514046188e-05, + "loss": 0.0394, + "step": 40030 + }, + { + "epoch": 0.08830411549225575, + "grad_norm": 0.11635114252567291, + "learning_rate": 2.9432264002895922e-05, + "loss": 0.0393, + "step": 40040 + }, + { + "epoch": 0.08832616946715391, + "grad_norm": 0.12672129273414612, + "learning_rate": 2.9431813316380017e-05, + "loss": 0.037, + "step": 40050 + }, + { + "epoch": 0.08834822344205208, + "grad_norm": 0.13344413042068481, + "learning_rate": 2.9431362454503958e-05, + "loss": 0.0384, + "step": 40060 + }, + { + "epoch": 0.08837027741695025, + "grad_norm": 0.10693079233169556, + "learning_rate": 2.9430911417273217e-05, + "loss": 0.0372, + "step": 40070 + }, + { + "epoch": 0.0883923313918484, + "grad_norm": 0.09433147311210632, + "learning_rate": 2.9430460204693277e-05, + "loss": 0.0369, + "step": 40080 + }, + { + "epoch": 0.08841438536674658, + "grad_norm": 0.12508611381053925, + "learning_rate": 2.9430008816769625e-05, + "loss": 0.038, + "step": 40090 + }, + { + "epoch": 0.08843643934164475, + "grad_norm": 0.1449345350265503, + "learning_rate": 2.9429557253507735e-05, + "loss": 0.0392, + "step": 40100 + }, + { + "epoch": 0.0884584933165429, + "grad_norm": 0.1265994906425476, + "learning_rate": 2.9429105514913104e-05, + "loss": 0.0377, + "step": 40110 + }, + { + "epoch": 0.08848054729144107, + "grad_norm": 0.1246766746044159, + "learning_rate": 2.942865360099122e-05, + "loss": 0.039, + "step": 40120 + }, + { + "epoch": 0.08850260126633924, + "grad_norm": 0.1270890086889267, + "learning_rate": 2.9428201511747572e-05, + "loss": 0.0384, + "step": 40130 + }, + { + "epoch": 0.0885246552412374, + "grad_norm": 0.17577466368675232, + "learning_rate": 2.942774924718765e-05, + "loss": 0.0377, + "step": 40140 + }, + { + "epoch": 0.08854670921613557, + "grad_norm": 0.11983119696378708, + "learning_rate": 2.9427296807316956e-05, + "loss": 0.0366, + "step": 40150 + }, + { + "epoch": 0.08856876319103374, + "grad_norm": 0.1280146837234497, + "learning_rate": 2.942684419214098e-05, + "loss": 0.0381, + "step": 40160 + }, + { + "epoch": 0.0885908171659319, + "grad_norm": 0.13570359349250793, + "learning_rate": 2.9426391401665228e-05, + "loss": 0.0384, + "step": 40170 + }, + { + "epoch": 0.08861287114083007, + "grad_norm": 0.09924135357141495, + "learning_rate": 2.9425938435895196e-05, + "loss": 0.038, + "step": 40180 + }, + { + "epoch": 0.08863492511572824, + "grad_norm": 0.11044463515281677, + "learning_rate": 2.9425485294836398e-05, + "loss": 0.0412, + "step": 40190 + }, + { + "epoch": 0.08865697909062639, + "grad_norm": 0.12254835665225983, + "learning_rate": 2.942503197849433e-05, + "loss": 0.037, + "step": 40200 + }, + { + "epoch": 0.08867903306552456, + "grad_norm": 0.10407284647226334, + "learning_rate": 2.9424578486874507e-05, + "loss": 0.0391, + "step": 40210 + }, + { + "epoch": 0.08870108704042273, + "grad_norm": 0.1202598586678505, + "learning_rate": 2.9424124819982436e-05, + "loss": 0.0408, + "step": 40220 + }, + { + "epoch": 0.0887231410153209, + "grad_norm": 0.13378801941871643, + "learning_rate": 2.9423670977823625e-05, + "loss": 0.039, + "step": 40230 + }, + { + "epoch": 0.08874519499021906, + "grad_norm": 0.11080794781446457, + "learning_rate": 2.9423216960403595e-05, + "loss": 0.0376, + "step": 40240 + }, + { + "epoch": 0.08876724896511723, + "grad_norm": 0.12496542185544968, + "learning_rate": 2.9422762767727863e-05, + "loss": 0.0378, + "step": 40250 + }, + { + "epoch": 0.0887893029400154, + "grad_norm": 0.12851585447788239, + "learning_rate": 2.9422308399801946e-05, + "loss": 0.0369, + "step": 40260 + }, + { + "epoch": 0.08881135691491356, + "grad_norm": 0.144851952791214, + "learning_rate": 2.9421853856631364e-05, + "loss": 0.0379, + "step": 40270 + }, + { + "epoch": 0.08883341088981173, + "grad_norm": 0.10406078398227692, + "learning_rate": 2.9421399138221645e-05, + "loss": 0.0357, + "step": 40280 + }, + { + "epoch": 0.0888554648647099, + "grad_norm": 0.09640190005302429, + "learning_rate": 2.9420944244578306e-05, + "loss": 0.0384, + "step": 40290 + }, + { + "epoch": 0.08887751883960805, + "grad_norm": 0.15305683016777039, + "learning_rate": 2.942048917570688e-05, + "loss": 0.0376, + "step": 40300 + }, + { + "epoch": 0.08889957281450622, + "grad_norm": 0.10075454413890839, + "learning_rate": 2.9420033931612893e-05, + "loss": 0.0362, + "step": 40310 + }, + { + "epoch": 0.08892162678940439, + "grad_norm": 0.1188352033495903, + "learning_rate": 2.9419578512301883e-05, + "loss": 0.0383, + "step": 40320 + }, + { + "epoch": 0.08894368076430255, + "grad_norm": 0.11122369021177292, + "learning_rate": 2.941912291777938e-05, + "loss": 0.0387, + "step": 40330 + }, + { + "epoch": 0.08896573473920072, + "grad_norm": 0.11638285964727402, + "learning_rate": 2.9418667148050913e-05, + "loss": 0.0384, + "step": 40340 + }, + { + "epoch": 0.08898778871409889, + "grad_norm": 0.12494396418333054, + "learning_rate": 2.9418211203122034e-05, + "loss": 0.0394, + "step": 40350 + }, + { + "epoch": 0.08900984268899705, + "grad_norm": 0.13490258157253265, + "learning_rate": 2.9417755082998273e-05, + "loss": 0.0382, + "step": 40360 + }, + { + "epoch": 0.08903189666389522, + "grad_norm": 0.11373693495988846, + "learning_rate": 2.9417298787685175e-05, + "loss": 0.0374, + "step": 40370 + }, + { + "epoch": 0.08905395063879339, + "grad_norm": 0.11152423918247223, + "learning_rate": 2.9416842317188283e-05, + "loss": 0.0373, + "step": 40380 + }, + { + "epoch": 0.08907600461369154, + "grad_norm": 0.12016534060239792, + "learning_rate": 2.9416385671513152e-05, + "loss": 0.0369, + "step": 40390 + }, + { + "epoch": 0.08909805858858971, + "grad_norm": 0.09738421440124512, + "learning_rate": 2.9415928850665313e-05, + "loss": 0.0378, + "step": 40400 + }, + { + "epoch": 0.08912011256348788, + "grad_norm": 0.10488946735858917, + "learning_rate": 2.941547185465034e-05, + "loss": 0.0399, + "step": 40410 + }, + { + "epoch": 0.08914216653838605, + "grad_norm": 0.15275220572948456, + "learning_rate": 2.9415014683473762e-05, + "loss": 0.0404, + "step": 40420 + }, + { + "epoch": 0.08916422051328421, + "grad_norm": 0.11198516190052032, + "learning_rate": 2.941455733714115e-05, + "loss": 0.0385, + "step": 40430 + }, + { + "epoch": 0.08918627448818238, + "grad_norm": 0.14942756295204163, + "learning_rate": 2.9414099815658058e-05, + "loss": 0.0371, + "step": 40440 + }, + { + "epoch": 0.08920832846308055, + "grad_norm": 0.1307697296142578, + "learning_rate": 2.9413642119030044e-05, + "loss": 0.0402, + "step": 40450 + }, + { + "epoch": 0.0892303824379787, + "grad_norm": 0.15290993452072144, + "learning_rate": 2.9413184247262664e-05, + "loss": 0.0375, + "step": 40460 + }, + { + "epoch": 0.08925243641287688, + "grad_norm": 0.14017052948474884, + "learning_rate": 2.9412726200361498e-05, + "loss": 0.0381, + "step": 40470 + }, + { + "epoch": 0.08927449038777505, + "grad_norm": 0.1194700226187706, + "learning_rate": 2.941226797833209e-05, + "loss": 0.0361, + "step": 40480 + }, + { + "epoch": 0.0892965443626732, + "grad_norm": 0.10496971011161804, + "learning_rate": 2.941180958118003e-05, + "loss": 0.037, + "step": 40490 + }, + { + "epoch": 0.08931859833757137, + "grad_norm": 0.09110964089632034, + "learning_rate": 2.941135100891086e-05, + "loss": 0.0393, + "step": 40500 + }, + { + "epoch": 0.08934065231246954, + "grad_norm": 0.11545321345329285, + "learning_rate": 2.9410892261530182e-05, + "loss": 0.0365, + "step": 40510 + }, + { + "epoch": 0.0893627062873677, + "grad_norm": 0.12887094914913177, + "learning_rate": 2.9410433339043554e-05, + "loss": 0.0379, + "step": 40520 + }, + { + "epoch": 0.08938476026226587, + "grad_norm": 0.11644949018955231, + "learning_rate": 2.9409974241456554e-05, + "loss": 0.0384, + "step": 40530 + }, + { + "epoch": 0.08940681423716404, + "grad_norm": 0.122771255671978, + "learning_rate": 2.9409514968774762e-05, + "loss": 0.0384, + "step": 40540 + }, + { + "epoch": 0.0894288682120622, + "grad_norm": 0.12295845150947571, + "learning_rate": 2.9409055521003762e-05, + "loss": 0.0378, + "step": 40550 + }, + { + "epoch": 0.08945092218696037, + "grad_norm": 0.12061280012130737, + "learning_rate": 2.9408595898149127e-05, + "loss": 0.0389, + "step": 40560 + }, + { + "epoch": 0.08947297616185854, + "grad_norm": 0.12464261054992676, + "learning_rate": 2.9408136100216448e-05, + "loss": 0.0404, + "step": 40570 + }, + { + "epoch": 0.08949503013675669, + "grad_norm": 0.13193842768669128, + "learning_rate": 2.940767612721132e-05, + "loss": 0.0384, + "step": 40580 + }, + { + "epoch": 0.08951708411165486, + "grad_norm": 0.159087136387825, + "learning_rate": 2.9407215979139316e-05, + "loss": 0.0378, + "step": 40590 + }, + { + "epoch": 0.08953913808655303, + "grad_norm": 0.15363478660583496, + "learning_rate": 2.9406755656006036e-05, + "loss": 0.0369, + "step": 40600 + }, + { + "epoch": 0.08956119206145119, + "grad_norm": 0.16228686273097992, + "learning_rate": 2.9406295157817074e-05, + "loss": 0.0377, + "step": 40610 + }, + { + "epoch": 0.08958324603634936, + "grad_norm": 0.14005927741527557, + "learning_rate": 2.9405834484578024e-05, + "loss": 0.0383, + "step": 40620 + }, + { + "epoch": 0.08960530001124753, + "grad_norm": 0.0976240411400795, + "learning_rate": 2.9405373636294485e-05, + "loss": 0.039, + "step": 40630 + }, + { + "epoch": 0.0896273539861457, + "grad_norm": 0.11220163106918335, + "learning_rate": 2.940491261297205e-05, + "loss": 0.0368, + "step": 40640 + }, + { + "epoch": 0.08964940796104386, + "grad_norm": 0.13823576271533966, + "learning_rate": 2.9404451414616335e-05, + "loss": 0.0378, + "step": 40650 + }, + { + "epoch": 0.08967146193594203, + "grad_norm": 0.15163186192512512, + "learning_rate": 2.9403990041232928e-05, + "loss": 0.0375, + "step": 40660 + }, + { + "epoch": 0.0896935159108402, + "grad_norm": 0.10072950273752213, + "learning_rate": 2.940352849282745e-05, + "loss": 0.04, + "step": 40670 + }, + { + "epoch": 0.08971556988573835, + "grad_norm": 0.14352324604988098, + "learning_rate": 2.940306676940549e-05, + "loss": 0.0364, + "step": 40680 + }, + { + "epoch": 0.08973762386063652, + "grad_norm": 0.1967497169971466, + "learning_rate": 2.940260487097268e-05, + "loss": 0.0369, + "step": 40690 + }, + { + "epoch": 0.08975967783553469, + "grad_norm": 0.1747279018163681, + "learning_rate": 2.9402142797534618e-05, + "loss": 0.0393, + "step": 40700 + }, + { + "epoch": 0.08978173181043285, + "grad_norm": 0.14516125619411469, + "learning_rate": 2.9401680549096923e-05, + "loss": 0.0377, + "step": 40710 + }, + { + "epoch": 0.08980378578533102, + "grad_norm": 0.10412894934415817, + "learning_rate": 2.9401218125665217e-05, + "loss": 0.0379, + "step": 40720 + }, + { + "epoch": 0.08982583976022919, + "grad_norm": 0.12414682656526566, + "learning_rate": 2.9400755527245106e-05, + "loss": 0.0383, + "step": 40730 + }, + { + "epoch": 0.08984789373512735, + "grad_norm": 0.12463421374559402, + "learning_rate": 2.9400292753842225e-05, + "loss": 0.0374, + "step": 40740 + }, + { + "epoch": 0.08986994771002552, + "grad_norm": 0.12717726826667786, + "learning_rate": 2.9399829805462192e-05, + "loss": 0.039, + "step": 40750 + }, + { + "epoch": 0.08989200168492369, + "grad_norm": 0.09821084141731262, + "learning_rate": 2.9399366682110624e-05, + "loss": 0.0374, + "step": 40760 + }, + { + "epoch": 0.08991405565982184, + "grad_norm": 0.1065891906619072, + "learning_rate": 2.9398903383793158e-05, + "loss": 0.0365, + "step": 40770 + }, + { + "epoch": 0.08993610963472001, + "grad_norm": 0.1267148107290268, + "learning_rate": 2.9398439910515424e-05, + "loss": 0.0368, + "step": 40780 + }, + { + "epoch": 0.08995816360961818, + "grad_norm": 0.13047295808792114, + "learning_rate": 2.9397976262283047e-05, + "loss": 0.0373, + "step": 40790 + }, + { + "epoch": 0.08998021758451634, + "grad_norm": 0.12407954782247543, + "learning_rate": 2.9397512439101667e-05, + "loss": 0.0381, + "step": 40800 + }, + { + "epoch": 0.09000227155941451, + "grad_norm": 0.10350023210048676, + "learning_rate": 2.9397048440976917e-05, + "loss": 0.0359, + "step": 40810 + }, + { + "epoch": 0.09002432553431268, + "grad_norm": 0.1249629333615303, + "learning_rate": 2.9396584267914436e-05, + "loss": 0.0358, + "step": 40820 + }, + { + "epoch": 0.09004637950921084, + "grad_norm": 0.10562117397785187, + "learning_rate": 2.939611991991986e-05, + "loss": 0.0383, + "step": 40830 + }, + { + "epoch": 0.090068433484109, + "grad_norm": 0.13137872517108917, + "learning_rate": 2.939565539699884e-05, + "loss": 0.0377, + "step": 40840 + }, + { + "epoch": 0.09009048745900718, + "grad_norm": 0.11197701841592789, + "learning_rate": 2.9395190699157012e-05, + "loss": 0.0387, + "step": 40850 + }, + { + "epoch": 0.09011254143390535, + "grad_norm": 0.1401831954717636, + "learning_rate": 2.9394725826400028e-05, + "loss": 0.04, + "step": 40860 + }, + { + "epoch": 0.0901345954088035, + "grad_norm": 0.09524897485971451, + "learning_rate": 2.9394260778733535e-05, + "loss": 0.038, + "step": 40870 + }, + { + "epoch": 0.09015664938370167, + "grad_norm": 0.19407452642917633, + "learning_rate": 2.9393795556163178e-05, + "loss": 0.0394, + "step": 40880 + }, + { + "epoch": 0.09017870335859984, + "grad_norm": 0.1387416273355484, + "learning_rate": 2.939333015869462e-05, + "loss": 0.0389, + "step": 40890 + }, + { + "epoch": 0.090200757333498, + "grad_norm": 0.1491708904504776, + "learning_rate": 2.939286458633351e-05, + "loss": 0.039, + "step": 40900 + }, + { + "epoch": 0.09022281130839617, + "grad_norm": 0.0976652055978775, + "learning_rate": 2.9392398839085503e-05, + "loss": 0.0389, + "step": 40910 + }, + { + "epoch": 0.09024486528329434, + "grad_norm": 0.19492535293102264, + "learning_rate": 2.9391932916956262e-05, + "loss": 0.0399, + "step": 40920 + }, + { + "epoch": 0.0902669192581925, + "grad_norm": 0.136221244931221, + "learning_rate": 2.939146681995145e-05, + "loss": 0.0384, + "step": 40930 + }, + { + "epoch": 0.09028897323309067, + "grad_norm": 0.12411284446716309, + "learning_rate": 2.9391000548076728e-05, + "loss": 0.038, + "step": 40940 + }, + { + "epoch": 0.09031102720798884, + "grad_norm": 0.14701814949512482, + "learning_rate": 2.939053410133776e-05, + "loss": 0.0373, + "step": 40950 + }, + { + "epoch": 0.09033308118288699, + "grad_norm": 0.12007645517587662, + "learning_rate": 2.9390067479740218e-05, + "loss": 0.0369, + "step": 40960 + }, + { + "epoch": 0.09035513515778516, + "grad_norm": 0.13764631748199463, + "learning_rate": 2.938960068328977e-05, + "loss": 0.0387, + "step": 40970 + }, + { + "epoch": 0.09037718913268333, + "grad_norm": 0.10686460137367249, + "learning_rate": 2.9389133711992087e-05, + "loss": 0.0374, + "step": 40980 + }, + { + "epoch": 0.09039924310758149, + "grad_norm": 0.16225028038024902, + "learning_rate": 2.938866656585284e-05, + "loss": 0.0379, + "step": 40990 + }, + { + "epoch": 0.09042129708247966, + "grad_norm": 0.11017562448978424, + "learning_rate": 2.9388199244877715e-05, + "loss": 0.0412, + "step": 41000 + }, + { + "epoch": 0.09044335105737783, + "grad_norm": 0.1271398812532425, + "learning_rate": 2.9387731749072385e-05, + "loss": 0.0389, + "step": 41010 + }, + { + "epoch": 0.09046540503227599, + "grad_norm": 0.10555562376976013, + "learning_rate": 2.9387264078442526e-05, + "loss": 0.0385, + "step": 41020 + }, + { + "epoch": 0.09048745900717416, + "grad_norm": 0.0935279056429863, + "learning_rate": 2.9386796232993827e-05, + "loss": 0.0394, + "step": 41030 + }, + { + "epoch": 0.09050951298207233, + "grad_norm": 0.140291228890419, + "learning_rate": 2.938632821273197e-05, + "loss": 0.0373, + "step": 41040 + }, + { + "epoch": 0.09053156695697048, + "grad_norm": 0.1161622405052185, + "learning_rate": 2.9385860017662636e-05, + "loss": 0.039, + "step": 41050 + }, + { + "epoch": 0.09055362093186865, + "grad_norm": 0.13167884945869446, + "learning_rate": 2.9385391647791527e-05, + "loss": 0.0372, + "step": 41060 + }, + { + "epoch": 0.09057567490676682, + "grad_norm": 0.1282077431678772, + "learning_rate": 2.9384923103124327e-05, + "loss": 0.0372, + "step": 41070 + }, + { + "epoch": 0.090597728881665, + "grad_norm": 0.12324155122041702, + "learning_rate": 2.9384454383666728e-05, + "loss": 0.0372, + "step": 41080 + }, + { + "epoch": 0.09061978285656315, + "grad_norm": 0.091759592294693, + "learning_rate": 2.9383985489424427e-05, + "loss": 0.0383, + "step": 41090 + }, + { + "epoch": 0.09064183683146132, + "grad_norm": 0.11044289171695709, + "learning_rate": 2.9383516420403124e-05, + "loss": 0.0392, + "step": 41100 + }, + { + "epoch": 0.09066389080635949, + "grad_norm": 0.12048675864934921, + "learning_rate": 2.9383047176608516e-05, + "loss": 0.0377, + "step": 41110 + }, + { + "epoch": 0.09068594478125765, + "grad_norm": 0.15023894608020782, + "learning_rate": 2.93825777580463e-05, + "loss": 0.0364, + "step": 41120 + }, + { + "epoch": 0.09070799875615582, + "grad_norm": 0.12304969877004623, + "learning_rate": 2.938210816472219e-05, + "loss": 0.0383, + "step": 41130 + }, + { + "epoch": 0.09073005273105399, + "grad_norm": 0.1639946848154068, + "learning_rate": 2.9381638396641884e-05, + "loss": 0.0358, + "step": 41140 + }, + { + "epoch": 0.09075210670595214, + "grad_norm": 0.16287849843502045, + "learning_rate": 2.9381168453811097e-05, + "loss": 0.0395, + "step": 41150 + }, + { + "epoch": 0.09077416068085031, + "grad_norm": 0.11737319827079773, + "learning_rate": 2.9380698336235528e-05, + "loss": 0.0376, + "step": 41160 + }, + { + "epoch": 0.09079621465574848, + "grad_norm": 0.11484896391630173, + "learning_rate": 2.93802280439209e-05, + "loss": 0.0389, + "step": 41170 + }, + { + "epoch": 0.09081826863064664, + "grad_norm": 0.11637698113918304, + "learning_rate": 2.937975757687293e-05, + "loss": 0.0376, + "step": 41180 + }, + { + "epoch": 0.09084032260554481, + "grad_norm": 0.10104215890169144, + "learning_rate": 2.9379286935097323e-05, + "loss": 0.0395, + "step": 41190 + }, + { + "epoch": 0.09086237658044298, + "grad_norm": 0.1157793179154396, + "learning_rate": 2.9378816118599803e-05, + "loss": 0.0348, + "step": 41200 + }, + { + "epoch": 0.09088443055534114, + "grad_norm": 0.10445888340473175, + "learning_rate": 2.9378345127386094e-05, + "loss": 0.0386, + "step": 41210 + }, + { + "epoch": 0.0909064845302393, + "grad_norm": 0.16823945939540863, + "learning_rate": 2.9377873961461913e-05, + "loss": 0.0369, + "step": 41220 + }, + { + "epoch": 0.09092853850513748, + "grad_norm": 0.10556907951831818, + "learning_rate": 2.9377402620832988e-05, + "loss": 0.0377, + "step": 41230 + }, + { + "epoch": 0.09095059248003563, + "grad_norm": 0.09943126142024994, + "learning_rate": 2.937693110550505e-05, + "loss": 0.0382, + "step": 41240 + }, + { + "epoch": 0.0909726464549338, + "grad_norm": 0.15233294665813446, + "learning_rate": 2.9376459415483827e-05, + "loss": 0.0395, + "step": 41250 + }, + { + "epoch": 0.09099470042983197, + "grad_norm": 0.0978567972779274, + "learning_rate": 2.937598755077504e-05, + "loss": 0.037, + "step": 41260 + }, + { + "epoch": 0.09101675440473013, + "grad_norm": 0.0964326411485672, + "learning_rate": 2.9375515511384437e-05, + "loss": 0.0399, + "step": 41270 + }, + { + "epoch": 0.0910388083796283, + "grad_norm": 0.13486342132091522, + "learning_rate": 2.9375043297317746e-05, + "loss": 0.0367, + "step": 41280 + }, + { + "epoch": 0.09106086235452647, + "grad_norm": 0.12274035066366196, + "learning_rate": 2.937457090858071e-05, + "loss": 0.0373, + "step": 41290 + }, + { + "epoch": 0.09108291632942464, + "grad_norm": 0.09950529783964157, + "learning_rate": 2.9374098345179064e-05, + "loss": 0.0385, + "step": 41300 + }, + { + "epoch": 0.0911049703043228, + "grad_norm": 0.11343571543693542, + "learning_rate": 2.9373625607118554e-05, + "loss": 0.0395, + "step": 41310 + }, + { + "epoch": 0.09112702427922097, + "grad_norm": 0.09851599484682083, + "learning_rate": 2.9373152694404916e-05, + "loss": 0.037, + "step": 41320 + }, + { + "epoch": 0.09114907825411914, + "grad_norm": 0.12102072685956955, + "learning_rate": 2.9372679607043905e-05, + "loss": 0.0382, + "step": 41330 + }, + { + "epoch": 0.0911711322290173, + "grad_norm": 0.0977930799126625, + "learning_rate": 2.9372206345041266e-05, + "loss": 0.0393, + "step": 41340 + }, + { + "epoch": 0.09119318620391546, + "grad_norm": 0.13116678595542908, + "learning_rate": 2.937173290840275e-05, + "loss": 0.0394, + "step": 41350 + }, + { + "epoch": 0.09121524017881363, + "grad_norm": 0.1581413894891739, + "learning_rate": 2.9371259297134115e-05, + "loss": 0.0373, + "step": 41360 + }, + { + "epoch": 0.09123729415371179, + "grad_norm": 0.12742295861244202, + "learning_rate": 2.9370785511241107e-05, + "loss": 0.036, + "step": 41370 + }, + { + "epoch": 0.09125934812860996, + "grad_norm": 0.10673264414072037, + "learning_rate": 2.9370311550729485e-05, + "loss": 0.0376, + "step": 41380 + }, + { + "epoch": 0.09128140210350813, + "grad_norm": 0.14981861412525177, + "learning_rate": 2.9369837415605012e-05, + "loss": 0.0356, + "step": 41390 + }, + { + "epoch": 0.09130345607840629, + "grad_norm": 0.13111668825149536, + "learning_rate": 2.9369363105873446e-05, + "loss": 0.0371, + "step": 41400 + }, + { + "epoch": 0.09132551005330446, + "grad_norm": 0.1289243847131729, + "learning_rate": 2.9368888621540556e-05, + "loss": 0.0412, + "step": 41410 + }, + { + "epoch": 0.09134756402820263, + "grad_norm": 0.13201870024204254, + "learning_rate": 2.9368413962612098e-05, + "loss": 0.0382, + "step": 41420 + }, + { + "epoch": 0.09136961800310078, + "grad_norm": 0.15816183388233185, + "learning_rate": 2.9367939129093842e-05, + "loss": 0.038, + "step": 41430 + }, + { + "epoch": 0.09139167197799895, + "grad_norm": 0.09878762811422348, + "learning_rate": 2.9367464120991563e-05, + "loss": 0.038, + "step": 41440 + }, + { + "epoch": 0.09141372595289712, + "grad_norm": 0.13231238722801208, + "learning_rate": 2.9366988938311033e-05, + "loss": 0.0393, + "step": 41450 + }, + { + "epoch": 0.09143577992779528, + "grad_norm": 0.09929326921701431, + "learning_rate": 2.936651358105802e-05, + "loss": 0.0381, + "step": 41460 + }, + { + "epoch": 0.09145783390269345, + "grad_norm": 0.1029941737651825, + "learning_rate": 2.93660380492383e-05, + "loss": 0.038, + "step": 41470 + }, + { + "epoch": 0.09147988787759162, + "grad_norm": 0.10841158777475357, + "learning_rate": 2.9365562342857658e-05, + "loss": 0.0361, + "step": 41480 + }, + { + "epoch": 0.09150194185248979, + "grad_norm": 0.12611877918243408, + "learning_rate": 2.9365086461921865e-05, + "loss": 0.0364, + "step": 41490 + }, + { + "epoch": 0.09152399582738795, + "grad_norm": 0.11282400041818619, + "learning_rate": 2.9364610406436712e-05, + "loss": 0.0385, + "step": 41500 + }, + { + "epoch": 0.09154604980228612, + "grad_norm": 0.10735263675451279, + "learning_rate": 2.9364134176407983e-05, + "loss": 0.0346, + "step": 41510 + }, + { + "epoch": 0.09156810377718429, + "grad_norm": 0.11533910036087036, + "learning_rate": 2.936365777184146e-05, + "loss": 0.0366, + "step": 41520 + }, + { + "epoch": 0.09159015775208244, + "grad_norm": 0.10428056865930557, + "learning_rate": 2.936318119274293e-05, + "loss": 0.036, + "step": 41530 + }, + { + "epoch": 0.09161221172698061, + "grad_norm": 0.110701784491539, + "learning_rate": 2.9362704439118187e-05, + "loss": 0.0409, + "step": 41540 + }, + { + "epoch": 0.09163426570187878, + "grad_norm": 0.12510912120342255, + "learning_rate": 2.936222751097303e-05, + "loss": 0.0391, + "step": 41550 + }, + { + "epoch": 0.09165631967677694, + "grad_norm": 0.15900826454162598, + "learning_rate": 2.9361750408313246e-05, + "loss": 0.0389, + "step": 41560 + }, + { + "epoch": 0.09167837365167511, + "grad_norm": 0.1394481062889099, + "learning_rate": 2.9361273131144636e-05, + "loss": 0.0381, + "step": 41570 + }, + { + "epoch": 0.09170042762657328, + "grad_norm": 0.1138746440410614, + "learning_rate": 2.9360795679472996e-05, + "loss": 0.0379, + "step": 41580 + }, + { + "epoch": 0.09172248160147144, + "grad_norm": 0.11044515669345856, + "learning_rate": 2.9360318053304132e-05, + "loss": 0.0382, + "step": 41590 + }, + { + "epoch": 0.0917445355763696, + "grad_norm": 0.11222334206104279, + "learning_rate": 2.9359840252643842e-05, + "loss": 0.0384, + "step": 41600 + }, + { + "epoch": 0.09176658955126778, + "grad_norm": 0.12809142470359802, + "learning_rate": 2.935936227749794e-05, + "loss": 0.039, + "step": 41610 + }, + { + "epoch": 0.09178864352616593, + "grad_norm": 0.09270872920751572, + "learning_rate": 2.9358884127872224e-05, + "loss": 0.0363, + "step": 41620 + }, + { + "epoch": 0.0918106975010641, + "grad_norm": 0.10259988903999329, + "learning_rate": 2.935840580377251e-05, + "loss": 0.0369, + "step": 41630 + }, + { + "epoch": 0.09183275147596227, + "grad_norm": 0.09458877146244049, + "learning_rate": 2.935792730520461e-05, + "loss": 0.0377, + "step": 41640 + }, + { + "epoch": 0.09185480545086043, + "grad_norm": 0.154854416847229, + "learning_rate": 2.9357448632174335e-05, + "loss": 0.0388, + "step": 41650 + }, + { + "epoch": 0.0918768594257586, + "grad_norm": 0.1168934628367424, + "learning_rate": 2.9356969784687506e-05, + "loss": 0.0405, + "step": 41660 + }, + { + "epoch": 0.09189891340065677, + "grad_norm": 0.11098294705152512, + "learning_rate": 2.9356490762749935e-05, + "loss": 0.0373, + "step": 41670 + }, + { + "epoch": 0.09192096737555493, + "grad_norm": 0.10671605914831161, + "learning_rate": 2.9356011566367454e-05, + "loss": 0.0389, + "step": 41680 + }, + { + "epoch": 0.0919430213504531, + "grad_norm": 0.13437384366989136, + "learning_rate": 2.935553219554587e-05, + "loss": 0.0359, + "step": 41690 + }, + { + "epoch": 0.09196507532535127, + "grad_norm": 0.12054647505283356, + "learning_rate": 2.9355052650291017e-05, + "loss": 0.0388, + "step": 41700 + }, + { + "epoch": 0.09198712930024944, + "grad_norm": 0.10098985582590103, + "learning_rate": 2.9354572930608728e-05, + "loss": 0.0372, + "step": 41710 + }, + { + "epoch": 0.0920091832751476, + "grad_norm": 0.12139496207237244, + "learning_rate": 2.9354093036504818e-05, + "loss": 0.0374, + "step": 41720 + }, + { + "epoch": 0.09203123725004576, + "grad_norm": 0.15791942179203033, + "learning_rate": 2.9353612967985128e-05, + "loss": 0.0375, + "step": 41730 + }, + { + "epoch": 0.09205329122494393, + "grad_norm": 0.10313866287469864, + "learning_rate": 2.935313272505549e-05, + "loss": 0.0374, + "step": 41740 + }, + { + "epoch": 0.09207534519984209, + "grad_norm": 0.11503944545984268, + "learning_rate": 2.9352652307721733e-05, + "loss": 0.0399, + "step": 41750 + }, + { + "epoch": 0.09209739917474026, + "grad_norm": 0.10034582763910294, + "learning_rate": 2.9352171715989704e-05, + "loss": 0.0383, + "step": 41760 + }, + { + "epoch": 0.09211945314963843, + "grad_norm": 0.12086721509695053, + "learning_rate": 2.9351690949865235e-05, + "loss": 0.0362, + "step": 41770 + }, + { + "epoch": 0.09214150712453659, + "grad_norm": 0.1195158138871193, + "learning_rate": 2.9351210009354173e-05, + "loss": 0.039, + "step": 41780 + }, + { + "epoch": 0.09216356109943476, + "grad_norm": 0.1434926986694336, + "learning_rate": 2.9350728894462357e-05, + "loss": 0.0385, + "step": 41790 + }, + { + "epoch": 0.09218561507433293, + "grad_norm": 0.10922113060951233, + "learning_rate": 2.9350247605195636e-05, + "loss": 0.039, + "step": 41800 + }, + { + "epoch": 0.09220766904923108, + "grad_norm": 0.1333850771188736, + "learning_rate": 2.9349766141559858e-05, + "loss": 0.037, + "step": 41810 + }, + { + "epoch": 0.09222972302412925, + "grad_norm": 0.13189613819122314, + "learning_rate": 2.9349284503560874e-05, + "loss": 0.0371, + "step": 41820 + }, + { + "epoch": 0.09225177699902742, + "grad_norm": 0.12185729295015335, + "learning_rate": 2.9348802691204535e-05, + "loss": 0.0382, + "step": 41830 + }, + { + "epoch": 0.09227383097392558, + "grad_norm": 0.10813884437084198, + "learning_rate": 2.9348320704496695e-05, + "loss": 0.0403, + "step": 41840 + }, + { + "epoch": 0.09229588494882375, + "grad_norm": 0.11062601208686829, + "learning_rate": 2.934783854344321e-05, + "loss": 0.0362, + "step": 41850 + }, + { + "epoch": 0.09231793892372192, + "grad_norm": 0.1379811316728592, + "learning_rate": 2.9347356208049943e-05, + "loss": 0.0375, + "step": 41860 + }, + { + "epoch": 0.09233999289862008, + "grad_norm": 0.1322283148765564, + "learning_rate": 2.934687369832275e-05, + "loss": 0.0389, + "step": 41870 + }, + { + "epoch": 0.09236204687351825, + "grad_norm": 0.10885120928287506, + "learning_rate": 2.9346391014267497e-05, + "loss": 0.0363, + "step": 41880 + }, + { + "epoch": 0.09238410084841642, + "grad_norm": 0.16019213199615479, + "learning_rate": 2.934590815589005e-05, + "loss": 0.0368, + "step": 41890 + }, + { + "epoch": 0.09240615482331457, + "grad_norm": 0.1585218459367752, + "learning_rate": 2.934542512319627e-05, + "loss": 0.039, + "step": 41900 + }, + { + "epoch": 0.09242820879821274, + "grad_norm": 0.12076953798532486, + "learning_rate": 2.934494191619203e-05, + "loss": 0.0359, + "step": 41910 + }, + { + "epoch": 0.09245026277311091, + "grad_norm": 0.13828401267528534, + "learning_rate": 2.9344458534883205e-05, + "loss": 0.0384, + "step": 41920 + }, + { + "epoch": 0.09247231674800908, + "grad_norm": 0.1324581354856491, + "learning_rate": 2.9343974979275666e-05, + "loss": 0.0395, + "step": 41930 + }, + { + "epoch": 0.09249437072290724, + "grad_norm": 0.13714267313480377, + "learning_rate": 2.9343491249375285e-05, + "loss": 0.0376, + "step": 41940 + }, + { + "epoch": 0.09251642469780541, + "grad_norm": 0.09924141317605972, + "learning_rate": 2.9343007345187945e-05, + "loss": 0.0373, + "step": 41950 + }, + { + "epoch": 0.09253847867270358, + "grad_norm": 0.0994107574224472, + "learning_rate": 2.934252326671952e-05, + "loss": 0.0376, + "step": 41960 + }, + { + "epoch": 0.09256053264760174, + "grad_norm": 0.12204904109239578, + "learning_rate": 2.9342039013975903e-05, + "loss": 0.0385, + "step": 41970 + }, + { + "epoch": 0.09258258662249991, + "grad_norm": 0.10495898872613907, + "learning_rate": 2.9341554586962968e-05, + "loss": 0.0382, + "step": 41980 + }, + { + "epoch": 0.09260464059739808, + "grad_norm": 0.12589341402053833, + "learning_rate": 2.93410699856866e-05, + "loss": 0.0392, + "step": 41990 + }, + { + "epoch": 0.09262669457229623, + "grad_norm": 0.1484653353691101, + "learning_rate": 2.9340585210152693e-05, + "loss": 0.0387, + "step": 42000 + }, + { + "epoch": 0.0926487485471944, + "grad_norm": 0.13799960911273956, + "learning_rate": 2.9340100260367137e-05, + "loss": 0.0371, + "step": 42010 + }, + { + "epoch": 0.09267080252209257, + "grad_norm": 0.12033800780773163, + "learning_rate": 2.9339615136335825e-05, + "loss": 0.0392, + "step": 42020 + }, + { + "epoch": 0.09269285649699073, + "grad_norm": 0.10364531725645065, + "learning_rate": 2.9339129838064648e-05, + "loss": 0.0385, + "step": 42030 + }, + { + "epoch": 0.0927149104718889, + "grad_norm": 0.1242540180683136, + "learning_rate": 2.9338644365559506e-05, + "loss": 0.0358, + "step": 42040 + }, + { + "epoch": 0.09273696444678707, + "grad_norm": 0.10387483984231949, + "learning_rate": 2.9338158718826296e-05, + "loss": 0.0371, + "step": 42050 + }, + { + "epoch": 0.09275901842168523, + "grad_norm": 0.11993861943483353, + "learning_rate": 2.9337672897870925e-05, + "loss": 0.0387, + "step": 42060 + }, + { + "epoch": 0.0927810723965834, + "grad_norm": 0.11801736056804657, + "learning_rate": 2.9337186902699284e-05, + "loss": 0.0365, + "step": 42070 + }, + { + "epoch": 0.09280312637148157, + "grad_norm": 0.1090790182352066, + "learning_rate": 2.9336700733317293e-05, + "loss": 0.0369, + "step": 42080 + }, + { + "epoch": 0.09282518034637972, + "grad_norm": 0.09802231192588806, + "learning_rate": 2.9336214389730844e-05, + "loss": 0.0384, + "step": 42090 + }, + { + "epoch": 0.0928472343212779, + "grad_norm": 0.1263236403465271, + "learning_rate": 2.933572787194586e-05, + "loss": 0.0399, + "step": 42100 + }, + { + "epoch": 0.09286928829617606, + "grad_norm": 0.10474395751953125, + "learning_rate": 2.9335241179968245e-05, + "loss": 0.0363, + "step": 42110 + }, + { + "epoch": 0.09289134227107422, + "grad_norm": 0.11462212353944778, + "learning_rate": 2.9334754313803914e-05, + "loss": 0.0371, + "step": 42120 + }, + { + "epoch": 0.09291339624597239, + "grad_norm": 0.17800001800060272, + "learning_rate": 2.9334267273458784e-05, + "loss": 0.0381, + "step": 42130 + }, + { + "epoch": 0.09293545022087056, + "grad_norm": 0.12105704843997955, + "learning_rate": 2.9333780058938777e-05, + "loss": 0.0357, + "step": 42140 + }, + { + "epoch": 0.09295750419576873, + "grad_norm": 0.11721494048833847, + "learning_rate": 2.9333292670249806e-05, + "loss": 0.0382, + "step": 42150 + }, + { + "epoch": 0.09297955817066689, + "grad_norm": 0.12385644763708115, + "learning_rate": 2.9332805107397796e-05, + "loss": 0.0401, + "step": 42160 + }, + { + "epoch": 0.09300161214556506, + "grad_norm": 0.10140243172645569, + "learning_rate": 2.933231737038867e-05, + "loss": 0.0371, + "step": 42170 + }, + { + "epoch": 0.09302366612046323, + "grad_norm": 0.11478043347597122, + "learning_rate": 2.9331829459228364e-05, + "loss": 0.037, + "step": 42180 + }, + { + "epoch": 0.09304572009536138, + "grad_norm": 0.16095982491970062, + "learning_rate": 2.933134137392279e-05, + "loss": 0.0388, + "step": 42190 + }, + { + "epoch": 0.09306777407025955, + "grad_norm": 0.15388481318950653, + "learning_rate": 2.933085311447789e-05, + "loss": 0.0359, + "step": 42200 + }, + { + "epoch": 0.09308982804515772, + "grad_norm": 0.1245129182934761, + "learning_rate": 2.9330364680899592e-05, + "loss": 0.0368, + "step": 42210 + }, + { + "epoch": 0.09311188202005588, + "grad_norm": 0.10935511440038681, + "learning_rate": 2.9329876073193838e-05, + "loss": 0.0377, + "step": 42220 + }, + { + "epoch": 0.09313393599495405, + "grad_norm": 0.11046851426362991, + "learning_rate": 2.9329387291366556e-05, + "loss": 0.0411, + "step": 42230 + }, + { + "epoch": 0.09315598996985222, + "grad_norm": 0.12007760256528854, + "learning_rate": 2.932889833542369e-05, + "loss": 0.0376, + "step": 42240 + }, + { + "epoch": 0.09317804394475038, + "grad_norm": 0.10113875567913055, + "learning_rate": 2.9328409205371178e-05, + "loss": 0.0374, + "step": 42250 + }, + { + "epoch": 0.09320009791964855, + "grad_norm": 0.12066829204559326, + "learning_rate": 2.932791990121497e-05, + "loss": 0.0374, + "step": 42260 + }, + { + "epoch": 0.09322215189454672, + "grad_norm": 0.16097573935985565, + "learning_rate": 2.9327430422961008e-05, + "loss": 0.0394, + "step": 42270 + }, + { + "epoch": 0.09324420586944487, + "grad_norm": 0.12566037476062775, + "learning_rate": 2.9326940770615235e-05, + "loss": 0.0364, + "step": 42280 + }, + { + "epoch": 0.09326625984434304, + "grad_norm": 0.11418378353118896, + "learning_rate": 2.932645094418361e-05, + "loss": 0.0384, + "step": 42290 + }, + { + "epoch": 0.09328831381924121, + "grad_norm": 0.14458777010440826, + "learning_rate": 2.9325960943672072e-05, + "loss": 0.0384, + "step": 42300 + }, + { + "epoch": 0.09331036779413937, + "grad_norm": 0.13813254237174988, + "learning_rate": 2.9325470769086587e-05, + "loss": 0.0386, + "step": 42310 + }, + { + "epoch": 0.09333242176903754, + "grad_norm": 0.15544241666793823, + "learning_rate": 2.9324980420433108e-05, + "loss": 0.0381, + "step": 42320 + }, + { + "epoch": 0.09335447574393571, + "grad_norm": 0.12452424317598343, + "learning_rate": 2.932448989771759e-05, + "loss": 0.0393, + "step": 42330 + }, + { + "epoch": 0.09337652971883387, + "grad_norm": 0.09114623069763184, + "learning_rate": 2.9323999200946e-05, + "loss": 0.0391, + "step": 42340 + }, + { + "epoch": 0.09339858369373204, + "grad_norm": 0.1450238823890686, + "learning_rate": 2.9323508330124288e-05, + "loss": 0.0394, + "step": 42350 + }, + { + "epoch": 0.09342063766863021, + "grad_norm": 0.10342197120189667, + "learning_rate": 2.932301728525843e-05, + "loss": 0.0383, + "step": 42360 + }, + { + "epoch": 0.09344269164352838, + "grad_norm": 0.10464353114366531, + "learning_rate": 2.9322526066354386e-05, + "loss": 0.0371, + "step": 42370 + }, + { + "epoch": 0.09346474561842653, + "grad_norm": 0.10933813452720642, + "learning_rate": 2.9322034673418133e-05, + "loss": 0.0371, + "step": 42380 + }, + { + "epoch": 0.0934867995933247, + "grad_norm": 0.12545421719551086, + "learning_rate": 2.932154310645563e-05, + "loss": 0.0368, + "step": 42390 + }, + { + "epoch": 0.09350885356822287, + "grad_norm": 0.09778816252946854, + "learning_rate": 2.9321051365472865e-05, + "loss": 0.0386, + "step": 42400 + }, + { + "epoch": 0.09353090754312103, + "grad_norm": 0.11412692815065384, + "learning_rate": 2.9320559450475794e-05, + "loss": 0.0369, + "step": 42410 + }, + { + "epoch": 0.0935529615180192, + "grad_norm": 0.10671484470367432, + "learning_rate": 2.932006736147041e-05, + "loss": 0.0388, + "step": 42420 + }, + { + "epoch": 0.09357501549291737, + "grad_norm": 0.1273517608642578, + "learning_rate": 2.9319575098462692e-05, + "loss": 0.0379, + "step": 42430 + }, + { + "epoch": 0.09359706946781553, + "grad_norm": 0.1124381273984909, + "learning_rate": 2.931908266145861e-05, + "loss": 0.0374, + "step": 42440 + }, + { + "epoch": 0.0936191234427137, + "grad_norm": 0.08706898987293243, + "learning_rate": 2.9318590050464156e-05, + "loss": 0.0372, + "step": 42450 + }, + { + "epoch": 0.09364117741761187, + "grad_norm": 0.13870099186897278, + "learning_rate": 2.9318097265485315e-05, + "loss": 0.0393, + "step": 42460 + }, + { + "epoch": 0.09366323139251002, + "grad_norm": 0.12246199697256088, + "learning_rate": 2.9317604306528074e-05, + "loss": 0.0353, + "step": 42470 + }, + { + "epoch": 0.0936852853674082, + "grad_norm": 0.12415400892496109, + "learning_rate": 2.931711117359842e-05, + "loss": 0.0375, + "step": 42480 + }, + { + "epoch": 0.09370733934230636, + "grad_norm": 0.09920656681060791, + "learning_rate": 2.931661786670235e-05, + "loss": 0.0363, + "step": 42490 + }, + { + "epoch": 0.09372939331720452, + "grad_norm": 0.12931902706623077, + "learning_rate": 2.9316124385845853e-05, + "loss": 0.0373, + "step": 42500 + }, + { + "epoch": 0.09375144729210269, + "grad_norm": 0.13502027094364166, + "learning_rate": 2.931563073103493e-05, + "loss": 0.0378, + "step": 42510 + }, + { + "epoch": 0.09377350126700086, + "grad_norm": 0.11881709843873978, + "learning_rate": 2.9315136902275577e-05, + "loss": 0.04, + "step": 42520 + }, + { + "epoch": 0.09379555524189902, + "grad_norm": 0.08356548845767975, + "learning_rate": 2.9314642899573798e-05, + "loss": 0.0371, + "step": 42530 + }, + { + "epoch": 0.09381760921679719, + "grad_norm": 0.13152958452701569, + "learning_rate": 2.931414872293559e-05, + "loss": 0.0371, + "step": 42540 + }, + { + "epoch": 0.09383966319169536, + "grad_norm": 0.1156468391418457, + "learning_rate": 2.931365437236696e-05, + "loss": 0.0349, + "step": 42550 + }, + { + "epoch": 0.09386171716659351, + "grad_norm": 0.13543419539928436, + "learning_rate": 2.9313159847873912e-05, + "loss": 0.0391, + "step": 42560 + }, + { + "epoch": 0.09388377114149168, + "grad_norm": 0.11758094280958176, + "learning_rate": 2.9312665149462466e-05, + "loss": 0.0365, + "step": 42570 + }, + { + "epoch": 0.09390582511638985, + "grad_norm": 0.10928091406822205, + "learning_rate": 2.9312170277138618e-05, + "loss": 0.0392, + "step": 42580 + }, + { + "epoch": 0.09392787909128802, + "grad_norm": 0.09733477234840393, + "learning_rate": 2.931167523090839e-05, + "loss": 0.0357, + "step": 42590 + }, + { + "epoch": 0.09394993306618618, + "grad_norm": 0.11427047848701477, + "learning_rate": 2.9311180010777793e-05, + "loss": 0.0389, + "step": 42600 + }, + { + "epoch": 0.09397198704108435, + "grad_norm": 0.11734382063150406, + "learning_rate": 2.931068461675285e-05, + "loss": 0.039, + "step": 42610 + }, + { + "epoch": 0.09399404101598252, + "grad_norm": 0.1278837025165558, + "learning_rate": 2.931018904883958e-05, + "loss": 0.0379, + "step": 42620 + }, + { + "epoch": 0.09401609499088068, + "grad_norm": 0.13615626096725464, + "learning_rate": 2.9309693307043998e-05, + "loss": 0.0399, + "step": 42630 + }, + { + "epoch": 0.09403814896577885, + "grad_norm": 0.1382800042629242, + "learning_rate": 2.930919739137213e-05, + "loss": 0.0391, + "step": 42640 + }, + { + "epoch": 0.09406020294067702, + "grad_norm": 0.1177547350525856, + "learning_rate": 2.9308701301830008e-05, + "loss": 0.0378, + "step": 42650 + }, + { + "epoch": 0.09408225691557517, + "grad_norm": 0.13773256540298462, + "learning_rate": 2.9308205038423656e-05, + "loss": 0.0403, + "step": 42660 + }, + { + "epoch": 0.09410431089047334, + "grad_norm": 0.12833759188652039, + "learning_rate": 2.93077086011591e-05, + "loss": 0.0381, + "step": 42670 + }, + { + "epoch": 0.09412636486537151, + "grad_norm": 0.11163214594125748, + "learning_rate": 2.9307211990042378e-05, + "loss": 0.0385, + "step": 42680 + }, + { + "epoch": 0.09414841884026967, + "grad_norm": 0.11915844678878784, + "learning_rate": 2.930671520507952e-05, + "loss": 0.0361, + "step": 42690 + }, + { + "epoch": 0.09417047281516784, + "grad_norm": 0.11558433622121811, + "learning_rate": 2.9306218246276567e-05, + "loss": 0.0375, + "step": 42700 + }, + { + "epoch": 0.09419252679006601, + "grad_norm": 0.14620670676231384, + "learning_rate": 2.9305721113639553e-05, + "loss": 0.0361, + "step": 42710 + }, + { + "epoch": 0.09421458076496417, + "grad_norm": 0.1236579641699791, + "learning_rate": 2.9305223807174523e-05, + "loss": 0.036, + "step": 42720 + }, + { + "epoch": 0.09423663473986234, + "grad_norm": 0.1132175549864769, + "learning_rate": 2.9304726326887516e-05, + "loss": 0.0386, + "step": 42730 + }, + { + "epoch": 0.09425868871476051, + "grad_norm": 0.10363657772541046, + "learning_rate": 2.9304228672784578e-05, + "loss": 0.0382, + "step": 42740 + }, + { + "epoch": 0.09428074268965866, + "grad_norm": 0.13042707741260529, + "learning_rate": 2.9303730844871757e-05, + "loss": 0.0356, + "step": 42750 + }, + { + "epoch": 0.09430279666455683, + "grad_norm": 0.12094569951295853, + "learning_rate": 2.93032328431551e-05, + "loss": 0.0396, + "step": 42760 + }, + { + "epoch": 0.094324850639455, + "grad_norm": 0.16014160215854645, + "learning_rate": 2.930273466764066e-05, + "loss": 0.038, + "step": 42770 + }, + { + "epoch": 0.09434690461435317, + "grad_norm": 0.08492398262023926, + "learning_rate": 2.9302236318334488e-05, + "loss": 0.0366, + "step": 42780 + }, + { + "epoch": 0.09436895858925133, + "grad_norm": 0.1523280143737793, + "learning_rate": 2.9301737795242645e-05, + "loss": 0.0392, + "step": 42790 + }, + { + "epoch": 0.0943910125641495, + "grad_norm": 0.11226915568113327, + "learning_rate": 2.930123909837118e-05, + "loss": 0.0375, + "step": 42800 + }, + { + "epoch": 0.09441306653904767, + "grad_norm": 0.13479486107826233, + "learning_rate": 2.9300740227726158e-05, + "loss": 0.0389, + "step": 42810 + }, + { + "epoch": 0.09443512051394583, + "grad_norm": 0.10763884335756302, + "learning_rate": 2.9300241183313644e-05, + "loss": 0.0368, + "step": 42820 + }, + { + "epoch": 0.094457174488844, + "grad_norm": 0.11158758401870728, + "learning_rate": 2.92997419651397e-05, + "loss": 0.0389, + "step": 42830 + }, + { + "epoch": 0.09447922846374217, + "grad_norm": 0.12360669672489166, + "learning_rate": 2.9299242573210384e-05, + "loss": 0.0376, + "step": 42840 + }, + { + "epoch": 0.09450128243864032, + "grad_norm": 0.1311352401971817, + "learning_rate": 2.9298743007531772e-05, + "loss": 0.037, + "step": 42850 + }, + { + "epoch": 0.0945233364135385, + "grad_norm": 0.08878181129693985, + "learning_rate": 2.9298243268109934e-05, + "loss": 0.0381, + "step": 42860 + }, + { + "epoch": 0.09454539038843666, + "grad_norm": 0.10777803510427475, + "learning_rate": 2.929774335495094e-05, + "loss": 0.0381, + "step": 42870 + }, + { + "epoch": 0.09456744436333482, + "grad_norm": 0.14417962729930878, + "learning_rate": 2.929724326806086e-05, + "loss": 0.0382, + "step": 42880 + }, + { + "epoch": 0.09458949833823299, + "grad_norm": 0.11303258687257767, + "learning_rate": 2.929674300744578e-05, + "loss": 0.0367, + "step": 42890 + }, + { + "epoch": 0.09461155231313116, + "grad_norm": 0.13935615122318268, + "learning_rate": 2.9296242573111773e-05, + "loss": 0.0382, + "step": 42900 + }, + { + "epoch": 0.09463360628802932, + "grad_norm": 0.1065802276134491, + "learning_rate": 2.929574196506492e-05, + "loss": 0.0374, + "step": 42910 + }, + { + "epoch": 0.09465566026292749, + "grad_norm": 0.11271431297063828, + "learning_rate": 2.929524118331131e-05, + "loss": 0.0384, + "step": 42920 + }, + { + "epoch": 0.09467771423782566, + "grad_norm": 0.12254568189382553, + "learning_rate": 2.9294740227857015e-05, + "loss": 0.0379, + "step": 42930 + }, + { + "epoch": 0.09469976821272381, + "grad_norm": 0.09571337699890137, + "learning_rate": 2.9294239098708132e-05, + "loss": 0.0369, + "step": 42940 + }, + { + "epoch": 0.09472182218762198, + "grad_norm": 0.13314788043498993, + "learning_rate": 2.929373779587075e-05, + "loss": 0.0358, + "step": 42950 + }, + { + "epoch": 0.09474387616252015, + "grad_norm": 0.1123686209321022, + "learning_rate": 2.929323631935096e-05, + "loss": 0.0384, + "step": 42960 + }, + { + "epoch": 0.09476593013741831, + "grad_norm": 0.16027987003326416, + "learning_rate": 2.929273466915485e-05, + "loss": 0.0371, + "step": 42970 + }, + { + "epoch": 0.09478798411231648, + "grad_norm": 0.12855620682239532, + "learning_rate": 2.9292232845288518e-05, + "loss": 0.0379, + "step": 42980 + }, + { + "epoch": 0.09481003808721465, + "grad_norm": 0.1539454460144043, + "learning_rate": 2.9291730847758063e-05, + "loss": 0.0372, + "step": 42990 + }, + { + "epoch": 0.09483209206211282, + "grad_norm": 0.13456647098064423, + "learning_rate": 2.929122867656959e-05, + "loss": 0.0362, + "step": 43000 + }, + { + "epoch": 0.09485414603701098, + "grad_norm": 0.15980441868305206, + "learning_rate": 2.9290726331729193e-05, + "loss": 0.039, + "step": 43010 + }, + { + "epoch": 0.09487620001190915, + "grad_norm": 0.13290619850158691, + "learning_rate": 2.929022381324298e-05, + "loss": 0.0363, + "step": 43020 + }, + { + "epoch": 0.09489825398680732, + "grad_norm": 0.12077811360359192, + "learning_rate": 2.928972112111705e-05, + "loss": 0.0382, + "step": 43030 + }, + { + "epoch": 0.09492030796170547, + "grad_norm": 0.11219900846481323, + "learning_rate": 2.928921825535752e-05, + "loss": 0.0364, + "step": 43040 + }, + { + "epoch": 0.09494236193660364, + "grad_norm": 0.13137556612491608, + "learning_rate": 2.9288715215970495e-05, + "loss": 0.04, + "step": 43050 + }, + { + "epoch": 0.09496441591150181, + "grad_norm": 0.1488749235868454, + "learning_rate": 2.9288212002962093e-05, + "loss": 0.0369, + "step": 43060 + }, + { + "epoch": 0.09498646988639997, + "grad_norm": 0.13913634419441223, + "learning_rate": 2.928770861633842e-05, + "loss": 0.0349, + "step": 43070 + }, + { + "epoch": 0.09500852386129814, + "grad_norm": 0.12495540082454681, + "learning_rate": 2.9287205056105602e-05, + "loss": 0.0375, + "step": 43080 + }, + { + "epoch": 0.09503057783619631, + "grad_norm": 0.10272639989852905, + "learning_rate": 2.9286701322269753e-05, + "loss": 0.0388, + "step": 43090 + }, + { + "epoch": 0.09505263181109447, + "grad_norm": 0.12464030832052231, + "learning_rate": 2.928619741483699e-05, + "loss": 0.037, + "step": 43100 + }, + { + "epoch": 0.09507468578599264, + "grad_norm": 0.10544134676456451, + "learning_rate": 2.9285693333813442e-05, + "loss": 0.0362, + "step": 43110 + }, + { + "epoch": 0.09509673976089081, + "grad_norm": 0.11926769465208054, + "learning_rate": 2.9285189079205233e-05, + "loss": 0.0399, + "step": 43120 + }, + { + "epoch": 0.09511879373578896, + "grad_norm": 0.12551626563072205, + "learning_rate": 2.928468465101849e-05, + "loss": 0.0381, + "step": 43130 + }, + { + "epoch": 0.09514084771068713, + "grad_norm": 0.11342190951108932, + "learning_rate": 2.9284180049259342e-05, + "loss": 0.0376, + "step": 43140 + }, + { + "epoch": 0.0951629016855853, + "grad_norm": 0.1156434714794159, + "learning_rate": 2.9283675273933915e-05, + "loss": 0.0361, + "step": 43150 + }, + { + "epoch": 0.09518495566048346, + "grad_norm": 0.10629839450120926, + "learning_rate": 2.928317032504835e-05, + "loss": 0.0364, + "step": 43160 + }, + { + "epoch": 0.09520700963538163, + "grad_norm": 0.15053795278072357, + "learning_rate": 2.928266520260878e-05, + "loss": 0.0387, + "step": 43170 + }, + { + "epoch": 0.0952290636102798, + "grad_norm": 0.12595954537391663, + "learning_rate": 2.9282159906621343e-05, + "loss": 0.0397, + "step": 43180 + }, + { + "epoch": 0.09525111758517796, + "grad_norm": 0.10139691084623337, + "learning_rate": 2.928165443709218e-05, + "loss": 0.0369, + "step": 43190 + }, + { + "epoch": 0.09527317156007613, + "grad_norm": 0.11087370663881302, + "learning_rate": 2.9281148794027433e-05, + "loss": 0.0368, + "step": 43200 + }, + { + "epoch": 0.0952952255349743, + "grad_norm": 0.17984934151172638, + "learning_rate": 2.928064297743324e-05, + "loss": 0.0375, + "step": 43210 + }, + { + "epoch": 0.09531727950987247, + "grad_norm": 0.13282066583633423, + "learning_rate": 2.9280136987315755e-05, + "loss": 0.0376, + "step": 43220 + }, + { + "epoch": 0.09533933348477062, + "grad_norm": 0.11799374222755432, + "learning_rate": 2.927963082368112e-05, + "loss": 0.0369, + "step": 43230 + }, + { + "epoch": 0.0953613874596688, + "grad_norm": 0.12031092494726181, + "learning_rate": 2.927912448653549e-05, + "loss": 0.0397, + "step": 43240 + }, + { + "epoch": 0.09538344143456697, + "grad_norm": 0.1437768191099167, + "learning_rate": 2.9278617975885016e-05, + "loss": 0.0382, + "step": 43250 + }, + { + "epoch": 0.09540549540946512, + "grad_norm": 0.12914924323558807, + "learning_rate": 2.927811129173585e-05, + "loss": 0.0393, + "step": 43260 + }, + { + "epoch": 0.09542754938436329, + "grad_norm": 0.10998117923736572, + "learning_rate": 2.9277604434094157e-05, + "loss": 0.0393, + "step": 43270 + }, + { + "epoch": 0.09544960335926146, + "grad_norm": 0.10230831056833267, + "learning_rate": 2.9277097402966082e-05, + "loss": 0.0381, + "step": 43280 + }, + { + "epoch": 0.09547165733415962, + "grad_norm": 0.12738943099975586, + "learning_rate": 2.92765901983578e-05, + "loss": 0.0386, + "step": 43290 + }, + { + "epoch": 0.09549371130905779, + "grad_norm": 0.10346487164497375, + "learning_rate": 2.9276082820275464e-05, + "loss": 0.0376, + "step": 43300 + }, + { + "epoch": 0.09551576528395596, + "grad_norm": 0.20038536190986633, + "learning_rate": 2.9275575268725244e-05, + "loss": 0.0377, + "step": 43310 + }, + { + "epoch": 0.09553781925885411, + "grad_norm": 0.13251182436943054, + "learning_rate": 2.92750675437133e-05, + "loss": 0.0395, + "step": 43320 + }, + { + "epoch": 0.09555987323375228, + "grad_norm": 0.18763825297355652, + "learning_rate": 2.927455964524582e-05, + "loss": 0.0381, + "step": 43330 + }, + { + "epoch": 0.09558192720865046, + "grad_norm": 0.09472404420375824, + "learning_rate": 2.9274051573328956e-05, + "loss": 0.0382, + "step": 43340 + }, + { + "epoch": 0.09560398118354861, + "grad_norm": 0.12351585179567337, + "learning_rate": 2.927354332796889e-05, + "loss": 0.0384, + "step": 43350 + }, + { + "epoch": 0.09562603515844678, + "grad_norm": 0.09571579098701477, + "learning_rate": 2.9273034909171795e-05, + "loss": 0.0361, + "step": 43360 + }, + { + "epoch": 0.09564808913334495, + "grad_norm": 0.10978981852531433, + "learning_rate": 2.9272526316943848e-05, + "loss": 0.0377, + "step": 43370 + }, + { + "epoch": 0.09567014310824311, + "grad_norm": 0.08769593387842178, + "learning_rate": 2.9272017551291235e-05, + "loss": 0.0391, + "step": 43380 + }, + { + "epoch": 0.09569219708314128, + "grad_norm": 0.12209740281105042, + "learning_rate": 2.9271508612220128e-05, + "loss": 0.0378, + "step": 43390 + }, + { + "epoch": 0.09571425105803945, + "grad_norm": 0.11742759495973587, + "learning_rate": 2.9270999499736725e-05, + "loss": 0.037, + "step": 43400 + }, + { + "epoch": 0.0957363050329376, + "grad_norm": 0.13214151561260223, + "learning_rate": 2.9270490213847196e-05, + "loss": 0.0394, + "step": 43410 + }, + { + "epoch": 0.09575835900783577, + "grad_norm": 0.14308562874794006, + "learning_rate": 2.926998075455774e-05, + "loss": 0.0364, + "step": 43420 + }, + { + "epoch": 0.09578041298273395, + "grad_norm": 0.11223950237035751, + "learning_rate": 2.926947112187455e-05, + "loss": 0.0364, + "step": 43430 + }, + { + "epoch": 0.09580246695763212, + "grad_norm": 0.12959446012973785, + "learning_rate": 2.9268961315803804e-05, + "loss": 0.0365, + "step": 43440 + }, + { + "epoch": 0.09582452093253027, + "grad_norm": 0.14676034450531006, + "learning_rate": 2.926845133635171e-05, + "loss": 0.0372, + "step": 43450 + }, + { + "epoch": 0.09584657490742844, + "grad_norm": 0.10368715226650238, + "learning_rate": 2.926794118352446e-05, + "loss": 0.0384, + "step": 43460 + }, + { + "epoch": 0.09586862888232661, + "grad_norm": 0.14133653044700623, + "learning_rate": 2.9267430857328256e-05, + "loss": 0.0375, + "step": 43470 + }, + { + "epoch": 0.09589068285722477, + "grad_norm": 0.11820298433303833, + "learning_rate": 2.9266920357769296e-05, + "loss": 0.0379, + "step": 43480 + }, + { + "epoch": 0.09591273683212294, + "grad_norm": 0.127200648188591, + "learning_rate": 2.9266409684853784e-05, + "loss": 0.0373, + "step": 43490 + }, + { + "epoch": 0.09593479080702111, + "grad_norm": 0.13150882720947266, + "learning_rate": 2.9265898838587922e-05, + "loss": 0.0386, + "step": 43500 + }, + { + "epoch": 0.09595684478191926, + "grad_norm": 0.1328737735748291, + "learning_rate": 2.926538781897792e-05, + "loss": 0.0374, + "step": 43510 + }, + { + "epoch": 0.09597889875681744, + "grad_norm": 0.14952749013900757, + "learning_rate": 2.926487662602999e-05, + "loss": 0.0371, + "step": 43520 + }, + { + "epoch": 0.0960009527317156, + "grad_norm": 0.12126050144433975, + "learning_rate": 2.9264365259750336e-05, + "loss": 0.038, + "step": 43530 + }, + { + "epoch": 0.09602300670661376, + "grad_norm": 0.1332438737154007, + "learning_rate": 2.926385372014518e-05, + "loss": 0.0398, + "step": 43540 + }, + { + "epoch": 0.09604506068151193, + "grad_norm": 0.11673562973737717, + "learning_rate": 2.9263342007220732e-05, + "loss": 0.0367, + "step": 43550 + }, + { + "epoch": 0.0960671146564101, + "grad_norm": 0.17867028713226318, + "learning_rate": 2.9262830120983216e-05, + "loss": 0.0383, + "step": 43560 + }, + { + "epoch": 0.09608916863130826, + "grad_norm": 0.13550777733325958, + "learning_rate": 2.9262318061438842e-05, + "loss": 0.0363, + "step": 43570 + }, + { + "epoch": 0.09611122260620643, + "grad_norm": 0.16821932792663574, + "learning_rate": 2.926180582859384e-05, + "loss": 0.0376, + "step": 43580 + }, + { + "epoch": 0.0961332765811046, + "grad_norm": 0.1465987265110016, + "learning_rate": 2.926129342245443e-05, + "loss": 0.0371, + "step": 43590 + }, + { + "epoch": 0.09615533055600275, + "grad_norm": 0.13747350871562958, + "learning_rate": 2.9260780843026845e-05, + "loss": 0.0407, + "step": 43600 + }, + { + "epoch": 0.09617738453090093, + "grad_norm": 0.1214483454823494, + "learning_rate": 2.9260268090317305e-05, + "loss": 0.0368, + "step": 43610 + }, + { + "epoch": 0.0961994385057991, + "grad_norm": 0.1477283537387848, + "learning_rate": 2.9259755164332043e-05, + "loss": 0.0388, + "step": 43620 + }, + { + "epoch": 0.09622149248069725, + "grad_norm": 0.12295669317245483, + "learning_rate": 2.9259242065077295e-05, + "loss": 0.0377, + "step": 43630 + }, + { + "epoch": 0.09624354645559542, + "grad_norm": 0.1147606149315834, + "learning_rate": 2.925872879255929e-05, + "loss": 0.0389, + "step": 43640 + }, + { + "epoch": 0.09626560043049359, + "grad_norm": 0.12649662792682648, + "learning_rate": 2.9258215346784272e-05, + "loss": 0.0361, + "step": 43650 + }, + { + "epoch": 0.09628765440539176, + "grad_norm": 0.13776111602783203, + "learning_rate": 2.925770172775847e-05, + "loss": 0.0399, + "step": 43660 + }, + { + "epoch": 0.09630970838028992, + "grad_norm": 0.10707046836614609, + "learning_rate": 2.9257187935488133e-05, + "loss": 0.038, + "step": 43670 + }, + { + "epoch": 0.09633176235518809, + "grad_norm": 0.09717532992362976, + "learning_rate": 2.9256673969979502e-05, + "loss": 0.0386, + "step": 43680 + }, + { + "epoch": 0.09635381633008626, + "grad_norm": 0.10541761666536331, + "learning_rate": 2.925615983123882e-05, + "loss": 0.0365, + "step": 43690 + }, + { + "epoch": 0.09637587030498442, + "grad_norm": 0.11124259233474731, + "learning_rate": 2.9255645519272337e-05, + "loss": 0.0389, + "step": 43700 + }, + { + "epoch": 0.09639792427988259, + "grad_norm": 0.10201480239629745, + "learning_rate": 2.9255131034086305e-05, + "loss": 0.0401, + "step": 43710 + }, + { + "epoch": 0.09641997825478076, + "grad_norm": 0.12762020528316498, + "learning_rate": 2.9254616375686966e-05, + "loss": 0.0374, + "step": 43720 + }, + { + "epoch": 0.09644203222967891, + "grad_norm": 0.10910973697900772, + "learning_rate": 2.9254101544080586e-05, + "loss": 0.0416, + "step": 43730 + }, + { + "epoch": 0.09646408620457708, + "grad_norm": 0.13650842010974884, + "learning_rate": 2.9253586539273408e-05, + "loss": 0.0367, + "step": 43740 + }, + { + "epoch": 0.09648614017947525, + "grad_norm": 0.10275691002607346, + "learning_rate": 2.9253071361271702e-05, + "loss": 0.0366, + "step": 43750 + }, + { + "epoch": 0.09650819415437341, + "grad_norm": 0.10123338550329208, + "learning_rate": 2.9252556010081716e-05, + "loss": 0.0372, + "step": 43760 + }, + { + "epoch": 0.09653024812927158, + "grad_norm": 0.10758042335510254, + "learning_rate": 2.9252040485709722e-05, + "loss": 0.0371, + "step": 43770 + }, + { + "epoch": 0.09655230210416975, + "grad_norm": 0.12804870307445526, + "learning_rate": 2.9251524788161977e-05, + "loss": 0.0368, + "step": 43780 + }, + { + "epoch": 0.0965743560790679, + "grad_norm": 0.10727541893720627, + "learning_rate": 2.925100891744475e-05, + "loss": 0.0367, + "step": 43790 + }, + { + "epoch": 0.09659641005396608, + "grad_norm": 0.12193100154399872, + "learning_rate": 2.925049287356431e-05, + "loss": 0.0386, + "step": 43800 + }, + { + "epoch": 0.09661846402886425, + "grad_norm": 0.09935840964317322, + "learning_rate": 2.924997665652693e-05, + "loss": 0.0379, + "step": 43810 + }, + { + "epoch": 0.0966405180037624, + "grad_norm": 0.12270904332399368, + "learning_rate": 2.924946026633887e-05, + "loss": 0.0387, + "step": 43820 + }, + { + "epoch": 0.09666257197866057, + "grad_norm": 0.13366851210594177, + "learning_rate": 2.9248943703006424e-05, + "loss": 0.0387, + "step": 43830 + }, + { + "epoch": 0.09668462595355874, + "grad_norm": 0.13697129487991333, + "learning_rate": 2.9248426966535856e-05, + "loss": 0.038, + "step": 43840 + }, + { + "epoch": 0.09670667992845691, + "grad_norm": 0.10623210668563843, + "learning_rate": 2.924791005693345e-05, + "loss": 0.0384, + "step": 43850 + }, + { + "epoch": 0.09672873390335507, + "grad_norm": 0.1389344334602356, + "learning_rate": 2.924739297420548e-05, + "loss": 0.0378, + "step": 43860 + }, + { + "epoch": 0.09675078787825324, + "grad_norm": 0.10787855833768845, + "learning_rate": 2.9246875718358235e-05, + "loss": 0.037, + "step": 43870 + }, + { + "epoch": 0.09677284185315141, + "grad_norm": 0.146098330616951, + "learning_rate": 2.9246358289398002e-05, + "loss": 0.037, + "step": 43880 + }, + { + "epoch": 0.09679489582804957, + "grad_norm": 0.1265479475259781, + "learning_rate": 2.924584068733106e-05, + "loss": 0.0368, + "step": 43890 + }, + { + "epoch": 0.09681694980294774, + "grad_norm": 0.12238486856222153, + "learning_rate": 2.9245322912163705e-05, + "loss": 0.037, + "step": 43900 + }, + { + "epoch": 0.0968390037778459, + "grad_norm": 0.10222179442644119, + "learning_rate": 2.9244804963902226e-05, + "loss": 0.0365, + "step": 43910 + }, + { + "epoch": 0.09686105775274406, + "grad_norm": 0.1111965924501419, + "learning_rate": 2.924428684255292e-05, + "loss": 0.0396, + "step": 43920 + }, + { + "epoch": 0.09688311172764223, + "grad_norm": 0.09735002368688583, + "learning_rate": 2.924376854812208e-05, + "loss": 0.0394, + "step": 43930 + }, + { + "epoch": 0.0969051657025404, + "grad_norm": 0.1357758641242981, + "learning_rate": 2.9243250080616e-05, + "loss": 0.0399, + "step": 43940 + }, + { + "epoch": 0.09692721967743856, + "grad_norm": 0.13115014135837555, + "learning_rate": 2.9242731440040987e-05, + "loss": 0.0386, + "step": 43950 + }, + { + "epoch": 0.09694927365233673, + "grad_norm": 0.10622481256723404, + "learning_rate": 2.924221262640334e-05, + "loss": 0.036, + "step": 43960 + }, + { + "epoch": 0.0969713276272349, + "grad_norm": 0.15727776288986206, + "learning_rate": 2.924169363970936e-05, + "loss": 0.0371, + "step": 43970 + }, + { + "epoch": 0.09699338160213306, + "grad_norm": 0.16309477388858795, + "learning_rate": 2.924117447996536e-05, + "loss": 0.0364, + "step": 43980 + }, + { + "epoch": 0.09701543557703123, + "grad_norm": 0.12792815268039703, + "learning_rate": 2.924065514717764e-05, + "loss": 0.0356, + "step": 43990 + }, + { + "epoch": 0.0970374895519294, + "grad_norm": 0.13036775588989258, + "learning_rate": 2.9240135641352515e-05, + "loss": 0.0367, + "step": 44000 + }, + { + "epoch": 0.09705954352682755, + "grad_norm": 0.13930007815361023, + "learning_rate": 2.92396159624963e-05, + "loss": 0.0393, + "step": 44010 + }, + { + "epoch": 0.09708159750172572, + "grad_norm": 0.10135197639465332, + "learning_rate": 2.9239096110615305e-05, + "loss": 0.0353, + "step": 44020 + }, + { + "epoch": 0.09710365147662389, + "grad_norm": 0.10104450583457947, + "learning_rate": 2.923857608571585e-05, + "loss": 0.0362, + "step": 44030 + }, + { + "epoch": 0.09712570545152205, + "grad_norm": 0.10539667308330536, + "learning_rate": 2.923805588780425e-05, + "loss": 0.0392, + "step": 44040 + }, + { + "epoch": 0.09714775942642022, + "grad_norm": 0.08823982626199722, + "learning_rate": 2.9237535516886828e-05, + "loss": 0.0383, + "step": 44050 + }, + { + "epoch": 0.09716981340131839, + "grad_norm": 0.11430806666612625, + "learning_rate": 2.92370149729699e-05, + "loss": 0.0404, + "step": 44060 + }, + { + "epoch": 0.09719186737621656, + "grad_norm": 0.123573899269104, + "learning_rate": 2.9236494256059812e-05, + "loss": 0.0365, + "step": 44070 + }, + { + "epoch": 0.09721392135111472, + "grad_norm": 0.12952221930027008, + "learning_rate": 2.923597336616287e-05, + "loss": 0.0353, + "step": 44080 + }, + { + "epoch": 0.09723597532601289, + "grad_norm": 0.14488016068935394, + "learning_rate": 2.923545230328541e-05, + "loss": 0.0393, + "step": 44090 + }, + { + "epoch": 0.09725802930091106, + "grad_norm": 0.15090516209602356, + "learning_rate": 2.9234931067433762e-05, + "loss": 0.0378, + "step": 44100 + }, + { + "epoch": 0.09728008327580921, + "grad_norm": 0.15638744831085205, + "learning_rate": 2.923440965861427e-05, + "loss": 0.0377, + "step": 44110 + }, + { + "epoch": 0.09730213725070738, + "grad_norm": 0.13895025849342346, + "learning_rate": 2.9233888076833253e-05, + "loss": 0.0387, + "step": 44120 + }, + { + "epoch": 0.09732419122560555, + "grad_norm": 0.16247431933879852, + "learning_rate": 2.9233366322097062e-05, + "loss": 0.0356, + "step": 44130 + }, + { + "epoch": 0.09734624520050371, + "grad_norm": 0.0917709469795227, + "learning_rate": 2.923284439441203e-05, + "loss": 0.0368, + "step": 44140 + }, + { + "epoch": 0.09736829917540188, + "grad_norm": 0.10512479394674301, + "learning_rate": 2.92323222937845e-05, + "loss": 0.0384, + "step": 44150 + }, + { + "epoch": 0.09739035315030005, + "grad_norm": 0.1359962671995163, + "learning_rate": 2.9231800020220815e-05, + "loss": 0.0369, + "step": 44160 + }, + { + "epoch": 0.0974124071251982, + "grad_norm": 0.1311248391866684, + "learning_rate": 2.9231277573727324e-05, + "loss": 0.0386, + "step": 44170 + }, + { + "epoch": 0.09743446110009638, + "grad_norm": 0.1031687930226326, + "learning_rate": 2.9230754954310374e-05, + "loss": 0.0392, + "step": 44180 + }, + { + "epoch": 0.09745651507499455, + "grad_norm": 0.13100433349609375, + "learning_rate": 2.9230232161976316e-05, + "loss": 0.0385, + "step": 44190 + }, + { + "epoch": 0.0974785690498927, + "grad_norm": 0.12485647201538086, + "learning_rate": 2.92297091967315e-05, + "loss": 0.0396, + "step": 44200 + }, + { + "epoch": 0.09750062302479087, + "grad_norm": 0.11420433223247528, + "learning_rate": 2.9229186058582286e-05, + "loss": 0.0356, + "step": 44210 + }, + { + "epoch": 0.09752267699968904, + "grad_norm": 0.14247845113277435, + "learning_rate": 2.9228662747535026e-05, + "loss": 0.0356, + "step": 44220 + }, + { + "epoch": 0.0975447309745872, + "grad_norm": 0.10474889725446701, + "learning_rate": 2.9228139263596073e-05, + "loss": 0.0366, + "step": 44230 + }, + { + "epoch": 0.09756678494948537, + "grad_norm": 0.14965665340423584, + "learning_rate": 2.92276156067718e-05, + "loss": 0.0402, + "step": 44240 + }, + { + "epoch": 0.09758883892438354, + "grad_norm": 0.14255283772945404, + "learning_rate": 2.9227091777068563e-05, + "loss": 0.0373, + "step": 44250 + }, + { + "epoch": 0.0976108928992817, + "grad_norm": 0.11116693168878555, + "learning_rate": 2.9226567774492726e-05, + "loss": 0.0387, + "step": 44260 + }, + { + "epoch": 0.09763294687417987, + "grad_norm": 0.1506444662809372, + "learning_rate": 2.9226043599050664e-05, + "loss": 0.0386, + "step": 44270 + }, + { + "epoch": 0.09765500084907804, + "grad_norm": 0.12181026488542557, + "learning_rate": 2.9225519250748737e-05, + "loss": 0.0371, + "step": 44280 + }, + { + "epoch": 0.0976770548239762, + "grad_norm": 0.1331682652235031, + "learning_rate": 2.9224994729593322e-05, + "loss": 0.0362, + "step": 44290 + }, + { + "epoch": 0.09769910879887436, + "grad_norm": 0.12690545618534088, + "learning_rate": 2.922447003559079e-05, + "loss": 0.0374, + "step": 44300 + }, + { + "epoch": 0.09772116277377253, + "grad_norm": 0.13983765244483948, + "learning_rate": 2.9223945168747516e-05, + "loss": 0.0389, + "step": 44310 + }, + { + "epoch": 0.0977432167486707, + "grad_norm": 0.15204600989818573, + "learning_rate": 2.922342012906988e-05, + "loss": 0.0371, + "step": 44320 + }, + { + "epoch": 0.09776527072356886, + "grad_norm": 0.11573798209428787, + "learning_rate": 2.9222894916564257e-05, + "loss": 0.0357, + "step": 44330 + }, + { + "epoch": 0.09778732469846703, + "grad_norm": 0.09383400529623032, + "learning_rate": 2.9222369531237037e-05, + "loss": 0.0374, + "step": 44340 + }, + { + "epoch": 0.0978093786733652, + "grad_norm": 0.11011325567960739, + "learning_rate": 2.9221843973094594e-05, + "loss": 0.0389, + "step": 44350 + }, + { + "epoch": 0.09783143264826336, + "grad_norm": 0.11506042629480362, + "learning_rate": 2.922131824214332e-05, + "loss": 0.0366, + "step": 44360 + }, + { + "epoch": 0.09785348662316153, + "grad_norm": 0.1283738762140274, + "learning_rate": 2.9220792338389606e-05, + "loss": 0.0385, + "step": 44370 + }, + { + "epoch": 0.0978755405980597, + "grad_norm": 0.12557467818260193, + "learning_rate": 2.9220266261839835e-05, + "loss": 0.0366, + "step": 44380 + }, + { + "epoch": 0.09789759457295785, + "grad_norm": 0.11840956658124924, + "learning_rate": 2.9219740012500404e-05, + "loss": 0.0373, + "step": 44390 + }, + { + "epoch": 0.09791964854785602, + "grad_norm": 0.11130756139755249, + "learning_rate": 2.9219213590377704e-05, + "loss": 0.0361, + "step": 44400 + }, + { + "epoch": 0.09794170252275419, + "grad_norm": 0.1017335057258606, + "learning_rate": 2.9218686995478135e-05, + "loss": 0.0373, + "step": 44410 + }, + { + "epoch": 0.09796375649765235, + "grad_norm": 0.10760676860809326, + "learning_rate": 2.9218160227808096e-05, + "loss": 0.0384, + "step": 44420 + }, + { + "epoch": 0.09798581047255052, + "grad_norm": 0.1522226333618164, + "learning_rate": 2.9217633287373986e-05, + "loss": 0.0381, + "step": 44430 + }, + { + "epoch": 0.09800786444744869, + "grad_norm": 0.11688366532325745, + "learning_rate": 2.9217106174182207e-05, + "loss": 0.0397, + "step": 44440 + }, + { + "epoch": 0.09802991842234685, + "grad_norm": 0.13239936530590057, + "learning_rate": 2.9216578888239166e-05, + "loss": 0.0386, + "step": 44450 + }, + { + "epoch": 0.09805197239724502, + "grad_norm": 0.10199157148599625, + "learning_rate": 2.9216051429551264e-05, + "loss": 0.0391, + "step": 44460 + }, + { + "epoch": 0.09807402637214319, + "grad_norm": 0.11486851423978806, + "learning_rate": 2.921552379812492e-05, + "loss": 0.0396, + "step": 44470 + }, + { + "epoch": 0.09809608034704134, + "grad_norm": 0.142496719956398, + "learning_rate": 2.9214995993966537e-05, + "loss": 0.0409, + "step": 44480 + }, + { + "epoch": 0.09811813432193951, + "grad_norm": 0.1298545002937317, + "learning_rate": 2.9214468017082536e-05, + "loss": 0.0376, + "step": 44490 + }, + { + "epoch": 0.09814018829683768, + "grad_norm": 0.10724953562021255, + "learning_rate": 2.9213939867479325e-05, + "loss": 0.0366, + "step": 44500 + }, + { + "epoch": 0.09816224227173585, + "grad_norm": 0.12550613284111023, + "learning_rate": 2.921341154516332e-05, + "loss": 0.0373, + "step": 44510 + }, + { + "epoch": 0.09818429624663401, + "grad_norm": 0.12960900366306305, + "learning_rate": 2.9212883050140952e-05, + "loss": 0.039, + "step": 44520 + }, + { + "epoch": 0.09820635022153218, + "grad_norm": 0.11459114402532578, + "learning_rate": 2.921235438241863e-05, + "loss": 0.0381, + "step": 44530 + }, + { + "epoch": 0.09822840419643035, + "grad_norm": 0.13606274127960205, + "learning_rate": 2.921182554200279e-05, + "loss": 0.0393, + "step": 44540 + }, + { + "epoch": 0.0982504581713285, + "grad_norm": 0.11066463589668274, + "learning_rate": 2.921129652889985e-05, + "loss": 0.0387, + "step": 44550 + }, + { + "epoch": 0.09827251214622668, + "grad_norm": 0.1504870355129242, + "learning_rate": 2.9210767343116236e-05, + "loss": 0.0369, + "step": 44560 + }, + { + "epoch": 0.09829456612112485, + "grad_norm": 0.12283969670534134, + "learning_rate": 2.921023798465838e-05, + "loss": 0.0375, + "step": 44570 + }, + { + "epoch": 0.098316620096023, + "grad_norm": 0.09976045787334442, + "learning_rate": 2.9209708453532724e-05, + "loss": 0.0357, + "step": 44580 + }, + { + "epoch": 0.09833867407092117, + "grad_norm": 0.1595856249332428, + "learning_rate": 2.9209178749745685e-05, + "loss": 0.0415, + "step": 44590 + }, + { + "epoch": 0.09836072804581934, + "grad_norm": 0.1512090116739273, + "learning_rate": 2.9208648873303716e-05, + "loss": 0.0399, + "step": 44600 + }, + { + "epoch": 0.0983827820207175, + "grad_norm": 0.10249580442905426, + "learning_rate": 2.9208118824213244e-05, + "loss": 0.0376, + "step": 44610 + }, + { + "epoch": 0.09840483599561567, + "grad_norm": 0.12122775614261627, + "learning_rate": 2.9207588602480714e-05, + "loss": 0.0382, + "step": 44620 + }, + { + "epoch": 0.09842688997051384, + "grad_norm": 0.11958222836256027, + "learning_rate": 2.9207058208112572e-05, + "loss": 0.0368, + "step": 44630 + }, + { + "epoch": 0.098448943945412, + "grad_norm": 0.13160699605941772, + "learning_rate": 2.9206527641115256e-05, + "loss": 0.0361, + "step": 44640 + }, + { + "epoch": 0.09847099792031017, + "grad_norm": 0.1076062023639679, + "learning_rate": 2.920599690149521e-05, + "loss": 0.0377, + "step": 44650 + }, + { + "epoch": 0.09849305189520834, + "grad_norm": 0.10505980998277664, + "learning_rate": 2.9205465989258894e-05, + "loss": 0.0354, + "step": 44660 + }, + { + "epoch": 0.09851510587010649, + "grad_norm": 0.10469616204500198, + "learning_rate": 2.9204934904412757e-05, + "loss": 0.0369, + "step": 44670 + }, + { + "epoch": 0.09853715984500466, + "grad_norm": 0.12357405573129654, + "learning_rate": 2.920440364696325e-05, + "loss": 0.0372, + "step": 44680 + }, + { + "epoch": 0.09855921381990283, + "grad_norm": 0.11335019022226334, + "learning_rate": 2.920387221691682e-05, + "loss": 0.0382, + "step": 44690 + }, + { + "epoch": 0.09858126779480099, + "grad_norm": 0.1023576408624649, + "learning_rate": 2.9203340614279936e-05, + "loss": 0.0366, + "step": 44700 + }, + { + "epoch": 0.09860332176969916, + "grad_norm": 0.13375185430049896, + "learning_rate": 2.920280883905906e-05, + "loss": 0.0377, + "step": 44710 + }, + { + "epoch": 0.09862537574459733, + "grad_norm": 0.1414109766483307, + "learning_rate": 2.9202276891260635e-05, + "loss": 0.0356, + "step": 44720 + }, + { + "epoch": 0.0986474297194955, + "grad_norm": 0.09997530281543732, + "learning_rate": 2.9201744770891145e-05, + "loss": 0.0367, + "step": 44730 + }, + { + "epoch": 0.09866948369439366, + "grad_norm": 0.13700911402702332, + "learning_rate": 2.9201212477957043e-05, + "loss": 0.0373, + "step": 44740 + }, + { + "epoch": 0.09869153766929183, + "grad_norm": 0.11504513025283813, + "learning_rate": 2.9200680012464803e-05, + "loss": 0.0381, + "step": 44750 + }, + { + "epoch": 0.09871359164419, + "grad_norm": 0.11944910138845444, + "learning_rate": 2.9200147374420896e-05, + "loss": 0.0358, + "step": 44760 + }, + { + "epoch": 0.09873564561908815, + "grad_norm": 0.11435167491436005, + "learning_rate": 2.9199614563831787e-05, + "loss": 0.0373, + "step": 44770 + }, + { + "epoch": 0.09875769959398632, + "grad_norm": 0.09770482033491135, + "learning_rate": 2.9199081580703953e-05, + "loss": 0.0361, + "step": 44780 + }, + { + "epoch": 0.09877975356888449, + "grad_norm": 0.1197504997253418, + "learning_rate": 2.9198548425043874e-05, + "loss": 0.0393, + "step": 44790 + }, + { + "epoch": 0.09880180754378265, + "grad_norm": 0.1527128964662552, + "learning_rate": 2.9198015096858026e-05, + "loss": 0.0395, + "step": 44800 + }, + { + "epoch": 0.09882386151868082, + "grad_norm": 0.08281707763671875, + "learning_rate": 2.9197481596152892e-05, + "loss": 0.0377, + "step": 44810 + }, + { + "epoch": 0.09884591549357899, + "grad_norm": 0.12534701824188232, + "learning_rate": 2.919694792293495e-05, + "loss": 0.0375, + "step": 44820 + }, + { + "epoch": 0.09886796946847715, + "grad_norm": 0.08409936726093292, + "learning_rate": 2.9196414077210687e-05, + "loss": 0.038, + "step": 44830 + }, + { + "epoch": 0.09889002344337532, + "grad_norm": 0.160761296749115, + "learning_rate": 2.919588005898659e-05, + "loss": 0.0387, + "step": 44840 + }, + { + "epoch": 0.09891207741827349, + "grad_norm": 0.13598452508449554, + "learning_rate": 2.9195345868269142e-05, + "loss": 0.0364, + "step": 44850 + }, + { + "epoch": 0.09893413139317164, + "grad_norm": 0.08993897587060928, + "learning_rate": 2.9194811505064842e-05, + "loss": 0.0364, + "step": 44860 + }, + { + "epoch": 0.09895618536806981, + "grad_norm": 0.10366132110357285, + "learning_rate": 2.919427696938018e-05, + "loss": 0.0367, + "step": 44870 + }, + { + "epoch": 0.09897823934296798, + "grad_norm": 0.14724357426166534, + "learning_rate": 2.9193742261221657e-05, + "loss": 0.0374, + "step": 44880 + }, + { + "epoch": 0.09900029331786614, + "grad_norm": 0.0908462181687355, + "learning_rate": 2.919320738059576e-05, + "loss": 0.0373, + "step": 44890 + }, + { + "epoch": 0.09902234729276431, + "grad_norm": 0.13204284012317657, + "learning_rate": 2.9192672327508988e-05, + "loss": 0.0377, + "step": 44900 + }, + { + "epoch": 0.09904440126766248, + "grad_norm": 0.1299864500761032, + "learning_rate": 2.9192137101967855e-05, + "loss": 0.0379, + "step": 44910 + }, + { + "epoch": 0.09906645524256065, + "grad_norm": 0.1324508935213089, + "learning_rate": 2.9191601703978853e-05, + "loss": 0.038, + "step": 44920 + }, + { + "epoch": 0.0990885092174588, + "grad_norm": 0.15731103718280792, + "learning_rate": 2.9191066133548486e-05, + "loss": 0.038, + "step": 44930 + }, + { + "epoch": 0.09911056319235698, + "grad_norm": 0.16685760021209717, + "learning_rate": 2.9190530390683278e-05, + "loss": 0.0382, + "step": 44940 + }, + { + "epoch": 0.09913261716725515, + "grad_norm": 0.08045167475938797, + "learning_rate": 2.918999447538972e-05, + "loss": 0.0349, + "step": 44950 + }, + { + "epoch": 0.0991546711421533, + "grad_norm": 0.13191638886928558, + "learning_rate": 2.9189458387674333e-05, + "loss": 0.0363, + "step": 44960 + }, + { + "epoch": 0.09917672511705147, + "grad_norm": 0.12008613348007202, + "learning_rate": 2.9188922127543627e-05, + "loss": 0.0376, + "step": 44970 + }, + { + "epoch": 0.09919877909194964, + "grad_norm": 0.12696132063865662, + "learning_rate": 2.9188385695004125e-05, + "loss": 0.0371, + "step": 44980 + }, + { + "epoch": 0.0992208330668478, + "grad_norm": 0.12794092297554016, + "learning_rate": 2.9187849090062336e-05, + "loss": 0.0358, + "step": 44990 + }, + { + "epoch": 0.09924288704174597, + "grad_norm": 0.10579078644514084, + "learning_rate": 2.918731231272479e-05, + "loss": 0.0388, + "step": 45000 + }, + { + "epoch": 0.09926494101664414, + "grad_norm": 0.12450622767210007, + "learning_rate": 2.9186775362998004e-05, + "loss": 0.0382, + "step": 45010 + }, + { + "epoch": 0.0992869949915423, + "grad_norm": 0.11749054491519928, + "learning_rate": 2.9186238240888502e-05, + "loss": 0.0366, + "step": 45020 + }, + { + "epoch": 0.09930904896644047, + "grad_norm": 0.11553019285202026, + "learning_rate": 2.918570094640281e-05, + "loss": 0.0371, + "step": 45030 + }, + { + "epoch": 0.09933110294133864, + "grad_norm": 0.11630476266145706, + "learning_rate": 2.9185163479547458e-05, + "loss": 0.0364, + "step": 45040 + }, + { + "epoch": 0.09935315691623679, + "grad_norm": 0.12275945395231247, + "learning_rate": 2.9184625840328976e-05, + "loss": 0.0377, + "step": 45050 + }, + { + "epoch": 0.09937521089113496, + "grad_norm": 0.12841123342514038, + "learning_rate": 2.91840880287539e-05, + "loss": 0.0393, + "step": 45060 + }, + { + "epoch": 0.09939726486603313, + "grad_norm": 0.14343711733818054, + "learning_rate": 2.918355004482876e-05, + "loss": 0.0357, + "step": 45070 + }, + { + "epoch": 0.09941931884093129, + "grad_norm": 0.1394302099943161, + "learning_rate": 2.91830118885601e-05, + "loss": 0.0375, + "step": 45080 + }, + { + "epoch": 0.09944137281582946, + "grad_norm": 0.15860800445079803, + "learning_rate": 2.918247355995445e-05, + "loss": 0.0387, + "step": 45090 + }, + { + "epoch": 0.09946342679072763, + "grad_norm": 0.12147045880556107, + "learning_rate": 2.9181935059018357e-05, + "loss": 0.0374, + "step": 45100 + }, + { + "epoch": 0.09948548076562579, + "grad_norm": 0.11304612457752228, + "learning_rate": 2.9181396385758364e-05, + "loss": 0.0374, + "step": 45110 + }, + { + "epoch": 0.09950753474052396, + "grad_norm": 0.08928494900465012, + "learning_rate": 2.9180857540181018e-05, + "loss": 0.0374, + "step": 45120 + }, + { + "epoch": 0.09952958871542213, + "grad_norm": 0.12170293182134628, + "learning_rate": 2.918031852229286e-05, + "loss": 0.0409, + "step": 45130 + }, + { + "epoch": 0.0995516426903203, + "grad_norm": 0.11698776483535767, + "learning_rate": 2.917977933210044e-05, + "loss": 0.0394, + "step": 45140 + }, + { + "epoch": 0.09957369666521845, + "grad_norm": 0.12910783290863037, + "learning_rate": 2.9179239969610317e-05, + "loss": 0.0376, + "step": 45150 + }, + { + "epoch": 0.09959575064011662, + "grad_norm": 0.10041418671607971, + "learning_rate": 2.917870043482904e-05, + "loss": 0.0371, + "step": 45160 + }, + { + "epoch": 0.0996178046150148, + "grad_norm": 0.11911646276712418, + "learning_rate": 2.917816072776317e-05, + "loss": 0.0382, + "step": 45170 + }, + { + "epoch": 0.09963985858991295, + "grad_norm": 0.13230785727500916, + "learning_rate": 2.917762084841926e-05, + "loss": 0.0401, + "step": 45180 + }, + { + "epoch": 0.09966191256481112, + "grad_norm": 0.12252291291952133, + "learning_rate": 2.9177080796803864e-05, + "loss": 0.0392, + "step": 45190 + }, + { + "epoch": 0.09968396653970929, + "grad_norm": 0.12087313085794449, + "learning_rate": 2.9176540572923555e-05, + "loss": 0.038, + "step": 45200 + }, + { + "epoch": 0.09970602051460745, + "grad_norm": 0.10890162736177444, + "learning_rate": 2.9176000176784897e-05, + "loss": 0.0392, + "step": 45210 + }, + { + "epoch": 0.09972807448950562, + "grad_norm": 0.07596047967672348, + "learning_rate": 2.917545960839445e-05, + "loss": 0.0351, + "step": 45220 + }, + { + "epoch": 0.09975012846440379, + "grad_norm": 0.12390361726284027, + "learning_rate": 2.9174918867758783e-05, + "loss": 0.0378, + "step": 45230 + }, + { + "epoch": 0.09977218243930194, + "grad_norm": 0.10867664963006973, + "learning_rate": 2.9174377954884467e-05, + "loss": 0.0386, + "step": 45240 + }, + { + "epoch": 0.09979423641420011, + "grad_norm": 0.11090076714754105, + "learning_rate": 2.9173836869778083e-05, + "loss": 0.0354, + "step": 45250 + }, + { + "epoch": 0.09981629038909828, + "grad_norm": 0.09985237568616867, + "learning_rate": 2.9173295612446194e-05, + "loss": 0.0357, + "step": 45260 + }, + { + "epoch": 0.09983834436399644, + "grad_norm": 0.11309163272380829, + "learning_rate": 2.917275418289538e-05, + "loss": 0.0361, + "step": 45270 + }, + { + "epoch": 0.09986039833889461, + "grad_norm": 0.14593505859375, + "learning_rate": 2.9172212581132223e-05, + "loss": 0.0371, + "step": 45280 + }, + { + "epoch": 0.09988245231379278, + "grad_norm": 0.0991249531507492, + "learning_rate": 2.91716708071633e-05, + "loss": 0.0348, + "step": 45290 + }, + { + "epoch": 0.09990450628869094, + "grad_norm": 0.1239691972732544, + "learning_rate": 2.91711288609952e-05, + "loss": 0.0398, + "step": 45300 + }, + { + "epoch": 0.0999265602635891, + "grad_norm": 0.12775474786758423, + "learning_rate": 2.9170586742634503e-05, + "loss": 0.0389, + "step": 45310 + }, + { + "epoch": 0.09994861423848728, + "grad_norm": 0.13801352679729462, + "learning_rate": 2.91700444520878e-05, + "loss": 0.0377, + "step": 45320 + }, + { + "epoch": 0.09997066821338543, + "grad_norm": 0.11557254940271378, + "learning_rate": 2.9169501989361677e-05, + "loss": 0.0377, + "step": 45330 + }, + { + "epoch": 0.0999927221882836, + "grad_norm": 0.1768360733985901, + "learning_rate": 2.9168959354462722e-05, + "loss": 0.0367, + "step": 45340 + }, + { + "epoch": 0.10001477616318177, + "grad_norm": 0.1618933528661728, + "learning_rate": 2.916841654739754e-05, + "loss": 0.0382, + "step": 45350 + }, + { + "epoch": 0.10003683013807994, + "grad_norm": 0.1429319977760315, + "learning_rate": 2.9167873568172718e-05, + "loss": 0.0344, + "step": 45360 + }, + { + "epoch": 0.1000588841129781, + "grad_norm": 0.10607057064771652, + "learning_rate": 2.9167330416794854e-05, + "loss": 0.0378, + "step": 45370 + }, + { + "epoch": 0.10008093808787627, + "grad_norm": 0.12309157848358154, + "learning_rate": 2.916678709327055e-05, + "loss": 0.038, + "step": 45380 + }, + { + "epoch": 0.10010299206277444, + "grad_norm": 0.13293077051639557, + "learning_rate": 2.9166243597606406e-05, + "loss": 0.0364, + "step": 45390 + }, + { + "epoch": 0.1001250460376726, + "grad_norm": 0.11051381379365921, + "learning_rate": 2.9165699929809027e-05, + "loss": 0.0369, + "step": 45400 + }, + { + "epoch": 0.10014710001257077, + "grad_norm": 0.10889843106269836, + "learning_rate": 2.916515608988502e-05, + "loss": 0.0357, + "step": 45410 + }, + { + "epoch": 0.10016915398746894, + "grad_norm": 0.10520482063293457, + "learning_rate": 2.9164612077840994e-05, + "loss": 0.0389, + "step": 45420 + }, + { + "epoch": 0.1001912079623671, + "grad_norm": 0.13283993303775787, + "learning_rate": 2.9164067893683555e-05, + "loss": 0.0371, + "step": 45430 + }, + { + "epoch": 0.10021326193726526, + "grad_norm": 0.12653803825378418, + "learning_rate": 2.916352353741932e-05, + "loss": 0.037, + "step": 45440 + }, + { + "epoch": 0.10023531591216343, + "grad_norm": 0.10016951709985733, + "learning_rate": 2.9162979009054902e-05, + "loss": 0.0364, + "step": 45450 + }, + { + "epoch": 0.10025736988706159, + "grad_norm": 0.11385589092969894, + "learning_rate": 2.9162434308596912e-05, + "loss": 0.0378, + "step": 45460 + }, + { + "epoch": 0.10027942386195976, + "grad_norm": 0.14161008596420288, + "learning_rate": 2.9161889436051978e-05, + "loss": 0.0398, + "step": 45470 + }, + { + "epoch": 0.10030147783685793, + "grad_norm": 0.11360528320074081, + "learning_rate": 2.9161344391426717e-05, + "loss": 0.0379, + "step": 45480 + }, + { + "epoch": 0.10032353181175609, + "grad_norm": 0.09382177889347076, + "learning_rate": 2.916079917472775e-05, + "loss": 0.038, + "step": 45490 + }, + { + "epoch": 0.10034558578665426, + "grad_norm": 0.11792922019958496, + "learning_rate": 2.9160253785961703e-05, + "loss": 0.0362, + "step": 45500 + }, + { + "epoch": 0.10036763976155243, + "grad_norm": 0.1090627908706665, + "learning_rate": 2.9159708225135204e-05, + "loss": 0.0357, + "step": 45510 + }, + { + "epoch": 0.10038969373645058, + "grad_norm": 0.15080980956554413, + "learning_rate": 2.9159162492254877e-05, + "loss": 0.0381, + "step": 45520 + }, + { + "epoch": 0.10041174771134875, + "grad_norm": 0.11488493531942368, + "learning_rate": 2.9158616587327363e-05, + "loss": 0.0377, + "step": 45530 + }, + { + "epoch": 0.10043380168624692, + "grad_norm": 0.12844565510749817, + "learning_rate": 2.9158070510359285e-05, + "loss": 0.0386, + "step": 45540 + }, + { + "epoch": 0.10045585566114508, + "grad_norm": 0.12213526666164398, + "learning_rate": 2.9157524261357283e-05, + "loss": 0.0386, + "step": 45550 + }, + { + "epoch": 0.10047790963604325, + "grad_norm": 0.13246412575244904, + "learning_rate": 2.9156977840327996e-05, + "loss": 0.0365, + "step": 45560 + }, + { + "epoch": 0.10049996361094142, + "grad_norm": 0.10433603078126907, + "learning_rate": 2.915643124727806e-05, + "loss": 0.0361, + "step": 45570 + }, + { + "epoch": 0.10052201758583959, + "grad_norm": 0.12960416078567505, + "learning_rate": 2.9155884482214125e-05, + "loss": 0.0383, + "step": 45580 + }, + { + "epoch": 0.10054407156073775, + "grad_norm": 0.11875034868717194, + "learning_rate": 2.9155337545142822e-05, + "loss": 0.0377, + "step": 45590 + }, + { + "epoch": 0.10056612553563592, + "grad_norm": 0.14657673239707947, + "learning_rate": 2.9154790436070803e-05, + "loss": 0.0368, + "step": 45600 + }, + { + "epoch": 0.10058817951053409, + "grad_norm": 0.13483916223049164, + "learning_rate": 2.915424315500472e-05, + "loss": 0.0369, + "step": 45610 + }, + { + "epoch": 0.10061023348543224, + "grad_norm": 0.11952594667673111, + "learning_rate": 2.915369570195121e-05, + "loss": 0.037, + "step": 45620 + }, + { + "epoch": 0.10063228746033041, + "grad_norm": 0.12008289992809296, + "learning_rate": 2.9153148076916943e-05, + "loss": 0.0356, + "step": 45630 + }, + { + "epoch": 0.10065434143522858, + "grad_norm": 0.10276904702186584, + "learning_rate": 2.9152600279908562e-05, + "loss": 0.0359, + "step": 45640 + }, + { + "epoch": 0.10067639541012674, + "grad_norm": 0.10359478741884232, + "learning_rate": 2.915205231093272e-05, + "loss": 0.0365, + "step": 45650 + }, + { + "epoch": 0.10069844938502491, + "grad_norm": 0.10094120353460312, + "learning_rate": 2.9151504169996085e-05, + "loss": 0.039, + "step": 45660 + }, + { + "epoch": 0.10072050335992308, + "grad_norm": 0.11577059328556061, + "learning_rate": 2.9150955857105314e-05, + "loss": 0.0382, + "step": 45670 + }, + { + "epoch": 0.10074255733482124, + "grad_norm": 0.11713777482509613, + "learning_rate": 2.9150407372267067e-05, + "loss": 0.0381, + "step": 45680 + }, + { + "epoch": 0.1007646113097194, + "grad_norm": 0.13077546656131744, + "learning_rate": 2.914985871548801e-05, + "loss": 0.037, + "step": 45690 + }, + { + "epoch": 0.10078666528461758, + "grad_norm": 0.12592434883117676, + "learning_rate": 2.9149309886774812e-05, + "loss": 0.0368, + "step": 45700 + }, + { + "epoch": 0.10080871925951573, + "grad_norm": 0.11944019049406052, + "learning_rate": 2.914876088613414e-05, + "loss": 0.0374, + "step": 45710 + }, + { + "epoch": 0.1008307732344139, + "grad_norm": 0.08893699944019318, + "learning_rate": 2.914821171357266e-05, + "loss": 0.0382, + "step": 45720 + }, + { + "epoch": 0.10085282720931207, + "grad_norm": 0.15596777200698853, + "learning_rate": 2.914766236909705e-05, + "loss": 0.0363, + "step": 45730 + }, + { + "epoch": 0.10087488118421023, + "grad_norm": 0.13720904290676117, + "learning_rate": 2.914711285271399e-05, + "loss": 0.0369, + "step": 45740 + }, + { + "epoch": 0.1008969351591084, + "grad_norm": 0.1417921930551529, + "learning_rate": 2.914656316443015e-05, + "loss": 0.0366, + "step": 45750 + }, + { + "epoch": 0.10091898913400657, + "grad_norm": 0.1300865113735199, + "learning_rate": 2.914601330425221e-05, + "loss": 0.0386, + "step": 45760 + }, + { + "epoch": 0.10094104310890473, + "grad_norm": 0.12429486215114594, + "learning_rate": 2.9145463272186854e-05, + "loss": 0.0379, + "step": 45770 + }, + { + "epoch": 0.1009630970838029, + "grad_norm": 0.11834459006786346, + "learning_rate": 2.914491306824076e-05, + "loss": 0.0357, + "step": 45780 + }, + { + "epoch": 0.10098515105870107, + "grad_norm": 0.10950759798288345, + "learning_rate": 2.914436269242062e-05, + "loss": 0.0373, + "step": 45790 + }, + { + "epoch": 0.10100720503359924, + "grad_norm": 0.10517150908708572, + "learning_rate": 2.9143812144733116e-05, + "loss": 0.0366, + "step": 45800 + }, + { + "epoch": 0.1010292590084974, + "grad_norm": 0.1498105823993683, + "learning_rate": 2.914326142518494e-05, + "loss": 0.0373, + "step": 45810 + }, + { + "epoch": 0.10105131298339556, + "grad_norm": 0.10326079279184341, + "learning_rate": 2.9142710533782788e-05, + "loss": 0.0383, + "step": 45820 + }, + { + "epoch": 0.10107336695829373, + "grad_norm": 0.1134495735168457, + "learning_rate": 2.914215947053335e-05, + "loss": 0.0384, + "step": 45830 + }, + { + "epoch": 0.10109542093319189, + "grad_norm": 0.13497091829776764, + "learning_rate": 2.914160823544332e-05, + "loss": 0.0392, + "step": 45840 + }, + { + "epoch": 0.10111747490809006, + "grad_norm": 0.0920683816075325, + "learning_rate": 2.91410568285194e-05, + "loss": 0.0355, + "step": 45850 + }, + { + "epoch": 0.10113952888298823, + "grad_norm": 0.1036752462387085, + "learning_rate": 2.9140505249768285e-05, + "loss": 0.0388, + "step": 45860 + }, + { + "epoch": 0.10116158285788639, + "grad_norm": 0.10840488970279694, + "learning_rate": 2.9139953499196686e-05, + "loss": 0.0395, + "step": 45870 + }, + { + "epoch": 0.10118363683278456, + "grad_norm": 0.13162866234779358, + "learning_rate": 2.9139401576811297e-05, + "loss": 0.0381, + "step": 45880 + }, + { + "epoch": 0.10120569080768273, + "grad_norm": 0.11726013571023941, + "learning_rate": 2.913884948261883e-05, + "loss": 0.0367, + "step": 45890 + }, + { + "epoch": 0.10122774478258088, + "grad_norm": 0.17545638978481293, + "learning_rate": 2.9138297216625993e-05, + "loss": 0.0404, + "step": 45900 + }, + { + "epoch": 0.10124979875747905, + "grad_norm": 0.11170405894517899, + "learning_rate": 2.9137744778839498e-05, + "loss": 0.0371, + "step": 45910 + }, + { + "epoch": 0.10127185273237722, + "grad_norm": 0.10083901882171631, + "learning_rate": 2.913719216926605e-05, + "loss": 0.0374, + "step": 45920 + }, + { + "epoch": 0.10129390670727538, + "grad_norm": 0.10762768238782883, + "learning_rate": 2.913663938791237e-05, + "loss": 0.0375, + "step": 45930 + }, + { + "epoch": 0.10131596068217355, + "grad_norm": 0.14354529976844788, + "learning_rate": 2.913608643478518e-05, + "loss": 0.0398, + "step": 45940 + }, + { + "epoch": 0.10133801465707172, + "grad_norm": 0.1344933956861496, + "learning_rate": 2.913553330989119e-05, + "loss": 0.0374, + "step": 45950 + }, + { + "epoch": 0.10136006863196988, + "grad_norm": 0.09981706738471985, + "learning_rate": 2.9134980013237126e-05, + "loss": 0.0363, + "step": 45960 + }, + { + "epoch": 0.10138212260686805, + "grad_norm": 0.0981733426451683, + "learning_rate": 2.9134426544829706e-05, + "loss": 0.0365, + "step": 45970 + }, + { + "epoch": 0.10140417658176622, + "grad_norm": 0.09803875535726547, + "learning_rate": 2.913387290467566e-05, + "loss": 0.0378, + "step": 45980 + }, + { + "epoch": 0.10142623055666437, + "grad_norm": 0.12178022414445877, + "learning_rate": 2.9133319092781715e-05, + "loss": 0.0383, + "step": 45990 + }, + { + "epoch": 0.10144828453156254, + "grad_norm": 0.13770955801010132, + "learning_rate": 2.9132765109154596e-05, + "loss": 0.0377, + "step": 46000 + }, + { + "epoch": 0.10147033850646071, + "grad_norm": 0.13428817689418793, + "learning_rate": 2.9132210953801036e-05, + "loss": 0.0382, + "step": 46010 + }, + { + "epoch": 0.10149239248135888, + "grad_norm": 0.11139379441738129, + "learning_rate": 2.9131656626727773e-05, + "loss": 0.0395, + "step": 46020 + }, + { + "epoch": 0.10151444645625704, + "grad_norm": 0.1318839192390442, + "learning_rate": 2.913110212794154e-05, + "loss": 0.0391, + "step": 46030 + }, + { + "epoch": 0.10153650043115521, + "grad_norm": 0.10849083214998245, + "learning_rate": 2.913054745744907e-05, + "loss": 0.0373, + "step": 46040 + }, + { + "epoch": 0.10155855440605338, + "grad_norm": 0.12272291630506516, + "learning_rate": 2.912999261525711e-05, + "loss": 0.0351, + "step": 46050 + }, + { + "epoch": 0.10158060838095154, + "grad_norm": 0.13228999078273773, + "learning_rate": 2.91294376013724e-05, + "loss": 0.0354, + "step": 46060 + }, + { + "epoch": 0.10160266235584971, + "grad_norm": 0.11049360781908035, + "learning_rate": 2.9128882415801686e-05, + "loss": 0.0389, + "step": 46070 + }, + { + "epoch": 0.10162471633074788, + "grad_norm": 0.11158935725688934, + "learning_rate": 2.9128327058551706e-05, + "loss": 0.0369, + "step": 46080 + }, + { + "epoch": 0.10164677030564603, + "grad_norm": 0.11589881032705307, + "learning_rate": 2.912777152962921e-05, + "loss": 0.0408, + "step": 46090 + }, + { + "epoch": 0.1016688242805442, + "grad_norm": 0.1352037936449051, + "learning_rate": 2.9127215829040957e-05, + "loss": 0.0389, + "step": 46100 + }, + { + "epoch": 0.10169087825544237, + "grad_norm": 0.0929180309176445, + "learning_rate": 2.912665995679369e-05, + "loss": 0.0369, + "step": 46110 + }, + { + "epoch": 0.10171293223034053, + "grad_norm": 0.11073780059814453, + "learning_rate": 2.9126103912894167e-05, + "loss": 0.0373, + "step": 46120 + }, + { + "epoch": 0.1017349862052387, + "grad_norm": 0.08411310613155365, + "learning_rate": 2.9125547697349143e-05, + "loss": 0.038, + "step": 46130 + }, + { + "epoch": 0.10175704018013687, + "grad_norm": 0.1054982990026474, + "learning_rate": 2.912499131016538e-05, + "loss": 0.0393, + "step": 46140 + }, + { + "epoch": 0.10177909415503503, + "grad_norm": 0.10351766645908356, + "learning_rate": 2.9124434751349637e-05, + "loss": 0.0377, + "step": 46150 + }, + { + "epoch": 0.1018011481299332, + "grad_norm": 0.12929944694042206, + "learning_rate": 2.9123878020908677e-05, + "loss": 0.0374, + "step": 46160 + }, + { + "epoch": 0.10182320210483137, + "grad_norm": 0.10718733072280884, + "learning_rate": 2.9123321118849263e-05, + "loss": 0.0379, + "step": 46170 + }, + { + "epoch": 0.10184525607972952, + "grad_norm": 0.13563290238380432, + "learning_rate": 2.912276404517816e-05, + "loss": 0.0375, + "step": 46180 + }, + { + "epoch": 0.1018673100546277, + "grad_norm": 0.12019297480583191, + "learning_rate": 2.912220679990214e-05, + "loss": 0.0362, + "step": 46190 + }, + { + "epoch": 0.10188936402952586, + "grad_norm": 0.1357831209897995, + "learning_rate": 2.9121649383027977e-05, + "loss": 0.0371, + "step": 46200 + }, + { + "epoch": 0.10191141800442403, + "grad_norm": 0.13220693171024323, + "learning_rate": 2.9121091794562438e-05, + "loss": 0.0381, + "step": 46210 + }, + { + "epoch": 0.10193347197932219, + "grad_norm": 0.14979304373264313, + "learning_rate": 2.9120534034512303e-05, + "loss": 0.0403, + "step": 46220 + }, + { + "epoch": 0.10195552595422036, + "grad_norm": 0.1444408893585205, + "learning_rate": 2.9119976102884345e-05, + "loss": 0.0366, + "step": 46230 + }, + { + "epoch": 0.10197757992911853, + "grad_norm": 0.10966868698596954, + "learning_rate": 2.9119417999685348e-05, + "loss": 0.0391, + "step": 46240 + }, + { + "epoch": 0.10199963390401669, + "grad_norm": 0.12861675024032593, + "learning_rate": 2.911885972492209e-05, + "loss": 0.0365, + "step": 46250 + }, + { + "epoch": 0.10202168787891486, + "grad_norm": 0.12193726748228073, + "learning_rate": 2.911830127860136e-05, + "loss": 0.0365, + "step": 46260 + }, + { + "epoch": 0.10204374185381303, + "grad_norm": 0.09668684750795364, + "learning_rate": 2.9117742660729937e-05, + "loss": 0.0372, + "step": 46270 + }, + { + "epoch": 0.10206579582871118, + "grad_norm": 0.12244599312543869, + "learning_rate": 2.9117183871314614e-05, + "loss": 0.0367, + "step": 46280 + }, + { + "epoch": 0.10208784980360935, + "grad_norm": 0.106871098279953, + "learning_rate": 2.9116624910362173e-05, + "loss": 0.0375, + "step": 46290 + }, + { + "epoch": 0.10210990377850752, + "grad_norm": 0.10951198637485504, + "learning_rate": 2.911606577787941e-05, + "loss": 0.0369, + "step": 46300 + }, + { + "epoch": 0.10213195775340568, + "grad_norm": 0.09900548309087753, + "learning_rate": 2.9115506473873127e-05, + "loss": 0.0377, + "step": 46310 + }, + { + "epoch": 0.10215401172830385, + "grad_norm": 0.1155342161655426, + "learning_rate": 2.911494699835011e-05, + "loss": 0.0384, + "step": 46320 + }, + { + "epoch": 0.10217606570320202, + "grad_norm": 0.08806316554546356, + "learning_rate": 2.9114387351317155e-05, + "loss": 0.0392, + "step": 46330 + }, + { + "epoch": 0.10219811967810018, + "grad_norm": 0.1250787377357483, + "learning_rate": 2.911382753278107e-05, + "loss": 0.0388, + "step": 46340 + }, + { + "epoch": 0.10222017365299835, + "grad_norm": 0.10284461081027985, + "learning_rate": 2.911326754274866e-05, + "loss": 0.0368, + "step": 46350 + }, + { + "epoch": 0.10224222762789652, + "grad_norm": 0.12095363438129425, + "learning_rate": 2.911270738122672e-05, + "loss": 0.0391, + "step": 46360 + }, + { + "epoch": 0.10226428160279467, + "grad_norm": 0.12978509068489075, + "learning_rate": 2.9112147048222064e-05, + "loss": 0.0373, + "step": 46370 + }, + { + "epoch": 0.10228633557769284, + "grad_norm": 0.11636801809072495, + "learning_rate": 2.9111586543741494e-05, + "loss": 0.0383, + "step": 46380 + }, + { + "epoch": 0.10230838955259101, + "grad_norm": 0.14056958258152008, + "learning_rate": 2.911102586779183e-05, + "loss": 0.0372, + "step": 46390 + }, + { + "epoch": 0.10233044352748917, + "grad_norm": 0.10192166268825531, + "learning_rate": 2.911046502037987e-05, + "loss": 0.0379, + "step": 46400 + }, + { + "epoch": 0.10235249750238734, + "grad_norm": 0.11542250216007233, + "learning_rate": 2.910990400151244e-05, + "loss": 0.0363, + "step": 46410 + }, + { + "epoch": 0.10237455147728551, + "grad_norm": 0.13574329018592834, + "learning_rate": 2.9109342811196356e-05, + "loss": 0.0364, + "step": 46420 + }, + { + "epoch": 0.10239660545218368, + "grad_norm": 0.1270887404680252, + "learning_rate": 2.9108781449438433e-05, + "loss": 0.0384, + "step": 46430 + }, + { + "epoch": 0.10241865942708184, + "grad_norm": 0.13016264140605927, + "learning_rate": 2.91082199162455e-05, + "loss": 0.0363, + "step": 46440 + }, + { + "epoch": 0.10244071340198001, + "grad_norm": 0.13622526824474335, + "learning_rate": 2.910765821162437e-05, + "loss": 0.0378, + "step": 46450 + }, + { + "epoch": 0.10246276737687818, + "grad_norm": 0.13896986842155457, + "learning_rate": 2.9107096335581874e-05, + "loss": 0.0375, + "step": 46460 + }, + { + "epoch": 0.10248482135177633, + "grad_norm": 0.11438757181167603, + "learning_rate": 2.9106534288124836e-05, + "loss": 0.0378, + "step": 46470 + }, + { + "epoch": 0.1025068753266745, + "grad_norm": 0.1518443375825882, + "learning_rate": 2.910597206926009e-05, + "loss": 0.0393, + "step": 46480 + }, + { + "epoch": 0.10252892930157267, + "grad_norm": 0.11636842787265778, + "learning_rate": 2.9105409678994467e-05, + "loss": 0.0385, + "step": 46490 + }, + { + "epoch": 0.10255098327647083, + "grad_norm": 0.11367147415876389, + "learning_rate": 2.91048471173348e-05, + "loss": 0.037, + "step": 46500 + }, + { + "epoch": 0.102573037251369, + "grad_norm": 0.14609365165233612, + "learning_rate": 2.910428438428792e-05, + "loss": 0.0383, + "step": 46510 + }, + { + "epoch": 0.10259509122626717, + "grad_norm": 0.10375959426164627, + "learning_rate": 2.9103721479860663e-05, + "loss": 0.0386, + "step": 46520 + }, + { + "epoch": 0.10261714520116533, + "grad_norm": 0.09729067236185074, + "learning_rate": 2.910315840405988e-05, + "loss": 0.0375, + "step": 46530 + }, + { + "epoch": 0.1026391991760635, + "grad_norm": 0.09343072772026062, + "learning_rate": 2.9102595156892407e-05, + "loss": 0.0381, + "step": 46540 + }, + { + "epoch": 0.10266125315096167, + "grad_norm": 0.08640685677528381, + "learning_rate": 2.9102031738365086e-05, + "loss": 0.0374, + "step": 46550 + }, + { + "epoch": 0.10268330712585982, + "grad_norm": 0.11533930897712708, + "learning_rate": 2.9101468148484765e-05, + "loss": 0.0397, + "step": 46560 + }, + { + "epoch": 0.102705361100758, + "grad_norm": 0.08839742094278336, + "learning_rate": 2.9100904387258292e-05, + "loss": 0.0359, + "step": 46570 + }, + { + "epoch": 0.10272741507565616, + "grad_norm": 0.12593525648117065, + "learning_rate": 2.9100340454692517e-05, + "loss": 0.0368, + "step": 46580 + }, + { + "epoch": 0.10274946905055432, + "grad_norm": 0.12176503241062164, + "learning_rate": 2.9099776350794294e-05, + "loss": 0.0376, + "step": 46590 + }, + { + "epoch": 0.10277152302545249, + "grad_norm": 0.09820359200239182, + "learning_rate": 2.9099212075570477e-05, + "loss": 0.0348, + "step": 46600 + }, + { + "epoch": 0.10279357700035066, + "grad_norm": 0.11656316369771957, + "learning_rate": 2.909864762902792e-05, + "loss": 0.0354, + "step": 46610 + }, + { + "epoch": 0.10281563097524882, + "grad_norm": 0.09612737596035004, + "learning_rate": 2.909808301117348e-05, + "loss": 0.0374, + "step": 46620 + }, + { + "epoch": 0.10283768495014699, + "grad_norm": 0.09097306430339813, + "learning_rate": 2.9097518222014023e-05, + "loss": 0.038, + "step": 46630 + }, + { + "epoch": 0.10285973892504516, + "grad_norm": 0.09672588109970093, + "learning_rate": 2.9096953261556413e-05, + "loss": 0.0377, + "step": 46640 + }, + { + "epoch": 0.10288179289994333, + "grad_norm": 0.14301994442939758, + "learning_rate": 2.9096388129807506e-05, + "loss": 0.0373, + "step": 46650 + }, + { + "epoch": 0.10290384687484148, + "grad_norm": 0.14896433055400848, + "learning_rate": 2.9095822826774176e-05, + "loss": 0.0367, + "step": 46660 + }, + { + "epoch": 0.10292590084973965, + "grad_norm": 0.13764716684818268, + "learning_rate": 2.9095257352463286e-05, + "loss": 0.0392, + "step": 46670 + }, + { + "epoch": 0.10294795482463782, + "grad_norm": 0.11526267230510712, + "learning_rate": 2.909469170688172e-05, + "loss": 0.0393, + "step": 46680 + }, + { + "epoch": 0.10297000879953598, + "grad_norm": 0.15489087998867035, + "learning_rate": 2.9094125890036337e-05, + "loss": 0.0354, + "step": 46690 + }, + { + "epoch": 0.10299206277443415, + "grad_norm": 0.10401146858930588, + "learning_rate": 2.9093559901934018e-05, + "loss": 0.0366, + "step": 46700 + }, + { + "epoch": 0.10301411674933232, + "grad_norm": 0.08119211345911026, + "learning_rate": 2.9092993742581637e-05, + "loss": 0.0362, + "step": 46710 + }, + { + "epoch": 0.10303617072423048, + "grad_norm": 0.11046617478132248, + "learning_rate": 2.909242741198608e-05, + "loss": 0.0382, + "step": 46720 + }, + { + "epoch": 0.10305822469912865, + "grad_norm": 0.09451530873775482, + "learning_rate": 2.909186091015422e-05, + "loss": 0.0383, + "step": 46730 + }, + { + "epoch": 0.10308027867402682, + "grad_norm": 0.11983360350131989, + "learning_rate": 2.9091294237092946e-05, + "loss": 0.0368, + "step": 46740 + }, + { + "epoch": 0.10310233264892497, + "grad_norm": 0.14327740669250488, + "learning_rate": 2.9090727392809147e-05, + "loss": 0.037, + "step": 46750 + }, + { + "epoch": 0.10312438662382314, + "grad_norm": 0.14220090210437775, + "learning_rate": 2.90901603773097e-05, + "loss": 0.0358, + "step": 46760 + }, + { + "epoch": 0.10314644059872131, + "grad_norm": 0.10328055173158646, + "learning_rate": 2.9089593190601506e-05, + "loss": 0.0351, + "step": 46770 + }, + { + "epoch": 0.10316849457361947, + "grad_norm": 0.1224428191781044, + "learning_rate": 2.908902583269145e-05, + "loss": 0.0387, + "step": 46780 + }, + { + "epoch": 0.10319054854851764, + "grad_norm": 0.12402389943599701, + "learning_rate": 2.908845830358643e-05, + "loss": 0.0383, + "step": 46790 + }, + { + "epoch": 0.10321260252341581, + "grad_norm": 0.1318332850933075, + "learning_rate": 2.9087890603293337e-05, + "loss": 0.0376, + "step": 46800 + }, + { + "epoch": 0.10323465649831397, + "grad_norm": 0.12372402846813202, + "learning_rate": 2.9087322731819074e-05, + "loss": 0.0371, + "step": 46810 + }, + { + "epoch": 0.10325671047321214, + "grad_norm": 0.13325943052768707, + "learning_rate": 2.9086754689170538e-05, + "loss": 0.038, + "step": 46820 + }, + { + "epoch": 0.10327876444811031, + "grad_norm": 0.10866396129131317, + "learning_rate": 2.9086186475354634e-05, + "loss": 0.0368, + "step": 46830 + }, + { + "epoch": 0.10330081842300846, + "grad_norm": 0.14262697100639343, + "learning_rate": 2.908561809037826e-05, + "loss": 0.0391, + "step": 46840 + }, + { + "epoch": 0.10332287239790663, + "grad_norm": 0.11439855396747589, + "learning_rate": 2.9085049534248333e-05, + "loss": 0.0377, + "step": 46850 + }, + { + "epoch": 0.1033449263728048, + "grad_norm": 0.17330598831176758, + "learning_rate": 2.9084480806971756e-05, + "loss": 0.0373, + "step": 46860 + }, + { + "epoch": 0.10336698034770297, + "grad_norm": 0.10806398838758469, + "learning_rate": 2.9083911908555435e-05, + "loss": 0.0389, + "step": 46870 + }, + { + "epoch": 0.10338903432260113, + "grad_norm": 0.114628367125988, + "learning_rate": 2.9083342839006285e-05, + "loss": 0.0402, + "step": 46880 + }, + { + "epoch": 0.1034110882974993, + "grad_norm": 0.13461586833000183, + "learning_rate": 2.9082773598331228e-05, + "loss": 0.038, + "step": 46890 + }, + { + "epoch": 0.10343314227239747, + "grad_norm": 0.13130109012126923, + "learning_rate": 2.9082204186537172e-05, + "loss": 0.0377, + "step": 46900 + }, + { + "epoch": 0.10345519624729563, + "grad_norm": 0.11325838416814804, + "learning_rate": 2.9081634603631046e-05, + "loss": 0.0371, + "step": 46910 + }, + { + "epoch": 0.1034772502221938, + "grad_norm": 0.12376372516155243, + "learning_rate": 2.9081064849619755e-05, + "loss": 0.0374, + "step": 46920 + }, + { + "epoch": 0.10349930419709197, + "grad_norm": 0.12167591601610184, + "learning_rate": 2.908049492451024e-05, + "loss": 0.0392, + "step": 46930 + }, + { + "epoch": 0.10352135817199012, + "grad_norm": 0.12807705998420715, + "learning_rate": 2.907992482830941e-05, + "loss": 0.0355, + "step": 46940 + }, + { + "epoch": 0.1035434121468883, + "grad_norm": 0.12477331608533859, + "learning_rate": 2.9079354561024204e-05, + "loss": 0.0383, + "step": 46950 + }, + { + "epoch": 0.10356546612178646, + "grad_norm": 0.11121120303869247, + "learning_rate": 2.9078784122661548e-05, + "loss": 0.0387, + "step": 46960 + }, + { + "epoch": 0.10358752009668462, + "grad_norm": 0.11421477049589157, + "learning_rate": 2.9078213513228367e-05, + "loss": 0.0386, + "step": 46970 + }, + { + "epoch": 0.10360957407158279, + "grad_norm": 0.12248174846172333, + "learning_rate": 2.9077642732731607e-05, + "loss": 0.0356, + "step": 46980 + }, + { + "epoch": 0.10363162804648096, + "grad_norm": 0.12638646364212036, + "learning_rate": 2.907707178117819e-05, + "loss": 0.0367, + "step": 46990 + }, + { + "epoch": 0.10365368202137912, + "grad_norm": 0.11392219364643097, + "learning_rate": 2.907650065857506e-05, + "loss": 0.0367, + "step": 47000 + }, + { + "epoch": 0.10367573599627729, + "grad_norm": 0.12198812514543533, + "learning_rate": 2.907592936492916e-05, + "loss": 0.0354, + "step": 47010 + }, + { + "epoch": 0.10369778997117546, + "grad_norm": 0.15838441252708435, + "learning_rate": 2.9075357900247428e-05, + "loss": 0.0374, + "step": 47020 + }, + { + "epoch": 0.10371984394607361, + "grad_norm": 0.10909576714038849, + "learning_rate": 2.907478626453681e-05, + "loss": 0.0415, + "step": 47030 + }, + { + "epoch": 0.10374189792097178, + "grad_norm": 0.12022197246551514, + "learning_rate": 2.9074214457804244e-05, + "loss": 0.0367, + "step": 47040 + }, + { + "epoch": 0.10376395189586995, + "grad_norm": 0.11173058301210403, + "learning_rate": 2.9073642480056686e-05, + "loss": 0.0375, + "step": 47050 + }, + { + "epoch": 0.10378600587076811, + "grad_norm": 0.12360398471355438, + "learning_rate": 2.9073070331301087e-05, + "loss": 0.0378, + "step": 47060 + }, + { + "epoch": 0.10380805984566628, + "grad_norm": 0.12041526287794113, + "learning_rate": 2.9072498011544393e-05, + "loss": 0.0352, + "step": 47070 + }, + { + "epoch": 0.10383011382056445, + "grad_norm": 0.10741525888442993, + "learning_rate": 2.9071925520793566e-05, + "loss": 0.0365, + "step": 47080 + }, + { + "epoch": 0.10385216779546262, + "grad_norm": 0.12002436816692352, + "learning_rate": 2.9071352859055553e-05, + "loss": 0.0374, + "step": 47090 + }, + { + "epoch": 0.10387422177036078, + "grad_norm": 0.10665277391672134, + "learning_rate": 2.907078002633732e-05, + "loss": 0.0356, + "step": 47100 + }, + { + "epoch": 0.10389627574525895, + "grad_norm": 0.11048771440982819, + "learning_rate": 2.9070207022645823e-05, + "loss": 0.0359, + "step": 47110 + }, + { + "epoch": 0.10391832972015712, + "grad_norm": 0.09142384678125381, + "learning_rate": 2.9069633847988025e-05, + "loss": 0.0375, + "step": 47120 + }, + { + "epoch": 0.10394038369505527, + "grad_norm": 0.09164609014987946, + "learning_rate": 2.9069060502370897e-05, + "loss": 0.038, + "step": 47130 + }, + { + "epoch": 0.10396243766995344, + "grad_norm": 0.10543341934680939, + "learning_rate": 2.90684869858014e-05, + "loss": 0.0376, + "step": 47140 + }, + { + "epoch": 0.10398449164485161, + "grad_norm": 0.09939872473478317, + "learning_rate": 2.90679132982865e-05, + "loss": 0.0366, + "step": 47150 + }, + { + "epoch": 0.10400654561974977, + "grad_norm": 0.11565528064966202, + "learning_rate": 2.9067339439833173e-05, + "loss": 0.039, + "step": 47160 + }, + { + "epoch": 0.10402859959464794, + "grad_norm": 0.12787917256355286, + "learning_rate": 2.9066765410448387e-05, + "loss": 0.04, + "step": 47170 + }, + { + "epoch": 0.10405065356954611, + "grad_norm": 0.1351260542869568, + "learning_rate": 2.906619121013912e-05, + "loss": 0.0364, + "step": 47180 + }, + { + "epoch": 0.10407270754444427, + "grad_norm": 0.10736798495054245, + "learning_rate": 2.9065616838912353e-05, + "loss": 0.0374, + "step": 47190 + }, + { + "epoch": 0.10409476151934244, + "grad_norm": 0.12187587469816208, + "learning_rate": 2.906504229677506e-05, + "loss": 0.0371, + "step": 47200 + }, + { + "epoch": 0.10411681549424061, + "grad_norm": 0.10181988030672073, + "learning_rate": 2.906446758373422e-05, + "loss": 0.0361, + "step": 47210 + }, + { + "epoch": 0.10413886946913876, + "grad_norm": 0.1113915666937828, + "learning_rate": 2.9063892699796825e-05, + "loss": 0.0391, + "step": 47220 + }, + { + "epoch": 0.10416092344403693, + "grad_norm": 0.08711808919906616, + "learning_rate": 2.9063317644969853e-05, + "loss": 0.0371, + "step": 47230 + }, + { + "epoch": 0.1041829774189351, + "grad_norm": 0.1113455519080162, + "learning_rate": 2.9062742419260293e-05, + "loss": 0.0387, + "step": 47240 + }, + { + "epoch": 0.10420503139383326, + "grad_norm": 0.11734091490507126, + "learning_rate": 2.9062167022675134e-05, + "loss": 0.0392, + "step": 47250 + }, + { + "epoch": 0.10422708536873143, + "grad_norm": 0.12895981967449188, + "learning_rate": 2.906159145522137e-05, + "loss": 0.037, + "step": 47260 + }, + { + "epoch": 0.1042491393436296, + "grad_norm": 0.0976707711815834, + "learning_rate": 2.9061015716905996e-05, + "loss": 0.0396, + "step": 47270 + }, + { + "epoch": 0.10427119331852777, + "grad_norm": 0.11513590812683105, + "learning_rate": 2.9060439807736003e-05, + "loss": 0.0369, + "step": 47280 + }, + { + "epoch": 0.10429324729342593, + "grad_norm": 0.16201208531856537, + "learning_rate": 2.9059863727718394e-05, + "loss": 0.0377, + "step": 47290 + }, + { + "epoch": 0.1043153012683241, + "grad_norm": 0.10229127109050751, + "learning_rate": 2.905928747686016e-05, + "loss": 0.0368, + "step": 47300 + }, + { + "epoch": 0.10433735524322227, + "grad_norm": 0.10307623445987701, + "learning_rate": 2.9058711055168313e-05, + "loss": 0.0364, + "step": 47310 + }, + { + "epoch": 0.10435940921812042, + "grad_norm": 0.10040172189474106, + "learning_rate": 2.905813446264985e-05, + "loss": 0.0365, + "step": 47320 + }, + { + "epoch": 0.1043814631930186, + "grad_norm": 0.11734889447689056, + "learning_rate": 2.9057557699311784e-05, + "loss": 0.0364, + "step": 47330 + }, + { + "epoch": 0.10440351716791677, + "grad_norm": 0.14550690352916718, + "learning_rate": 2.905698076516112e-05, + "loss": 0.0379, + "step": 47340 + }, + { + "epoch": 0.10442557114281492, + "grad_norm": 0.12374600768089294, + "learning_rate": 2.9056403660204865e-05, + "loss": 0.039, + "step": 47350 + }, + { + "epoch": 0.10444762511771309, + "grad_norm": 0.13883133232593536, + "learning_rate": 2.9055826384450035e-05, + "loss": 0.0372, + "step": 47360 + }, + { + "epoch": 0.10446967909261126, + "grad_norm": 0.1260153353214264, + "learning_rate": 2.9055248937903642e-05, + "loss": 0.0362, + "step": 47370 + }, + { + "epoch": 0.10449173306750942, + "grad_norm": 0.12143665552139282, + "learning_rate": 2.9054671320572706e-05, + "loss": 0.037, + "step": 47380 + }, + { + "epoch": 0.10451378704240759, + "grad_norm": 0.12415716052055359, + "learning_rate": 2.905409353246424e-05, + "loss": 0.0376, + "step": 47390 + }, + { + "epoch": 0.10453584101730576, + "grad_norm": 0.14062714576721191, + "learning_rate": 2.9053515573585274e-05, + "loss": 0.036, + "step": 47400 + }, + { + "epoch": 0.10455789499220391, + "grad_norm": 0.13110949099063873, + "learning_rate": 2.9052937443942824e-05, + "loss": 0.0385, + "step": 47410 + }, + { + "epoch": 0.10457994896710208, + "grad_norm": 0.15555912256240845, + "learning_rate": 2.9052359143543913e-05, + "loss": 0.0361, + "step": 47420 + }, + { + "epoch": 0.10460200294200026, + "grad_norm": 0.12060219049453735, + "learning_rate": 2.9051780672395574e-05, + "loss": 0.0376, + "step": 47430 + }, + { + "epoch": 0.10462405691689841, + "grad_norm": 0.10529151558876038, + "learning_rate": 2.9051202030504827e-05, + "loss": 0.0358, + "step": 47440 + }, + { + "epoch": 0.10464611089179658, + "grad_norm": 0.11101441830396652, + "learning_rate": 2.9050623217878716e-05, + "loss": 0.0373, + "step": 47450 + }, + { + "epoch": 0.10466816486669475, + "grad_norm": 0.10422898828983307, + "learning_rate": 2.905004423452426e-05, + "loss": 0.0365, + "step": 47460 + }, + { + "epoch": 0.10469021884159291, + "grad_norm": 0.12731201946735382, + "learning_rate": 2.9049465080448504e-05, + "loss": 0.0361, + "step": 47470 + }, + { + "epoch": 0.10471227281649108, + "grad_norm": 0.09277889132499695, + "learning_rate": 2.9048885755658487e-05, + "loss": 0.0371, + "step": 47480 + }, + { + "epoch": 0.10473432679138925, + "grad_norm": 0.11369840800762177, + "learning_rate": 2.9048306260161237e-05, + "loss": 0.0374, + "step": 47490 + }, + { + "epoch": 0.10475638076628742, + "grad_norm": 0.10821578651666641, + "learning_rate": 2.9047726593963803e-05, + "loss": 0.0365, + "step": 47500 + }, + { + "epoch": 0.10477843474118557, + "grad_norm": 0.10877927392721176, + "learning_rate": 2.904714675707323e-05, + "loss": 0.0372, + "step": 47510 + }, + { + "epoch": 0.10480048871608375, + "grad_norm": 0.11382758617401123, + "learning_rate": 2.9046566749496562e-05, + "loss": 0.0391, + "step": 47520 + }, + { + "epoch": 0.10482254269098192, + "grad_norm": 0.12171291559934616, + "learning_rate": 2.9045986571240844e-05, + "loss": 0.0351, + "step": 47530 + }, + { + "epoch": 0.10484459666588007, + "grad_norm": 0.13351991772651672, + "learning_rate": 2.9045406222313126e-05, + "loss": 0.0388, + "step": 47540 + }, + { + "epoch": 0.10486665064077824, + "grad_norm": 0.10298920422792435, + "learning_rate": 2.9044825702720463e-05, + "loss": 0.0374, + "step": 47550 + }, + { + "epoch": 0.10488870461567641, + "grad_norm": 0.1288832128047943, + "learning_rate": 2.9044245012469913e-05, + "loss": 0.0354, + "step": 47560 + }, + { + "epoch": 0.10491075859057457, + "grad_norm": 0.1163022518157959, + "learning_rate": 2.904366415156852e-05, + "loss": 0.0358, + "step": 47570 + }, + { + "epoch": 0.10493281256547274, + "grad_norm": 0.1131763905286789, + "learning_rate": 2.9043083120023345e-05, + "loss": 0.0376, + "step": 47580 + }, + { + "epoch": 0.10495486654037091, + "grad_norm": 0.11960461735725403, + "learning_rate": 2.9042501917841454e-05, + "loss": 0.0355, + "step": 47590 + }, + { + "epoch": 0.10497692051526906, + "grad_norm": 0.15057791769504547, + "learning_rate": 2.904192054502991e-05, + "loss": 0.0373, + "step": 47600 + }, + { + "epoch": 0.10499897449016724, + "grad_norm": 0.11244943737983704, + "learning_rate": 2.9041339001595772e-05, + "loss": 0.0357, + "step": 47610 + }, + { + "epoch": 0.1050210284650654, + "grad_norm": 0.10088935494422913, + "learning_rate": 2.9040757287546104e-05, + "loss": 0.0383, + "step": 47620 + }, + { + "epoch": 0.10504308243996356, + "grad_norm": 0.12546589970588684, + "learning_rate": 2.9040175402887984e-05, + "loss": 0.0372, + "step": 47630 + }, + { + "epoch": 0.10506513641486173, + "grad_norm": 0.08847393095493317, + "learning_rate": 2.9039593347628474e-05, + "loss": 0.0376, + "step": 47640 + }, + { + "epoch": 0.1050871903897599, + "grad_norm": 0.1163749024271965, + "learning_rate": 2.9039011121774648e-05, + "loss": 0.0367, + "step": 47650 + }, + { + "epoch": 0.10510924436465806, + "grad_norm": 0.10562411695718765, + "learning_rate": 2.903842872533358e-05, + "loss": 0.0364, + "step": 47660 + }, + { + "epoch": 0.10513129833955623, + "grad_norm": 0.10924631357192993, + "learning_rate": 2.9037846158312356e-05, + "loss": 0.038, + "step": 47670 + }, + { + "epoch": 0.1051533523144544, + "grad_norm": 0.09216903895139694, + "learning_rate": 2.9037263420718042e-05, + "loss": 0.0374, + "step": 47680 + }, + { + "epoch": 0.10517540628935255, + "grad_norm": 0.11709034442901611, + "learning_rate": 2.9036680512557724e-05, + "loss": 0.0387, + "step": 47690 + }, + { + "epoch": 0.10519746026425073, + "grad_norm": 0.18966121971607208, + "learning_rate": 2.9036097433838485e-05, + "loss": 0.0395, + "step": 47700 + }, + { + "epoch": 0.1052195142391489, + "grad_norm": 0.11056603491306305, + "learning_rate": 2.903551418456741e-05, + "loss": 0.0379, + "step": 47710 + }, + { + "epoch": 0.10524156821404707, + "grad_norm": 0.1215781569480896, + "learning_rate": 2.9034930764751588e-05, + "loss": 0.0355, + "step": 47720 + }, + { + "epoch": 0.10526362218894522, + "grad_norm": 0.11148396879434586, + "learning_rate": 2.9034347174398103e-05, + "loss": 0.0389, + "step": 47730 + }, + { + "epoch": 0.10528567616384339, + "grad_norm": 0.1288038194179535, + "learning_rate": 2.9033763413514053e-05, + "loss": 0.036, + "step": 47740 + }, + { + "epoch": 0.10530773013874156, + "grad_norm": 0.1598220318555832, + "learning_rate": 2.9033179482106523e-05, + "loss": 0.0358, + "step": 47750 + }, + { + "epoch": 0.10532978411363972, + "grad_norm": 0.1135997548699379, + "learning_rate": 2.9032595380182617e-05, + "loss": 0.0365, + "step": 47760 + }, + { + "epoch": 0.10535183808853789, + "grad_norm": 0.129895880818367, + "learning_rate": 2.9032011107749427e-05, + "loss": 0.0392, + "step": 47770 + }, + { + "epoch": 0.10537389206343606, + "grad_norm": 0.1128111481666565, + "learning_rate": 2.9031426664814055e-05, + "loss": 0.0375, + "step": 47780 + }, + { + "epoch": 0.10539594603833422, + "grad_norm": 0.19279536604881287, + "learning_rate": 2.90308420513836e-05, + "loss": 0.0383, + "step": 47790 + }, + { + "epoch": 0.10541800001323239, + "grad_norm": 0.15493930876255035, + "learning_rate": 2.9030257267465166e-05, + "loss": 0.036, + "step": 47800 + }, + { + "epoch": 0.10544005398813056, + "grad_norm": 0.12367372959852219, + "learning_rate": 2.902967231306586e-05, + "loss": 0.0379, + "step": 47810 + }, + { + "epoch": 0.10546210796302871, + "grad_norm": 0.16871346533298492, + "learning_rate": 2.9029087188192788e-05, + "loss": 0.0379, + "step": 47820 + }, + { + "epoch": 0.10548416193792688, + "grad_norm": 0.11533831804990768, + "learning_rate": 2.902850189285306e-05, + "loss": 0.0395, + "step": 47830 + }, + { + "epoch": 0.10550621591282505, + "grad_norm": 0.12168990820646286, + "learning_rate": 2.9027916427053792e-05, + "loss": 0.0372, + "step": 47840 + }, + { + "epoch": 0.10552826988772321, + "grad_norm": 0.13496416807174683, + "learning_rate": 2.902733079080209e-05, + "loss": 0.0352, + "step": 47850 + }, + { + "epoch": 0.10555032386262138, + "grad_norm": 0.11916735023260117, + "learning_rate": 2.902674498410508e-05, + "loss": 0.037, + "step": 47860 + }, + { + "epoch": 0.10557237783751955, + "grad_norm": 0.12591281533241272, + "learning_rate": 2.9026159006969873e-05, + "loss": 0.0383, + "step": 47870 + }, + { + "epoch": 0.1055944318124177, + "grad_norm": 0.11547011137008667, + "learning_rate": 2.9025572859403588e-05, + "loss": 0.0378, + "step": 47880 + }, + { + "epoch": 0.10561648578731588, + "grad_norm": 0.10765259712934494, + "learning_rate": 2.9024986541413356e-05, + "loss": 0.0376, + "step": 47890 + }, + { + "epoch": 0.10563853976221405, + "grad_norm": 0.13068045675754547, + "learning_rate": 2.9024400053006295e-05, + "loss": 0.0368, + "step": 47900 + }, + { + "epoch": 0.1056605937371122, + "grad_norm": 0.11294940859079361, + "learning_rate": 2.902381339418953e-05, + "loss": 0.0353, + "step": 47910 + }, + { + "epoch": 0.10568264771201037, + "grad_norm": 0.12274716049432755, + "learning_rate": 2.9023226564970195e-05, + "loss": 0.0386, + "step": 47920 + }, + { + "epoch": 0.10570470168690854, + "grad_norm": 0.13125580549240112, + "learning_rate": 2.9022639565355413e-05, + "loss": 0.0387, + "step": 47930 + }, + { + "epoch": 0.10572675566180671, + "grad_norm": 0.11510296165943146, + "learning_rate": 2.902205239535232e-05, + "loss": 0.0343, + "step": 47940 + }, + { + "epoch": 0.10574880963670487, + "grad_norm": 0.1109059527516365, + "learning_rate": 2.9021465054968055e-05, + "loss": 0.0386, + "step": 47950 + }, + { + "epoch": 0.10577086361160304, + "grad_norm": 0.11606606841087341, + "learning_rate": 2.9020877544209747e-05, + "loss": 0.0379, + "step": 47960 + }, + { + "epoch": 0.10579291758650121, + "grad_norm": 0.09528462588787079, + "learning_rate": 2.9020289863084544e-05, + "loss": 0.0373, + "step": 47970 + }, + { + "epoch": 0.10581497156139937, + "grad_norm": 0.13356390595436096, + "learning_rate": 2.9019702011599576e-05, + "loss": 0.0377, + "step": 47980 + }, + { + "epoch": 0.10583702553629754, + "grad_norm": 0.08524816483259201, + "learning_rate": 2.9019113989761997e-05, + "loss": 0.0378, + "step": 47990 + }, + { + "epoch": 0.1058590795111957, + "grad_norm": 0.12561430037021637, + "learning_rate": 2.9018525797578944e-05, + "loss": 0.0379, + "step": 48000 + }, + { + "epoch": 0.10588113348609386, + "grad_norm": 0.11207062751054764, + "learning_rate": 2.901793743505757e-05, + "loss": 0.0375, + "step": 48010 + }, + { + "epoch": 0.10590318746099203, + "grad_norm": 0.11102648824453354, + "learning_rate": 2.9017348902205016e-05, + "loss": 0.0367, + "step": 48020 + }, + { + "epoch": 0.1059252414358902, + "grad_norm": 0.13048319518566132, + "learning_rate": 2.9016760199028443e-05, + "loss": 0.0362, + "step": 48030 + }, + { + "epoch": 0.10594729541078836, + "grad_norm": 0.22160698473453522, + "learning_rate": 2.9016171325534995e-05, + "loss": 0.0351, + "step": 48040 + }, + { + "epoch": 0.10596934938568653, + "grad_norm": 0.12098623067140579, + "learning_rate": 2.9015582281731833e-05, + "loss": 0.0354, + "step": 48050 + }, + { + "epoch": 0.1059914033605847, + "grad_norm": 0.09328830987215042, + "learning_rate": 2.9014993067626118e-05, + "loss": 0.0353, + "step": 48060 + }, + { + "epoch": 0.10601345733548286, + "grad_norm": 0.10235077142715454, + "learning_rate": 2.9014403683225004e-05, + "loss": 0.0362, + "step": 48070 + }, + { + "epoch": 0.10603551131038103, + "grad_norm": 0.12256377935409546, + "learning_rate": 2.901381412853565e-05, + "loss": 0.0366, + "step": 48080 + }, + { + "epoch": 0.1060575652852792, + "grad_norm": 0.11350253969430923, + "learning_rate": 2.9013224403565227e-05, + "loss": 0.036, + "step": 48090 + }, + { + "epoch": 0.10607961926017735, + "grad_norm": 0.12180650979280472, + "learning_rate": 2.901263450832089e-05, + "loss": 0.0365, + "step": 48100 + }, + { + "epoch": 0.10610167323507552, + "grad_norm": 0.13065093755722046, + "learning_rate": 2.9012044442809817e-05, + "loss": 0.0364, + "step": 48110 + }, + { + "epoch": 0.10612372720997369, + "grad_norm": 0.11683124303817749, + "learning_rate": 2.9011454207039175e-05, + "loss": 0.0377, + "step": 48120 + }, + { + "epoch": 0.10614578118487185, + "grad_norm": 0.13854819536209106, + "learning_rate": 2.901086380101614e-05, + "loss": 0.0353, + "step": 48130 + }, + { + "epoch": 0.10616783515977002, + "grad_norm": 0.12721426784992218, + "learning_rate": 2.9010273224747875e-05, + "loss": 0.0387, + "step": 48140 + }, + { + "epoch": 0.10618988913466819, + "grad_norm": 0.10794396698474884, + "learning_rate": 2.9009682478241563e-05, + "loss": 0.036, + "step": 48150 + }, + { + "epoch": 0.10621194310956636, + "grad_norm": 0.11254454404115677, + "learning_rate": 2.900909156150438e-05, + "loss": 0.0367, + "step": 48160 + }, + { + "epoch": 0.10623399708446452, + "grad_norm": 0.10268665105104446, + "learning_rate": 2.9008500474543516e-05, + "loss": 0.0357, + "step": 48170 + }, + { + "epoch": 0.10625605105936269, + "grad_norm": 0.11587397009134293, + "learning_rate": 2.900790921736614e-05, + "loss": 0.0339, + "step": 48180 + }, + { + "epoch": 0.10627810503426086, + "grad_norm": 0.11871334165334702, + "learning_rate": 2.9007317789979443e-05, + "loss": 0.0383, + "step": 48190 + }, + { + "epoch": 0.10630015900915901, + "grad_norm": 0.1302928328514099, + "learning_rate": 2.9006726192390604e-05, + "loss": 0.0349, + "step": 48200 + }, + { + "epoch": 0.10632221298405718, + "grad_norm": 0.10638412088155746, + "learning_rate": 2.900613442460682e-05, + "loss": 0.0371, + "step": 48210 + }, + { + "epoch": 0.10634426695895535, + "grad_norm": 0.12212522327899933, + "learning_rate": 2.9005542486635282e-05, + "loss": 0.0382, + "step": 48220 + }, + { + "epoch": 0.10636632093385351, + "grad_norm": 0.14888492226600647, + "learning_rate": 2.9004950378483175e-05, + "loss": 0.0376, + "step": 48230 + }, + { + "epoch": 0.10638837490875168, + "grad_norm": 0.09808258712291718, + "learning_rate": 2.9004358100157704e-05, + "loss": 0.0367, + "step": 48240 + }, + { + "epoch": 0.10641042888364985, + "grad_norm": 0.10615026205778122, + "learning_rate": 2.900376565166605e-05, + "loss": 0.0366, + "step": 48250 + }, + { + "epoch": 0.106432482858548, + "grad_norm": 0.11322744935750961, + "learning_rate": 2.9003173033015433e-05, + "loss": 0.0388, + "step": 48260 + }, + { + "epoch": 0.10645453683344618, + "grad_norm": 0.11220695823431015, + "learning_rate": 2.9002580244213034e-05, + "loss": 0.0362, + "step": 48270 + }, + { + "epoch": 0.10647659080834435, + "grad_norm": 0.10314548760652542, + "learning_rate": 2.9001987285266072e-05, + "loss": 0.0369, + "step": 48280 + }, + { + "epoch": 0.1064986447832425, + "grad_norm": 0.1445523351430893, + "learning_rate": 2.900139415618174e-05, + "loss": 0.0375, + "step": 48290 + }, + { + "epoch": 0.10652069875814067, + "grad_norm": 0.11453820019960403, + "learning_rate": 2.9000800856967246e-05, + "loss": 0.0361, + "step": 48300 + }, + { + "epoch": 0.10654275273303884, + "grad_norm": 0.13843926787376404, + "learning_rate": 2.900020738762981e-05, + "loss": 0.0384, + "step": 48310 + }, + { + "epoch": 0.106564806707937, + "grad_norm": 0.12354561686515808, + "learning_rate": 2.8999613748176628e-05, + "loss": 0.0393, + "step": 48320 + }, + { + "epoch": 0.10658686068283517, + "grad_norm": 0.11117976903915405, + "learning_rate": 2.8999019938614922e-05, + "loss": 0.035, + "step": 48330 + }, + { + "epoch": 0.10660891465773334, + "grad_norm": 0.11039365828037262, + "learning_rate": 2.8998425958951914e-05, + "loss": 0.0359, + "step": 48340 + }, + { + "epoch": 0.10663096863263151, + "grad_norm": 0.13085488975048065, + "learning_rate": 2.8997831809194807e-05, + "loss": 0.0372, + "step": 48350 + }, + { + "epoch": 0.10665302260752967, + "grad_norm": 0.11113300919532776, + "learning_rate": 2.899723748935083e-05, + "loss": 0.0371, + "step": 48360 + }, + { + "epoch": 0.10667507658242784, + "grad_norm": 0.09449741244316101, + "learning_rate": 2.89966429994272e-05, + "loss": 0.0355, + "step": 48370 + }, + { + "epoch": 0.106697130557326, + "grad_norm": 0.12594552338123322, + "learning_rate": 2.899604833943115e-05, + "loss": 0.0384, + "step": 48380 + }, + { + "epoch": 0.10671918453222416, + "grad_norm": 0.13223403692245483, + "learning_rate": 2.899545350936989e-05, + "loss": 0.0382, + "step": 48390 + }, + { + "epoch": 0.10674123850712233, + "grad_norm": 0.12319513410329819, + "learning_rate": 2.899485850925066e-05, + "loss": 0.0364, + "step": 48400 + }, + { + "epoch": 0.1067632924820205, + "grad_norm": 0.1256737858057022, + "learning_rate": 2.899426333908068e-05, + "loss": 0.0389, + "step": 48410 + }, + { + "epoch": 0.10678534645691866, + "grad_norm": 0.1339431256055832, + "learning_rate": 2.8993667998867194e-05, + "loss": 0.0408, + "step": 48420 + }, + { + "epoch": 0.10680740043181683, + "grad_norm": 0.10434652864933014, + "learning_rate": 2.8993072488617426e-05, + "loss": 0.0375, + "step": 48430 + }, + { + "epoch": 0.106829454406715, + "grad_norm": 0.10077085345983505, + "learning_rate": 2.8992476808338617e-05, + "loss": 0.0366, + "step": 48440 + }, + { + "epoch": 0.10685150838161316, + "grad_norm": 0.08993852138519287, + "learning_rate": 2.8991880958038005e-05, + "loss": 0.0382, + "step": 48450 + }, + { + "epoch": 0.10687356235651133, + "grad_norm": 0.11615649610757828, + "learning_rate": 2.8991284937722828e-05, + "loss": 0.0362, + "step": 48460 + }, + { + "epoch": 0.1068956163314095, + "grad_norm": 0.11554452031850815, + "learning_rate": 2.899068874740033e-05, + "loss": 0.0361, + "step": 48470 + }, + { + "epoch": 0.10691767030630765, + "grad_norm": 0.11232718825340271, + "learning_rate": 2.8990092387077753e-05, + "loss": 0.0403, + "step": 48480 + }, + { + "epoch": 0.10693972428120582, + "grad_norm": 0.10125042498111725, + "learning_rate": 2.898949585676234e-05, + "loss": 0.0347, + "step": 48490 + }, + { + "epoch": 0.10696177825610399, + "grad_norm": 0.09555652737617493, + "learning_rate": 2.898889915646135e-05, + "loss": 0.0387, + "step": 48500 + }, + { + "epoch": 0.10698383223100215, + "grad_norm": 0.12043873965740204, + "learning_rate": 2.8988302286182028e-05, + "loss": 0.0361, + "step": 48510 + }, + { + "epoch": 0.10700588620590032, + "grad_norm": 0.14538879692554474, + "learning_rate": 2.8987705245931626e-05, + "loss": 0.0386, + "step": 48520 + }, + { + "epoch": 0.10702794018079849, + "grad_norm": 0.1295866221189499, + "learning_rate": 2.8987108035717395e-05, + "loss": 0.0375, + "step": 48530 + }, + { + "epoch": 0.10704999415569665, + "grad_norm": 0.09819881618022919, + "learning_rate": 2.8986510655546596e-05, + "loss": 0.0368, + "step": 48540 + }, + { + "epoch": 0.10707204813059482, + "grad_norm": 0.08620725572109222, + "learning_rate": 2.898591310542649e-05, + "loss": 0.0366, + "step": 48550 + }, + { + "epoch": 0.10709410210549299, + "grad_norm": 0.12999558448791504, + "learning_rate": 2.8985315385364334e-05, + "loss": 0.036, + "step": 48560 + }, + { + "epoch": 0.10711615608039116, + "grad_norm": 0.10719660669565201, + "learning_rate": 2.8984717495367392e-05, + "loss": 0.0356, + "step": 48570 + }, + { + "epoch": 0.10713821005528931, + "grad_norm": 0.09642937779426575, + "learning_rate": 2.898411943544293e-05, + "loss": 0.0381, + "step": 48580 + }, + { + "epoch": 0.10716026403018748, + "grad_norm": 0.10002025961875916, + "learning_rate": 2.8983521205598205e-05, + "loss": 0.0383, + "step": 48590 + }, + { + "epoch": 0.10718231800508565, + "grad_norm": 0.15130013227462769, + "learning_rate": 2.8982922805840502e-05, + "loss": 0.0365, + "step": 48600 + }, + { + "epoch": 0.10720437197998381, + "grad_norm": 0.15478506684303284, + "learning_rate": 2.8982324236177086e-05, + "loss": 0.0366, + "step": 48610 + }, + { + "epoch": 0.10722642595488198, + "grad_norm": 0.11076344549655914, + "learning_rate": 2.8981725496615226e-05, + "loss": 0.0368, + "step": 48620 + }, + { + "epoch": 0.10724847992978015, + "grad_norm": 0.12325908243656158, + "learning_rate": 2.8981126587162203e-05, + "loss": 0.0383, + "step": 48630 + }, + { + "epoch": 0.1072705339046783, + "grad_norm": 0.10053098946809769, + "learning_rate": 2.8980527507825287e-05, + "loss": 0.0373, + "step": 48640 + }, + { + "epoch": 0.10729258787957648, + "grad_norm": 0.12586215138435364, + "learning_rate": 2.8979928258611766e-05, + "loss": 0.0396, + "step": 48650 + }, + { + "epoch": 0.10731464185447465, + "grad_norm": 0.1409597545862198, + "learning_rate": 2.897932883952892e-05, + "loss": 0.0374, + "step": 48660 + }, + { + "epoch": 0.1073366958293728, + "grad_norm": 0.12828978896141052, + "learning_rate": 2.897872925058402e-05, + "loss": 0.0375, + "step": 48670 + }, + { + "epoch": 0.10735874980427097, + "grad_norm": 0.13638414442539215, + "learning_rate": 2.897812949178437e-05, + "loss": 0.0387, + "step": 48680 + }, + { + "epoch": 0.10738080377916914, + "grad_norm": 0.13088230788707733, + "learning_rate": 2.897752956313725e-05, + "loss": 0.0382, + "step": 48690 + }, + { + "epoch": 0.1074028577540673, + "grad_norm": 0.1122949942946434, + "learning_rate": 2.897692946464994e-05, + "loss": 0.0377, + "step": 48700 + }, + { + "epoch": 0.10742491172896547, + "grad_norm": 0.11446374654769897, + "learning_rate": 2.897632919632975e-05, + "loss": 0.0363, + "step": 48710 + }, + { + "epoch": 0.10744696570386364, + "grad_norm": 0.08975990116596222, + "learning_rate": 2.897572875818396e-05, + "loss": 0.0354, + "step": 48720 + }, + { + "epoch": 0.1074690196787618, + "grad_norm": 0.12208852171897888, + "learning_rate": 2.897512815021987e-05, + "loss": 0.0369, + "step": 48730 + }, + { + "epoch": 0.10749107365365997, + "grad_norm": 0.10516811162233353, + "learning_rate": 2.897452737244478e-05, + "loss": 0.038, + "step": 48740 + }, + { + "epoch": 0.10751312762855814, + "grad_norm": 0.10812509804964066, + "learning_rate": 2.897392642486599e-05, + "loss": 0.0381, + "step": 48750 + }, + { + "epoch": 0.10753518160345629, + "grad_norm": 0.08562520891427994, + "learning_rate": 2.8973325307490796e-05, + "loss": 0.0347, + "step": 48760 + }, + { + "epoch": 0.10755723557835446, + "grad_norm": 0.09898020327091217, + "learning_rate": 2.8972724020326507e-05, + "loss": 0.0365, + "step": 48770 + }, + { + "epoch": 0.10757928955325263, + "grad_norm": 0.08648630231618881, + "learning_rate": 2.897212256338043e-05, + "loss": 0.0369, + "step": 48780 + }, + { + "epoch": 0.1076013435281508, + "grad_norm": 0.14650340378284454, + "learning_rate": 2.897152093665987e-05, + "loss": 0.0364, + "step": 48790 + }, + { + "epoch": 0.10762339750304896, + "grad_norm": 0.13871736824512482, + "learning_rate": 2.8970919140172144e-05, + "loss": 0.0371, + "step": 48800 + }, + { + "epoch": 0.10764545147794713, + "grad_norm": 0.11927012354135513, + "learning_rate": 2.8970317173924557e-05, + "loss": 0.0363, + "step": 48810 + }, + { + "epoch": 0.1076675054528453, + "grad_norm": 0.10303732007741928, + "learning_rate": 2.8969715037924426e-05, + "loss": 0.0342, + "step": 48820 + }, + { + "epoch": 0.10768955942774346, + "grad_norm": 0.07737764716148376, + "learning_rate": 2.8969112732179067e-05, + "loss": 0.0373, + "step": 48830 + }, + { + "epoch": 0.10771161340264163, + "grad_norm": 0.08261024206876755, + "learning_rate": 2.8968510256695804e-05, + "loss": 0.0369, + "step": 48840 + }, + { + "epoch": 0.1077336673775398, + "grad_norm": 0.10382513701915741, + "learning_rate": 2.8967907611481945e-05, + "loss": 0.0366, + "step": 48850 + }, + { + "epoch": 0.10775572135243795, + "grad_norm": 0.1217176765203476, + "learning_rate": 2.8967304796544824e-05, + "loss": 0.0383, + "step": 48860 + }, + { + "epoch": 0.10777777532733612, + "grad_norm": 0.11081919074058533, + "learning_rate": 2.8966701811891767e-05, + "loss": 0.0346, + "step": 48870 + }, + { + "epoch": 0.10779982930223429, + "grad_norm": 0.1178009882569313, + "learning_rate": 2.896609865753009e-05, + "loss": 0.0365, + "step": 48880 + }, + { + "epoch": 0.10782188327713245, + "grad_norm": 0.11217958480119705, + "learning_rate": 2.8965495333467133e-05, + "loss": 0.0374, + "step": 48890 + }, + { + "epoch": 0.10784393725203062, + "grad_norm": 0.14670990407466888, + "learning_rate": 2.896489183971022e-05, + "loss": 0.0379, + "step": 48900 + }, + { + "epoch": 0.10786599122692879, + "grad_norm": 0.09661231935024261, + "learning_rate": 2.8964288176266686e-05, + "loss": 0.0382, + "step": 48910 + }, + { + "epoch": 0.10788804520182695, + "grad_norm": 0.11099178344011307, + "learning_rate": 2.896368434314386e-05, + "loss": 0.0362, + "step": 48920 + }, + { + "epoch": 0.10791009917672512, + "grad_norm": 0.1683979630470276, + "learning_rate": 2.8963080340349096e-05, + "loss": 0.0362, + "step": 48930 + }, + { + "epoch": 0.10793215315162329, + "grad_norm": 0.11070804297924042, + "learning_rate": 2.8962476167889716e-05, + "loss": 0.0366, + "step": 48940 + }, + { + "epoch": 0.10795420712652144, + "grad_norm": 0.15536633133888245, + "learning_rate": 2.896187182577307e-05, + "loss": 0.0378, + "step": 48950 + }, + { + "epoch": 0.10797626110141961, + "grad_norm": 0.11002229899168015, + "learning_rate": 2.8961267314006497e-05, + "loss": 0.0366, + "step": 48960 + }, + { + "epoch": 0.10799831507631778, + "grad_norm": 0.1041623130440712, + "learning_rate": 2.896066263259735e-05, + "loss": 0.0359, + "step": 48970 + }, + { + "epoch": 0.10802036905121594, + "grad_norm": 0.10696808248758316, + "learning_rate": 2.8960057781552963e-05, + "loss": 0.0349, + "step": 48980 + }, + { + "epoch": 0.10804242302611411, + "grad_norm": 0.12310606986284256, + "learning_rate": 2.8959452760880703e-05, + "loss": 0.0385, + "step": 48990 + }, + { + "epoch": 0.10806447700101228, + "grad_norm": 0.14359380304813385, + "learning_rate": 2.8958847570587902e-05, + "loss": 0.0372, + "step": 49000 + }, + { + "epoch": 0.10808653097591045, + "grad_norm": 0.12153300642967224, + "learning_rate": 2.895824221068193e-05, + "loss": 0.0382, + "step": 49010 + }, + { + "epoch": 0.1081085849508086, + "grad_norm": 0.1123574897646904, + "learning_rate": 2.8957636681170134e-05, + "loss": 0.0377, + "step": 49020 + }, + { + "epoch": 0.10813063892570678, + "grad_norm": 0.12682366371154785, + "learning_rate": 2.8957030982059878e-05, + "loss": 0.0377, + "step": 49030 + }, + { + "epoch": 0.10815269290060495, + "grad_norm": 0.11341097950935364, + "learning_rate": 2.8956425113358512e-05, + "loss": 0.0356, + "step": 49040 + }, + { + "epoch": 0.1081747468755031, + "grad_norm": 0.10793343186378479, + "learning_rate": 2.8955819075073408e-05, + "loss": 0.0352, + "step": 49050 + }, + { + "epoch": 0.10819680085040127, + "grad_norm": 0.11406457424163818, + "learning_rate": 2.8955212867211925e-05, + "loss": 0.0368, + "step": 49060 + }, + { + "epoch": 0.10821885482529944, + "grad_norm": 0.1081133484840393, + "learning_rate": 2.8954606489781425e-05, + "loss": 0.0364, + "step": 49070 + }, + { + "epoch": 0.1082409088001976, + "grad_norm": 0.12507636845111847, + "learning_rate": 2.8953999942789285e-05, + "loss": 0.0345, + "step": 49080 + }, + { + "epoch": 0.10826296277509577, + "grad_norm": 0.10982969403266907, + "learning_rate": 2.8953393226242873e-05, + "loss": 0.0375, + "step": 49090 + }, + { + "epoch": 0.10828501674999394, + "grad_norm": 0.12208002805709839, + "learning_rate": 2.8952786340149556e-05, + "loss": 0.0366, + "step": 49100 + }, + { + "epoch": 0.1083070707248921, + "grad_norm": 0.14345088601112366, + "learning_rate": 2.8952179284516717e-05, + "loss": 0.0347, + "step": 49110 + }, + { + "epoch": 0.10832912469979027, + "grad_norm": 0.11081928014755249, + "learning_rate": 2.895157205935172e-05, + "loss": 0.0353, + "step": 49120 + }, + { + "epoch": 0.10835117867468844, + "grad_norm": 0.10327325761318207, + "learning_rate": 2.8950964664661952e-05, + "loss": 0.0383, + "step": 49130 + }, + { + "epoch": 0.10837323264958659, + "grad_norm": 0.15292437374591827, + "learning_rate": 2.895035710045479e-05, + "loss": 0.0369, + "step": 49140 + }, + { + "epoch": 0.10839528662448476, + "grad_norm": 0.12823118269443512, + "learning_rate": 2.8949749366737618e-05, + "loss": 0.0376, + "step": 49150 + }, + { + "epoch": 0.10841734059938293, + "grad_norm": 0.11432742327451706, + "learning_rate": 2.8949141463517824e-05, + "loss": 0.0364, + "step": 49160 + }, + { + "epoch": 0.10843939457428109, + "grad_norm": 0.10209045559167862, + "learning_rate": 2.894853339080279e-05, + "loss": 0.0384, + "step": 49170 + }, + { + "epoch": 0.10846144854917926, + "grad_norm": 0.1361459642648697, + "learning_rate": 2.8947925148599906e-05, + "loss": 0.0383, + "step": 49180 + }, + { + "epoch": 0.10848350252407743, + "grad_norm": 0.09252086281776428, + "learning_rate": 2.8947316736916557e-05, + "loss": 0.0339, + "step": 49190 + }, + { + "epoch": 0.10850555649897559, + "grad_norm": 0.11405982822179794, + "learning_rate": 2.8946708155760147e-05, + "loss": 0.0367, + "step": 49200 + }, + { + "epoch": 0.10852761047387376, + "grad_norm": 0.14775967597961426, + "learning_rate": 2.8946099405138066e-05, + "loss": 0.0373, + "step": 49210 + }, + { + "epoch": 0.10854966444877193, + "grad_norm": 0.10778163373470306, + "learning_rate": 2.8945490485057705e-05, + "loss": 0.0378, + "step": 49220 + }, + { + "epoch": 0.1085717184236701, + "grad_norm": 0.10838132351636887, + "learning_rate": 2.894488139552647e-05, + "loss": 0.037, + "step": 49230 + }, + { + "epoch": 0.10859377239856825, + "grad_norm": 0.11131361126899719, + "learning_rate": 2.8944272136551757e-05, + "loss": 0.036, + "step": 49240 + }, + { + "epoch": 0.10861582637346642, + "grad_norm": 0.1070583313703537, + "learning_rate": 2.894366270814098e-05, + "loss": 0.0358, + "step": 49250 + }, + { + "epoch": 0.1086378803483646, + "grad_norm": 0.09255139529705048, + "learning_rate": 2.8943053110301527e-05, + "loss": 0.0383, + "step": 49260 + }, + { + "epoch": 0.10865993432326275, + "grad_norm": 0.09901907294988632, + "learning_rate": 2.894244334304082e-05, + "loss": 0.0376, + "step": 49270 + }, + { + "epoch": 0.10868198829816092, + "grad_norm": 0.13445062935352325, + "learning_rate": 2.8941833406366264e-05, + "loss": 0.0347, + "step": 49280 + }, + { + "epoch": 0.10870404227305909, + "grad_norm": 0.11826397478580475, + "learning_rate": 2.894122330028526e-05, + "loss": 0.0376, + "step": 49290 + }, + { + "epoch": 0.10872609624795725, + "grad_norm": 0.17071670293807983, + "learning_rate": 2.8940613024805235e-05, + "loss": 0.0379, + "step": 49300 + }, + { + "epoch": 0.10874815022285542, + "grad_norm": 0.1604515165090561, + "learning_rate": 2.8940002579933606e-05, + "loss": 0.0373, + "step": 49310 + }, + { + "epoch": 0.10877020419775359, + "grad_norm": 0.0941697433590889, + "learning_rate": 2.8939391965677775e-05, + "loss": 0.0369, + "step": 49320 + }, + { + "epoch": 0.10879225817265174, + "grad_norm": 0.10263404995203018, + "learning_rate": 2.8938781182045174e-05, + "loss": 0.0381, + "step": 49330 + }, + { + "epoch": 0.10881431214754991, + "grad_norm": 0.11799103766679764, + "learning_rate": 2.8938170229043223e-05, + "loss": 0.0374, + "step": 49340 + }, + { + "epoch": 0.10883636612244808, + "grad_norm": 0.15290270745754242, + "learning_rate": 2.893755910667934e-05, + "loss": 0.0393, + "step": 49350 + }, + { + "epoch": 0.10885842009734624, + "grad_norm": 0.10274884849786758, + "learning_rate": 2.8936947814960962e-05, + "loss": 0.035, + "step": 49360 + }, + { + "epoch": 0.10888047407224441, + "grad_norm": 0.14106734097003937, + "learning_rate": 2.89363363538955e-05, + "loss": 0.0386, + "step": 49370 + }, + { + "epoch": 0.10890252804714258, + "grad_norm": 0.13826072216033936, + "learning_rate": 2.8935724723490397e-05, + "loss": 0.0382, + "step": 49380 + }, + { + "epoch": 0.10892458202204074, + "grad_norm": 0.10347804427146912, + "learning_rate": 2.8935112923753085e-05, + "loss": 0.036, + "step": 49390 + }, + { + "epoch": 0.1089466359969389, + "grad_norm": 0.10758652538061142, + "learning_rate": 2.893450095469099e-05, + "loss": 0.038, + "step": 49400 + }, + { + "epoch": 0.10896868997183708, + "grad_norm": 0.0948413759469986, + "learning_rate": 2.8933888816311555e-05, + "loss": 0.0358, + "step": 49410 + }, + { + "epoch": 0.10899074394673523, + "grad_norm": 0.1190941333770752, + "learning_rate": 2.893327650862221e-05, + "loss": 0.0371, + "step": 49420 + }, + { + "epoch": 0.1090127979216334, + "grad_norm": 0.15529319643974304, + "learning_rate": 2.8932664031630402e-05, + "loss": 0.036, + "step": 49430 + }, + { + "epoch": 0.10903485189653157, + "grad_norm": 0.12483032047748566, + "learning_rate": 2.8932051385343575e-05, + "loss": 0.0386, + "step": 49440 + }, + { + "epoch": 0.10905690587142974, + "grad_norm": 0.14006006717681885, + "learning_rate": 2.8931438569769166e-05, + "loss": 0.0376, + "step": 49450 + }, + { + "epoch": 0.1090789598463279, + "grad_norm": 0.13953106105327606, + "learning_rate": 2.8930825584914627e-05, + "loss": 0.0377, + "step": 49460 + }, + { + "epoch": 0.10910101382122607, + "grad_norm": 0.13171279430389404, + "learning_rate": 2.8930212430787404e-05, + "loss": 0.0386, + "step": 49470 + }, + { + "epoch": 0.10912306779612424, + "grad_norm": 0.13368885219097137, + "learning_rate": 2.8929599107394947e-05, + "loss": 0.038, + "step": 49480 + }, + { + "epoch": 0.1091451217710224, + "grad_norm": 0.12139935046434402, + "learning_rate": 2.8928985614744708e-05, + "loss": 0.0396, + "step": 49490 + }, + { + "epoch": 0.10916717574592057, + "grad_norm": 0.12625807523727417, + "learning_rate": 2.8928371952844143e-05, + "loss": 0.0363, + "step": 49500 + }, + { + "epoch": 0.10918922972081874, + "grad_norm": 0.13142792880535126, + "learning_rate": 2.892775812170071e-05, + "loss": 0.0369, + "step": 49510 + }, + { + "epoch": 0.1092112836957169, + "grad_norm": 0.15272794663906097, + "learning_rate": 2.8927144121321867e-05, + "loss": 0.0386, + "step": 49520 + }, + { + "epoch": 0.10923333767061506, + "grad_norm": 0.16251537203788757, + "learning_rate": 2.892652995171507e-05, + "loss": 0.0359, + "step": 49530 + }, + { + "epoch": 0.10925539164551323, + "grad_norm": 0.1386343091726303, + "learning_rate": 2.8925915612887787e-05, + "loss": 0.0349, + "step": 49540 + }, + { + "epoch": 0.10927744562041139, + "grad_norm": 0.11735685914754868, + "learning_rate": 2.892530110484748e-05, + "loss": 0.035, + "step": 49550 + }, + { + "epoch": 0.10929949959530956, + "grad_norm": 0.1075180396437645, + "learning_rate": 2.8924686427601618e-05, + "loss": 0.0367, + "step": 49560 + }, + { + "epoch": 0.10932155357020773, + "grad_norm": 0.11121457815170288, + "learning_rate": 2.8924071581157667e-05, + "loss": 0.0372, + "step": 49570 + }, + { + "epoch": 0.10934360754510589, + "grad_norm": 0.13335099816322327, + "learning_rate": 2.89234565655231e-05, + "loss": 0.0368, + "step": 49580 + }, + { + "epoch": 0.10936566152000406, + "grad_norm": 0.13024941086769104, + "learning_rate": 2.8922841380705395e-05, + "loss": 0.0369, + "step": 49590 + }, + { + "epoch": 0.10938771549490223, + "grad_norm": 0.10342292487621307, + "learning_rate": 2.8922226026712018e-05, + "loss": 0.0387, + "step": 49600 + }, + { + "epoch": 0.10940976946980038, + "grad_norm": 0.11216527968645096, + "learning_rate": 2.8921610503550447e-05, + "loss": 0.037, + "step": 49610 + }, + { + "epoch": 0.10943182344469855, + "grad_norm": 0.09910457581281662, + "learning_rate": 2.8920994811228167e-05, + "loss": 0.0392, + "step": 49620 + }, + { + "epoch": 0.10945387741959672, + "grad_norm": 0.11007647961378098, + "learning_rate": 2.8920378949752658e-05, + "loss": 0.0383, + "step": 49630 + }, + { + "epoch": 0.1094759313944949, + "grad_norm": 0.10398736596107483, + "learning_rate": 2.8919762919131402e-05, + "loss": 0.0361, + "step": 49640 + }, + { + "epoch": 0.10949798536939305, + "grad_norm": 0.10479118674993515, + "learning_rate": 2.891914671937188e-05, + "loss": 0.0371, + "step": 49650 + }, + { + "epoch": 0.10952003934429122, + "grad_norm": 0.11497005820274353, + "learning_rate": 2.891853035048159e-05, + "loss": 0.0386, + "step": 49660 + }, + { + "epoch": 0.10954209331918939, + "grad_norm": 0.1148923859000206, + "learning_rate": 2.8917913812468008e-05, + "loss": 0.0353, + "step": 49670 + }, + { + "epoch": 0.10956414729408755, + "grad_norm": 0.12058228999376297, + "learning_rate": 2.8917297105338637e-05, + "loss": 0.0376, + "step": 49680 + }, + { + "epoch": 0.10958620126898572, + "grad_norm": 0.10860712826251984, + "learning_rate": 2.8916680229100964e-05, + "loss": 0.0367, + "step": 49690 + }, + { + "epoch": 0.10960825524388389, + "grad_norm": 0.11998029798269272, + "learning_rate": 2.8916063183762485e-05, + "loss": 0.0357, + "step": 49700 + }, + { + "epoch": 0.10963030921878204, + "grad_norm": 0.1285526156425476, + "learning_rate": 2.8915445969330706e-05, + "loss": 0.0385, + "step": 49710 + }, + { + "epoch": 0.10965236319368021, + "grad_norm": 0.14169400930404663, + "learning_rate": 2.8914828585813112e-05, + "loss": 0.0381, + "step": 49720 + }, + { + "epoch": 0.10967441716857838, + "grad_norm": 0.13958804309368134, + "learning_rate": 2.8914211033217215e-05, + "loss": 0.0391, + "step": 49730 + }, + { + "epoch": 0.10969647114347654, + "grad_norm": 0.09864061325788498, + "learning_rate": 2.891359331155052e-05, + "loss": 0.0355, + "step": 49740 + }, + { + "epoch": 0.10971852511837471, + "grad_norm": 0.14356014132499695, + "learning_rate": 2.8912975420820528e-05, + "loss": 0.0358, + "step": 49750 + }, + { + "epoch": 0.10974057909327288, + "grad_norm": 0.16205188632011414, + "learning_rate": 2.891235736103475e-05, + "loss": 0.0373, + "step": 49760 + }, + { + "epoch": 0.10976263306817104, + "grad_norm": 0.12317553907632828, + "learning_rate": 2.891173913220069e-05, + "loss": 0.0393, + "step": 49770 + }, + { + "epoch": 0.1097846870430692, + "grad_norm": 0.13344717025756836, + "learning_rate": 2.8911120734325872e-05, + "loss": 0.0372, + "step": 49780 + }, + { + "epoch": 0.10980674101796738, + "grad_norm": 0.12470288574695587, + "learning_rate": 2.8910502167417796e-05, + "loss": 0.0371, + "step": 49790 + }, + { + "epoch": 0.10982879499286553, + "grad_norm": 0.11848391592502594, + "learning_rate": 2.8909883431483987e-05, + "loss": 0.0368, + "step": 49800 + }, + { + "epoch": 0.1098508489677637, + "grad_norm": 0.09697425365447998, + "learning_rate": 2.8909264526531963e-05, + "loss": 0.0392, + "step": 49810 + }, + { + "epoch": 0.10987290294266187, + "grad_norm": 0.12477347254753113, + "learning_rate": 2.890864545256924e-05, + "loss": 0.036, + "step": 49820 + }, + { + "epoch": 0.10989495691756003, + "grad_norm": 0.13950799405574799, + "learning_rate": 2.8908026209603344e-05, + "loss": 0.0363, + "step": 49830 + }, + { + "epoch": 0.1099170108924582, + "grad_norm": 0.1073121577501297, + "learning_rate": 2.89074067976418e-05, + "loss": 0.0367, + "step": 49840 + }, + { + "epoch": 0.10993906486735637, + "grad_norm": 0.10883168131113052, + "learning_rate": 2.890678721669213e-05, + "loss": 0.0358, + "step": 49850 + }, + { + "epoch": 0.10996111884225454, + "grad_norm": 0.12659451365470886, + "learning_rate": 2.8906167466761867e-05, + "loss": 0.0361, + "step": 49860 + }, + { + "epoch": 0.1099831728171527, + "grad_norm": 0.14020223915576935, + "learning_rate": 2.890554754785854e-05, + "loss": 0.0371, + "step": 49870 + }, + { + "epoch": 0.11000522679205087, + "grad_norm": 0.11622954905033112, + "learning_rate": 2.890492745998968e-05, + "loss": 0.0353, + "step": 49880 + }, + { + "epoch": 0.11002728076694904, + "grad_norm": 0.102421835064888, + "learning_rate": 2.8904307203162824e-05, + "loss": 0.0382, + "step": 49890 + }, + { + "epoch": 0.1100493347418472, + "grad_norm": 0.11565852910280228, + "learning_rate": 2.89036867773855e-05, + "loss": 0.0352, + "step": 49900 + }, + { + "epoch": 0.11007138871674536, + "grad_norm": 0.10457325726747513, + "learning_rate": 2.8903066182665265e-05, + "loss": 0.0354, + "step": 49910 + }, + { + "epoch": 0.11009344269164353, + "grad_norm": 0.11987300962209702, + "learning_rate": 2.8902445419009648e-05, + "loss": 0.0381, + "step": 49920 + }, + { + "epoch": 0.11011549666654169, + "grad_norm": 0.15685568749904633, + "learning_rate": 2.8901824486426187e-05, + "loss": 0.0368, + "step": 49930 + }, + { + "epoch": 0.11013755064143986, + "grad_norm": 0.09586663544178009, + "learning_rate": 2.890120338492244e-05, + "loss": 0.035, + "step": 49940 + }, + { + "epoch": 0.11015960461633803, + "grad_norm": 0.11260093748569489, + "learning_rate": 2.890058211450594e-05, + "loss": 0.0362, + "step": 49950 + }, + { + "epoch": 0.11018165859123619, + "grad_norm": 0.11937306821346283, + "learning_rate": 2.889996067518425e-05, + "loss": 0.0377, + "step": 49960 + }, + { + "epoch": 0.11020371256613436, + "grad_norm": 0.09282633662223816, + "learning_rate": 2.8899339066964907e-05, + "loss": 0.0366, + "step": 49970 + }, + { + "epoch": 0.11022576654103253, + "grad_norm": 0.07655873894691467, + "learning_rate": 2.8898717289855476e-05, + "loss": 0.0366, + "step": 49980 + }, + { + "epoch": 0.11024782051593068, + "grad_norm": 0.12480916827917099, + "learning_rate": 2.8898095343863508e-05, + "loss": 0.0389, + "step": 49990 + }, + { + "epoch": 0.11026987449082885, + "grad_norm": 0.14633485674858093, + "learning_rate": 2.8897473228996555e-05, + "loss": 0.0371, + "step": 50000 + }, + { + "epoch": 0.11029192846572702, + "grad_norm": 0.11004500836133957, + "learning_rate": 2.8896850945262183e-05, + "loss": 0.0365, + "step": 50010 + }, + { + "epoch": 0.11031398244062518, + "grad_norm": 0.1326117068529129, + "learning_rate": 2.889622849266795e-05, + "loss": 0.0357, + "step": 50020 + }, + { + "epoch": 0.11033603641552335, + "grad_norm": 0.09702982008457184, + "learning_rate": 2.8895605871221422e-05, + "loss": 0.0362, + "step": 50030 + }, + { + "epoch": 0.11035809039042152, + "grad_norm": 0.17038455605506897, + "learning_rate": 2.8894983080930163e-05, + "loss": 0.0376, + "step": 50040 + }, + { + "epoch": 0.11038014436531968, + "grad_norm": 0.08192599564790726, + "learning_rate": 2.889436012180174e-05, + "loss": 0.0375, + "step": 50050 + }, + { + "epoch": 0.11040219834021785, + "grad_norm": 0.12171066552400589, + "learning_rate": 2.889373699384373e-05, + "loss": 0.0376, + "step": 50060 + }, + { + "epoch": 0.11042425231511602, + "grad_norm": 0.1423521488904953, + "learning_rate": 2.8893113697063685e-05, + "loss": 0.0375, + "step": 50070 + }, + { + "epoch": 0.11044630629001419, + "grad_norm": 0.11107887327671051, + "learning_rate": 2.8892490231469203e-05, + "loss": 0.0376, + "step": 50080 + }, + { + "epoch": 0.11046836026491234, + "grad_norm": 0.11972881108522415, + "learning_rate": 2.8891866597067843e-05, + "loss": 0.0379, + "step": 50090 + }, + { + "epoch": 0.11049041423981051, + "grad_norm": 0.15135350823402405, + "learning_rate": 2.8891242793867187e-05, + "loss": 0.038, + "step": 50100 + }, + { + "epoch": 0.11051246821470868, + "grad_norm": 0.09850630164146423, + "learning_rate": 2.8890618821874815e-05, + "loss": 0.037, + "step": 50110 + }, + { + "epoch": 0.11053452218960684, + "grad_norm": 0.10163868963718414, + "learning_rate": 2.8889994681098312e-05, + "loss": 0.0362, + "step": 50120 + }, + { + "epoch": 0.11055657616450501, + "grad_norm": 0.12780790030956268, + "learning_rate": 2.8889370371545256e-05, + "loss": 0.0367, + "step": 50130 + }, + { + "epoch": 0.11057863013940318, + "grad_norm": 0.13657182455062866, + "learning_rate": 2.8888745893223236e-05, + "loss": 0.0361, + "step": 50140 + }, + { + "epoch": 0.11060068411430134, + "grad_norm": 0.136609748005867, + "learning_rate": 2.8888121246139846e-05, + "loss": 0.0351, + "step": 50150 + }, + { + "epoch": 0.11062273808919951, + "grad_norm": 0.12162820249795914, + "learning_rate": 2.8887496430302664e-05, + "loss": 0.0376, + "step": 50160 + }, + { + "epoch": 0.11064479206409768, + "grad_norm": 0.10368850827217102, + "learning_rate": 2.8886871445719294e-05, + "loss": 0.036, + "step": 50170 + }, + { + "epoch": 0.11066684603899583, + "grad_norm": 0.1049179658293724, + "learning_rate": 2.888624629239732e-05, + "loss": 0.0366, + "step": 50180 + }, + { + "epoch": 0.110688900013894, + "grad_norm": 0.10058040171861649, + "learning_rate": 2.888562097034434e-05, + "loss": 0.0356, + "step": 50190 + }, + { + "epoch": 0.11071095398879217, + "grad_norm": 0.10106854140758514, + "learning_rate": 2.8884995479567962e-05, + "loss": 0.0365, + "step": 50200 + }, + { + "epoch": 0.11073300796369033, + "grad_norm": 0.10782953351736069, + "learning_rate": 2.888436982007578e-05, + "loss": 0.0366, + "step": 50210 + }, + { + "epoch": 0.1107550619385885, + "grad_norm": 0.14592459797859192, + "learning_rate": 2.8883743991875392e-05, + "loss": 0.0356, + "step": 50220 + }, + { + "epoch": 0.11077711591348667, + "grad_norm": 0.10927551984786987, + "learning_rate": 2.8883117994974405e-05, + "loss": 0.0382, + "step": 50230 + }, + { + "epoch": 0.11079916988838483, + "grad_norm": 0.1143256351351738, + "learning_rate": 2.8882491829380427e-05, + "loss": 0.0373, + "step": 50240 + }, + { + "epoch": 0.110821223863283, + "grad_norm": 0.12684284150600433, + "learning_rate": 2.8881865495101067e-05, + "loss": 0.036, + "step": 50250 + }, + { + "epoch": 0.11084327783818117, + "grad_norm": 0.1110215038061142, + "learning_rate": 2.8881238992143936e-05, + "loss": 0.0376, + "step": 50260 + }, + { + "epoch": 0.11086533181307932, + "grad_norm": 0.12241734564304352, + "learning_rate": 2.8880612320516643e-05, + "loss": 0.0382, + "step": 50270 + }, + { + "epoch": 0.1108873857879775, + "grad_norm": 0.1426723301410675, + "learning_rate": 2.8879985480226807e-05, + "loss": 0.0376, + "step": 50280 + }, + { + "epoch": 0.11090943976287566, + "grad_norm": 0.10688091814517975, + "learning_rate": 2.8879358471282042e-05, + "loss": 0.0371, + "step": 50290 + }, + { + "epoch": 0.11093149373777383, + "grad_norm": 0.09886077046394348, + "learning_rate": 2.8878731293689965e-05, + "loss": 0.0377, + "step": 50300 + }, + { + "epoch": 0.11095354771267199, + "grad_norm": 0.15306732058525085, + "learning_rate": 2.88781039474582e-05, + "loss": 0.0383, + "step": 50310 + }, + { + "epoch": 0.11097560168757016, + "grad_norm": 0.11480870097875595, + "learning_rate": 2.8877476432594367e-05, + "loss": 0.036, + "step": 50320 + }, + { + "epoch": 0.11099765566246833, + "grad_norm": 0.1129210963845253, + "learning_rate": 2.8876848749106095e-05, + "loss": 0.037, + "step": 50330 + }, + { + "epoch": 0.11101970963736649, + "grad_norm": 0.10388495773077011, + "learning_rate": 2.8876220897001008e-05, + "loss": 0.0358, + "step": 50340 + }, + { + "epoch": 0.11104176361226466, + "grad_norm": 0.09869541227817535, + "learning_rate": 2.8875592876286738e-05, + "loss": 0.0375, + "step": 50350 + }, + { + "epoch": 0.11106381758716283, + "grad_norm": 0.13086862862110138, + "learning_rate": 2.8874964686970915e-05, + "loss": 0.0366, + "step": 50360 + }, + { + "epoch": 0.11108587156206098, + "grad_norm": 0.17325298488140106, + "learning_rate": 2.8874336329061167e-05, + "loss": 0.0358, + "step": 50370 + }, + { + "epoch": 0.11110792553695915, + "grad_norm": 0.10568134486675262, + "learning_rate": 2.8873707802565137e-05, + "loss": 0.0377, + "step": 50380 + }, + { + "epoch": 0.11112997951185732, + "grad_norm": 0.08856698870658875, + "learning_rate": 2.8873079107490455e-05, + "loss": 0.0376, + "step": 50390 + }, + { + "epoch": 0.11115203348675548, + "grad_norm": 0.14193320274353027, + "learning_rate": 2.8872450243844765e-05, + "loss": 0.039, + "step": 50400 + }, + { + "epoch": 0.11117408746165365, + "grad_norm": 0.11664783954620361, + "learning_rate": 2.8871821211635706e-05, + "loss": 0.0395, + "step": 50410 + }, + { + "epoch": 0.11119614143655182, + "grad_norm": 0.11808901280164719, + "learning_rate": 2.8871192010870923e-05, + "loss": 0.0344, + "step": 50420 + }, + { + "epoch": 0.11121819541144998, + "grad_norm": 0.10160952061414719, + "learning_rate": 2.8870562641558055e-05, + "loss": 0.0363, + "step": 50430 + }, + { + "epoch": 0.11124024938634815, + "grad_norm": 0.15201696753501892, + "learning_rate": 2.886993310370476e-05, + "loss": 0.0381, + "step": 50440 + }, + { + "epoch": 0.11126230336124632, + "grad_norm": 0.10259070247411728, + "learning_rate": 2.8869303397318683e-05, + "loss": 0.0369, + "step": 50450 + }, + { + "epoch": 0.11128435733614447, + "grad_norm": 0.1012197881937027, + "learning_rate": 2.8868673522407475e-05, + "loss": 0.0378, + "step": 50460 + }, + { + "epoch": 0.11130641131104264, + "grad_norm": 0.0967710018157959, + "learning_rate": 2.8868043478978788e-05, + "loss": 0.0356, + "step": 50470 + }, + { + "epoch": 0.11132846528594081, + "grad_norm": 0.13180376589298248, + "learning_rate": 2.8867413267040282e-05, + "loss": 0.0378, + "step": 50480 + }, + { + "epoch": 0.11135051926083897, + "grad_norm": 0.15565356612205505, + "learning_rate": 2.8866782886599608e-05, + "loss": 0.0383, + "step": 50490 + }, + { + "epoch": 0.11137257323573714, + "grad_norm": 0.08130210638046265, + "learning_rate": 2.886615233766443e-05, + "loss": 0.0374, + "step": 50500 + }, + { + "epoch": 0.11139462721063531, + "grad_norm": 0.14992065727710724, + "learning_rate": 2.886552162024241e-05, + "loss": 0.0367, + "step": 50510 + }, + { + "epoch": 0.11141668118553348, + "grad_norm": 0.12834857404232025, + "learning_rate": 2.8864890734341212e-05, + "loss": 0.0357, + "step": 50520 + }, + { + "epoch": 0.11143873516043164, + "grad_norm": 0.10070917010307312, + "learning_rate": 2.88642596799685e-05, + "loss": 0.0373, + "step": 50530 + }, + { + "epoch": 0.11146078913532981, + "grad_norm": 0.14311806857585907, + "learning_rate": 2.886362845713194e-05, + "loss": 0.0371, + "step": 50540 + }, + { + "epoch": 0.11148284311022798, + "grad_norm": 0.09870386123657227, + "learning_rate": 2.886299706583921e-05, + "loss": 0.0362, + "step": 50550 + }, + { + "epoch": 0.11150489708512613, + "grad_norm": 0.09300004690885544, + "learning_rate": 2.8862365506097975e-05, + "loss": 0.0368, + "step": 50560 + }, + { + "epoch": 0.1115269510600243, + "grad_norm": 0.09282853454351425, + "learning_rate": 2.886173377791591e-05, + "loss": 0.0354, + "step": 50570 + }, + { + "epoch": 0.11154900503492247, + "grad_norm": 0.15317043662071228, + "learning_rate": 2.8861101881300695e-05, + "loss": 0.0351, + "step": 50580 + }, + { + "epoch": 0.11157105900982063, + "grad_norm": 0.09935332834720612, + "learning_rate": 2.886046981626e-05, + "loss": 0.0363, + "step": 50590 + }, + { + "epoch": 0.1115931129847188, + "grad_norm": 0.12358365207910538, + "learning_rate": 2.8859837582801516e-05, + "loss": 0.0366, + "step": 50600 + }, + { + "epoch": 0.11161516695961697, + "grad_norm": 0.11584683507680893, + "learning_rate": 2.8859205180932916e-05, + "loss": 0.0374, + "step": 50610 + }, + { + "epoch": 0.11163722093451513, + "grad_norm": 0.149075448513031, + "learning_rate": 2.8858572610661886e-05, + "loss": 0.0369, + "step": 50620 + }, + { + "epoch": 0.1116592749094133, + "grad_norm": 0.12275632470846176, + "learning_rate": 2.8857939871996116e-05, + "loss": 0.0357, + "step": 50630 + }, + { + "epoch": 0.11168132888431147, + "grad_norm": 0.1337832361459732, + "learning_rate": 2.8857306964943293e-05, + "loss": 0.0362, + "step": 50640 + }, + { + "epoch": 0.11170338285920962, + "grad_norm": 0.1538434773683548, + "learning_rate": 2.8856673889511108e-05, + "loss": 0.0382, + "step": 50650 + }, + { + "epoch": 0.1117254368341078, + "grad_norm": 0.13550475239753723, + "learning_rate": 2.885604064570725e-05, + "loss": 0.0353, + "step": 50660 + }, + { + "epoch": 0.11174749080900596, + "grad_norm": 0.10926550626754761, + "learning_rate": 2.8855407233539415e-05, + "loss": 0.0368, + "step": 50670 + }, + { + "epoch": 0.11176954478390412, + "grad_norm": 0.10808711498975754, + "learning_rate": 2.8854773653015304e-05, + "loss": 0.0367, + "step": 50680 + }, + { + "epoch": 0.11179159875880229, + "grad_norm": 0.09606818854808807, + "learning_rate": 2.885413990414261e-05, + "loss": 0.0364, + "step": 50690 + }, + { + "epoch": 0.11181365273370046, + "grad_norm": 0.12329952418804169, + "learning_rate": 2.8853505986929035e-05, + "loss": 0.0343, + "step": 50700 + }, + { + "epoch": 0.11183570670859863, + "grad_norm": 0.10712160915136337, + "learning_rate": 2.8852871901382288e-05, + "loss": 0.0362, + "step": 50710 + }, + { + "epoch": 0.11185776068349679, + "grad_norm": 0.09850238263607025, + "learning_rate": 2.885223764751006e-05, + "loss": 0.0373, + "step": 50720 + }, + { + "epoch": 0.11187981465839496, + "grad_norm": 0.10896178334951401, + "learning_rate": 2.885160322532007e-05, + "loss": 0.0348, + "step": 50730 + }, + { + "epoch": 0.11190186863329313, + "grad_norm": 0.13180825114250183, + "learning_rate": 2.8850968634820024e-05, + "loss": 0.0374, + "step": 50740 + }, + { + "epoch": 0.11192392260819128, + "grad_norm": 0.11272256821393967, + "learning_rate": 2.885033387601763e-05, + "loss": 0.0351, + "step": 50750 + }, + { + "epoch": 0.11194597658308945, + "grad_norm": 0.08520720899105072, + "learning_rate": 2.8849698948920603e-05, + "loss": 0.0365, + "step": 50760 + }, + { + "epoch": 0.11196803055798762, + "grad_norm": 0.10255647450685501, + "learning_rate": 2.8849063853536656e-05, + "loss": 0.0365, + "step": 50770 + }, + { + "epoch": 0.11199008453288578, + "grad_norm": 0.08924225717782974, + "learning_rate": 2.884842858987351e-05, + "loss": 0.0376, + "step": 50780 + }, + { + "epoch": 0.11201213850778395, + "grad_norm": 0.0899752527475357, + "learning_rate": 2.8847793157938877e-05, + "loss": 0.0376, + "step": 50790 + }, + { + "epoch": 0.11203419248268212, + "grad_norm": 0.1054849624633789, + "learning_rate": 2.884715755774049e-05, + "loss": 0.0377, + "step": 50800 + }, + { + "epoch": 0.11205624645758028, + "grad_norm": 0.11209829896688461, + "learning_rate": 2.8846521789286056e-05, + "loss": 0.0369, + "step": 50810 + }, + { + "epoch": 0.11207830043247845, + "grad_norm": 0.12419876456260681, + "learning_rate": 2.8845885852583317e-05, + "loss": 0.0357, + "step": 50820 + }, + { + "epoch": 0.11210035440737662, + "grad_norm": 0.12829162180423737, + "learning_rate": 2.8845249747639987e-05, + "loss": 0.0384, + "step": 50830 + }, + { + "epoch": 0.11212240838227477, + "grad_norm": 0.10164876282215118, + "learning_rate": 2.8844613474463805e-05, + "loss": 0.0358, + "step": 50840 + }, + { + "epoch": 0.11214446235717294, + "grad_norm": 0.11862093955278397, + "learning_rate": 2.884397703306249e-05, + "loss": 0.038, + "step": 50850 + }, + { + "epoch": 0.11216651633207111, + "grad_norm": 0.11019720882177353, + "learning_rate": 2.884334042344379e-05, + "loss": 0.0375, + "step": 50860 + }, + { + "epoch": 0.11218857030696927, + "grad_norm": 0.15046058595180511, + "learning_rate": 2.884270364561543e-05, + "loss": 0.037, + "step": 50870 + }, + { + "epoch": 0.11221062428186744, + "grad_norm": 0.12001251429319382, + "learning_rate": 2.8842066699585155e-05, + "loss": 0.0388, + "step": 50880 + }, + { + "epoch": 0.11223267825676561, + "grad_norm": 0.10738545656204224, + "learning_rate": 2.8841429585360696e-05, + "loss": 0.0369, + "step": 50890 + }, + { + "epoch": 0.11225473223166377, + "grad_norm": 0.10968439280986786, + "learning_rate": 2.8840792302949802e-05, + "loss": 0.0372, + "step": 50900 + }, + { + "epoch": 0.11227678620656194, + "grad_norm": 0.11818333715200424, + "learning_rate": 2.8840154852360215e-05, + "loss": 0.036, + "step": 50910 + }, + { + "epoch": 0.11229884018146011, + "grad_norm": 0.09756477922201157, + "learning_rate": 2.8839517233599673e-05, + "loss": 0.0377, + "step": 50920 + }, + { + "epoch": 0.11232089415635828, + "grad_norm": 0.11456423997879028, + "learning_rate": 2.8838879446675935e-05, + "loss": 0.0366, + "step": 50930 + }, + { + "epoch": 0.11234294813125643, + "grad_norm": 0.12317612022161484, + "learning_rate": 2.8838241491596747e-05, + "loss": 0.0377, + "step": 50940 + }, + { + "epoch": 0.1123650021061546, + "grad_norm": 0.13245394825935364, + "learning_rate": 2.8837603368369856e-05, + "loss": 0.0386, + "step": 50950 + }, + { + "epoch": 0.11238705608105277, + "grad_norm": 0.1125238761305809, + "learning_rate": 2.8836965077003023e-05, + "loss": 0.0356, + "step": 50960 + }, + { + "epoch": 0.11240911005595093, + "grad_norm": 0.10975339263677597, + "learning_rate": 2.883632661750399e-05, + "loss": 0.0363, + "step": 50970 + }, + { + "epoch": 0.1124311640308491, + "grad_norm": 0.1237613707780838, + "learning_rate": 2.8835687989880534e-05, + "loss": 0.036, + "step": 50980 + }, + { + "epoch": 0.11245321800574727, + "grad_norm": 0.12139171361923218, + "learning_rate": 2.8835049194140407e-05, + "loss": 0.0363, + "step": 50990 + }, + { + "epoch": 0.11247527198064543, + "grad_norm": 0.09537120908498764, + "learning_rate": 2.8834410230291365e-05, + "loss": 0.0375, + "step": 51000 + }, + { + "epoch": 0.1124973259555436, + "grad_norm": 0.1132306307554245, + "learning_rate": 2.883377109834118e-05, + "loss": 0.0371, + "step": 51010 + }, + { + "epoch": 0.11251937993044177, + "grad_norm": 0.11240583658218384, + "learning_rate": 2.8833131798297614e-05, + "loss": 0.0394, + "step": 51020 + }, + { + "epoch": 0.11254143390533992, + "grad_norm": 0.1265658736228943, + "learning_rate": 2.8832492330168435e-05, + "loss": 0.0366, + "step": 51030 + }, + { + "epoch": 0.1125634878802381, + "grad_norm": 0.11448143422603607, + "learning_rate": 2.8831852693961415e-05, + "loss": 0.0362, + "step": 51040 + }, + { + "epoch": 0.11258554185513626, + "grad_norm": 0.10247133672237396, + "learning_rate": 2.8831212889684325e-05, + "loss": 0.0376, + "step": 51050 + }, + { + "epoch": 0.11260759583003442, + "grad_norm": 0.15146981179714203, + "learning_rate": 2.883057291734494e-05, + "loss": 0.0385, + "step": 51060 + }, + { + "epoch": 0.11262964980493259, + "grad_norm": 0.11223481595516205, + "learning_rate": 2.8829932776951034e-05, + "loss": 0.0356, + "step": 51070 + }, + { + "epoch": 0.11265170377983076, + "grad_norm": 0.14172160625457764, + "learning_rate": 2.8829292468510393e-05, + "loss": 0.0359, + "step": 51080 + }, + { + "epoch": 0.11267375775472892, + "grad_norm": 0.1416669636964798, + "learning_rate": 2.8828651992030788e-05, + "loss": 0.0363, + "step": 51090 + }, + { + "epoch": 0.11269581172962709, + "grad_norm": 0.11878044903278351, + "learning_rate": 2.882801134752e-05, + "loss": 0.035, + "step": 51100 + }, + { + "epoch": 0.11271786570452526, + "grad_norm": 0.08468864113092422, + "learning_rate": 2.8827370534985826e-05, + "loss": 0.0352, + "step": 51110 + }, + { + "epoch": 0.11273991967942341, + "grad_norm": 0.1004520133137703, + "learning_rate": 2.882672955443604e-05, + "loss": 0.0358, + "step": 51120 + }, + { + "epoch": 0.11276197365432158, + "grad_norm": 0.1096731424331665, + "learning_rate": 2.8826088405878434e-05, + "loss": 0.0367, + "step": 51130 + }, + { + "epoch": 0.11278402762921975, + "grad_norm": 0.15097133815288544, + "learning_rate": 2.8825447089320805e-05, + "loss": 0.0364, + "step": 51140 + }, + { + "epoch": 0.11280608160411792, + "grad_norm": 0.16244935989379883, + "learning_rate": 2.8824805604770936e-05, + "loss": 0.0354, + "step": 51150 + }, + { + "epoch": 0.11282813557901608, + "grad_norm": 0.14080694317817688, + "learning_rate": 2.882416395223663e-05, + "loss": 0.0371, + "step": 51160 + }, + { + "epoch": 0.11285018955391425, + "grad_norm": 0.11691562831401825, + "learning_rate": 2.8823522131725678e-05, + "loss": 0.0364, + "step": 51170 + }, + { + "epoch": 0.11287224352881242, + "grad_norm": 0.1037931740283966, + "learning_rate": 2.8822880143245882e-05, + "loss": 0.0342, + "step": 51180 + }, + { + "epoch": 0.11289429750371058, + "grad_norm": 0.10279977321624756, + "learning_rate": 2.882223798680504e-05, + "loss": 0.0368, + "step": 51190 + }, + { + "epoch": 0.11291635147860875, + "grad_norm": 0.11768930405378342, + "learning_rate": 2.8821595662410954e-05, + "loss": 0.0377, + "step": 51200 + }, + { + "epoch": 0.11293840545350692, + "grad_norm": 0.14768525958061218, + "learning_rate": 2.8820953170071435e-05, + "loss": 0.0375, + "step": 51210 + }, + { + "epoch": 0.11296045942840507, + "grad_norm": 0.11688806861639023, + "learning_rate": 2.882031050979428e-05, + "loss": 0.0359, + "step": 51220 + }, + { + "epoch": 0.11298251340330324, + "grad_norm": 0.11812650412321091, + "learning_rate": 2.881966768158731e-05, + "loss": 0.0387, + "step": 51230 + }, + { + "epoch": 0.11300456737820141, + "grad_norm": 0.11942964792251587, + "learning_rate": 2.8819024685458328e-05, + "loss": 0.0371, + "step": 51240 + }, + { + "epoch": 0.11302662135309957, + "grad_norm": 0.08943287283182144, + "learning_rate": 2.8818381521415146e-05, + "loss": 0.0371, + "step": 51250 + }, + { + "epoch": 0.11304867532799774, + "grad_norm": 0.0924573615193367, + "learning_rate": 2.8817738189465584e-05, + "loss": 0.0394, + "step": 51260 + }, + { + "epoch": 0.11307072930289591, + "grad_norm": 0.13725464046001434, + "learning_rate": 2.8817094689617457e-05, + "loss": 0.0372, + "step": 51270 + }, + { + "epoch": 0.11309278327779407, + "grad_norm": 0.133152574300766, + "learning_rate": 2.8816451021878583e-05, + "loss": 0.0366, + "step": 51280 + }, + { + "epoch": 0.11311483725269224, + "grad_norm": 0.13011306524276733, + "learning_rate": 2.8815807186256787e-05, + "loss": 0.0356, + "step": 51290 + }, + { + "epoch": 0.11313689122759041, + "grad_norm": 0.11582458019256592, + "learning_rate": 2.881516318275988e-05, + "loss": 0.0385, + "step": 51300 + }, + { + "epoch": 0.11315894520248856, + "grad_norm": 0.12589263916015625, + "learning_rate": 2.881451901139571e-05, + "loss": 0.0377, + "step": 51310 + }, + { + "epoch": 0.11318099917738673, + "grad_norm": 0.07806511223316193, + "learning_rate": 2.8813874672172083e-05, + "loss": 0.0361, + "step": 51320 + }, + { + "epoch": 0.1132030531522849, + "grad_norm": 0.114406518638134, + "learning_rate": 2.881323016509684e-05, + "loss": 0.0364, + "step": 51330 + }, + { + "epoch": 0.11322510712718306, + "grad_norm": 0.12912264466285706, + "learning_rate": 2.8812585490177806e-05, + "loss": 0.0364, + "step": 51340 + }, + { + "epoch": 0.11324716110208123, + "grad_norm": 0.10222966223955154, + "learning_rate": 2.881194064742282e-05, + "loss": 0.0361, + "step": 51350 + }, + { + "epoch": 0.1132692150769794, + "grad_norm": 0.10319889336824417, + "learning_rate": 2.881129563683971e-05, + "loss": 0.0347, + "step": 51360 + }, + { + "epoch": 0.11329126905187757, + "grad_norm": 0.12545636296272278, + "learning_rate": 2.881065045843632e-05, + "loss": 0.0368, + "step": 51370 + }, + { + "epoch": 0.11331332302677573, + "grad_norm": 0.12497909367084503, + "learning_rate": 2.881000511222049e-05, + "loss": 0.0373, + "step": 51380 + }, + { + "epoch": 0.1133353770016739, + "grad_norm": 0.10647062212228775, + "learning_rate": 2.8809359598200055e-05, + "loss": 0.0396, + "step": 51390 + }, + { + "epoch": 0.11335743097657207, + "grad_norm": 0.12936921417713165, + "learning_rate": 2.880871391638286e-05, + "loss": 0.0379, + "step": 51400 + }, + { + "epoch": 0.11337948495147022, + "grad_norm": 0.11438639461994171, + "learning_rate": 2.880806806677676e-05, + "loss": 0.0368, + "step": 51410 + }, + { + "epoch": 0.1134015389263684, + "grad_norm": 0.10932036489248276, + "learning_rate": 2.8807422049389595e-05, + "loss": 0.0342, + "step": 51420 + }, + { + "epoch": 0.11342359290126657, + "grad_norm": 0.08980931341648102, + "learning_rate": 2.8806775864229215e-05, + "loss": 0.0335, + "step": 51430 + }, + { + "epoch": 0.11344564687616472, + "grad_norm": 0.1011543720960617, + "learning_rate": 2.8806129511303474e-05, + "loss": 0.036, + "step": 51440 + }, + { + "epoch": 0.11346770085106289, + "grad_norm": 0.12574607133865356, + "learning_rate": 2.8805482990620222e-05, + "loss": 0.0362, + "step": 51450 + }, + { + "epoch": 0.11348975482596106, + "grad_norm": 0.12677550315856934, + "learning_rate": 2.8804836302187316e-05, + "loss": 0.0376, + "step": 51460 + }, + { + "epoch": 0.11351180880085922, + "grad_norm": 0.12566649913787842, + "learning_rate": 2.8804189446012616e-05, + "loss": 0.0394, + "step": 51470 + }, + { + "epoch": 0.11353386277575739, + "grad_norm": 0.10307037830352783, + "learning_rate": 2.8803542422103986e-05, + "loss": 0.0357, + "step": 51480 + }, + { + "epoch": 0.11355591675065556, + "grad_norm": 0.10702461749315262, + "learning_rate": 2.8802895230469274e-05, + "loss": 0.037, + "step": 51490 + }, + { + "epoch": 0.11357797072555371, + "grad_norm": 0.08624618500471115, + "learning_rate": 2.880224787111636e-05, + "loss": 0.0347, + "step": 51500 + }, + { + "epoch": 0.11360002470045188, + "grad_norm": 0.13555414974689484, + "learning_rate": 2.88016003440531e-05, + "loss": 0.0363, + "step": 51510 + }, + { + "epoch": 0.11362207867535006, + "grad_norm": 0.09757118672132492, + "learning_rate": 2.8800952649287362e-05, + "loss": 0.0357, + "step": 51520 + }, + { + "epoch": 0.11364413265024821, + "grad_norm": 0.11125796288251877, + "learning_rate": 2.8800304786827025e-05, + "loss": 0.0371, + "step": 51530 + }, + { + "epoch": 0.11366618662514638, + "grad_norm": 0.14222458004951477, + "learning_rate": 2.8799656756679954e-05, + "loss": 0.0335, + "step": 51540 + }, + { + "epoch": 0.11368824060004455, + "grad_norm": 0.17735832929611206, + "learning_rate": 2.8799008558854017e-05, + "loss": 0.0374, + "step": 51550 + }, + { + "epoch": 0.11371029457494271, + "grad_norm": 0.0999303013086319, + "learning_rate": 2.8798360193357106e-05, + "loss": 0.0366, + "step": 51560 + }, + { + "epoch": 0.11373234854984088, + "grad_norm": 0.11652437597513199, + "learning_rate": 2.8797711660197087e-05, + "loss": 0.0372, + "step": 51570 + }, + { + "epoch": 0.11375440252473905, + "grad_norm": 0.12206529080867767, + "learning_rate": 2.8797062959381847e-05, + "loss": 0.0374, + "step": 51580 + }, + { + "epoch": 0.11377645649963722, + "grad_norm": 0.1123926043510437, + "learning_rate": 2.8796414090919262e-05, + "loss": 0.0342, + "step": 51590 + }, + { + "epoch": 0.11379851047453537, + "grad_norm": 0.1265707015991211, + "learning_rate": 2.8795765054817225e-05, + "loss": 0.0382, + "step": 51600 + }, + { + "epoch": 0.11382056444943355, + "grad_norm": 0.11509910970926285, + "learning_rate": 2.8795115851083614e-05, + "loss": 0.0353, + "step": 51610 + }, + { + "epoch": 0.11384261842433172, + "grad_norm": 0.1192476749420166, + "learning_rate": 2.8794466479726316e-05, + "loss": 0.0366, + "step": 51620 + }, + { + "epoch": 0.11386467239922987, + "grad_norm": 0.13140936195850372, + "learning_rate": 2.8793816940753232e-05, + "loss": 0.0391, + "step": 51630 + }, + { + "epoch": 0.11388672637412804, + "grad_norm": 0.11588428914546967, + "learning_rate": 2.8793167234172247e-05, + "loss": 0.0364, + "step": 51640 + }, + { + "epoch": 0.11390878034902621, + "grad_norm": 0.1122388243675232, + "learning_rate": 2.8792517359991258e-05, + "loss": 0.0371, + "step": 51650 + }, + { + "epoch": 0.11393083432392437, + "grad_norm": 0.10651290416717529, + "learning_rate": 2.879186731821816e-05, + "loss": 0.0383, + "step": 51660 + }, + { + "epoch": 0.11395288829882254, + "grad_norm": 0.14689834415912628, + "learning_rate": 2.879121710886085e-05, + "loss": 0.0377, + "step": 51670 + }, + { + "epoch": 0.11397494227372071, + "grad_norm": 0.12251947075128555, + "learning_rate": 2.8790566731927233e-05, + "loss": 0.0372, + "step": 51680 + }, + { + "epoch": 0.11399699624861886, + "grad_norm": 0.1247425302863121, + "learning_rate": 2.8789916187425214e-05, + "loss": 0.0362, + "step": 51690 + }, + { + "epoch": 0.11401905022351704, + "grad_norm": 0.11163453757762909, + "learning_rate": 2.8789265475362686e-05, + "loss": 0.0364, + "step": 51700 + }, + { + "epoch": 0.1140411041984152, + "grad_norm": 0.12173338979482651, + "learning_rate": 2.8788614595747568e-05, + "loss": 0.0405, + "step": 51710 + }, + { + "epoch": 0.11406315817331336, + "grad_norm": 0.08666561543941498, + "learning_rate": 2.8787963548587764e-05, + "loss": 0.0358, + "step": 51720 + }, + { + "epoch": 0.11408521214821153, + "grad_norm": 0.13090643286705017, + "learning_rate": 2.8787312333891186e-05, + "loss": 0.039, + "step": 51730 + }, + { + "epoch": 0.1141072661231097, + "grad_norm": 0.09410922229290009, + "learning_rate": 2.8786660951665742e-05, + "loss": 0.0368, + "step": 51740 + }, + { + "epoch": 0.11412932009800786, + "grad_norm": 0.1113494262099266, + "learning_rate": 2.8786009401919356e-05, + "loss": 0.0388, + "step": 51750 + }, + { + "epoch": 0.11415137407290603, + "grad_norm": 0.11414692550897598, + "learning_rate": 2.8785357684659933e-05, + "loss": 0.0364, + "step": 51760 + }, + { + "epoch": 0.1141734280478042, + "grad_norm": 0.1343584507703781, + "learning_rate": 2.8784705799895404e-05, + "loss": 0.0365, + "step": 51770 + }, + { + "epoch": 0.11419548202270237, + "grad_norm": 0.10087946057319641, + "learning_rate": 2.8784053747633677e-05, + "loss": 0.0381, + "step": 51780 + }, + { + "epoch": 0.11421753599760052, + "grad_norm": 0.13961364328861237, + "learning_rate": 2.8783401527882692e-05, + "loss": 0.0367, + "step": 51790 + }, + { + "epoch": 0.1142395899724987, + "grad_norm": 0.11159726232290268, + "learning_rate": 2.8782749140650362e-05, + "loss": 0.0354, + "step": 51800 + }, + { + "epoch": 0.11426164394739687, + "grad_norm": 0.11673061549663544, + "learning_rate": 2.8782096585944615e-05, + "loss": 0.0353, + "step": 51810 + }, + { + "epoch": 0.11428369792229502, + "grad_norm": 0.11684982478618622, + "learning_rate": 2.878144386377338e-05, + "loss": 0.0357, + "step": 51820 + }, + { + "epoch": 0.11430575189719319, + "grad_norm": 0.11049848049879074, + "learning_rate": 2.8780790974144593e-05, + "loss": 0.0369, + "step": 51830 + }, + { + "epoch": 0.11432780587209136, + "grad_norm": 0.11743580549955368, + "learning_rate": 2.8780137917066184e-05, + "loss": 0.0379, + "step": 51840 + }, + { + "epoch": 0.11434985984698952, + "grad_norm": 0.17647048830986023, + "learning_rate": 2.877948469254609e-05, + "loss": 0.0369, + "step": 51850 + }, + { + "epoch": 0.11437191382188769, + "grad_norm": 0.13086257874965668, + "learning_rate": 2.8778831300592245e-05, + "loss": 0.0372, + "step": 51860 + }, + { + "epoch": 0.11439396779678586, + "grad_norm": 0.0931781455874443, + "learning_rate": 2.877817774121259e-05, + "loss": 0.0378, + "step": 51870 + }, + { + "epoch": 0.11441602177168401, + "grad_norm": 0.13370119035243988, + "learning_rate": 2.877752401441507e-05, + "loss": 0.0383, + "step": 51880 + }, + { + "epoch": 0.11443807574658219, + "grad_norm": 0.1191844567656517, + "learning_rate": 2.8776870120207623e-05, + "loss": 0.0356, + "step": 51890 + }, + { + "epoch": 0.11446012972148036, + "grad_norm": 0.10721000283956528, + "learning_rate": 2.8776216058598194e-05, + "loss": 0.039, + "step": 51900 + }, + { + "epoch": 0.11448218369637851, + "grad_norm": 0.11086495220661163, + "learning_rate": 2.8775561829594737e-05, + "loss": 0.0382, + "step": 51910 + }, + { + "epoch": 0.11450423767127668, + "grad_norm": 0.14001351594924927, + "learning_rate": 2.87749074332052e-05, + "loss": 0.0374, + "step": 51920 + }, + { + "epoch": 0.11452629164617485, + "grad_norm": 0.11915892362594604, + "learning_rate": 2.8774252869437526e-05, + "loss": 0.0403, + "step": 51930 + }, + { + "epoch": 0.11454834562107301, + "grad_norm": 0.11265476047992706, + "learning_rate": 2.8773598138299677e-05, + "loss": 0.0368, + "step": 51940 + }, + { + "epoch": 0.11457039959597118, + "grad_norm": 0.12503862380981445, + "learning_rate": 2.8772943239799603e-05, + "loss": 0.0361, + "step": 51950 + }, + { + "epoch": 0.11459245357086935, + "grad_norm": 0.11598363518714905, + "learning_rate": 2.877228817394527e-05, + "loss": 0.0358, + "step": 51960 + }, + { + "epoch": 0.1146145075457675, + "grad_norm": 0.107354536652565, + "learning_rate": 2.8771632940744632e-05, + "loss": 0.0386, + "step": 51970 + }, + { + "epoch": 0.11463656152066568, + "grad_norm": 0.0980900377035141, + "learning_rate": 2.8770977540205647e-05, + "loss": 0.0366, + "step": 51980 + }, + { + "epoch": 0.11465861549556385, + "grad_norm": 0.09572836756706238, + "learning_rate": 2.8770321972336285e-05, + "loss": 0.036, + "step": 51990 + }, + { + "epoch": 0.11468066947046202, + "grad_norm": 0.09344921261072159, + "learning_rate": 2.876966623714451e-05, + "loss": 0.0366, + "step": 52000 + }, + { + "epoch": 0.11470272344536017, + "grad_norm": 0.10799221694469452, + "learning_rate": 2.8769010334638288e-05, + "loss": 0.0355, + "step": 52010 + }, + { + "epoch": 0.11472477742025834, + "grad_norm": 0.12002857029438019, + "learning_rate": 2.876835426482559e-05, + "loss": 0.0369, + "step": 52020 + }, + { + "epoch": 0.11474683139515651, + "grad_norm": 0.1274573802947998, + "learning_rate": 2.8767698027714393e-05, + "loss": 0.0359, + "step": 52030 + }, + { + "epoch": 0.11476888537005467, + "grad_norm": 0.1184447854757309, + "learning_rate": 2.8767041623312664e-05, + "loss": 0.0355, + "step": 52040 + }, + { + "epoch": 0.11479093934495284, + "grad_norm": 0.1063583716750145, + "learning_rate": 2.8766385051628378e-05, + "loss": 0.0382, + "step": 52050 + }, + { + "epoch": 0.11481299331985101, + "grad_norm": 0.13820555806159973, + "learning_rate": 2.8765728312669522e-05, + "loss": 0.0354, + "step": 52060 + }, + { + "epoch": 0.11483504729474917, + "grad_norm": 0.1250060498714447, + "learning_rate": 2.8765071406444065e-05, + "loss": 0.0358, + "step": 52070 + }, + { + "epoch": 0.11485710126964734, + "grad_norm": 0.1147456169128418, + "learning_rate": 2.8764414332959992e-05, + "loss": 0.0368, + "step": 52080 + }, + { + "epoch": 0.1148791552445455, + "grad_norm": 0.08725687116384506, + "learning_rate": 2.8763757092225296e-05, + "loss": 0.0358, + "step": 52090 + }, + { + "epoch": 0.11490120921944366, + "grad_norm": 0.10624540597200394, + "learning_rate": 2.876309968424795e-05, + "loss": 0.0374, + "step": 52100 + }, + { + "epoch": 0.11492326319434183, + "grad_norm": 0.13178229331970215, + "learning_rate": 2.8762442109035952e-05, + "loss": 0.037, + "step": 52110 + }, + { + "epoch": 0.11494531716924, + "grad_norm": 0.1405642181634903, + "learning_rate": 2.8761784366597285e-05, + "loss": 0.0341, + "step": 52120 + }, + { + "epoch": 0.11496737114413816, + "grad_norm": 0.11319096386432648, + "learning_rate": 2.876112645693995e-05, + "loss": 0.0364, + "step": 52130 + }, + { + "epoch": 0.11498942511903633, + "grad_norm": 0.1462252289056778, + "learning_rate": 2.8760468380071933e-05, + "loss": 0.0371, + "step": 52140 + }, + { + "epoch": 0.1150114790939345, + "grad_norm": 0.10166312009096146, + "learning_rate": 2.8759810136001232e-05, + "loss": 0.0368, + "step": 52150 + }, + { + "epoch": 0.11503353306883266, + "grad_norm": 0.11611755937337875, + "learning_rate": 2.8759151724735847e-05, + "loss": 0.037, + "step": 52160 + }, + { + "epoch": 0.11505558704373083, + "grad_norm": 0.14239317178726196, + "learning_rate": 2.875849314628378e-05, + "loss": 0.0376, + "step": 52170 + }, + { + "epoch": 0.115077641018629, + "grad_norm": 0.14307141304016113, + "learning_rate": 2.875783440065303e-05, + "loss": 0.0382, + "step": 52180 + }, + { + "epoch": 0.11509969499352715, + "grad_norm": 0.11765050143003464, + "learning_rate": 2.87571754878516e-05, + "loss": 0.0372, + "step": 52190 + }, + { + "epoch": 0.11512174896842532, + "grad_norm": 0.09596963226795197, + "learning_rate": 2.87565164078875e-05, + "loss": 0.0367, + "step": 52200 + }, + { + "epoch": 0.11514380294332349, + "grad_norm": 0.12350983917713165, + "learning_rate": 2.875585716076874e-05, + "loss": 0.0376, + "step": 52210 + }, + { + "epoch": 0.11516585691822166, + "grad_norm": 0.1233515813946724, + "learning_rate": 2.8755197746503324e-05, + "loss": 0.0372, + "step": 52220 + }, + { + "epoch": 0.11518791089311982, + "grad_norm": 0.1006171703338623, + "learning_rate": 2.875453816509927e-05, + "loss": 0.0365, + "step": 52230 + }, + { + "epoch": 0.11520996486801799, + "grad_norm": 0.11755616217851639, + "learning_rate": 2.8753878416564592e-05, + "loss": 0.0361, + "step": 52240 + }, + { + "epoch": 0.11523201884291616, + "grad_norm": 0.1189027801156044, + "learning_rate": 2.8753218500907303e-05, + "loss": 0.0357, + "step": 52250 + }, + { + "epoch": 0.11525407281781432, + "grad_norm": 0.13196441531181335, + "learning_rate": 2.8752558418135424e-05, + "loss": 0.0376, + "step": 52260 + }, + { + "epoch": 0.11527612679271249, + "grad_norm": 0.11637350171804428, + "learning_rate": 2.8751898168256975e-05, + "loss": 0.0362, + "step": 52270 + }, + { + "epoch": 0.11529818076761066, + "grad_norm": 0.1498968005180359, + "learning_rate": 2.875123775127998e-05, + "loss": 0.0368, + "step": 52280 + }, + { + "epoch": 0.11532023474250881, + "grad_norm": 0.10589331388473511, + "learning_rate": 2.875057716721247e-05, + "loss": 0.035, + "step": 52290 + }, + { + "epoch": 0.11534228871740698, + "grad_norm": 0.0988290086388588, + "learning_rate": 2.8749916416062456e-05, + "loss": 0.0364, + "step": 52300 + }, + { + "epoch": 0.11536434269230515, + "grad_norm": 0.12932705879211426, + "learning_rate": 2.874925549783798e-05, + "loss": 0.0346, + "step": 52310 + }, + { + "epoch": 0.11538639666720331, + "grad_norm": 0.11132166534662247, + "learning_rate": 2.874859441254707e-05, + "loss": 0.0356, + "step": 52320 + }, + { + "epoch": 0.11540845064210148, + "grad_norm": 0.135649174451828, + "learning_rate": 2.8747933160197752e-05, + "loss": 0.0394, + "step": 52330 + }, + { + "epoch": 0.11543050461699965, + "grad_norm": 0.12729895114898682, + "learning_rate": 2.874727174079807e-05, + "loss": 0.0367, + "step": 52340 + }, + { + "epoch": 0.1154525585918978, + "grad_norm": 0.10635272413492203, + "learning_rate": 2.874661015435605e-05, + "loss": 0.0367, + "step": 52350 + }, + { + "epoch": 0.11547461256679598, + "grad_norm": 0.11206366121768951, + "learning_rate": 2.8745948400879742e-05, + "loss": 0.038, + "step": 52360 + }, + { + "epoch": 0.11549666654169415, + "grad_norm": 0.08633596450090408, + "learning_rate": 2.8745286480377188e-05, + "loss": 0.0366, + "step": 52370 + }, + { + "epoch": 0.1155187205165923, + "grad_norm": 0.13574880361557007, + "learning_rate": 2.8744624392856417e-05, + "loss": 0.0368, + "step": 52380 + }, + { + "epoch": 0.11554077449149047, + "grad_norm": 0.13902749121189117, + "learning_rate": 2.8743962138325485e-05, + "loss": 0.036, + "step": 52390 + }, + { + "epoch": 0.11556282846638864, + "grad_norm": 0.1105431318283081, + "learning_rate": 2.8743299716792437e-05, + "loss": 0.0367, + "step": 52400 + }, + { + "epoch": 0.1155848824412868, + "grad_norm": 0.10578035563230515, + "learning_rate": 2.8742637128265324e-05, + "loss": 0.0364, + "step": 52410 + }, + { + "epoch": 0.11560693641618497, + "grad_norm": 0.0880003497004509, + "learning_rate": 2.8741974372752192e-05, + "loss": 0.0373, + "step": 52420 + }, + { + "epoch": 0.11562899039108314, + "grad_norm": 0.12178879976272583, + "learning_rate": 2.8741311450261097e-05, + "loss": 0.0373, + "step": 52430 + }, + { + "epoch": 0.11565104436598131, + "grad_norm": 0.12886735796928406, + "learning_rate": 2.8740648360800096e-05, + "loss": 0.0372, + "step": 52440 + }, + { + "epoch": 0.11567309834087947, + "grad_norm": 0.13820213079452515, + "learning_rate": 2.8739985104377246e-05, + "loss": 0.0359, + "step": 52450 + }, + { + "epoch": 0.11569515231577764, + "grad_norm": 0.12656675279140472, + "learning_rate": 2.8739321681000597e-05, + "loss": 0.0349, + "step": 52460 + }, + { + "epoch": 0.1157172062906758, + "grad_norm": 0.13708342611789703, + "learning_rate": 2.8738658090678223e-05, + "loss": 0.0356, + "step": 52470 + }, + { + "epoch": 0.11573926026557396, + "grad_norm": 0.1483454406261444, + "learning_rate": 2.8737994333418177e-05, + "loss": 0.0357, + "step": 52480 + }, + { + "epoch": 0.11576131424047213, + "grad_norm": 0.15652824938297272, + "learning_rate": 2.8737330409228534e-05, + "loss": 0.0373, + "step": 52490 + }, + { + "epoch": 0.1157833682153703, + "grad_norm": 0.13985377550125122, + "learning_rate": 2.8736666318117354e-05, + "loss": 0.0388, + "step": 52500 + }, + { + "epoch": 0.11580542219026846, + "grad_norm": 0.11567416042089462, + "learning_rate": 2.873600206009271e-05, + "loss": 0.0358, + "step": 52510 + }, + { + "epoch": 0.11582747616516663, + "grad_norm": 0.11455462127923965, + "learning_rate": 2.873533763516267e-05, + "loss": 0.0372, + "step": 52520 + }, + { + "epoch": 0.1158495301400648, + "grad_norm": 0.12905892729759216, + "learning_rate": 2.873467304333531e-05, + "loss": 0.0359, + "step": 52530 + }, + { + "epoch": 0.11587158411496296, + "grad_norm": 0.1318463534116745, + "learning_rate": 2.8734008284618705e-05, + "loss": 0.0379, + "step": 52540 + }, + { + "epoch": 0.11589363808986113, + "grad_norm": 0.13277851045131683, + "learning_rate": 2.8733343359020935e-05, + "loss": 0.0367, + "step": 52550 + }, + { + "epoch": 0.1159156920647593, + "grad_norm": 0.11862439662218094, + "learning_rate": 2.873267826655007e-05, + "loss": 0.0365, + "step": 52560 + }, + { + "epoch": 0.11593774603965745, + "grad_norm": 0.14942121505737305, + "learning_rate": 2.8732013007214208e-05, + "loss": 0.0382, + "step": 52570 + }, + { + "epoch": 0.11595980001455562, + "grad_norm": 0.13044731318950653, + "learning_rate": 2.873134758102142e-05, + "loss": 0.0354, + "step": 52580 + }, + { + "epoch": 0.11598185398945379, + "grad_norm": 0.11043106764554977, + "learning_rate": 2.873068198797979e-05, + "loss": 0.0375, + "step": 52590 + }, + { + "epoch": 0.11600390796435195, + "grad_norm": 0.1417398452758789, + "learning_rate": 2.873001622809741e-05, + "loss": 0.0382, + "step": 52600 + }, + { + "epoch": 0.11602596193925012, + "grad_norm": 0.09119123220443726, + "learning_rate": 2.8729350301382372e-05, + "loss": 0.0369, + "step": 52610 + }, + { + "epoch": 0.11604801591414829, + "grad_norm": 0.0859856903553009, + "learning_rate": 2.8728684207842764e-05, + "loss": 0.0346, + "step": 52620 + }, + { + "epoch": 0.11607006988904645, + "grad_norm": 0.12263856828212738, + "learning_rate": 2.8728017947486684e-05, + "loss": 0.0364, + "step": 52630 + }, + { + "epoch": 0.11609212386394462, + "grad_norm": 0.10968364775180817, + "learning_rate": 2.8727351520322217e-05, + "loss": 0.0367, + "step": 52640 + }, + { + "epoch": 0.11611417783884279, + "grad_norm": 0.14351245760917664, + "learning_rate": 2.8726684926357478e-05, + "loss": 0.037, + "step": 52650 + }, + { + "epoch": 0.11613623181374096, + "grad_norm": 0.1236344650387764, + "learning_rate": 2.872601816560055e-05, + "loss": 0.0374, + "step": 52660 + }, + { + "epoch": 0.11615828578863911, + "grad_norm": 0.11758816987276077, + "learning_rate": 2.8725351238059545e-05, + "loss": 0.0379, + "step": 52670 + }, + { + "epoch": 0.11618033976353728, + "grad_norm": 0.1402938812971115, + "learning_rate": 2.872468414374256e-05, + "loss": 0.0368, + "step": 52680 + }, + { + "epoch": 0.11620239373843545, + "grad_norm": 0.10813295841217041, + "learning_rate": 2.8724016882657708e-05, + "loss": 0.0352, + "step": 52690 + }, + { + "epoch": 0.11622444771333361, + "grad_norm": 0.12038804590702057, + "learning_rate": 2.8723349454813095e-05, + "loss": 0.0395, + "step": 52700 + }, + { + "epoch": 0.11624650168823178, + "grad_norm": 0.10998043417930603, + "learning_rate": 2.8722681860216825e-05, + "loss": 0.036, + "step": 52710 + }, + { + "epoch": 0.11626855566312995, + "grad_norm": 0.11119844764471054, + "learning_rate": 2.8722014098877014e-05, + "loss": 0.0374, + "step": 52720 + }, + { + "epoch": 0.1162906096380281, + "grad_norm": 0.09641611576080322, + "learning_rate": 2.872134617080178e-05, + "loss": 0.0376, + "step": 52730 + }, + { + "epoch": 0.11631266361292628, + "grad_norm": 0.10838323086500168, + "learning_rate": 2.8720678075999232e-05, + "loss": 0.035, + "step": 52740 + }, + { + "epoch": 0.11633471758782445, + "grad_norm": 0.10568714141845703, + "learning_rate": 2.8720009814477494e-05, + "loss": 0.036, + "step": 52750 + }, + { + "epoch": 0.1163567715627226, + "grad_norm": 0.14401768147945404, + "learning_rate": 2.8719341386244676e-05, + "loss": 0.0379, + "step": 52760 + }, + { + "epoch": 0.11637882553762077, + "grad_norm": 0.11833459883928299, + "learning_rate": 2.8718672791308913e-05, + "loss": 0.0368, + "step": 52770 + }, + { + "epoch": 0.11640087951251894, + "grad_norm": 0.10978454351425171, + "learning_rate": 2.871800402967832e-05, + "loss": 0.0347, + "step": 52780 + }, + { + "epoch": 0.1164229334874171, + "grad_norm": 0.10640864074230194, + "learning_rate": 2.8717335101361027e-05, + "loss": 0.0379, + "step": 52790 + }, + { + "epoch": 0.11644498746231527, + "grad_norm": 0.13648384809494019, + "learning_rate": 2.8716666006365164e-05, + "loss": 0.0368, + "step": 52800 + }, + { + "epoch": 0.11646704143721344, + "grad_norm": 0.1145019382238388, + "learning_rate": 2.8715996744698855e-05, + "loss": 0.0356, + "step": 52810 + }, + { + "epoch": 0.1164890954121116, + "grad_norm": 0.11119532585144043, + "learning_rate": 2.8715327316370238e-05, + "loss": 0.0363, + "step": 52820 + }, + { + "epoch": 0.11651114938700977, + "grad_norm": 0.1854621320962906, + "learning_rate": 2.871465772138744e-05, + "loss": 0.0378, + "step": 52830 + }, + { + "epoch": 0.11653320336190794, + "grad_norm": 0.12470456212759018, + "learning_rate": 2.8713987959758607e-05, + "loss": 0.0348, + "step": 52840 + }, + { + "epoch": 0.11655525733680609, + "grad_norm": 0.10480581969022751, + "learning_rate": 2.871331803149187e-05, + "loss": 0.0371, + "step": 52850 + }, + { + "epoch": 0.11657731131170426, + "grad_norm": 0.10213132202625275, + "learning_rate": 2.871264793659537e-05, + "loss": 0.0354, + "step": 52860 + }, + { + "epoch": 0.11659936528660243, + "grad_norm": 0.11678337305784225, + "learning_rate": 2.8711977675077252e-05, + "loss": 0.0365, + "step": 52870 + }, + { + "epoch": 0.1166214192615006, + "grad_norm": 0.09728939086198807, + "learning_rate": 2.8711307246945662e-05, + "loss": 0.0359, + "step": 52880 + }, + { + "epoch": 0.11664347323639876, + "grad_norm": 0.14366157352924347, + "learning_rate": 2.871063665220874e-05, + "loss": 0.0383, + "step": 52890 + }, + { + "epoch": 0.11666552721129693, + "grad_norm": 0.09898746758699417, + "learning_rate": 2.8709965890874636e-05, + "loss": 0.036, + "step": 52900 + }, + { + "epoch": 0.1166875811861951, + "grad_norm": 0.10092207044363022, + "learning_rate": 2.8709294962951503e-05, + "loss": 0.0357, + "step": 52910 + }, + { + "epoch": 0.11670963516109326, + "grad_norm": 0.08578582108020782, + "learning_rate": 2.8708623868447493e-05, + "loss": 0.0357, + "step": 52920 + }, + { + "epoch": 0.11673168913599143, + "grad_norm": 0.1427755206823349, + "learning_rate": 2.870795260737076e-05, + "loss": 0.0378, + "step": 52930 + }, + { + "epoch": 0.1167537431108896, + "grad_norm": 0.13417072594165802, + "learning_rate": 2.8707281179729465e-05, + "loss": 0.0354, + "step": 52940 + }, + { + "epoch": 0.11677579708578775, + "grad_norm": 0.10606072098016739, + "learning_rate": 2.8706609585531757e-05, + "loss": 0.0392, + "step": 52950 + }, + { + "epoch": 0.11679785106068592, + "grad_norm": 0.10720183700323105, + "learning_rate": 2.87059378247858e-05, + "loss": 0.0375, + "step": 52960 + }, + { + "epoch": 0.11681990503558409, + "grad_norm": 0.10161130130290985, + "learning_rate": 2.8705265897499762e-05, + "loss": 0.0361, + "step": 52970 + }, + { + "epoch": 0.11684195901048225, + "grad_norm": 0.14522750675678253, + "learning_rate": 2.87045938036818e-05, + "loss": 0.037, + "step": 52980 + }, + { + "epoch": 0.11686401298538042, + "grad_norm": 0.12174498289823532, + "learning_rate": 2.8703921543340088e-05, + "loss": 0.0358, + "step": 52990 + }, + { + "epoch": 0.11688606696027859, + "grad_norm": 0.12422267347574234, + "learning_rate": 2.8703249116482786e-05, + "loss": 0.036, + "step": 53000 + }, + { + "epoch": 0.11690812093517675, + "grad_norm": 0.12647157907485962, + "learning_rate": 2.870257652311807e-05, + "loss": 0.038, + "step": 53010 + }, + { + "epoch": 0.11693017491007492, + "grad_norm": 0.10192607343196869, + "learning_rate": 2.8701903763254116e-05, + "loss": 0.0379, + "step": 53020 + }, + { + "epoch": 0.11695222888497309, + "grad_norm": 0.12150759994983673, + "learning_rate": 2.8701230836899094e-05, + "loss": 0.0363, + "step": 53030 + }, + { + "epoch": 0.11697428285987124, + "grad_norm": 0.09856786578893661, + "learning_rate": 2.8700557744061178e-05, + "loss": 0.0373, + "step": 53040 + }, + { + "epoch": 0.11699633683476941, + "grad_norm": 0.0980854257941246, + "learning_rate": 2.869988448474855e-05, + "loss": 0.0375, + "step": 53050 + }, + { + "epoch": 0.11701839080966758, + "grad_norm": 0.11601142585277557, + "learning_rate": 2.869921105896939e-05, + "loss": 0.0371, + "step": 53060 + }, + { + "epoch": 0.11704044478456575, + "grad_norm": 0.11621163785457611, + "learning_rate": 2.8698537466731886e-05, + "loss": 0.0372, + "step": 53070 + }, + { + "epoch": 0.11706249875946391, + "grad_norm": 0.11111751198768616, + "learning_rate": 2.8697863708044213e-05, + "loss": 0.0382, + "step": 53080 + }, + { + "epoch": 0.11708455273436208, + "grad_norm": 0.10950391739606857, + "learning_rate": 2.8697189782914566e-05, + "loss": 0.0364, + "step": 53090 + }, + { + "epoch": 0.11710660670926025, + "grad_norm": 0.108127161860466, + "learning_rate": 2.869651569135113e-05, + "loss": 0.0367, + "step": 53100 + }, + { + "epoch": 0.1171286606841584, + "grad_norm": 0.12058988958597183, + "learning_rate": 2.86958414333621e-05, + "loss": 0.0367, + "step": 53110 + }, + { + "epoch": 0.11715071465905658, + "grad_norm": 0.0970897451043129, + "learning_rate": 2.8695167008955655e-05, + "loss": 0.0332, + "step": 53120 + }, + { + "epoch": 0.11717276863395475, + "grad_norm": 0.12823812663555145, + "learning_rate": 2.8694492418140013e-05, + "loss": 0.0352, + "step": 53130 + }, + { + "epoch": 0.1171948226088529, + "grad_norm": 0.11412573605775833, + "learning_rate": 2.8693817660923355e-05, + "loss": 0.0356, + "step": 53140 + }, + { + "epoch": 0.11721687658375107, + "grad_norm": 0.1267995536327362, + "learning_rate": 2.869314273731388e-05, + "loss": 0.0371, + "step": 53150 + }, + { + "epoch": 0.11723893055864924, + "grad_norm": 0.10229874402284622, + "learning_rate": 2.8692467647319793e-05, + "loss": 0.0345, + "step": 53160 + }, + { + "epoch": 0.1172609845335474, + "grad_norm": 0.1141900047659874, + "learning_rate": 2.8691792390949294e-05, + "loss": 0.0391, + "step": 53170 + }, + { + "epoch": 0.11728303850844557, + "grad_norm": 0.09771287441253662, + "learning_rate": 2.8691116968210596e-05, + "loss": 0.0332, + "step": 53180 + }, + { + "epoch": 0.11730509248334374, + "grad_norm": 0.14289110898971558, + "learning_rate": 2.8690441379111897e-05, + "loss": 0.0388, + "step": 53190 + }, + { + "epoch": 0.1173271464582419, + "grad_norm": 0.10042130947113037, + "learning_rate": 2.868976562366141e-05, + "loss": 0.0361, + "step": 53200 + }, + { + "epoch": 0.11734920043314007, + "grad_norm": 0.10840047895908356, + "learning_rate": 2.8689089701867346e-05, + "loss": 0.0353, + "step": 53210 + }, + { + "epoch": 0.11737125440803824, + "grad_norm": 0.11397384852170944, + "learning_rate": 2.8688413613737915e-05, + "loss": 0.0369, + "step": 53220 + }, + { + "epoch": 0.11739330838293639, + "grad_norm": 0.11997738480567932, + "learning_rate": 2.8687737359281338e-05, + "loss": 0.0382, + "step": 53230 + }, + { + "epoch": 0.11741536235783456, + "grad_norm": 0.11704482138156891, + "learning_rate": 2.8687060938505827e-05, + "loss": 0.0369, + "step": 53240 + }, + { + "epoch": 0.11743741633273273, + "grad_norm": 0.1129021868109703, + "learning_rate": 2.8686384351419604e-05, + "loss": 0.0377, + "step": 53250 + }, + { + "epoch": 0.11745947030763089, + "grad_norm": 0.10537468641996384, + "learning_rate": 2.8685707598030887e-05, + "loss": 0.0397, + "step": 53260 + }, + { + "epoch": 0.11748152428252906, + "grad_norm": 0.1310987025499344, + "learning_rate": 2.86850306783479e-05, + "loss": 0.038, + "step": 53270 + }, + { + "epoch": 0.11750357825742723, + "grad_norm": 0.12411578744649887, + "learning_rate": 2.8684353592378872e-05, + "loss": 0.0372, + "step": 53280 + }, + { + "epoch": 0.1175256322323254, + "grad_norm": 0.09909118711948395, + "learning_rate": 2.868367634013203e-05, + "loss": 0.0353, + "step": 53290 + }, + { + "epoch": 0.11754768620722356, + "grad_norm": 0.08377044647932053, + "learning_rate": 2.86829989216156e-05, + "loss": 0.0335, + "step": 53300 + }, + { + "epoch": 0.11756974018212173, + "grad_norm": 0.10936357080936432, + "learning_rate": 2.8682321336837812e-05, + "loss": 0.0368, + "step": 53310 + }, + { + "epoch": 0.1175917941570199, + "grad_norm": 0.11985906958580017, + "learning_rate": 2.86816435858069e-05, + "loss": 0.0361, + "step": 53320 + }, + { + "epoch": 0.11761384813191805, + "grad_norm": 0.1411646157503128, + "learning_rate": 2.86809656685311e-05, + "loss": 0.0381, + "step": 53330 + }, + { + "epoch": 0.11763590210681622, + "grad_norm": 0.1897161602973938, + "learning_rate": 2.8680287585018655e-05, + "loss": 0.0367, + "step": 53340 + }, + { + "epoch": 0.1176579560817144, + "grad_norm": 0.14071263372898102, + "learning_rate": 2.86796093352778e-05, + "loss": 0.038, + "step": 53350 + }, + { + "epoch": 0.11768001005661255, + "grad_norm": 0.16462132334709167, + "learning_rate": 2.8678930919316773e-05, + "loss": 0.0376, + "step": 53360 + }, + { + "epoch": 0.11770206403151072, + "grad_norm": 0.10713130235671997, + "learning_rate": 2.867825233714382e-05, + "loss": 0.0379, + "step": 53370 + }, + { + "epoch": 0.11772411800640889, + "grad_norm": 0.10699378699064255, + "learning_rate": 2.8677573588767185e-05, + "loss": 0.0354, + "step": 53380 + }, + { + "epoch": 0.11774617198130705, + "grad_norm": 0.08968318998813629, + "learning_rate": 2.8676894674195122e-05, + "loss": 0.0365, + "step": 53390 + }, + { + "epoch": 0.11776822595620522, + "grad_norm": 0.09208106994628906, + "learning_rate": 2.867621559343587e-05, + "loss": 0.0352, + "step": 53400 + }, + { + "epoch": 0.11779027993110339, + "grad_norm": 0.10643988102674484, + "learning_rate": 2.8675536346497687e-05, + "loss": 0.0361, + "step": 53410 + }, + { + "epoch": 0.11781233390600154, + "grad_norm": 0.10423756390810013, + "learning_rate": 2.867485693338883e-05, + "loss": 0.0361, + "step": 53420 + }, + { + "epoch": 0.11783438788089971, + "grad_norm": 0.1149992123246193, + "learning_rate": 2.8674177354117547e-05, + "loss": 0.0382, + "step": 53430 + }, + { + "epoch": 0.11785644185579788, + "grad_norm": 0.11469116806983948, + "learning_rate": 2.86734976086921e-05, + "loss": 0.0375, + "step": 53440 + }, + { + "epoch": 0.11787849583069604, + "grad_norm": 0.11624376475811005, + "learning_rate": 2.8672817697120747e-05, + "loss": 0.0359, + "step": 53450 + }, + { + "epoch": 0.11790054980559421, + "grad_norm": 0.10162261873483658, + "learning_rate": 2.867213761941175e-05, + "loss": 0.0358, + "step": 53460 + }, + { + "epoch": 0.11792260378049238, + "grad_norm": 0.12040507793426514, + "learning_rate": 2.867145737557337e-05, + "loss": 0.0367, + "step": 53470 + }, + { + "epoch": 0.11794465775539054, + "grad_norm": 0.10938528925180435, + "learning_rate": 2.8670776965613878e-05, + "loss": 0.0351, + "step": 53480 + }, + { + "epoch": 0.1179667117302887, + "grad_norm": 0.1247611939907074, + "learning_rate": 2.8670096389541536e-05, + "loss": 0.0366, + "step": 53490 + }, + { + "epoch": 0.11798876570518688, + "grad_norm": 0.10191145539283752, + "learning_rate": 2.8669415647364616e-05, + "loss": 0.035, + "step": 53500 + }, + { + "epoch": 0.11801081968008505, + "grad_norm": 0.13042068481445312, + "learning_rate": 2.8668734739091394e-05, + "loss": 0.0374, + "step": 53510 + }, + { + "epoch": 0.1180328736549832, + "grad_norm": 0.11297444254159927, + "learning_rate": 2.8668053664730134e-05, + "loss": 0.0371, + "step": 53520 + }, + { + "epoch": 0.11805492762988137, + "grad_norm": 0.12114790081977844, + "learning_rate": 2.8667372424289123e-05, + "loss": 0.0366, + "step": 53530 + }, + { + "epoch": 0.11807698160477954, + "grad_norm": 0.12667807936668396, + "learning_rate": 2.8666691017776626e-05, + "loss": 0.0346, + "step": 53540 + }, + { + "epoch": 0.1180990355796777, + "grad_norm": 0.18129447102546692, + "learning_rate": 2.866600944520093e-05, + "loss": 0.0363, + "step": 53550 + }, + { + "epoch": 0.11812108955457587, + "grad_norm": 0.12620587646961212, + "learning_rate": 2.8665327706570323e-05, + "loss": 0.0355, + "step": 53560 + }, + { + "epoch": 0.11814314352947404, + "grad_norm": 0.11107857525348663, + "learning_rate": 2.8664645801893077e-05, + "loss": 0.0352, + "step": 53570 + }, + { + "epoch": 0.1181651975043722, + "grad_norm": 0.09305272251367569, + "learning_rate": 2.8663963731177486e-05, + "loss": 0.0363, + "step": 53580 + }, + { + "epoch": 0.11818725147927037, + "grad_norm": 0.13406872749328613, + "learning_rate": 2.8663281494431832e-05, + "loss": 0.0361, + "step": 53590 + }, + { + "epoch": 0.11820930545416854, + "grad_norm": 0.09178531169891357, + "learning_rate": 2.866259909166441e-05, + "loss": 0.0361, + "step": 53600 + }, + { + "epoch": 0.11823135942906669, + "grad_norm": 0.14175492525100708, + "learning_rate": 2.866191652288351e-05, + "loss": 0.0356, + "step": 53610 + }, + { + "epoch": 0.11825341340396486, + "grad_norm": 0.12747414410114288, + "learning_rate": 2.8661233788097424e-05, + "loss": 0.0364, + "step": 53620 + }, + { + "epoch": 0.11827546737886303, + "grad_norm": 0.1127319410443306, + "learning_rate": 2.8660550887314447e-05, + "loss": 0.0364, + "step": 53630 + }, + { + "epoch": 0.11829752135376119, + "grad_norm": 0.0814526304602623, + "learning_rate": 2.865986782054288e-05, + "loss": 0.0374, + "step": 53640 + }, + { + "epoch": 0.11831957532865936, + "grad_norm": 0.08936960995197296, + "learning_rate": 2.865918458779102e-05, + "loss": 0.0339, + "step": 53650 + }, + { + "epoch": 0.11834162930355753, + "grad_norm": 0.11909769475460052, + "learning_rate": 2.8658501189067176e-05, + "loss": 0.0371, + "step": 53660 + }, + { + "epoch": 0.11836368327845569, + "grad_norm": 0.11889540404081345, + "learning_rate": 2.8657817624379643e-05, + "loss": 0.0349, + "step": 53670 + }, + { + "epoch": 0.11838573725335386, + "grad_norm": 0.10747946053743362, + "learning_rate": 2.8657133893736732e-05, + "loss": 0.0383, + "step": 53680 + }, + { + "epoch": 0.11840779122825203, + "grad_norm": 0.13131973147392273, + "learning_rate": 2.865644999714675e-05, + "loss": 0.0352, + "step": 53690 + }, + { + "epoch": 0.11842984520315018, + "grad_norm": 0.09656970202922821, + "learning_rate": 2.8655765934618006e-05, + "loss": 0.0346, + "step": 53700 + }, + { + "epoch": 0.11845189917804835, + "grad_norm": 0.13476939499378204, + "learning_rate": 2.8655081706158816e-05, + "loss": 0.0388, + "step": 53710 + }, + { + "epoch": 0.11847395315294652, + "grad_norm": 0.11780855804681778, + "learning_rate": 2.8654397311777486e-05, + "loss": 0.0369, + "step": 53720 + }, + { + "epoch": 0.1184960071278447, + "grad_norm": 0.08955869823694229, + "learning_rate": 2.8653712751482338e-05, + "loss": 0.0361, + "step": 53730 + }, + { + "epoch": 0.11851806110274285, + "grad_norm": 0.14765560626983643, + "learning_rate": 2.8653028025281692e-05, + "loss": 0.0378, + "step": 53740 + }, + { + "epoch": 0.11854011507764102, + "grad_norm": 0.1223127692937851, + "learning_rate": 2.8652343133183863e-05, + "loss": 0.0359, + "step": 53750 + }, + { + "epoch": 0.11856216905253919, + "grad_norm": 0.08917534351348877, + "learning_rate": 2.865165807519717e-05, + "loss": 0.034, + "step": 53760 + }, + { + "epoch": 0.11858422302743735, + "grad_norm": 0.10759514570236206, + "learning_rate": 2.8650972851329946e-05, + "loss": 0.0366, + "step": 53770 + }, + { + "epoch": 0.11860627700233552, + "grad_norm": 0.09118101000785828, + "learning_rate": 2.8650287461590513e-05, + "loss": 0.0369, + "step": 53780 + }, + { + "epoch": 0.11862833097723369, + "grad_norm": 0.12919369339942932, + "learning_rate": 2.8649601905987203e-05, + "loss": 0.0376, + "step": 53790 + }, + { + "epoch": 0.11865038495213184, + "grad_norm": 0.13726010918617249, + "learning_rate": 2.864891618452834e-05, + "loss": 0.0373, + "step": 53800 + }, + { + "epoch": 0.11867243892703001, + "grad_norm": 0.10380840301513672, + "learning_rate": 2.8648230297222255e-05, + "loss": 0.039, + "step": 53810 + }, + { + "epoch": 0.11869449290192818, + "grad_norm": 0.16713587939739227, + "learning_rate": 2.864754424407729e-05, + "loss": 0.0376, + "step": 53820 + }, + { + "epoch": 0.11871654687682634, + "grad_norm": 0.11307572573423386, + "learning_rate": 2.8646858025101778e-05, + "loss": 0.0375, + "step": 53830 + }, + { + "epoch": 0.11873860085172451, + "grad_norm": 0.11685813963413239, + "learning_rate": 2.8646171640304054e-05, + "loss": 0.0354, + "step": 53840 + }, + { + "epoch": 0.11876065482662268, + "grad_norm": 0.10804219543933868, + "learning_rate": 2.864548508969246e-05, + "loss": 0.0353, + "step": 53850 + }, + { + "epoch": 0.11878270880152084, + "grad_norm": 0.15649615228176117, + "learning_rate": 2.864479837327534e-05, + "loss": 0.039, + "step": 53860 + }, + { + "epoch": 0.118804762776419, + "grad_norm": 0.10882746428251266, + "learning_rate": 2.8644111491061035e-05, + "loss": 0.0354, + "step": 53870 + }, + { + "epoch": 0.11882681675131718, + "grad_norm": 0.12117626518011093, + "learning_rate": 2.8643424443057893e-05, + "loss": 0.0384, + "step": 53880 + }, + { + "epoch": 0.11884887072621533, + "grad_norm": 0.12133438885211945, + "learning_rate": 2.8642737229274266e-05, + "loss": 0.0352, + "step": 53890 + }, + { + "epoch": 0.1188709247011135, + "grad_norm": 0.10458351671695709, + "learning_rate": 2.8642049849718498e-05, + "loss": 0.0357, + "step": 53900 + }, + { + "epoch": 0.11889297867601167, + "grad_norm": 0.11088942736387253, + "learning_rate": 2.8641362304398944e-05, + "loss": 0.0345, + "step": 53910 + }, + { + "epoch": 0.11891503265090983, + "grad_norm": 0.1072159856557846, + "learning_rate": 2.8640674593323958e-05, + "loss": 0.0379, + "step": 53920 + }, + { + "epoch": 0.118937086625808, + "grad_norm": 0.10959680378437042, + "learning_rate": 2.8639986716501902e-05, + "loss": 0.0369, + "step": 53930 + }, + { + "epoch": 0.11895914060070617, + "grad_norm": 0.1221817210316658, + "learning_rate": 2.8639298673941124e-05, + "loss": 0.0378, + "step": 53940 + }, + { + "epoch": 0.11898119457560434, + "grad_norm": 0.12481055408716202, + "learning_rate": 2.8638610465649988e-05, + "loss": 0.037, + "step": 53950 + }, + { + "epoch": 0.1190032485505025, + "grad_norm": 0.10282843559980392, + "learning_rate": 2.863792209163686e-05, + "loss": 0.0373, + "step": 53960 + }, + { + "epoch": 0.11902530252540067, + "grad_norm": 0.1485426425933838, + "learning_rate": 2.86372335519101e-05, + "loss": 0.0349, + "step": 53970 + }, + { + "epoch": 0.11904735650029884, + "grad_norm": 0.10957573354244232, + "learning_rate": 2.8636544846478086e-05, + "loss": 0.0363, + "step": 53980 + }, + { + "epoch": 0.119069410475197, + "grad_norm": 0.11953684687614441, + "learning_rate": 2.863585597534917e-05, + "loss": 0.0357, + "step": 53990 + }, + { + "epoch": 0.11909146445009516, + "grad_norm": 0.12264787405729294, + "learning_rate": 2.863516693853173e-05, + "loss": 0.0356, + "step": 54000 + }, + { + "epoch": 0.11911351842499333, + "grad_norm": 0.10219204425811768, + "learning_rate": 2.8634477736034137e-05, + "loss": 0.0361, + "step": 54010 + }, + { + "epoch": 0.11913557239989149, + "grad_norm": 0.10468599200248718, + "learning_rate": 2.863378836786477e-05, + "loss": 0.0376, + "step": 54020 + }, + { + "epoch": 0.11915762637478966, + "grad_norm": 0.11337630450725555, + "learning_rate": 2.8633098834031998e-05, + "loss": 0.0369, + "step": 54030 + }, + { + "epoch": 0.11917968034968783, + "grad_norm": 0.11147312819957733, + "learning_rate": 2.8632409134544203e-05, + "loss": 0.0367, + "step": 54040 + }, + { + "epoch": 0.11920173432458599, + "grad_norm": 0.11236387491226196, + "learning_rate": 2.8631719269409767e-05, + "loss": 0.0368, + "step": 54050 + }, + { + "epoch": 0.11922378829948416, + "grad_norm": 0.08961142599582672, + "learning_rate": 2.8631029238637072e-05, + "loss": 0.0319, + "step": 54060 + }, + { + "epoch": 0.11924584227438233, + "grad_norm": 0.11018389463424683, + "learning_rate": 2.8630339042234502e-05, + "loss": 0.0351, + "step": 54070 + }, + { + "epoch": 0.11926789624928048, + "grad_norm": 0.09545683115720749, + "learning_rate": 2.8629648680210444e-05, + "loss": 0.0334, + "step": 54080 + }, + { + "epoch": 0.11928995022417865, + "grad_norm": 0.1013362854719162, + "learning_rate": 2.8628958152573285e-05, + "loss": 0.0362, + "step": 54090 + }, + { + "epoch": 0.11931200419907682, + "grad_norm": 0.11207222938537598, + "learning_rate": 2.8628267459331417e-05, + "loss": 0.0366, + "step": 54100 + }, + { + "epoch": 0.11933405817397498, + "grad_norm": 0.10338132083415985, + "learning_rate": 2.862757660049323e-05, + "loss": 0.0374, + "step": 54110 + }, + { + "epoch": 0.11935611214887315, + "grad_norm": 0.11099694669246674, + "learning_rate": 2.8626885576067122e-05, + "loss": 0.0346, + "step": 54120 + }, + { + "epoch": 0.11937816612377132, + "grad_norm": 0.1045118197798729, + "learning_rate": 2.8626194386061494e-05, + "loss": 0.0367, + "step": 54130 + }, + { + "epoch": 0.11940022009866949, + "grad_norm": 0.07807383686304092, + "learning_rate": 2.8625503030484733e-05, + "loss": 0.0346, + "step": 54140 + }, + { + "epoch": 0.11942227407356765, + "grad_norm": 0.13767877221107483, + "learning_rate": 2.862481150934525e-05, + "loss": 0.0373, + "step": 54150 + }, + { + "epoch": 0.11944432804846582, + "grad_norm": 0.1421361118555069, + "learning_rate": 2.8624119822651434e-05, + "loss": 0.0361, + "step": 54160 + }, + { + "epoch": 0.11946638202336399, + "grad_norm": 0.11546171456575394, + "learning_rate": 2.8623427970411707e-05, + "loss": 0.0362, + "step": 54170 + }, + { + "epoch": 0.11948843599826214, + "grad_norm": 0.12112683802843094, + "learning_rate": 2.8622735952634468e-05, + "loss": 0.037, + "step": 54180 + }, + { + "epoch": 0.11951048997316031, + "grad_norm": 0.1101100817322731, + "learning_rate": 2.862204376932812e-05, + "loss": 0.036, + "step": 54190 + }, + { + "epoch": 0.11953254394805848, + "grad_norm": 0.1299213171005249, + "learning_rate": 2.862135142050108e-05, + "loss": 0.0357, + "step": 54200 + }, + { + "epoch": 0.11955459792295664, + "grad_norm": 0.10580649971961975, + "learning_rate": 2.862065890616176e-05, + "loss": 0.0363, + "step": 54210 + }, + { + "epoch": 0.11957665189785481, + "grad_norm": 0.12155134230852127, + "learning_rate": 2.8619966226318575e-05, + "loss": 0.0345, + "step": 54220 + }, + { + "epoch": 0.11959870587275298, + "grad_norm": 0.11353043466806412, + "learning_rate": 2.861927338097994e-05, + "loss": 0.0378, + "step": 54230 + }, + { + "epoch": 0.11962075984765114, + "grad_norm": 0.11309969425201416, + "learning_rate": 2.8618580370154278e-05, + "loss": 0.0369, + "step": 54240 + }, + { + "epoch": 0.11964281382254931, + "grad_norm": 0.13436168432235718, + "learning_rate": 2.8617887193850006e-05, + "loss": 0.0376, + "step": 54250 + }, + { + "epoch": 0.11966486779744748, + "grad_norm": 0.13807988166809082, + "learning_rate": 2.8617193852075546e-05, + "loss": 0.0365, + "step": 54260 + }, + { + "epoch": 0.11968692177234563, + "grad_norm": 0.12543293833732605, + "learning_rate": 2.861650034483932e-05, + "loss": 0.0379, + "step": 54270 + }, + { + "epoch": 0.1197089757472438, + "grad_norm": 0.14233016967773438, + "learning_rate": 2.8615806672149766e-05, + "loss": 0.0369, + "step": 54280 + }, + { + "epoch": 0.11973102972214197, + "grad_norm": 0.0947253555059433, + "learning_rate": 2.8615112834015297e-05, + "loss": 0.0342, + "step": 54290 + }, + { + "epoch": 0.11975308369704013, + "grad_norm": 0.1261618733406067, + "learning_rate": 2.861441883044436e-05, + "loss": 0.0353, + "step": 54300 + }, + { + "epoch": 0.1197751376719383, + "grad_norm": 0.09374737739562988, + "learning_rate": 2.8613724661445375e-05, + "loss": 0.0359, + "step": 54310 + }, + { + "epoch": 0.11979719164683647, + "grad_norm": 0.13960705697536469, + "learning_rate": 2.8613030327026784e-05, + "loss": 0.0362, + "step": 54320 + }, + { + "epoch": 0.11981924562173463, + "grad_norm": 0.11379251629114151, + "learning_rate": 2.861233582719702e-05, + "loss": 0.0366, + "step": 54330 + }, + { + "epoch": 0.1198412995966328, + "grad_norm": 0.08691638708114624, + "learning_rate": 2.861164116196453e-05, + "loss": 0.0351, + "step": 54340 + }, + { + "epoch": 0.11986335357153097, + "grad_norm": 0.10258477181196213, + "learning_rate": 2.861094633133774e-05, + "loss": 0.0368, + "step": 54350 + }, + { + "epoch": 0.11988540754642914, + "grad_norm": 0.1322575807571411, + "learning_rate": 2.8610251335325103e-05, + "loss": 0.0353, + "step": 54360 + }, + { + "epoch": 0.1199074615213273, + "grad_norm": 0.10453425347805023, + "learning_rate": 2.8609556173935066e-05, + "loss": 0.0394, + "step": 54370 + }, + { + "epoch": 0.11992951549622546, + "grad_norm": 0.11420521885156631, + "learning_rate": 2.8608860847176066e-05, + "loss": 0.0377, + "step": 54380 + }, + { + "epoch": 0.11995156947112363, + "grad_norm": 0.11850845813751221, + "learning_rate": 2.8608165355056563e-05, + "loss": 0.0366, + "step": 54390 + }, + { + "epoch": 0.11997362344602179, + "grad_norm": 0.10642782598733902, + "learning_rate": 2.8607469697585e-05, + "loss": 0.0378, + "step": 54400 + }, + { + "epoch": 0.11999567742091996, + "grad_norm": 0.1124080941081047, + "learning_rate": 2.8606773874769835e-05, + "loss": 0.0363, + "step": 54410 + }, + { + "epoch": 0.12001773139581813, + "grad_norm": 0.1130475327372551, + "learning_rate": 2.8606077886619517e-05, + "loss": 0.0383, + "step": 54420 + }, + { + "epoch": 0.12003978537071629, + "grad_norm": 0.14595595002174377, + "learning_rate": 2.860538173314251e-05, + "loss": 0.038, + "step": 54430 + }, + { + "epoch": 0.12006183934561446, + "grad_norm": 0.1163080707192421, + "learning_rate": 2.8604685414347267e-05, + "loss": 0.0366, + "step": 54440 + }, + { + "epoch": 0.12008389332051263, + "grad_norm": 0.09602975845336914, + "learning_rate": 2.8603988930242252e-05, + "loss": 0.0368, + "step": 54450 + }, + { + "epoch": 0.12010594729541078, + "grad_norm": 0.12693971395492554, + "learning_rate": 2.8603292280835928e-05, + "loss": 0.0368, + "step": 54460 + }, + { + "epoch": 0.12012800127030895, + "grad_norm": 0.1241244450211525, + "learning_rate": 2.8602595466136762e-05, + "loss": 0.0363, + "step": 54470 + }, + { + "epoch": 0.12015005524520712, + "grad_norm": 0.138409823179245, + "learning_rate": 2.8601898486153214e-05, + "loss": 0.0354, + "step": 54480 + }, + { + "epoch": 0.12017210922010528, + "grad_norm": 0.134502112865448, + "learning_rate": 2.8601201340893755e-05, + "loss": 0.0351, + "step": 54490 + }, + { + "epoch": 0.12019416319500345, + "grad_norm": 0.09419815242290497, + "learning_rate": 2.860050403036686e-05, + "loss": 0.0356, + "step": 54500 + }, + { + "epoch": 0.12021621716990162, + "grad_norm": 0.15964846312999725, + "learning_rate": 2.8599806554581002e-05, + "loss": 0.0382, + "step": 54510 + }, + { + "epoch": 0.12023827114479978, + "grad_norm": 0.1298273801803589, + "learning_rate": 2.859910891354465e-05, + "loss": 0.0385, + "step": 54520 + }, + { + "epoch": 0.12026032511969795, + "grad_norm": 0.15954835712909698, + "learning_rate": 2.859841110726629e-05, + "loss": 0.0358, + "step": 54530 + }, + { + "epoch": 0.12028237909459612, + "grad_norm": 0.14495041966438293, + "learning_rate": 2.8597713135754395e-05, + "loss": 0.0374, + "step": 54540 + }, + { + "epoch": 0.12030443306949427, + "grad_norm": 0.10672736912965775, + "learning_rate": 2.8597014999017444e-05, + "loss": 0.0355, + "step": 54550 + }, + { + "epoch": 0.12032648704439244, + "grad_norm": 0.10341043770313263, + "learning_rate": 2.8596316697063924e-05, + "loss": 0.0363, + "step": 54560 + }, + { + "epoch": 0.12034854101929061, + "grad_norm": 0.11488503217697144, + "learning_rate": 2.859561822990232e-05, + "loss": 0.0369, + "step": 54570 + }, + { + "epoch": 0.12037059499418878, + "grad_norm": 0.10719151794910431, + "learning_rate": 2.859491959754112e-05, + "loss": 0.0384, + "step": 54580 + }, + { + "epoch": 0.12039264896908694, + "grad_norm": 0.1109982579946518, + "learning_rate": 2.8594220799988808e-05, + "loss": 0.0342, + "step": 54590 + }, + { + "epoch": 0.12041470294398511, + "grad_norm": 0.11585430055856705, + "learning_rate": 2.8593521837253878e-05, + "loss": 0.0357, + "step": 54600 + }, + { + "epoch": 0.12043675691888328, + "grad_norm": 0.11794432252645493, + "learning_rate": 2.8592822709344826e-05, + "loss": 0.0371, + "step": 54610 + }, + { + "epoch": 0.12045881089378144, + "grad_norm": 0.12109601497650146, + "learning_rate": 2.8592123416270144e-05, + "loss": 0.0355, + "step": 54620 + }, + { + "epoch": 0.12048086486867961, + "grad_norm": 0.11049606651067734, + "learning_rate": 2.8591423958038328e-05, + "loss": 0.0365, + "step": 54630 + }, + { + "epoch": 0.12050291884357778, + "grad_norm": 0.1475624293088913, + "learning_rate": 2.859072433465788e-05, + "loss": 0.0356, + "step": 54640 + }, + { + "epoch": 0.12052497281847593, + "grad_norm": 0.11892829835414886, + "learning_rate": 2.8590024546137298e-05, + "loss": 0.0347, + "step": 54650 + }, + { + "epoch": 0.1205470267933741, + "grad_norm": 0.1478489637374878, + "learning_rate": 2.8589324592485086e-05, + "loss": 0.0363, + "step": 54660 + }, + { + "epoch": 0.12056908076827227, + "grad_norm": 0.12237542867660522, + "learning_rate": 2.8588624473709754e-05, + "loss": 0.0361, + "step": 54670 + }, + { + "epoch": 0.12059113474317043, + "grad_norm": 0.11380130797624588, + "learning_rate": 2.8587924189819796e-05, + "loss": 0.038, + "step": 54680 + }, + { + "epoch": 0.1206131887180686, + "grad_norm": 0.10095558315515518, + "learning_rate": 2.858722374082374e-05, + "loss": 0.0377, + "step": 54690 + }, + { + "epoch": 0.12063524269296677, + "grad_norm": 0.11032489687204361, + "learning_rate": 2.8586523126730082e-05, + "loss": 0.037, + "step": 54700 + }, + { + "epoch": 0.12065729666786493, + "grad_norm": 0.10857129096984863, + "learning_rate": 2.8585822347547342e-05, + "loss": 0.0366, + "step": 54710 + }, + { + "epoch": 0.1206793506427631, + "grad_norm": 0.11436378955841064, + "learning_rate": 2.858512140328403e-05, + "loss": 0.036, + "step": 54720 + }, + { + "epoch": 0.12070140461766127, + "grad_norm": 0.12080439925193787, + "learning_rate": 2.8584420293948668e-05, + "loss": 0.0367, + "step": 54730 + }, + { + "epoch": 0.12072345859255942, + "grad_norm": 0.10806689411401749, + "learning_rate": 2.8583719019549775e-05, + "loss": 0.0362, + "step": 54740 + }, + { + "epoch": 0.1207455125674576, + "grad_norm": 0.11770495027303696, + "learning_rate": 2.8583017580095868e-05, + "loss": 0.0349, + "step": 54750 + }, + { + "epoch": 0.12076756654235576, + "grad_norm": 0.11098920553922653, + "learning_rate": 2.858231597559548e-05, + "loss": 0.0374, + "step": 54760 + }, + { + "epoch": 0.12078962051725392, + "grad_norm": 0.15458156168460846, + "learning_rate": 2.858161420605712e-05, + "loss": 0.0369, + "step": 54770 + }, + { + "epoch": 0.12081167449215209, + "grad_norm": 0.14472737908363342, + "learning_rate": 2.858091227148933e-05, + "loss": 0.0368, + "step": 54780 + }, + { + "epoch": 0.12083372846705026, + "grad_norm": 0.10524512827396393, + "learning_rate": 2.8580210171900632e-05, + "loss": 0.0346, + "step": 54790 + }, + { + "epoch": 0.12085578244194843, + "grad_norm": 0.13114047050476074, + "learning_rate": 2.857950790729956e-05, + "loss": 0.0371, + "step": 54800 + }, + { + "epoch": 0.12087783641684659, + "grad_norm": 0.10079353302717209, + "learning_rate": 2.857880547769464e-05, + "loss": 0.0373, + "step": 54810 + }, + { + "epoch": 0.12089989039174476, + "grad_norm": 0.09791164100170135, + "learning_rate": 2.8578102883094422e-05, + "loss": 0.0365, + "step": 54820 + }, + { + "epoch": 0.12092194436664293, + "grad_norm": 0.10136181116104126, + "learning_rate": 2.8577400123507424e-05, + "loss": 0.0377, + "step": 54830 + }, + { + "epoch": 0.12094399834154108, + "grad_norm": 0.11820393055677414, + "learning_rate": 2.8576697198942204e-05, + "loss": 0.039, + "step": 54840 + }, + { + "epoch": 0.12096605231643925, + "grad_norm": 0.11633570492267609, + "learning_rate": 2.857599410940729e-05, + "loss": 0.0352, + "step": 54850 + }, + { + "epoch": 0.12098810629133742, + "grad_norm": 0.10528622567653656, + "learning_rate": 2.857529085491123e-05, + "loss": 0.0358, + "step": 54860 + }, + { + "epoch": 0.12101016026623558, + "grad_norm": 0.10457893460988998, + "learning_rate": 2.8574587435462573e-05, + "loss": 0.0359, + "step": 54870 + }, + { + "epoch": 0.12103221424113375, + "grad_norm": 0.11775577068328857, + "learning_rate": 2.8573883851069857e-05, + "loss": 0.037, + "step": 54880 + }, + { + "epoch": 0.12105426821603192, + "grad_norm": 0.15633408725261688, + "learning_rate": 2.857318010174164e-05, + "loss": 0.0364, + "step": 54890 + }, + { + "epoch": 0.12107632219093008, + "grad_norm": 0.1691293865442276, + "learning_rate": 2.8572476187486463e-05, + "loss": 0.0358, + "step": 54900 + }, + { + "epoch": 0.12109837616582825, + "grad_norm": 0.10036072134971619, + "learning_rate": 2.857177210831289e-05, + "loss": 0.034, + "step": 54910 + }, + { + "epoch": 0.12112043014072642, + "grad_norm": 0.12024730443954468, + "learning_rate": 2.8571067864229476e-05, + "loss": 0.0375, + "step": 54920 + }, + { + "epoch": 0.12114248411562457, + "grad_norm": 0.12372595816850662, + "learning_rate": 2.8570363455244768e-05, + "loss": 0.035, + "step": 54930 + }, + { + "epoch": 0.12116453809052274, + "grad_norm": 0.1353723704814911, + "learning_rate": 2.8569658881367333e-05, + "loss": 0.0383, + "step": 54940 + }, + { + "epoch": 0.12118659206542091, + "grad_norm": 0.1060323491692543, + "learning_rate": 2.8568954142605733e-05, + "loss": 0.0366, + "step": 54950 + }, + { + "epoch": 0.12120864604031907, + "grad_norm": 0.11287534981966019, + "learning_rate": 2.856824923896853e-05, + "loss": 0.0375, + "step": 54960 + }, + { + "epoch": 0.12123070001521724, + "grad_norm": 0.10760021954774857, + "learning_rate": 2.8567544170464284e-05, + "loss": 0.035, + "step": 54970 + }, + { + "epoch": 0.12125275399011541, + "grad_norm": 0.14307093620300293, + "learning_rate": 2.856683893710157e-05, + "loss": 0.0356, + "step": 54980 + }, + { + "epoch": 0.12127480796501357, + "grad_norm": 0.10255424678325653, + "learning_rate": 2.8566133538888952e-05, + "loss": 0.037, + "step": 54990 + }, + { + "epoch": 0.12129686193991174, + "grad_norm": 0.10919228941202164, + "learning_rate": 2.8565427975835003e-05, + "loss": 0.0359, + "step": 55000 + }, + { + "epoch": 0.12131891591480991, + "grad_norm": 0.11612504720687866, + "learning_rate": 2.8564722247948293e-05, + "loss": 0.0353, + "step": 55010 + }, + { + "epoch": 0.12134096988970808, + "grad_norm": 0.10338300466537476, + "learning_rate": 2.8564016355237402e-05, + "loss": 0.0356, + "step": 55020 + }, + { + "epoch": 0.12136302386460623, + "grad_norm": 0.11633966118097305, + "learning_rate": 2.856331029771091e-05, + "loss": 0.0366, + "step": 55030 + }, + { + "epoch": 0.1213850778395044, + "grad_norm": 0.12234903126955032, + "learning_rate": 2.8562604075377392e-05, + "loss": 0.0369, + "step": 55040 + }, + { + "epoch": 0.12140713181440257, + "grad_norm": 0.09198884665966034, + "learning_rate": 2.8561897688245424e-05, + "loss": 0.0367, + "step": 55050 + }, + { + "epoch": 0.12142918578930073, + "grad_norm": 0.12026362866163254, + "learning_rate": 2.85611911363236e-05, + "loss": 0.037, + "step": 55060 + }, + { + "epoch": 0.1214512397641989, + "grad_norm": 0.13614629209041595, + "learning_rate": 2.8560484419620496e-05, + "loss": 0.037, + "step": 55070 + }, + { + "epoch": 0.12147329373909707, + "grad_norm": 0.12172897905111313, + "learning_rate": 2.85597775381447e-05, + "loss": 0.0367, + "step": 55080 + }, + { + "epoch": 0.12149534771399523, + "grad_norm": 0.1409125030040741, + "learning_rate": 2.8559070491904813e-05, + "loss": 0.0364, + "step": 55090 + }, + { + "epoch": 0.1215174016888934, + "grad_norm": 0.1304679960012436, + "learning_rate": 2.855836328090941e-05, + "loss": 0.0372, + "step": 55100 + }, + { + "epoch": 0.12153945566379157, + "grad_norm": 0.1361554116010666, + "learning_rate": 2.8557655905167095e-05, + "loss": 0.0348, + "step": 55110 + }, + { + "epoch": 0.12156150963868972, + "grad_norm": 0.10689328610897064, + "learning_rate": 2.855694836468646e-05, + "loss": 0.0354, + "step": 55120 + }, + { + "epoch": 0.1215835636135879, + "grad_norm": 0.12269337475299835, + "learning_rate": 2.8556240659476104e-05, + "loss": 0.0385, + "step": 55130 + }, + { + "epoch": 0.12160561758848606, + "grad_norm": 0.13376444578170776, + "learning_rate": 2.8555532789544623e-05, + "loss": 0.0366, + "step": 55140 + }, + { + "epoch": 0.12162767156338422, + "grad_norm": 0.11724287271499634, + "learning_rate": 2.855482475490062e-05, + "loss": 0.0364, + "step": 55150 + }, + { + "epoch": 0.12164972553828239, + "grad_norm": 0.11287922412157059, + "learning_rate": 2.85541165555527e-05, + "loss": 0.0351, + "step": 55160 + }, + { + "epoch": 0.12167177951318056, + "grad_norm": 0.10838688910007477, + "learning_rate": 2.8553408191509466e-05, + "loss": 0.0358, + "step": 55170 + }, + { + "epoch": 0.12169383348807872, + "grad_norm": 0.14261089265346527, + "learning_rate": 2.8552699662779526e-05, + "loss": 0.0349, + "step": 55180 + }, + { + "epoch": 0.12171588746297689, + "grad_norm": 0.10028168559074402, + "learning_rate": 2.8551990969371488e-05, + "loss": 0.0366, + "step": 55190 + }, + { + "epoch": 0.12173794143787506, + "grad_norm": 0.10064594447612762, + "learning_rate": 2.8551282111293966e-05, + "loss": 0.0353, + "step": 55200 + }, + { + "epoch": 0.12175999541277321, + "grad_norm": 0.12891064584255219, + "learning_rate": 2.855057308855557e-05, + "loss": 0.0365, + "step": 55210 + }, + { + "epoch": 0.12178204938767138, + "grad_norm": 0.11013331264257431, + "learning_rate": 2.854986390116492e-05, + "loss": 0.0362, + "step": 55220 + }, + { + "epoch": 0.12180410336256955, + "grad_norm": 0.1380404829978943, + "learning_rate": 2.8549154549130628e-05, + "loss": 0.0381, + "step": 55230 + }, + { + "epoch": 0.12182615733746772, + "grad_norm": 0.13265137374401093, + "learning_rate": 2.8548445032461317e-05, + "loss": 0.0375, + "step": 55240 + }, + { + "epoch": 0.12184821131236588, + "grad_norm": 0.1237972155213356, + "learning_rate": 2.8547735351165603e-05, + "loss": 0.034, + "step": 55250 + }, + { + "epoch": 0.12187026528726405, + "grad_norm": 0.11976461112499237, + "learning_rate": 2.854702550525212e-05, + "loss": 0.0359, + "step": 55260 + }, + { + "epoch": 0.12189231926216222, + "grad_norm": 0.18291163444519043, + "learning_rate": 2.8546315494729484e-05, + "loss": 0.0396, + "step": 55270 + }, + { + "epoch": 0.12191437323706038, + "grad_norm": 0.1086520180106163, + "learning_rate": 2.854560531960632e-05, + "loss": 0.037, + "step": 55280 + }, + { + "epoch": 0.12193642721195855, + "grad_norm": 0.1337893307209015, + "learning_rate": 2.854489497989127e-05, + "loss": 0.0379, + "step": 55290 + }, + { + "epoch": 0.12195848118685672, + "grad_norm": 0.13802842795848846, + "learning_rate": 2.854418447559295e-05, + "loss": 0.0348, + "step": 55300 + }, + { + "epoch": 0.12198053516175487, + "grad_norm": 0.11799688637256622, + "learning_rate": 2.8543473806720005e-05, + "loss": 0.0361, + "step": 55310 + }, + { + "epoch": 0.12200258913665304, + "grad_norm": 0.10663525015115738, + "learning_rate": 2.8542762973281067e-05, + "loss": 0.0373, + "step": 55320 + }, + { + "epoch": 0.12202464311155121, + "grad_norm": 0.13608355820178986, + "learning_rate": 2.8542051975284772e-05, + "loss": 0.0367, + "step": 55330 + }, + { + "epoch": 0.12204669708644937, + "grad_norm": 0.10538366436958313, + "learning_rate": 2.8541340812739758e-05, + "loss": 0.0352, + "step": 55340 + }, + { + "epoch": 0.12206875106134754, + "grad_norm": 0.11753302067518234, + "learning_rate": 2.854062948565467e-05, + "loss": 0.039, + "step": 55350 + }, + { + "epoch": 0.12209080503624571, + "grad_norm": 0.10751186311244965, + "learning_rate": 2.853991799403815e-05, + "loss": 0.0378, + "step": 55360 + }, + { + "epoch": 0.12211285901114387, + "grad_norm": 0.1024186760187149, + "learning_rate": 2.8539206337898837e-05, + "loss": 0.0335, + "step": 55370 + }, + { + "epoch": 0.12213491298604204, + "grad_norm": 0.13886550068855286, + "learning_rate": 2.8538494517245387e-05, + "loss": 0.036, + "step": 55380 + }, + { + "epoch": 0.12215696696094021, + "grad_norm": 0.12929542362689972, + "learning_rate": 2.8537782532086443e-05, + "loss": 0.0348, + "step": 55390 + }, + { + "epoch": 0.12217902093583836, + "grad_norm": 0.11564287543296814, + "learning_rate": 2.8537070382430666e-05, + "loss": 0.0385, + "step": 55400 + }, + { + "epoch": 0.12220107491073653, + "grad_norm": 0.15343979001045227, + "learning_rate": 2.85363580682867e-05, + "loss": 0.0377, + "step": 55410 + }, + { + "epoch": 0.1222231288856347, + "grad_norm": 0.12080874294042587, + "learning_rate": 2.8535645589663204e-05, + "loss": 0.0377, + "step": 55420 + }, + { + "epoch": 0.12224518286053287, + "grad_norm": 0.08427444100379944, + "learning_rate": 2.8534932946568833e-05, + "loss": 0.0366, + "step": 55430 + }, + { + "epoch": 0.12226723683543103, + "grad_norm": 0.13922640681266785, + "learning_rate": 2.853422013901225e-05, + "loss": 0.0363, + "step": 55440 + }, + { + "epoch": 0.1222892908103292, + "grad_norm": 0.13888481259346008, + "learning_rate": 2.853350716700211e-05, + "loss": 0.0374, + "step": 55450 + }, + { + "epoch": 0.12231134478522737, + "grad_norm": 0.11128872632980347, + "learning_rate": 2.8532794030547084e-05, + "loss": 0.0358, + "step": 55460 + }, + { + "epoch": 0.12233339876012553, + "grad_norm": 0.100505031645298, + "learning_rate": 2.853208072965583e-05, + "loss": 0.0348, + "step": 55470 + }, + { + "epoch": 0.1223554527350237, + "grad_norm": 0.12906990945339203, + "learning_rate": 2.8531367264337017e-05, + "loss": 0.0373, + "step": 55480 + }, + { + "epoch": 0.12237750670992187, + "grad_norm": 0.09238699823617935, + "learning_rate": 2.853065363459932e-05, + "loss": 0.0374, + "step": 55490 + }, + { + "epoch": 0.12239956068482002, + "grad_norm": 0.12566497921943665, + "learning_rate": 2.8529939840451406e-05, + "loss": 0.0354, + "step": 55500 + }, + { + "epoch": 0.1224216146597182, + "grad_norm": 0.09970642626285553, + "learning_rate": 2.8529225881901946e-05, + "loss": 0.0343, + "step": 55510 + }, + { + "epoch": 0.12244366863461636, + "grad_norm": 0.13135173916816711, + "learning_rate": 2.852851175895962e-05, + "loss": 0.0379, + "step": 55520 + }, + { + "epoch": 0.12246572260951452, + "grad_norm": 0.10390087962150574, + "learning_rate": 2.8527797471633102e-05, + "loss": 0.0366, + "step": 55530 + }, + { + "epoch": 0.12248777658441269, + "grad_norm": 0.10911908000707626, + "learning_rate": 2.852708301993107e-05, + "loss": 0.0368, + "step": 55540 + }, + { + "epoch": 0.12250983055931086, + "grad_norm": 0.09750349074602127, + "learning_rate": 2.8526368403862213e-05, + "loss": 0.0354, + "step": 55550 + }, + { + "epoch": 0.12253188453420902, + "grad_norm": 0.11394196003675461, + "learning_rate": 2.8525653623435208e-05, + "loss": 0.0373, + "step": 55560 + }, + { + "epoch": 0.12255393850910719, + "grad_norm": 0.13663320243358612, + "learning_rate": 2.8524938678658738e-05, + "loss": 0.0366, + "step": 55570 + }, + { + "epoch": 0.12257599248400536, + "grad_norm": 0.08977756649255753, + "learning_rate": 2.8524223569541498e-05, + "loss": 0.035, + "step": 55580 + }, + { + "epoch": 0.12259804645890351, + "grad_norm": 0.11322048306465149, + "learning_rate": 2.852350829609217e-05, + "loss": 0.0368, + "step": 55590 + }, + { + "epoch": 0.12262010043380168, + "grad_norm": 0.12166154384613037, + "learning_rate": 2.852279285831945e-05, + "loss": 0.0371, + "step": 55600 + }, + { + "epoch": 0.12264215440869985, + "grad_norm": 0.13199622929096222, + "learning_rate": 2.8522077256232023e-05, + "loss": 0.036, + "step": 55610 + }, + { + "epoch": 0.12266420838359801, + "grad_norm": 0.1041097491979599, + "learning_rate": 2.8521361489838592e-05, + "loss": 0.0352, + "step": 55620 + }, + { + "epoch": 0.12268626235849618, + "grad_norm": 0.12214743345975876, + "learning_rate": 2.8520645559147854e-05, + "loss": 0.0365, + "step": 55630 + }, + { + "epoch": 0.12270831633339435, + "grad_norm": 0.12501099705696106, + "learning_rate": 2.8519929464168513e-05, + "loss": 0.0351, + "step": 55640 + }, + { + "epoch": 0.12273037030829252, + "grad_norm": 0.14108113944530487, + "learning_rate": 2.851921320490926e-05, + "loss": 0.0371, + "step": 55650 + }, + { + "epoch": 0.12275242428319068, + "grad_norm": 0.15309828519821167, + "learning_rate": 2.8518496781378797e-05, + "loss": 0.0376, + "step": 55660 + }, + { + "epoch": 0.12277447825808885, + "grad_norm": 0.10605038702487946, + "learning_rate": 2.851778019358584e-05, + "loss": 0.037, + "step": 55670 + }, + { + "epoch": 0.12279653223298702, + "grad_norm": 0.21486636996269226, + "learning_rate": 2.851706344153909e-05, + "loss": 0.0359, + "step": 55680 + }, + { + "epoch": 0.12281858620788517, + "grad_norm": 0.12396419048309326, + "learning_rate": 2.8516346525247254e-05, + "loss": 0.0363, + "step": 55690 + }, + { + "epoch": 0.12284064018278334, + "grad_norm": 0.11310689151287079, + "learning_rate": 2.851562944471905e-05, + "loss": 0.0381, + "step": 55700 + }, + { + "epoch": 0.12286269415768152, + "grad_norm": 0.11641238629817963, + "learning_rate": 2.8514912199963184e-05, + "loss": 0.0384, + "step": 55710 + }, + { + "epoch": 0.12288474813257967, + "grad_norm": 0.10086093097925186, + "learning_rate": 2.8514194790988378e-05, + "loss": 0.0347, + "step": 55720 + }, + { + "epoch": 0.12290680210747784, + "grad_norm": 0.09964287281036377, + "learning_rate": 2.8513477217803346e-05, + "loss": 0.0339, + "step": 55730 + }, + { + "epoch": 0.12292885608237601, + "grad_norm": 0.1157551035284996, + "learning_rate": 2.85127594804168e-05, + "loss": 0.0362, + "step": 55740 + }, + { + "epoch": 0.12295091005727417, + "grad_norm": 0.10721907019615173, + "learning_rate": 2.8512041578837477e-05, + "loss": 0.0351, + "step": 55750 + }, + { + "epoch": 0.12297296403217234, + "grad_norm": 0.12374704331159592, + "learning_rate": 2.851132351307409e-05, + "loss": 0.036, + "step": 55760 + }, + { + "epoch": 0.12299501800707051, + "grad_norm": 0.08694836497306824, + "learning_rate": 2.8510605283135357e-05, + "loss": 0.036, + "step": 55770 + }, + { + "epoch": 0.12301707198196866, + "grad_norm": 0.1559562385082245, + "learning_rate": 2.850988688903002e-05, + "loss": 0.0361, + "step": 55780 + }, + { + "epoch": 0.12303912595686683, + "grad_norm": 0.09460888057947159, + "learning_rate": 2.8509168330766804e-05, + "loss": 0.0352, + "step": 55790 + }, + { + "epoch": 0.123061179931765, + "grad_norm": 0.09799159318208694, + "learning_rate": 2.850844960835444e-05, + "loss": 0.0362, + "step": 55800 + }, + { + "epoch": 0.12308323390666316, + "grad_norm": 0.10731955617666245, + "learning_rate": 2.8507730721801654e-05, + "loss": 0.0336, + "step": 55810 + }, + { + "epoch": 0.12310528788156133, + "grad_norm": 0.09868357330560684, + "learning_rate": 2.8507011671117184e-05, + "loss": 0.0365, + "step": 55820 + }, + { + "epoch": 0.1231273418564595, + "grad_norm": 0.152689591050148, + "learning_rate": 2.8506292456309774e-05, + "loss": 0.0355, + "step": 55830 + }, + { + "epoch": 0.12314939583135766, + "grad_norm": 0.10449371486902237, + "learning_rate": 2.8505573077388155e-05, + "loss": 0.0365, + "step": 55840 + }, + { + "epoch": 0.12317144980625583, + "grad_norm": 0.10347781330347061, + "learning_rate": 2.8504853534361076e-05, + "loss": 0.036, + "step": 55850 + }, + { + "epoch": 0.123193503781154, + "grad_norm": 0.11796530336141586, + "learning_rate": 2.8504133827237273e-05, + "loss": 0.0374, + "step": 55860 + }, + { + "epoch": 0.12321555775605217, + "grad_norm": 0.11058538407087326, + "learning_rate": 2.8503413956025496e-05, + "loss": 0.0351, + "step": 55870 + }, + { + "epoch": 0.12323761173095032, + "grad_norm": 0.12972207367420197, + "learning_rate": 2.8502693920734485e-05, + "loss": 0.0367, + "step": 55880 + }, + { + "epoch": 0.1232596657058485, + "grad_norm": 0.0880013108253479, + "learning_rate": 2.8501973721373003e-05, + "loss": 0.0352, + "step": 55890 + }, + { + "epoch": 0.12328171968074667, + "grad_norm": 0.12994486093521118, + "learning_rate": 2.8501253357949787e-05, + "loss": 0.0383, + "step": 55900 + }, + { + "epoch": 0.12330377365564482, + "grad_norm": 0.11669479310512543, + "learning_rate": 2.8500532830473596e-05, + "loss": 0.0406, + "step": 55910 + }, + { + "epoch": 0.12332582763054299, + "grad_norm": 0.13433238863945007, + "learning_rate": 2.8499812138953184e-05, + "loss": 0.0349, + "step": 55920 + }, + { + "epoch": 0.12334788160544116, + "grad_norm": 0.12416653335094452, + "learning_rate": 2.8499091283397307e-05, + "loss": 0.0362, + "step": 55930 + }, + { + "epoch": 0.12336993558033932, + "grad_norm": 0.11425416171550751, + "learning_rate": 2.849837026381473e-05, + "loss": 0.0358, + "step": 55940 + }, + { + "epoch": 0.12339198955523749, + "grad_norm": 0.1361493468284607, + "learning_rate": 2.8497649080214207e-05, + "loss": 0.0351, + "step": 55950 + }, + { + "epoch": 0.12341404353013566, + "grad_norm": 0.10712093859910965, + "learning_rate": 2.8496927732604504e-05, + "loss": 0.0365, + "step": 55960 + }, + { + "epoch": 0.12343609750503381, + "grad_norm": 0.10964956134557724, + "learning_rate": 2.8496206220994385e-05, + "loss": 0.0362, + "step": 55970 + }, + { + "epoch": 0.12345815147993199, + "grad_norm": 0.09816332906484604, + "learning_rate": 2.849548454539262e-05, + "loss": 0.0345, + "step": 55980 + }, + { + "epoch": 0.12348020545483016, + "grad_norm": 0.1319718211889267, + "learning_rate": 2.8494762705807974e-05, + "loss": 0.0355, + "step": 55990 + }, + { + "epoch": 0.12350225942972831, + "grad_norm": 0.1466495543718338, + "learning_rate": 2.849404070224922e-05, + "loss": 0.0372, + "step": 56000 + }, + { + "epoch": 0.12352431340462648, + "grad_norm": 0.12692765891551971, + "learning_rate": 2.849331853472513e-05, + "loss": 0.0376, + "step": 56010 + }, + { + "epoch": 0.12354636737952465, + "grad_norm": 0.10033860057592392, + "learning_rate": 2.8492596203244483e-05, + "loss": 0.0358, + "step": 56020 + }, + { + "epoch": 0.12356842135442281, + "grad_norm": 0.12852507829666138, + "learning_rate": 2.8491873707816052e-05, + "loss": 0.038, + "step": 56030 + }, + { + "epoch": 0.12359047532932098, + "grad_norm": 0.12027076631784439, + "learning_rate": 2.8491151048448616e-05, + "loss": 0.0352, + "step": 56040 + }, + { + "epoch": 0.12361252930421915, + "grad_norm": 0.12055892497301102, + "learning_rate": 2.849042822515096e-05, + "loss": 0.0387, + "step": 56050 + }, + { + "epoch": 0.1236345832791173, + "grad_norm": 0.11139371991157532, + "learning_rate": 2.848970523793186e-05, + "loss": 0.0386, + "step": 56060 + }, + { + "epoch": 0.12365663725401548, + "grad_norm": 0.11908916383981705, + "learning_rate": 2.8488982086800106e-05, + "loss": 0.0354, + "step": 56070 + }, + { + "epoch": 0.12367869122891365, + "grad_norm": 0.10039331763982773, + "learning_rate": 2.8488258771764483e-05, + "loss": 0.0354, + "step": 56080 + }, + { + "epoch": 0.12370074520381182, + "grad_norm": 0.15638010203838348, + "learning_rate": 2.8487535292833785e-05, + "loss": 0.0354, + "step": 56090 + }, + { + "epoch": 0.12372279917870997, + "grad_norm": 0.09307905286550522, + "learning_rate": 2.8486811650016797e-05, + "loss": 0.0357, + "step": 56100 + }, + { + "epoch": 0.12374485315360814, + "grad_norm": 0.09608066082000732, + "learning_rate": 2.8486087843322312e-05, + "loss": 0.035, + "step": 56110 + }, + { + "epoch": 0.12376690712850631, + "grad_norm": 0.1106601431965828, + "learning_rate": 2.848536387275913e-05, + "loss": 0.0339, + "step": 56120 + }, + { + "epoch": 0.12378896110340447, + "grad_norm": 0.14246952533721924, + "learning_rate": 2.848463973833604e-05, + "loss": 0.0366, + "step": 56130 + }, + { + "epoch": 0.12381101507830264, + "grad_norm": 0.14454615116119385, + "learning_rate": 2.848391544006185e-05, + "loss": 0.0349, + "step": 56140 + }, + { + "epoch": 0.12383306905320081, + "grad_norm": 0.09448477625846863, + "learning_rate": 2.8483190977945353e-05, + "loss": 0.0366, + "step": 56150 + }, + { + "epoch": 0.12385512302809897, + "grad_norm": 0.10989703983068466, + "learning_rate": 2.848246635199536e-05, + "loss": 0.0347, + "step": 56160 + }, + { + "epoch": 0.12387717700299714, + "grad_norm": 0.1128547191619873, + "learning_rate": 2.8481741562220664e-05, + "loss": 0.0347, + "step": 56170 + }, + { + "epoch": 0.1238992309778953, + "grad_norm": 0.10096445679664612, + "learning_rate": 2.8481016608630087e-05, + "loss": 0.0348, + "step": 56180 + }, + { + "epoch": 0.12392128495279346, + "grad_norm": 0.12027732282876968, + "learning_rate": 2.8480291491232425e-05, + "loss": 0.0359, + "step": 56190 + }, + { + "epoch": 0.12394333892769163, + "grad_norm": 0.12466797232627869, + "learning_rate": 2.8479566210036494e-05, + "loss": 0.0357, + "step": 56200 + }, + { + "epoch": 0.1239653929025898, + "grad_norm": 0.1424541473388672, + "learning_rate": 2.8478840765051107e-05, + "loss": 0.0355, + "step": 56210 + }, + { + "epoch": 0.12398744687748796, + "grad_norm": 0.12884314358234406, + "learning_rate": 2.8478115156285082e-05, + "loss": 0.0358, + "step": 56220 + }, + { + "epoch": 0.12400950085238613, + "grad_norm": 0.11975356936454773, + "learning_rate": 2.847738938374723e-05, + "loss": 0.0359, + "step": 56230 + }, + { + "epoch": 0.1240315548272843, + "grad_norm": 0.10636302083730698, + "learning_rate": 2.847666344744637e-05, + "loss": 0.0352, + "step": 56240 + }, + { + "epoch": 0.12405360880218246, + "grad_norm": 0.13732773065567017, + "learning_rate": 2.8475937347391326e-05, + "loss": 0.0374, + "step": 56250 + }, + { + "epoch": 0.12407566277708063, + "grad_norm": 0.1150466650724411, + "learning_rate": 2.8475211083590922e-05, + "loss": 0.0346, + "step": 56260 + }, + { + "epoch": 0.1240977167519788, + "grad_norm": 0.12605531513690948, + "learning_rate": 2.8474484656053983e-05, + "loss": 0.0358, + "step": 56270 + }, + { + "epoch": 0.12411977072687695, + "grad_norm": 0.11404235661029816, + "learning_rate": 2.847375806478933e-05, + "loss": 0.0361, + "step": 56280 + }, + { + "epoch": 0.12414182470177512, + "grad_norm": 0.10727445036172867, + "learning_rate": 2.8473031309805795e-05, + "loss": 0.0369, + "step": 56290 + }, + { + "epoch": 0.12416387867667329, + "grad_norm": 0.1130601167678833, + "learning_rate": 2.8472304391112208e-05, + "loss": 0.0338, + "step": 56300 + }, + { + "epoch": 0.12418593265157146, + "grad_norm": 0.13698215782642365, + "learning_rate": 2.8471577308717404e-05, + "loss": 0.0378, + "step": 56310 + }, + { + "epoch": 0.12420798662646962, + "grad_norm": 0.13251768052577972, + "learning_rate": 2.8470850062630214e-05, + "loss": 0.034, + "step": 56320 + }, + { + "epoch": 0.12423004060136779, + "grad_norm": 0.11546394228935242, + "learning_rate": 2.847012265285948e-05, + "loss": 0.0362, + "step": 56330 + }, + { + "epoch": 0.12425209457626596, + "grad_norm": 0.12594449520111084, + "learning_rate": 2.846939507941404e-05, + "loss": 0.035, + "step": 56340 + }, + { + "epoch": 0.12427414855116412, + "grad_norm": 0.11696597188711166, + "learning_rate": 2.8468667342302728e-05, + "loss": 0.0359, + "step": 56350 + }, + { + "epoch": 0.12429620252606229, + "grad_norm": 0.09374811500310898, + "learning_rate": 2.8467939441534395e-05, + "loss": 0.0374, + "step": 56360 + }, + { + "epoch": 0.12431825650096046, + "grad_norm": 0.1210138276219368, + "learning_rate": 2.846721137711788e-05, + "loss": 0.0369, + "step": 56370 + }, + { + "epoch": 0.12434031047585861, + "grad_norm": 0.12703926861286163, + "learning_rate": 2.846648314906203e-05, + "loss": 0.0361, + "step": 56380 + }, + { + "epoch": 0.12436236445075678, + "grad_norm": 0.09298336505889893, + "learning_rate": 2.8465754757375694e-05, + "loss": 0.0353, + "step": 56390 + }, + { + "epoch": 0.12438441842565495, + "grad_norm": 0.11110848188400269, + "learning_rate": 2.8465026202067733e-05, + "loss": 0.038, + "step": 56400 + }, + { + "epoch": 0.12440647240055311, + "grad_norm": 0.09933200478553772, + "learning_rate": 2.8464297483146983e-05, + "loss": 0.0326, + "step": 56410 + }, + { + "epoch": 0.12442852637545128, + "grad_norm": 0.10953380912542343, + "learning_rate": 2.8463568600622307e-05, + "loss": 0.0369, + "step": 56420 + }, + { + "epoch": 0.12445058035034945, + "grad_norm": 0.13505151867866516, + "learning_rate": 2.846283955450256e-05, + "loss": 0.0361, + "step": 56430 + }, + { + "epoch": 0.1244726343252476, + "grad_norm": 0.14108577370643616, + "learning_rate": 2.8462110344796603e-05, + "loss": 0.0373, + "step": 56440 + }, + { + "epoch": 0.12449468830014578, + "grad_norm": 0.1022389754652977, + "learning_rate": 2.8461380971513292e-05, + "loss": 0.0353, + "step": 56450 + }, + { + "epoch": 0.12451674227504395, + "grad_norm": 0.09524697810411453, + "learning_rate": 2.84606514346615e-05, + "loss": 0.0347, + "step": 56460 + }, + { + "epoch": 0.1245387962499421, + "grad_norm": 0.08495964854955673, + "learning_rate": 2.8459921734250076e-05, + "loss": 0.0381, + "step": 56470 + }, + { + "epoch": 0.12456085022484027, + "grad_norm": 0.1258312612771988, + "learning_rate": 2.84591918702879e-05, + "loss": 0.0372, + "step": 56480 + }, + { + "epoch": 0.12458290419973844, + "grad_norm": 0.11054422706365585, + "learning_rate": 2.8458461842783832e-05, + "loss": 0.0378, + "step": 56490 + }, + { + "epoch": 0.12460495817463661, + "grad_norm": 0.08666245639324188, + "learning_rate": 2.845773165174675e-05, + "loss": 0.0345, + "step": 56500 + }, + { + "epoch": 0.12462701214953477, + "grad_norm": 0.1033553034067154, + "learning_rate": 2.8457001297185516e-05, + "loss": 0.0349, + "step": 56510 + }, + { + "epoch": 0.12464906612443294, + "grad_norm": 0.14275102317333221, + "learning_rate": 2.8456270779109015e-05, + "loss": 0.0371, + "step": 56520 + }, + { + "epoch": 0.12467112009933111, + "grad_norm": 0.13300824165344238, + "learning_rate": 2.845554009752612e-05, + "loss": 0.0337, + "step": 56530 + }, + { + "epoch": 0.12469317407422927, + "grad_norm": 0.10222247987985611, + "learning_rate": 2.8454809252445707e-05, + "loss": 0.0343, + "step": 56540 + }, + { + "epoch": 0.12471522804912744, + "grad_norm": 0.12254957854747772, + "learning_rate": 2.8454078243876655e-05, + "loss": 0.0361, + "step": 56550 + }, + { + "epoch": 0.1247372820240256, + "grad_norm": 0.14581654965877533, + "learning_rate": 2.8453347071827855e-05, + "loss": 0.0359, + "step": 56560 + }, + { + "epoch": 0.12475933599892376, + "grad_norm": 0.08563745766878128, + "learning_rate": 2.8452615736308184e-05, + "loss": 0.0375, + "step": 56570 + }, + { + "epoch": 0.12478138997382193, + "grad_norm": 0.11398310214281082, + "learning_rate": 2.845188423732653e-05, + "loss": 0.0379, + "step": 56580 + }, + { + "epoch": 0.1248034439487201, + "grad_norm": 0.09767467528581619, + "learning_rate": 2.845115257489178e-05, + "loss": 0.0359, + "step": 56590 + }, + { + "epoch": 0.12482549792361826, + "grad_norm": 0.08994822204113007, + "learning_rate": 2.845042074901283e-05, + "loss": 0.0357, + "step": 56600 + }, + { + "epoch": 0.12484755189851643, + "grad_norm": 0.10056556016206741, + "learning_rate": 2.8449688759698565e-05, + "loss": 0.0347, + "step": 56610 + }, + { + "epoch": 0.1248696058734146, + "grad_norm": 0.10748528689146042, + "learning_rate": 2.8448956606957885e-05, + "loss": 0.0364, + "step": 56620 + }, + { + "epoch": 0.12489165984831276, + "grad_norm": 0.11224949359893799, + "learning_rate": 2.8448224290799684e-05, + "loss": 0.0363, + "step": 56630 + }, + { + "epoch": 0.12491371382321093, + "grad_norm": 0.16271540522575378, + "learning_rate": 2.844749181123286e-05, + "loss": 0.0356, + "step": 56640 + }, + { + "epoch": 0.1249357677981091, + "grad_norm": 0.12507517635822296, + "learning_rate": 2.8446759168266315e-05, + "loss": 0.0342, + "step": 56650 + }, + { + "epoch": 0.12495782177300725, + "grad_norm": 0.11491760611534119, + "learning_rate": 2.8446026361908948e-05, + "loss": 0.0378, + "step": 56660 + }, + { + "epoch": 0.12497987574790542, + "grad_norm": 0.10160423815250397, + "learning_rate": 2.844529339216967e-05, + "loss": 0.0351, + "step": 56670 + }, + { + "epoch": 0.12500192972280358, + "grad_norm": 0.10798949003219604, + "learning_rate": 2.844456025905738e-05, + "loss": 0.0367, + "step": 56680 + }, + { + "epoch": 0.12502398369770176, + "grad_norm": 0.12049974501132965, + "learning_rate": 2.8443826962580985e-05, + "loss": 0.0364, + "step": 56690 + }, + { + "epoch": 0.12504603767259992, + "grad_norm": 0.10166595131158829, + "learning_rate": 2.8443093502749406e-05, + "loss": 0.0371, + "step": 56700 + }, + { + "epoch": 0.12506809164749808, + "grad_norm": 0.08801330626010895, + "learning_rate": 2.8442359879571547e-05, + "loss": 0.0374, + "step": 56710 + }, + { + "epoch": 0.12509014562239626, + "grad_norm": 0.13054251670837402, + "learning_rate": 2.8441626093056324e-05, + "loss": 0.0354, + "step": 56720 + }, + { + "epoch": 0.12511219959729442, + "grad_norm": 0.11167958378791809, + "learning_rate": 2.844089214321265e-05, + "loss": 0.0359, + "step": 56730 + }, + { + "epoch": 0.12513425357219257, + "grad_norm": 0.10311771929264069, + "learning_rate": 2.844015803004945e-05, + "loss": 0.0347, + "step": 56740 + }, + { + "epoch": 0.12515630754709076, + "grad_norm": 0.11519575864076614, + "learning_rate": 2.8439423753575635e-05, + "loss": 0.0387, + "step": 56750 + }, + { + "epoch": 0.1251783615219889, + "grad_norm": 0.10235583782196045, + "learning_rate": 2.843868931380014e-05, + "loss": 0.0379, + "step": 56760 + }, + { + "epoch": 0.12520041549688707, + "grad_norm": 0.1277591437101364, + "learning_rate": 2.8437954710731873e-05, + "loss": 0.0371, + "step": 56770 + }, + { + "epoch": 0.12522246947178525, + "grad_norm": 0.09848971664905548, + "learning_rate": 2.8437219944379775e-05, + "loss": 0.0378, + "step": 56780 + }, + { + "epoch": 0.1252445234466834, + "grad_norm": 0.11871598660945892, + "learning_rate": 2.8436485014752766e-05, + "loss": 0.0344, + "step": 56790 + }, + { + "epoch": 0.12526657742158157, + "grad_norm": 0.13914358615875244, + "learning_rate": 2.8435749921859777e-05, + "loss": 0.0365, + "step": 56800 + }, + { + "epoch": 0.12528863139647975, + "grad_norm": 0.1167268455028534, + "learning_rate": 2.8435014665709742e-05, + "loss": 0.0366, + "step": 56810 + }, + { + "epoch": 0.1253106853713779, + "grad_norm": 0.1250753253698349, + "learning_rate": 2.8434279246311593e-05, + "loss": 0.0367, + "step": 56820 + }, + { + "epoch": 0.1253327393462761, + "grad_norm": 0.11516052484512329, + "learning_rate": 2.843354366367427e-05, + "loss": 0.0353, + "step": 56830 + }, + { + "epoch": 0.12535479332117425, + "grad_norm": 0.10682957619428635, + "learning_rate": 2.8432807917806706e-05, + "loss": 0.0371, + "step": 56840 + }, + { + "epoch": 0.1253768472960724, + "grad_norm": 0.09360182285308838, + "learning_rate": 2.843207200871784e-05, + "loss": 0.0362, + "step": 56850 + }, + { + "epoch": 0.1253989012709706, + "grad_norm": 0.11046741157770157, + "learning_rate": 2.8431335936416623e-05, + "loss": 0.0373, + "step": 56860 + }, + { + "epoch": 0.12542095524586874, + "grad_norm": 0.11715017259120941, + "learning_rate": 2.843059970091199e-05, + "loss": 0.0354, + "step": 56870 + }, + { + "epoch": 0.1254430092207669, + "grad_norm": 0.1144053116440773, + "learning_rate": 2.842986330221289e-05, + "loss": 0.0354, + "step": 56880 + }, + { + "epoch": 0.12546506319566508, + "grad_norm": 0.1080484613776207, + "learning_rate": 2.8429126740328273e-05, + "loss": 0.0368, + "step": 56890 + }, + { + "epoch": 0.12548711717056324, + "grad_norm": 0.141514852643013, + "learning_rate": 2.8428390015267088e-05, + "loss": 0.0392, + "step": 56900 + }, + { + "epoch": 0.1255091711454614, + "grad_norm": 0.09360972791910172, + "learning_rate": 2.842765312703828e-05, + "loss": 0.0348, + "step": 56910 + }, + { + "epoch": 0.12553122512035958, + "grad_norm": 0.10126577317714691, + "learning_rate": 2.842691607565081e-05, + "loss": 0.0358, + "step": 56920 + }, + { + "epoch": 0.12555327909525774, + "grad_norm": 0.12002134323120117, + "learning_rate": 2.842617886111364e-05, + "loss": 0.0358, + "step": 56930 + }, + { + "epoch": 0.1255753330701559, + "grad_norm": 0.15443654358386993, + "learning_rate": 2.842544148343571e-05, + "loss": 0.0355, + "step": 56940 + }, + { + "epoch": 0.12559738704505408, + "grad_norm": 0.09843308478593826, + "learning_rate": 2.8424703942625996e-05, + "loss": 0.0367, + "step": 56950 + }, + { + "epoch": 0.12561944101995223, + "grad_norm": 0.11480768769979477, + "learning_rate": 2.8423966238693456e-05, + "loss": 0.0361, + "step": 56960 + }, + { + "epoch": 0.1256414949948504, + "grad_norm": 0.13385827839374542, + "learning_rate": 2.8423228371647048e-05, + "loss": 0.036, + "step": 56970 + }, + { + "epoch": 0.12566354896974857, + "grad_norm": 0.10029374063014984, + "learning_rate": 2.8422490341495743e-05, + "loss": 0.0369, + "step": 56980 + }, + { + "epoch": 0.12568560294464673, + "grad_norm": 0.1632385104894638, + "learning_rate": 2.8421752148248504e-05, + "loss": 0.0389, + "step": 56990 + }, + { + "epoch": 0.12570765691954489, + "grad_norm": 0.15299804508686066, + "learning_rate": 2.842101379191431e-05, + "loss": 0.0351, + "step": 57000 + }, + { + "epoch": 0.12572971089444307, + "grad_norm": 0.1277870535850525, + "learning_rate": 2.8420275272502125e-05, + "loss": 0.0367, + "step": 57010 + }, + { + "epoch": 0.12575176486934123, + "grad_norm": 0.11234618723392487, + "learning_rate": 2.8419536590020925e-05, + "loss": 0.0386, + "step": 57020 + }, + { + "epoch": 0.12577381884423938, + "grad_norm": 0.11878395825624466, + "learning_rate": 2.841879774447968e-05, + "loss": 0.039, + "step": 57030 + }, + { + "epoch": 0.12579587281913757, + "grad_norm": 0.14411400258541107, + "learning_rate": 2.8418058735887378e-05, + "loss": 0.0368, + "step": 57040 + }, + { + "epoch": 0.12581792679403572, + "grad_norm": 0.10876716673374176, + "learning_rate": 2.8417319564252995e-05, + "loss": 0.036, + "step": 57050 + }, + { + "epoch": 0.12583998076893388, + "grad_norm": 0.10560216009616852, + "learning_rate": 2.8416580229585506e-05, + "loss": 0.0373, + "step": 57060 + }, + { + "epoch": 0.12586203474383206, + "grad_norm": 0.09412431716918945, + "learning_rate": 2.8415840731893905e-05, + "loss": 0.0346, + "step": 57070 + }, + { + "epoch": 0.12588408871873022, + "grad_norm": 0.12204407900571823, + "learning_rate": 2.8415101071187167e-05, + "loss": 0.0374, + "step": 57080 + }, + { + "epoch": 0.12590614269362838, + "grad_norm": 0.11445821076631546, + "learning_rate": 2.841436124747429e-05, + "loss": 0.0367, + "step": 57090 + }, + { + "epoch": 0.12592819666852656, + "grad_norm": 0.09747416526079178, + "learning_rate": 2.841362126076426e-05, + "loss": 0.0364, + "step": 57100 + }, + { + "epoch": 0.12595025064342472, + "grad_norm": 0.11282717436552048, + "learning_rate": 2.8412881111066065e-05, + "loss": 0.0359, + "step": 57110 + }, + { + "epoch": 0.12597230461832287, + "grad_norm": 0.11607863754034042, + "learning_rate": 2.8412140798388697e-05, + "loss": 0.0358, + "step": 57120 + }, + { + "epoch": 0.12599435859322106, + "grad_norm": 0.097477987408638, + "learning_rate": 2.841140032274116e-05, + "loss": 0.0367, + "step": 57130 + }, + { + "epoch": 0.1260164125681192, + "grad_norm": 0.14643865823745728, + "learning_rate": 2.841065968413244e-05, + "loss": 0.0368, + "step": 57140 + }, + { + "epoch": 0.12603846654301737, + "grad_norm": 0.15972064435482025, + "learning_rate": 2.840991888257155e-05, + "loss": 0.0369, + "step": 57150 + }, + { + "epoch": 0.12606052051791555, + "grad_norm": 0.09846650809049606, + "learning_rate": 2.840917791806748e-05, + "loss": 0.0385, + "step": 57160 + }, + { + "epoch": 0.1260825744928137, + "grad_norm": 0.112908735871315, + "learning_rate": 2.8408436790629243e-05, + "loss": 0.0352, + "step": 57170 + }, + { + "epoch": 0.12610462846771187, + "grad_norm": 0.10782984644174576, + "learning_rate": 2.8407695500265836e-05, + "loss": 0.0356, + "step": 57180 + }, + { + "epoch": 0.12612668244261005, + "grad_norm": 0.09545673429965973, + "learning_rate": 2.8406954046986268e-05, + "loss": 0.0355, + "step": 57190 + }, + { + "epoch": 0.1261487364175082, + "grad_norm": 0.11611629277467728, + "learning_rate": 2.8406212430799553e-05, + "loss": 0.036, + "step": 57200 + }, + { + "epoch": 0.12617079039240636, + "grad_norm": 0.11395968496799469, + "learning_rate": 2.8405470651714697e-05, + "loss": 0.0398, + "step": 57210 + }, + { + "epoch": 0.12619284436730455, + "grad_norm": 0.12745167315006256, + "learning_rate": 2.840472870974072e-05, + "loss": 0.0358, + "step": 57220 + }, + { + "epoch": 0.1262148983422027, + "grad_norm": 0.10332286357879639, + "learning_rate": 2.840398660488663e-05, + "loss": 0.0386, + "step": 57230 + }, + { + "epoch": 0.12623695231710086, + "grad_norm": 0.09864386171102524, + "learning_rate": 2.8403244337161447e-05, + "loss": 0.0361, + "step": 57240 + }, + { + "epoch": 0.12625900629199904, + "grad_norm": 0.1071014553308487, + "learning_rate": 2.840250190657419e-05, + "loss": 0.0362, + "step": 57250 + }, + { + "epoch": 0.1262810602668972, + "grad_norm": 0.09413476288318634, + "learning_rate": 2.8401759313133882e-05, + "loss": 0.0358, + "step": 57260 + }, + { + "epoch": 0.12630311424179538, + "grad_norm": 0.11257371306419373, + "learning_rate": 2.8401016556849544e-05, + "loss": 0.0369, + "step": 57270 + }, + { + "epoch": 0.12632516821669354, + "grad_norm": 0.11558333039283752, + "learning_rate": 2.8400273637730204e-05, + "loss": 0.0373, + "step": 57280 + }, + { + "epoch": 0.1263472221915917, + "grad_norm": 0.1100575178861618, + "learning_rate": 2.839953055578489e-05, + "loss": 0.0371, + "step": 57290 + }, + { + "epoch": 0.12636927616648988, + "grad_norm": 0.10714678466320038, + "learning_rate": 2.8398787311022622e-05, + "loss": 0.037, + "step": 57300 + }, + { + "epoch": 0.12639133014138804, + "grad_norm": 0.14846141636371613, + "learning_rate": 2.8398043903452442e-05, + "loss": 0.0356, + "step": 57310 + }, + { + "epoch": 0.1264133841162862, + "grad_norm": 0.09654958546161652, + "learning_rate": 2.8397300333083374e-05, + "loss": 0.0368, + "step": 57320 + }, + { + "epoch": 0.12643543809118438, + "grad_norm": 0.12098868936300278, + "learning_rate": 2.8396556599924463e-05, + "loss": 0.0363, + "step": 57330 + }, + { + "epoch": 0.12645749206608253, + "grad_norm": 0.12298933416604996, + "learning_rate": 2.8395812703984742e-05, + "loss": 0.0382, + "step": 57340 + }, + { + "epoch": 0.1264795460409807, + "grad_norm": 0.12391901761293411, + "learning_rate": 2.8395068645273244e-05, + "loss": 0.0355, + "step": 57350 + }, + { + "epoch": 0.12650160001587887, + "grad_norm": 0.12422831356525421, + "learning_rate": 2.8394324423799016e-05, + "loss": 0.0371, + "step": 57360 + }, + { + "epoch": 0.12652365399077703, + "grad_norm": 0.09049713611602783, + "learning_rate": 2.83935800395711e-05, + "loss": 0.035, + "step": 57370 + }, + { + "epoch": 0.12654570796567519, + "grad_norm": 0.12188088893890381, + "learning_rate": 2.8392835492598542e-05, + "loss": 0.0384, + "step": 57380 + }, + { + "epoch": 0.12656776194057337, + "grad_norm": 0.12185811996459961, + "learning_rate": 2.8392090782890386e-05, + "loss": 0.0377, + "step": 57390 + }, + { + "epoch": 0.12658981591547153, + "grad_norm": 0.11317532509565353, + "learning_rate": 2.8391345910455685e-05, + "loss": 0.0361, + "step": 57400 + }, + { + "epoch": 0.12661186989036968, + "grad_norm": 0.137630894780159, + "learning_rate": 2.8390600875303484e-05, + "loss": 0.0366, + "step": 57410 + }, + { + "epoch": 0.12663392386526787, + "grad_norm": 0.12770560383796692, + "learning_rate": 2.8389855677442847e-05, + "loss": 0.0362, + "step": 57420 + }, + { + "epoch": 0.12665597784016602, + "grad_norm": 0.08410383015871048, + "learning_rate": 2.8389110316882815e-05, + "loss": 0.0349, + "step": 57430 + }, + { + "epoch": 0.12667803181506418, + "grad_norm": 0.18047522008419037, + "learning_rate": 2.8388364793632452e-05, + "loss": 0.0369, + "step": 57440 + }, + { + "epoch": 0.12670008578996236, + "grad_norm": 0.1019539013504982, + "learning_rate": 2.8387619107700816e-05, + "loss": 0.0365, + "step": 57450 + }, + { + "epoch": 0.12672213976486052, + "grad_norm": 0.09070602804422379, + "learning_rate": 2.838687325909697e-05, + "loss": 0.0356, + "step": 57460 + }, + { + "epoch": 0.12674419373975868, + "grad_norm": 0.10050612688064575, + "learning_rate": 2.838612724782997e-05, + "loss": 0.0347, + "step": 57470 + }, + { + "epoch": 0.12676624771465686, + "grad_norm": 0.10053854435682297, + "learning_rate": 2.8385381073908892e-05, + "loss": 0.0347, + "step": 57480 + }, + { + "epoch": 0.12678830168955502, + "grad_norm": 0.10219094902276993, + "learning_rate": 2.8384634737342793e-05, + "loss": 0.0339, + "step": 57490 + }, + { + "epoch": 0.12681035566445317, + "grad_norm": 0.12986771762371063, + "learning_rate": 2.8383888238140747e-05, + "loss": 0.0377, + "step": 57500 + }, + { + "epoch": 0.12683240963935136, + "grad_norm": 0.09943010658025742, + "learning_rate": 2.8383141576311817e-05, + "loss": 0.0344, + "step": 57510 + }, + { + "epoch": 0.1268544636142495, + "grad_norm": 0.09701768308877945, + "learning_rate": 2.838239475186508e-05, + "loss": 0.0334, + "step": 57520 + }, + { + "epoch": 0.12687651758914767, + "grad_norm": 0.17804543673992157, + "learning_rate": 2.8381647764809617e-05, + "loss": 0.0388, + "step": 57530 + }, + { + "epoch": 0.12689857156404585, + "grad_norm": 0.10035950690507889, + "learning_rate": 2.8380900615154498e-05, + "loss": 0.0375, + "step": 57540 + }, + { + "epoch": 0.126920625538944, + "grad_norm": 0.09947052597999573, + "learning_rate": 2.8380153302908805e-05, + "loss": 0.0358, + "step": 57550 + }, + { + "epoch": 0.12694267951384217, + "grad_norm": 0.1081480011343956, + "learning_rate": 2.8379405828081613e-05, + "loss": 0.0358, + "step": 57560 + }, + { + "epoch": 0.12696473348874035, + "grad_norm": 0.12896330654621124, + "learning_rate": 2.837865819068201e-05, + "loss": 0.0358, + "step": 57570 + }, + { + "epoch": 0.1269867874636385, + "grad_norm": 0.10500258207321167, + "learning_rate": 2.837791039071908e-05, + "loss": 0.0361, + "step": 57580 + }, + { + "epoch": 0.12700884143853666, + "grad_norm": 0.1255762130022049, + "learning_rate": 2.83771624282019e-05, + "loss": 0.0369, + "step": 57590 + }, + { + "epoch": 0.12703089541343485, + "grad_norm": 0.10263116657733917, + "learning_rate": 2.8376414303139576e-05, + "loss": 0.0385, + "step": 57600 + }, + { + "epoch": 0.127052949388333, + "grad_norm": 0.10006693005561829, + "learning_rate": 2.8375666015541185e-05, + "loss": 0.036, + "step": 57610 + }, + { + "epoch": 0.12707500336323116, + "grad_norm": 0.1266312152147293, + "learning_rate": 2.8374917565415822e-05, + "loss": 0.036, + "step": 57620 + }, + { + "epoch": 0.12709705733812934, + "grad_norm": 0.11599405854940414, + "learning_rate": 2.8374168952772586e-05, + "loss": 0.0361, + "step": 57630 + }, + { + "epoch": 0.1271191113130275, + "grad_norm": 0.1309226006269455, + "learning_rate": 2.8373420177620563e-05, + "loss": 0.0366, + "step": 57640 + }, + { + "epoch": 0.12714116528792566, + "grad_norm": 0.10918137431144714, + "learning_rate": 2.8372671239968863e-05, + "loss": 0.0366, + "step": 57650 + }, + { + "epoch": 0.12716321926282384, + "grad_norm": 0.14439637959003448, + "learning_rate": 2.8371922139826585e-05, + "loss": 0.0372, + "step": 57660 + }, + { + "epoch": 0.127185273237722, + "grad_norm": 0.10315529257059097, + "learning_rate": 2.837117287720282e-05, + "loss": 0.0361, + "step": 57670 + }, + { + "epoch": 0.12720732721262018, + "grad_norm": 0.09455882012844086, + "learning_rate": 2.8370423452106687e-05, + "loss": 0.0364, + "step": 57680 + }, + { + "epoch": 0.12722938118751834, + "grad_norm": 0.11262653023004532, + "learning_rate": 2.8369673864547285e-05, + "loss": 0.0355, + "step": 57690 + }, + { + "epoch": 0.1272514351624165, + "grad_norm": 0.12236980348825455, + "learning_rate": 2.8368924114533716e-05, + "loss": 0.0373, + "step": 57700 + }, + { + "epoch": 0.12727348913731468, + "grad_norm": 0.10847609490156174, + "learning_rate": 2.8368174202075098e-05, + "loss": 0.0369, + "step": 57710 + }, + { + "epoch": 0.12729554311221283, + "grad_norm": 0.13600318133831024, + "learning_rate": 2.8367424127180548e-05, + "loss": 0.0373, + "step": 57720 + }, + { + "epoch": 0.127317597087111, + "grad_norm": 0.10492756962776184, + "learning_rate": 2.836667388985917e-05, + "loss": 0.0368, + "step": 57730 + }, + { + "epoch": 0.12733965106200917, + "grad_norm": 0.13483363389968872, + "learning_rate": 2.8365923490120082e-05, + "loss": 0.0363, + "step": 57740 + }, + { + "epoch": 0.12736170503690733, + "grad_norm": 0.10146774351596832, + "learning_rate": 2.8365172927972408e-05, + "loss": 0.0376, + "step": 57750 + }, + { + "epoch": 0.1273837590118055, + "grad_norm": 0.11096564680337906, + "learning_rate": 2.836442220342526e-05, + "loss": 0.0373, + "step": 57760 + }, + { + "epoch": 0.12740581298670367, + "grad_norm": 0.15473045408725739, + "learning_rate": 2.8363671316487767e-05, + "loss": 0.0352, + "step": 57770 + }, + { + "epoch": 0.12742786696160183, + "grad_norm": 0.11697542667388916, + "learning_rate": 2.836292026716905e-05, + "loss": 0.0358, + "step": 57780 + }, + { + "epoch": 0.12744992093649998, + "grad_norm": 0.10372816026210785, + "learning_rate": 2.836216905547824e-05, + "loss": 0.0359, + "step": 57790 + }, + { + "epoch": 0.12747197491139817, + "grad_norm": 0.13352155685424805, + "learning_rate": 2.836141768142445e-05, + "loss": 0.0361, + "step": 57800 + }, + { + "epoch": 0.12749402888629632, + "grad_norm": 0.136335551738739, + "learning_rate": 2.8360666145016823e-05, + "loss": 0.0361, + "step": 57810 + }, + { + "epoch": 0.12751608286119448, + "grad_norm": 0.08510809391736984, + "learning_rate": 2.8359914446264493e-05, + "loss": 0.0349, + "step": 57820 + }, + { + "epoch": 0.12753813683609266, + "grad_norm": 0.12781351804733276, + "learning_rate": 2.8359162585176585e-05, + "loss": 0.0361, + "step": 57830 + }, + { + "epoch": 0.12756019081099082, + "grad_norm": 0.10918425023555756, + "learning_rate": 2.8358410561762236e-05, + "loss": 0.0358, + "step": 57840 + }, + { + "epoch": 0.12758224478588898, + "grad_norm": 0.10264916718006134, + "learning_rate": 2.835765837603059e-05, + "loss": 0.0337, + "step": 57850 + }, + { + "epoch": 0.12760429876078716, + "grad_norm": 0.09276304394006729, + "learning_rate": 2.8356906027990782e-05, + "loss": 0.0369, + "step": 57860 + }, + { + "epoch": 0.12762635273568532, + "grad_norm": 0.11463196575641632, + "learning_rate": 2.8356153517651954e-05, + "loss": 0.0369, + "step": 57870 + }, + { + "epoch": 0.12764840671058347, + "grad_norm": 0.11129950731992722, + "learning_rate": 2.835540084502325e-05, + "loss": 0.0372, + "step": 57880 + }, + { + "epoch": 0.12767046068548166, + "grad_norm": 0.10986983776092529, + "learning_rate": 2.8354648010113817e-05, + "loss": 0.0368, + "step": 57890 + }, + { + "epoch": 0.1276925146603798, + "grad_norm": 0.1578671634197235, + "learning_rate": 2.8353895012932803e-05, + "loss": 0.0365, + "step": 57900 + }, + { + "epoch": 0.12771456863527797, + "grad_norm": 0.12416375428438187, + "learning_rate": 2.8353141853489354e-05, + "loss": 0.0353, + "step": 57910 + }, + { + "epoch": 0.12773662261017615, + "grad_norm": 0.14870963990688324, + "learning_rate": 2.8352388531792624e-05, + "loss": 0.0378, + "step": 57920 + }, + { + "epoch": 0.1277586765850743, + "grad_norm": 0.1147906705737114, + "learning_rate": 2.835163504785177e-05, + "loss": 0.0362, + "step": 57930 + }, + { + "epoch": 0.12778073055997247, + "grad_norm": 0.1288955956697464, + "learning_rate": 2.835088140167594e-05, + "loss": 0.0383, + "step": 57940 + }, + { + "epoch": 0.12780278453487065, + "grad_norm": 0.1482740193605423, + "learning_rate": 2.83501275932743e-05, + "loss": 0.0391, + "step": 57950 + }, + { + "epoch": 0.1278248385097688, + "grad_norm": 0.11627212911844254, + "learning_rate": 2.8349373622656003e-05, + "loss": 0.0363, + "step": 57960 + }, + { + "epoch": 0.12784689248466696, + "grad_norm": 0.09093991667032242, + "learning_rate": 2.834861948983021e-05, + "loss": 0.034, + "step": 57970 + }, + { + "epoch": 0.12786894645956515, + "grad_norm": 0.11307283490896225, + "learning_rate": 2.834786519480609e-05, + "loss": 0.0369, + "step": 57980 + }, + { + "epoch": 0.1278910004344633, + "grad_norm": 0.12266476452350616, + "learning_rate": 2.8347110737592807e-05, + "loss": 0.0368, + "step": 57990 + }, + { + "epoch": 0.12791305440936146, + "grad_norm": 0.11946089565753937, + "learning_rate": 2.834635611819952e-05, + "loss": 0.0355, + "step": 58000 + }, + { + "epoch": 0.12793510838425964, + "grad_norm": 0.11042897403240204, + "learning_rate": 2.8345601336635414e-05, + "loss": 0.0363, + "step": 58010 + }, + { + "epoch": 0.1279571623591578, + "grad_norm": 0.09972494840621948, + "learning_rate": 2.8344846392909645e-05, + "loss": 0.036, + "step": 58020 + }, + { + "epoch": 0.12797921633405596, + "grad_norm": 0.12157093733549118, + "learning_rate": 2.8344091287031396e-05, + "loss": 0.0351, + "step": 58030 + }, + { + "epoch": 0.12800127030895414, + "grad_norm": 0.13589182496070862, + "learning_rate": 2.834333601900984e-05, + "loss": 0.0377, + "step": 58040 + }, + { + "epoch": 0.1280233242838523, + "grad_norm": 0.12704910337924957, + "learning_rate": 2.834258058885415e-05, + "loss": 0.0348, + "step": 58050 + }, + { + "epoch": 0.12804537825875045, + "grad_norm": 0.12659528851509094, + "learning_rate": 2.8341824996573506e-05, + "loss": 0.0366, + "step": 58060 + }, + { + "epoch": 0.12806743223364864, + "grad_norm": 0.11126130819320679, + "learning_rate": 2.8341069242177095e-05, + "loss": 0.0348, + "step": 58070 + }, + { + "epoch": 0.1280894862085468, + "grad_norm": 0.14727811515331268, + "learning_rate": 2.8340313325674098e-05, + "loss": 0.0356, + "step": 58080 + }, + { + "epoch": 0.12811154018344495, + "grad_norm": 0.141280397772789, + "learning_rate": 2.8339557247073695e-05, + "loss": 0.0373, + "step": 58090 + }, + { + "epoch": 0.12813359415834313, + "grad_norm": 0.11883734911680222, + "learning_rate": 2.833880100638508e-05, + "loss": 0.036, + "step": 58100 + }, + { + "epoch": 0.1281556481332413, + "grad_norm": 0.12017370760440826, + "learning_rate": 2.8338044603617436e-05, + "loss": 0.0382, + "step": 58110 + }, + { + "epoch": 0.12817770210813947, + "grad_norm": 0.09607219696044922, + "learning_rate": 2.833728803877996e-05, + "loss": 0.0352, + "step": 58120 + }, + { + "epoch": 0.12819975608303763, + "grad_norm": 0.10957438498735428, + "learning_rate": 2.833653131188184e-05, + "loss": 0.0337, + "step": 58130 + }, + { + "epoch": 0.1282218100579358, + "grad_norm": 0.12960341572761536, + "learning_rate": 2.8335774422932275e-05, + "loss": 0.0353, + "step": 58140 + }, + { + "epoch": 0.12824386403283397, + "grad_norm": 0.11554361879825592, + "learning_rate": 2.8335017371940457e-05, + "loss": 0.0355, + "step": 58150 + }, + { + "epoch": 0.12826591800773213, + "grad_norm": 0.16290885210037231, + "learning_rate": 2.833426015891559e-05, + "loss": 0.0356, + "step": 58160 + }, + { + "epoch": 0.12828797198263028, + "grad_norm": 0.13916653394699097, + "learning_rate": 2.8333502783866872e-05, + "loss": 0.0344, + "step": 58170 + }, + { + "epoch": 0.12831002595752847, + "grad_norm": 0.09757409989833832, + "learning_rate": 2.8332745246803507e-05, + "loss": 0.0358, + "step": 58180 + }, + { + "epoch": 0.12833207993242662, + "grad_norm": 0.10031308233737946, + "learning_rate": 2.8331987547734696e-05, + "loss": 0.0346, + "step": 58190 + }, + { + "epoch": 0.12835413390732478, + "grad_norm": 0.13292771577835083, + "learning_rate": 2.8331229686669648e-05, + "loss": 0.0356, + "step": 58200 + }, + { + "epoch": 0.12837618788222296, + "grad_norm": 0.15248499810695648, + "learning_rate": 2.8330471663617575e-05, + "loss": 0.0358, + "step": 58210 + }, + { + "epoch": 0.12839824185712112, + "grad_norm": 0.12371747940778732, + "learning_rate": 2.832971347858769e-05, + "loss": 0.0371, + "step": 58220 + }, + { + "epoch": 0.12842029583201928, + "grad_norm": 0.14691230654716492, + "learning_rate": 2.832895513158919e-05, + "loss": 0.0378, + "step": 58230 + }, + { + "epoch": 0.12844234980691746, + "grad_norm": 0.13305369019508362, + "learning_rate": 2.8328196622631307e-05, + "loss": 0.0362, + "step": 58240 + }, + { + "epoch": 0.12846440378181562, + "grad_norm": 0.10954266041517258, + "learning_rate": 2.8327437951723252e-05, + "loss": 0.0358, + "step": 58250 + }, + { + "epoch": 0.12848645775671377, + "grad_norm": 0.11779768764972687, + "learning_rate": 2.832667911887424e-05, + "loss": 0.0356, + "step": 58260 + }, + { + "epoch": 0.12850851173161196, + "grad_norm": 0.12295812368392944, + "learning_rate": 2.8325920124093495e-05, + "loss": 0.0364, + "step": 58270 + }, + { + "epoch": 0.1285305657065101, + "grad_norm": 0.11896897852420807, + "learning_rate": 2.8325160967390242e-05, + "loss": 0.0374, + "step": 58280 + }, + { + "epoch": 0.12855261968140827, + "grad_norm": 0.1267428696155548, + "learning_rate": 2.83244016487737e-05, + "loss": 0.0367, + "step": 58290 + }, + { + "epoch": 0.12857467365630645, + "grad_norm": 0.13311098515987396, + "learning_rate": 2.8323642168253096e-05, + "loss": 0.0378, + "step": 58300 + }, + { + "epoch": 0.1285967276312046, + "grad_norm": 0.11792151629924774, + "learning_rate": 2.832288252583766e-05, + "loss": 0.0357, + "step": 58310 + }, + { + "epoch": 0.12861878160610277, + "grad_norm": 0.10999142378568649, + "learning_rate": 2.8322122721536625e-05, + "loss": 0.0343, + "step": 58320 + }, + { + "epoch": 0.12864083558100095, + "grad_norm": 0.09924907237291336, + "learning_rate": 2.8321362755359217e-05, + "loss": 0.0367, + "step": 58330 + }, + { + "epoch": 0.1286628895558991, + "grad_norm": 0.10965878516435623, + "learning_rate": 2.8320602627314678e-05, + "loss": 0.0351, + "step": 58340 + }, + { + "epoch": 0.12868494353079726, + "grad_norm": 0.10719917714595795, + "learning_rate": 2.831984233741224e-05, + "loss": 0.0376, + "step": 58350 + }, + { + "epoch": 0.12870699750569545, + "grad_norm": 0.11544709652662277, + "learning_rate": 2.831908188566114e-05, + "loss": 0.037, + "step": 58360 + }, + { + "epoch": 0.1287290514805936, + "grad_norm": 0.12539081275463104, + "learning_rate": 2.831832127207062e-05, + "loss": 0.0363, + "step": 58370 + }, + { + "epoch": 0.12875110545549176, + "grad_norm": 0.09965315461158752, + "learning_rate": 2.8317560496649924e-05, + "loss": 0.0371, + "step": 58380 + }, + { + "epoch": 0.12877315943038994, + "grad_norm": 0.10027728974819183, + "learning_rate": 2.8316799559408287e-05, + "loss": 0.0339, + "step": 58390 + }, + { + "epoch": 0.1287952134052881, + "grad_norm": 0.13201746344566345, + "learning_rate": 2.8316038460354972e-05, + "loss": 0.0369, + "step": 58400 + }, + { + "epoch": 0.12881726738018626, + "grad_norm": 0.09872104227542877, + "learning_rate": 2.831527719949921e-05, + "loss": 0.0345, + "step": 58410 + }, + { + "epoch": 0.12883932135508444, + "grad_norm": 0.12481704354286194, + "learning_rate": 2.8314515776850263e-05, + "loss": 0.0366, + "step": 58420 + }, + { + "epoch": 0.1288613753299826, + "grad_norm": 0.11811795085668564, + "learning_rate": 2.831375419241738e-05, + "loss": 0.0358, + "step": 58430 + }, + { + "epoch": 0.12888342930488075, + "grad_norm": 0.1027645468711853, + "learning_rate": 2.8312992446209807e-05, + "loss": 0.0381, + "step": 58440 + }, + { + "epoch": 0.12890548327977894, + "grad_norm": 0.09869621694087982, + "learning_rate": 2.831223053823681e-05, + "loss": 0.0354, + "step": 58450 + }, + { + "epoch": 0.1289275372546771, + "grad_norm": 0.11276472359895706, + "learning_rate": 2.831146846850764e-05, + "loss": 0.0357, + "step": 58460 + }, + { + "epoch": 0.12894959122957525, + "grad_norm": 0.11287645995616913, + "learning_rate": 2.831070623703156e-05, + "loss": 0.0353, + "step": 58470 + }, + { + "epoch": 0.12897164520447343, + "grad_norm": 0.10690844804048538, + "learning_rate": 2.8309943843817836e-05, + "loss": 0.0364, + "step": 58480 + }, + { + "epoch": 0.1289936991793716, + "grad_norm": 0.11056632548570633, + "learning_rate": 2.8309181288875726e-05, + "loss": 0.0359, + "step": 58490 + }, + { + "epoch": 0.12901575315426975, + "grad_norm": 0.13420680165290833, + "learning_rate": 2.8308418572214497e-05, + "loss": 0.0358, + "step": 58500 + }, + { + "epoch": 0.12903780712916793, + "grad_norm": 0.1287621110677719, + "learning_rate": 2.8307655693843414e-05, + "loss": 0.0384, + "step": 58510 + }, + { + "epoch": 0.1290598611040661, + "grad_norm": 0.11005239188671112, + "learning_rate": 2.8306892653771755e-05, + "loss": 0.0353, + "step": 58520 + }, + { + "epoch": 0.12908191507896424, + "grad_norm": 0.08896765112876892, + "learning_rate": 2.8306129452008784e-05, + "loss": 0.0353, + "step": 58530 + }, + { + "epoch": 0.12910396905386243, + "grad_norm": 0.1068982258439064, + "learning_rate": 2.8305366088563778e-05, + "loss": 0.0359, + "step": 58540 + }, + { + "epoch": 0.12912602302876058, + "grad_norm": 0.10941393673419952, + "learning_rate": 2.830460256344601e-05, + "loss": 0.0337, + "step": 58550 + }, + { + "epoch": 0.12914807700365877, + "grad_norm": 0.09616684168577194, + "learning_rate": 2.830383887666476e-05, + "loss": 0.0362, + "step": 58560 + }, + { + "epoch": 0.12917013097855692, + "grad_norm": 0.10507305711507797, + "learning_rate": 2.8303075028229306e-05, + "loss": 0.0381, + "step": 58570 + }, + { + "epoch": 0.12919218495345508, + "grad_norm": 0.13650517165660858, + "learning_rate": 2.8302311018148933e-05, + "loss": 0.037, + "step": 58580 + }, + { + "epoch": 0.12921423892835326, + "grad_norm": 0.11685429513454437, + "learning_rate": 2.830154684643292e-05, + "loss": 0.0356, + "step": 58590 + }, + { + "epoch": 0.12923629290325142, + "grad_norm": 0.11059663444757462, + "learning_rate": 2.8300782513090553e-05, + "loss": 0.0353, + "step": 58600 + }, + { + "epoch": 0.12925834687814958, + "grad_norm": 0.13710618019104004, + "learning_rate": 2.8300018018131122e-05, + "loss": 0.0353, + "step": 58610 + }, + { + "epoch": 0.12928040085304776, + "grad_norm": 0.1031654104590416, + "learning_rate": 2.8299253361563915e-05, + "loss": 0.0349, + "step": 58620 + }, + { + "epoch": 0.12930245482794592, + "grad_norm": 0.09652478992938995, + "learning_rate": 2.829848854339822e-05, + "loss": 0.0352, + "step": 58630 + }, + { + "epoch": 0.12932450880284407, + "grad_norm": 0.12577788531780243, + "learning_rate": 2.8297723563643338e-05, + "loss": 0.037, + "step": 58640 + }, + { + "epoch": 0.12934656277774226, + "grad_norm": 0.14146888256072998, + "learning_rate": 2.8296958422308556e-05, + "loss": 0.0382, + "step": 58650 + }, + { + "epoch": 0.12936861675264041, + "grad_norm": 0.1438068002462387, + "learning_rate": 2.8296193119403178e-05, + "loss": 0.0383, + "step": 58660 + }, + { + "epoch": 0.12939067072753857, + "grad_norm": 0.10325673222541809, + "learning_rate": 2.8295427654936497e-05, + "loss": 0.0369, + "step": 58670 + }, + { + "epoch": 0.12941272470243675, + "grad_norm": 0.14273296296596527, + "learning_rate": 2.8294662028917815e-05, + "loss": 0.0348, + "step": 58680 + }, + { + "epoch": 0.1294347786773349, + "grad_norm": 0.16222256422042847, + "learning_rate": 2.829389624135644e-05, + "loss": 0.0357, + "step": 58690 + }, + { + "epoch": 0.12945683265223307, + "grad_norm": 0.1413167119026184, + "learning_rate": 2.8293130292261675e-05, + "loss": 0.0363, + "step": 58700 + }, + { + "epoch": 0.12947888662713125, + "grad_norm": 0.10658059269189835, + "learning_rate": 2.8292364181642826e-05, + "loss": 0.0351, + "step": 58710 + }, + { + "epoch": 0.1295009406020294, + "grad_norm": 0.12373147159814835, + "learning_rate": 2.8291597909509203e-05, + "loss": 0.0358, + "step": 58720 + }, + { + "epoch": 0.12952299457692756, + "grad_norm": 0.09720709919929504, + "learning_rate": 2.829083147587011e-05, + "loss": 0.0359, + "step": 58730 + }, + { + "epoch": 0.12954504855182575, + "grad_norm": 0.1234712228178978, + "learning_rate": 2.8290064880734874e-05, + "loss": 0.0364, + "step": 58740 + }, + { + "epoch": 0.1295671025267239, + "grad_norm": 0.12239201366901398, + "learning_rate": 2.82892981241128e-05, + "loss": 0.0361, + "step": 58750 + }, + { + "epoch": 0.12958915650162206, + "grad_norm": 0.11389829963445663, + "learning_rate": 2.8288531206013202e-05, + "loss": 0.0355, + "step": 58760 + }, + { + "epoch": 0.12961121047652024, + "grad_norm": 0.11686326563358307, + "learning_rate": 2.828776412644541e-05, + "loss": 0.037, + "step": 58770 + }, + { + "epoch": 0.1296332644514184, + "grad_norm": 0.08847489953041077, + "learning_rate": 2.828699688541873e-05, + "loss": 0.0362, + "step": 58780 + }, + { + "epoch": 0.12965531842631656, + "grad_norm": 0.0923423022031784, + "learning_rate": 2.82862294829425e-05, + "loss": 0.0372, + "step": 58790 + }, + { + "epoch": 0.12967737240121474, + "grad_norm": 0.15059073269367218, + "learning_rate": 2.8285461919026034e-05, + "loss": 0.0373, + "step": 58800 + }, + { + "epoch": 0.1296994263761129, + "grad_norm": 0.10858122259378433, + "learning_rate": 2.8284694193678665e-05, + "loss": 0.0339, + "step": 58810 + }, + { + "epoch": 0.12972148035101105, + "grad_norm": 0.09423382580280304, + "learning_rate": 2.8283926306909713e-05, + "loss": 0.0359, + "step": 58820 + }, + { + "epoch": 0.12974353432590924, + "grad_norm": 0.14087143540382385, + "learning_rate": 2.828315825872852e-05, + "loss": 0.0379, + "step": 58830 + }, + { + "epoch": 0.1297655883008074, + "grad_norm": 0.12265127152204514, + "learning_rate": 2.828239004914441e-05, + "loss": 0.0369, + "step": 58840 + }, + { + "epoch": 0.12978764227570555, + "grad_norm": 0.11984190344810486, + "learning_rate": 2.8281621678166722e-05, + "loss": 0.0351, + "step": 58850 + }, + { + "epoch": 0.12980969625060373, + "grad_norm": 0.0991043969988823, + "learning_rate": 2.8280853145804786e-05, + "loss": 0.0368, + "step": 58860 + }, + { + "epoch": 0.1298317502255019, + "grad_norm": 0.10860437899827957, + "learning_rate": 2.828008445206795e-05, + "loss": 0.0352, + "step": 58870 + }, + { + "epoch": 0.12985380420040005, + "grad_norm": 0.11958324164152145, + "learning_rate": 2.8279315596965548e-05, + "loss": 0.0356, + "step": 58880 + }, + { + "epoch": 0.12987585817529823, + "grad_norm": 0.12604442238807678, + "learning_rate": 2.8278546580506925e-05, + "loss": 0.0362, + "step": 58890 + }, + { + "epoch": 0.1298979121501964, + "grad_norm": 0.09846748411655426, + "learning_rate": 2.8277777402701424e-05, + "loss": 0.0352, + "step": 58900 + }, + { + "epoch": 0.12991996612509454, + "grad_norm": 0.12493867427110672, + "learning_rate": 2.8277008063558387e-05, + "loss": 0.0341, + "step": 58910 + }, + { + "epoch": 0.12994202009999273, + "grad_norm": 0.12513910233974457, + "learning_rate": 2.827623856308717e-05, + "loss": 0.0352, + "step": 58920 + }, + { + "epoch": 0.12996407407489088, + "grad_norm": 0.13376984000205994, + "learning_rate": 2.827546890129712e-05, + "loss": 0.0377, + "step": 58930 + }, + { + "epoch": 0.12998612804978904, + "grad_norm": 0.13683640956878662, + "learning_rate": 2.8274699078197586e-05, + "loss": 0.0358, + "step": 58940 + }, + { + "epoch": 0.13000818202468722, + "grad_norm": 0.1753748208284378, + "learning_rate": 2.8273929093797928e-05, + "loss": 0.0378, + "step": 58950 + }, + { + "epoch": 0.13003023599958538, + "grad_norm": 0.1576072871685028, + "learning_rate": 2.8273158948107496e-05, + "loss": 0.0358, + "step": 58960 + }, + { + "epoch": 0.13005228997448356, + "grad_norm": 0.1569526642560959, + "learning_rate": 2.827238864113565e-05, + "loss": 0.0368, + "step": 58970 + }, + { + "epoch": 0.13007434394938172, + "grad_norm": 0.10582295805215836, + "learning_rate": 2.8271618172891755e-05, + "loss": 0.0354, + "step": 58980 + }, + { + "epoch": 0.13009639792427988, + "grad_norm": 0.1469598412513733, + "learning_rate": 2.827084754338517e-05, + "loss": 0.0355, + "step": 58990 + }, + { + "epoch": 0.13011845189917806, + "grad_norm": 0.1212184727191925, + "learning_rate": 2.8270076752625254e-05, + "loss": 0.037, + "step": 59000 + }, + { + "epoch": 0.13014050587407622, + "grad_norm": 0.09052051603794098, + "learning_rate": 2.8269305800621377e-05, + "loss": 0.0361, + "step": 59010 + }, + { + "epoch": 0.13016255984897437, + "grad_norm": 0.10157757997512817, + "learning_rate": 2.8268534687382905e-05, + "loss": 0.0356, + "step": 59020 + }, + { + "epoch": 0.13018461382387256, + "grad_norm": 0.08990942686796188, + "learning_rate": 2.8267763412919213e-05, + "loss": 0.0358, + "step": 59030 + }, + { + "epoch": 0.13020666779877071, + "grad_norm": 0.10262490808963776, + "learning_rate": 2.8266991977239663e-05, + "loss": 0.0373, + "step": 59040 + }, + { + "epoch": 0.13022872177366887, + "grad_norm": 0.11352934688329697, + "learning_rate": 2.8266220380353638e-05, + "loss": 0.0355, + "step": 59050 + }, + { + "epoch": 0.13025077574856705, + "grad_norm": 0.10349183529615402, + "learning_rate": 2.826544862227051e-05, + "loss": 0.0368, + "step": 59060 + }, + { + "epoch": 0.1302728297234652, + "grad_norm": 0.07953609526157379, + "learning_rate": 2.8264676702999655e-05, + "loss": 0.0363, + "step": 59070 + }, + { + "epoch": 0.13029488369836337, + "grad_norm": 0.10195409506559372, + "learning_rate": 2.8263904622550453e-05, + "loss": 0.0351, + "step": 59080 + }, + { + "epoch": 0.13031693767326155, + "grad_norm": 0.11018253117799759, + "learning_rate": 2.826313238093229e-05, + "loss": 0.0351, + "step": 59090 + }, + { + "epoch": 0.1303389916481597, + "grad_norm": 0.14046502113342285, + "learning_rate": 2.8262359978154546e-05, + "loss": 0.0373, + "step": 59100 + }, + { + "epoch": 0.13036104562305786, + "grad_norm": 0.09909936040639877, + "learning_rate": 2.8261587414226604e-05, + "loss": 0.037, + "step": 59110 + }, + { + "epoch": 0.13038309959795605, + "grad_norm": 0.13441509008407593, + "learning_rate": 2.8260814689157855e-05, + "loss": 0.0355, + "step": 59120 + }, + { + "epoch": 0.1304051535728542, + "grad_norm": 0.12829920649528503, + "learning_rate": 2.826004180295769e-05, + "loss": 0.0378, + "step": 59130 + }, + { + "epoch": 0.13042720754775236, + "grad_norm": 0.11642026156187057, + "learning_rate": 2.825926875563549e-05, + "loss": 0.0364, + "step": 59140 + }, + { + "epoch": 0.13044926152265054, + "grad_norm": 0.14066368341445923, + "learning_rate": 2.825849554720066e-05, + "loss": 0.0346, + "step": 59150 + }, + { + "epoch": 0.1304713154975487, + "grad_norm": 0.10675607621669769, + "learning_rate": 2.825772217766259e-05, + "loss": 0.035, + "step": 59160 + }, + { + "epoch": 0.13049336947244686, + "grad_norm": 0.11667450517416, + "learning_rate": 2.8256948647030682e-05, + "loss": 0.0361, + "step": 59170 + }, + { + "epoch": 0.13051542344734504, + "grad_norm": 0.11601380258798599, + "learning_rate": 2.825617495531433e-05, + "loss": 0.0359, + "step": 59180 + }, + { + "epoch": 0.1305374774222432, + "grad_norm": 0.1320357322692871, + "learning_rate": 2.8255401102522937e-05, + "loss": 0.0367, + "step": 59190 + }, + { + "epoch": 0.13055953139714135, + "grad_norm": 0.10032481700181961, + "learning_rate": 2.82546270886659e-05, + "loss": 0.0355, + "step": 59200 + }, + { + "epoch": 0.13058158537203954, + "grad_norm": 0.10973796993494034, + "learning_rate": 2.8253852913752634e-05, + "loss": 0.0352, + "step": 59210 + }, + { + "epoch": 0.1306036393469377, + "grad_norm": 0.10572896897792816, + "learning_rate": 2.8253078577792543e-05, + "loss": 0.0366, + "step": 59220 + }, + { + "epoch": 0.13062569332183585, + "grad_norm": 0.14681372046470642, + "learning_rate": 2.8252304080795032e-05, + "loss": 0.037, + "step": 59230 + }, + { + "epoch": 0.13064774729673403, + "grad_norm": 0.15816861391067505, + "learning_rate": 2.825152942276951e-05, + "loss": 0.0337, + "step": 59240 + }, + { + "epoch": 0.1306698012716322, + "grad_norm": 0.12983009219169617, + "learning_rate": 2.82507546037254e-05, + "loss": 0.0364, + "step": 59250 + }, + { + "epoch": 0.13069185524653035, + "grad_norm": 0.1254892647266388, + "learning_rate": 2.824997962367211e-05, + "loss": 0.0346, + "step": 59260 + }, + { + "epoch": 0.13071390922142853, + "grad_norm": 0.10326465964317322, + "learning_rate": 2.8249204482619056e-05, + "loss": 0.0364, + "step": 59270 + }, + { + "epoch": 0.1307359631963267, + "grad_norm": 0.13265261054039001, + "learning_rate": 2.824842918057566e-05, + "loss": 0.0355, + "step": 59280 + }, + { + "epoch": 0.13075801717122484, + "grad_norm": 0.10126955807209015, + "learning_rate": 2.824765371755134e-05, + "loss": 0.0367, + "step": 59290 + }, + { + "epoch": 0.13078007114612303, + "grad_norm": 0.11405757069587708, + "learning_rate": 2.8246878093555516e-05, + "loss": 0.0364, + "step": 59300 + }, + { + "epoch": 0.13080212512102118, + "grad_norm": 0.132005974650383, + "learning_rate": 2.8246102308597622e-05, + "loss": 0.0354, + "step": 59310 + }, + { + "epoch": 0.13082417909591934, + "grad_norm": 0.10507769137620926, + "learning_rate": 2.8245326362687077e-05, + "loss": 0.0362, + "step": 59320 + }, + { + "epoch": 0.13084623307081752, + "grad_norm": 0.1082371175289154, + "learning_rate": 2.8244550255833305e-05, + "loss": 0.0367, + "step": 59330 + }, + { + "epoch": 0.13086828704571568, + "grad_norm": 0.0959051325917244, + "learning_rate": 2.824377398804575e-05, + "loss": 0.0357, + "step": 59340 + }, + { + "epoch": 0.13089034102061384, + "grad_norm": 0.08652139455080032, + "learning_rate": 2.824299755933383e-05, + "loss": 0.0354, + "step": 59350 + }, + { + "epoch": 0.13091239499551202, + "grad_norm": 0.10353163629770279, + "learning_rate": 2.8242220969706994e-05, + "loss": 0.035, + "step": 59360 + }, + { + "epoch": 0.13093444897041018, + "grad_norm": 0.11531035602092743, + "learning_rate": 2.8241444219174665e-05, + "loss": 0.0388, + "step": 59370 + }, + { + "epoch": 0.13095650294530833, + "grad_norm": 0.11265785247087479, + "learning_rate": 2.824066730774629e-05, + "loss": 0.0374, + "step": 59380 + }, + { + "epoch": 0.13097855692020652, + "grad_norm": 0.13735483586788177, + "learning_rate": 2.8239890235431303e-05, + "loss": 0.036, + "step": 59390 + }, + { + "epoch": 0.13100061089510467, + "grad_norm": 0.13166874647140503, + "learning_rate": 2.823911300223915e-05, + "loss": 0.0353, + "step": 59400 + }, + { + "epoch": 0.13102266487000286, + "grad_norm": 0.12139871716499329, + "learning_rate": 2.8238335608179274e-05, + "loss": 0.0346, + "step": 59410 + }, + { + "epoch": 0.13104471884490101, + "grad_norm": 0.09343321621417999, + "learning_rate": 2.8237558053261124e-05, + "loss": 0.0371, + "step": 59420 + }, + { + "epoch": 0.13106677281979917, + "grad_norm": 0.07789149880409241, + "learning_rate": 2.8236780337494144e-05, + "loss": 0.0332, + "step": 59430 + }, + { + "epoch": 0.13108882679469736, + "grad_norm": 0.10336404293775558, + "learning_rate": 2.823600246088778e-05, + "loss": 0.0363, + "step": 59440 + }, + { + "epoch": 0.1311108807695955, + "grad_norm": 0.11723770201206207, + "learning_rate": 2.82352244234515e-05, + "loss": 0.0345, + "step": 59450 + }, + { + "epoch": 0.13113293474449367, + "grad_norm": 0.1277492791414261, + "learning_rate": 2.8234446225194735e-05, + "loss": 0.0355, + "step": 59460 + }, + { + "epoch": 0.13115498871939185, + "grad_norm": 0.137007474899292, + "learning_rate": 2.8233667866126964e-05, + "loss": 0.0349, + "step": 59470 + }, + { + "epoch": 0.13117704269429, + "grad_norm": 0.12177316844463348, + "learning_rate": 2.8232889346257628e-05, + "loss": 0.0364, + "step": 59480 + }, + { + "epoch": 0.13119909666918816, + "grad_norm": 0.1149463877081871, + "learning_rate": 2.8232110665596194e-05, + "loss": 0.0349, + "step": 59490 + }, + { + "epoch": 0.13122115064408635, + "grad_norm": 0.12005865573883057, + "learning_rate": 2.8231331824152117e-05, + "loss": 0.0338, + "step": 59500 + }, + { + "epoch": 0.1312432046189845, + "grad_norm": 0.10900305211544037, + "learning_rate": 2.8230552821934872e-05, + "loss": 0.0335, + "step": 59510 + }, + { + "epoch": 0.13126525859388266, + "grad_norm": 0.10760494321584702, + "learning_rate": 2.8229773658953918e-05, + "loss": 0.036, + "step": 59520 + }, + { + "epoch": 0.13128731256878085, + "grad_norm": 0.1089722216129303, + "learning_rate": 2.8228994335218717e-05, + "loss": 0.034, + "step": 59530 + }, + { + "epoch": 0.131309366543679, + "grad_norm": 0.12511037290096283, + "learning_rate": 2.8228214850738754e-05, + "loss": 0.0385, + "step": 59540 + }, + { + "epoch": 0.13133142051857716, + "grad_norm": 0.09466671943664551, + "learning_rate": 2.8227435205523487e-05, + "loss": 0.036, + "step": 59550 + }, + { + "epoch": 0.13135347449347534, + "grad_norm": 0.13839934766292572, + "learning_rate": 2.8226655399582394e-05, + "loss": 0.0344, + "step": 59560 + }, + { + "epoch": 0.1313755284683735, + "grad_norm": 0.12393077462911606, + "learning_rate": 2.8225875432924948e-05, + "loss": 0.0374, + "step": 59570 + }, + { + "epoch": 0.13139758244327165, + "grad_norm": 0.1418551355600357, + "learning_rate": 2.8225095305560625e-05, + "loss": 0.0355, + "step": 59580 + }, + { + "epoch": 0.13141963641816984, + "grad_norm": 0.1461920589208603, + "learning_rate": 2.8224315017498916e-05, + "loss": 0.0354, + "step": 59590 + }, + { + "epoch": 0.131441690393068, + "grad_norm": 0.10638339072465897, + "learning_rate": 2.822353456874929e-05, + "loss": 0.0349, + "step": 59600 + }, + { + "epoch": 0.13146374436796615, + "grad_norm": 0.143468976020813, + "learning_rate": 2.8222753959321234e-05, + "loss": 0.0353, + "step": 59610 + }, + { + "epoch": 0.13148579834286434, + "grad_norm": 0.10815724730491638, + "learning_rate": 2.822197318922423e-05, + "loss": 0.0366, + "step": 59620 + }, + { + "epoch": 0.1315078523177625, + "grad_norm": 0.15323159098625183, + "learning_rate": 2.822119225846777e-05, + "loss": 0.037, + "step": 59630 + }, + { + "epoch": 0.13152990629266065, + "grad_norm": 0.1336684674024582, + "learning_rate": 2.8220411167061343e-05, + "loss": 0.0354, + "step": 59640 + }, + { + "epoch": 0.13155196026755883, + "grad_norm": 0.11415445059537888, + "learning_rate": 2.8219629915014435e-05, + "loss": 0.0356, + "step": 59650 + }, + { + "epoch": 0.131574014242457, + "grad_norm": 0.08481884747743607, + "learning_rate": 2.821884850233654e-05, + "loss": 0.0349, + "step": 59660 + }, + { + "epoch": 0.13159606821735514, + "grad_norm": 0.11093247681856155, + "learning_rate": 2.8218066929037162e-05, + "loss": 0.0373, + "step": 59670 + }, + { + "epoch": 0.13161812219225333, + "grad_norm": 0.10880808532238007, + "learning_rate": 2.821728519512579e-05, + "loss": 0.0342, + "step": 59680 + }, + { + "epoch": 0.13164017616715148, + "grad_norm": 0.11843429505825043, + "learning_rate": 2.821650330061192e-05, + "loss": 0.0381, + "step": 59690 + }, + { + "epoch": 0.13166223014204964, + "grad_norm": 0.10359974950551987, + "learning_rate": 2.821572124550506e-05, + "loss": 0.0366, + "step": 59700 + }, + { + "epoch": 0.13168428411694783, + "grad_norm": 0.12159271538257599, + "learning_rate": 2.8214939029814702e-05, + "loss": 0.0375, + "step": 59710 + }, + { + "epoch": 0.13170633809184598, + "grad_norm": 0.12158987671136856, + "learning_rate": 2.821415665355037e-05, + "loss": 0.0347, + "step": 59720 + }, + { + "epoch": 0.13172839206674414, + "grad_norm": 0.10580894351005554, + "learning_rate": 2.8213374116721546e-05, + "loss": 0.0344, + "step": 59730 + }, + { + "epoch": 0.13175044604164232, + "grad_norm": 0.11119816452264786, + "learning_rate": 2.821259141933776e-05, + "loss": 0.0377, + "step": 59740 + }, + { + "epoch": 0.13177250001654048, + "grad_norm": 0.13666939735412598, + "learning_rate": 2.821180856140851e-05, + "loss": 0.0359, + "step": 59750 + }, + { + "epoch": 0.13179455399143863, + "grad_norm": 0.1070839911699295, + "learning_rate": 2.821102554294331e-05, + "loss": 0.0357, + "step": 59760 + }, + { + "epoch": 0.13181660796633682, + "grad_norm": 0.09337624162435532, + "learning_rate": 2.8210242363951678e-05, + "loss": 0.0347, + "step": 59770 + }, + { + "epoch": 0.13183866194123497, + "grad_norm": 0.10267803072929382, + "learning_rate": 2.8209459024443127e-05, + "loss": 0.0362, + "step": 59780 + }, + { + "epoch": 0.13186071591613313, + "grad_norm": 0.10794032365083694, + "learning_rate": 2.820867552442718e-05, + "loss": 0.0374, + "step": 59790 + }, + { + "epoch": 0.13188276989103132, + "grad_norm": 0.13501502573490143, + "learning_rate": 2.8207891863913352e-05, + "loss": 0.0355, + "step": 59800 + }, + { + "epoch": 0.13190482386592947, + "grad_norm": 0.12941233813762665, + "learning_rate": 2.820710804291117e-05, + "loss": 0.0365, + "step": 59810 + }, + { + "epoch": 0.13192687784082763, + "grad_norm": 0.13442553579807281, + "learning_rate": 2.8206324061430157e-05, + "loss": 0.0361, + "step": 59820 + }, + { + "epoch": 0.1319489318157258, + "grad_norm": 0.1174982488155365, + "learning_rate": 2.8205539919479837e-05, + "loss": 0.0359, + "step": 59830 + }, + { + "epoch": 0.13197098579062397, + "grad_norm": 0.10199972987174988, + "learning_rate": 2.8204755617069735e-05, + "loss": 0.037, + "step": 59840 + }, + { + "epoch": 0.13199303976552215, + "grad_norm": 0.14456015825271606, + "learning_rate": 2.8203971154209386e-05, + "loss": 0.0351, + "step": 59850 + }, + { + "epoch": 0.1320150937404203, + "grad_norm": 0.12102428078651428, + "learning_rate": 2.8203186530908325e-05, + "loss": 0.0345, + "step": 59860 + }, + { + "epoch": 0.13203714771531846, + "grad_norm": 0.10930594801902771, + "learning_rate": 2.820240174717608e-05, + "loss": 0.0366, + "step": 59870 + }, + { + "epoch": 0.13205920169021665, + "grad_norm": 0.10033582895994186, + "learning_rate": 2.8201616803022184e-05, + "loss": 0.0374, + "step": 59880 + }, + { + "epoch": 0.1320812556651148, + "grad_norm": 0.15203438699245453, + "learning_rate": 2.8200831698456183e-05, + "loss": 0.0359, + "step": 59890 + }, + { + "epoch": 0.13210330964001296, + "grad_norm": 0.14537504315376282, + "learning_rate": 2.8200046433487614e-05, + "loss": 0.0356, + "step": 59900 + }, + { + "epoch": 0.13212536361491115, + "grad_norm": 0.11166395992040634, + "learning_rate": 2.8199261008126017e-05, + "loss": 0.0352, + "step": 59910 + }, + { + "epoch": 0.1321474175898093, + "grad_norm": 0.12820139527320862, + "learning_rate": 2.8198475422380937e-05, + "loss": 0.0356, + "step": 59920 + }, + { + "epoch": 0.13216947156470746, + "grad_norm": 0.15331055223941803, + "learning_rate": 2.8197689676261916e-05, + "loss": 0.0377, + "step": 59930 + }, + { + "epoch": 0.13219152553960564, + "grad_norm": 0.10402035713195801, + "learning_rate": 2.8196903769778507e-05, + "loss": 0.0347, + "step": 59940 + }, + { + "epoch": 0.1322135795145038, + "grad_norm": 0.1052124947309494, + "learning_rate": 2.8196117702940262e-05, + "loss": 0.0363, + "step": 59950 + }, + { + "epoch": 0.13223563348940195, + "grad_norm": 0.08721248805522919, + "learning_rate": 2.819533147575672e-05, + "loss": 0.036, + "step": 59960 + }, + { + "epoch": 0.13225768746430014, + "grad_norm": 0.09974782168865204, + "learning_rate": 2.8194545088237447e-05, + "loss": 0.0361, + "step": 59970 + }, + { + "epoch": 0.1322797414391983, + "grad_norm": 0.148000106215477, + "learning_rate": 2.8193758540391993e-05, + "loss": 0.0368, + "step": 59980 + }, + { + "epoch": 0.13230179541409645, + "grad_norm": 0.1165812537074089, + "learning_rate": 2.8192971832229915e-05, + "loss": 0.0337, + "step": 59990 + }, + { + "epoch": 0.13232384938899464, + "grad_norm": 0.12790308892726898, + "learning_rate": 2.8192184963760773e-05, + "loss": 0.0354, + "step": 60000 + }, + { + "epoch": 0.1323459033638928, + "grad_norm": 0.11454226821660995, + "learning_rate": 2.8191397934994125e-05, + "loss": 0.0367, + "step": 60010 + }, + { + "epoch": 0.13236795733879095, + "grad_norm": 0.09572811424732208, + "learning_rate": 2.8190610745939543e-05, + "loss": 0.0353, + "step": 60020 + }, + { + "epoch": 0.13239001131368913, + "grad_norm": 0.11666087061166763, + "learning_rate": 2.8189823396606584e-05, + "loss": 0.0346, + "step": 60030 + }, + { + "epoch": 0.1324120652885873, + "grad_norm": 0.10787174850702286, + "learning_rate": 2.8189035887004815e-05, + "loss": 0.0347, + "step": 60040 + }, + { + "epoch": 0.13243411926348544, + "grad_norm": 0.09004750847816467, + "learning_rate": 2.818824821714381e-05, + "loss": 0.0375, + "step": 60050 + }, + { + "epoch": 0.13245617323838363, + "grad_norm": 0.11686185002326965, + "learning_rate": 2.8187460387033136e-05, + "loss": 0.0352, + "step": 60060 + }, + { + "epoch": 0.13247822721328179, + "grad_norm": 0.10992228239774704, + "learning_rate": 2.818667239668237e-05, + "loss": 0.0355, + "step": 60070 + }, + { + "epoch": 0.13250028118817994, + "grad_norm": 0.14071598649024963, + "learning_rate": 2.8185884246101083e-05, + "loss": 0.034, + "step": 60080 + }, + { + "epoch": 0.13252233516307813, + "grad_norm": 0.11485955119132996, + "learning_rate": 2.818509593529885e-05, + "loss": 0.038, + "step": 60090 + }, + { + "epoch": 0.13254438913797628, + "grad_norm": 0.08921290934085846, + "learning_rate": 2.818430746428526e-05, + "loss": 0.0357, + "step": 60100 + }, + { + "epoch": 0.13256644311287444, + "grad_norm": 0.12108512222766876, + "learning_rate": 2.818351883306988e-05, + "loss": 0.0351, + "step": 60110 + }, + { + "epoch": 0.13258849708777262, + "grad_norm": 0.13207213580608368, + "learning_rate": 2.8182730041662297e-05, + "loss": 0.0361, + "step": 60120 + }, + { + "epoch": 0.13261055106267078, + "grad_norm": 0.11862488090991974, + "learning_rate": 2.8181941090072102e-05, + "loss": 0.036, + "step": 60130 + }, + { + "epoch": 0.13263260503756893, + "grad_norm": 0.12528815865516663, + "learning_rate": 2.8181151978308875e-05, + "loss": 0.0339, + "step": 60140 + }, + { + "epoch": 0.13265465901246712, + "grad_norm": 0.09317813068628311, + "learning_rate": 2.818036270638221e-05, + "loss": 0.0358, + "step": 60150 + }, + { + "epoch": 0.13267671298736528, + "grad_norm": 0.1205635666847229, + "learning_rate": 2.817957327430169e-05, + "loss": 0.0378, + "step": 60160 + }, + { + "epoch": 0.13269876696226343, + "grad_norm": 0.10238660126924515, + "learning_rate": 2.817878368207691e-05, + "loss": 0.0362, + "step": 60170 + }, + { + "epoch": 0.13272082093716162, + "grad_norm": 0.10591811686754227, + "learning_rate": 2.8177993929717465e-05, + "loss": 0.0357, + "step": 60180 + }, + { + "epoch": 0.13274287491205977, + "grad_norm": 0.12098699808120728, + "learning_rate": 2.8177204017232956e-05, + "loss": 0.0346, + "step": 60190 + }, + { + "epoch": 0.13276492888695793, + "grad_norm": 0.11089323461055756, + "learning_rate": 2.8176413944632975e-05, + "loss": 0.0353, + "step": 60200 + }, + { + "epoch": 0.1327869828618561, + "grad_norm": 0.12054183334112167, + "learning_rate": 2.8175623711927124e-05, + "loss": 0.0355, + "step": 60210 + }, + { + "epoch": 0.13280903683675427, + "grad_norm": 0.1330317109823227, + "learning_rate": 2.8174833319125006e-05, + "loss": 0.0361, + "step": 60220 + }, + { + "epoch": 0.13283109081165242, + "grad_norm": 0.1436115801334381, + "learning_rate": 2.8174042766236223e-05, + "loss": 0.0375, + "step": 60230 + }, + { + "epoch": 0.1328531447865506, + "grad_norm": 0.09073398262262344, + "learning_rate": 2.8173252053270382e-05, + "loss": 0.0341, + "step": 60240 + }, + { + "epoch": 0.13287519876144877, + "grad_norm": 0.10591087490320206, + "learning_rate": 2.8172461180237086e-05, + "loss": 0.0383, + "step": 60250 + }, + { + "epoch": 0.13289725273634695, + "grad_norm": 0.11084777861833572, + "learning_rate": 2.817167014714596e-05, + "loss": 0.0364, + "step": 60260 + }, + { + "epoch": 0.1329193067112451, + "grad_norm": 0.1282489150762558, + "learning_rate": 2.81708789540066e-05, + "loss": 0.0358, + "step": 60270 + }, + { + "epoch": 0.13294136068614326, + "grad_norm": 0.13964393734931946, + "learning_rate": 2.8170087600828622e-05, + "loss": 0.0368, + "step": 60280 + }, + { + "epoch": 0.13296341466104145, + "grad_norm": 0.12477913498878479, + "learning_rate": 2.816929608762165e-05, + "loss": 0.0352, + "step": 60290 + }, + { + "epoch": 0.1329854686359396, + "grad_norm": 0.12151318043470383, + "learning_rate": 2.8168504414395295e-05, + "loss": 0.0376, + "step": 60300 + }, + { + "epoch": 0.13300752261083776, + "grad_norm": 0.1312258392572403, + "learning_rate": 2.816771258115918e-05, + "loss": 0.0368, + "step": 60310 + }, + { + "epoch": 0.13302957658573594, + "grad_norm": 0.11260340362787247, + "learning_rate": 2.8166920587922924e-05, + "loss": 0.0375, + "step": 60320 + }, + { + "epoch": 0.1330516305606341, + "grad_norm": 0.09018636494874954, + "learning_rate": 2.8166128434696148e-05, + "loss": 0.0338, + "step": 60330 + }, + { + "epoch": 0.13307368453553226, + "grad_norm": 0.09637240320444107, + "learning_rate": 2.8165336121488487e-05, + "loss": 0.0347, + "step": 60340 + }, + { + "epoch": 0.13309573851043044, + "grad_norm": 0.11072123795747757, + "learning_rate": 2.8164543648309554e-05, + "loss": 0.0363, + "step": 60350 + }, + { + "epoch": 0.1331177924853286, + "grad_norm": 0.1461479812860489, + "learning_rate": 2.816375101516899e-05, + "loss": 0.0351, + "step": 60360 + }, + { + "epoch": 0.13313984646022675, + "grad_norm": 0.13350774347782135, + "learning_rate": 2.8162958222076423e-05, + "loss": 0.0346, + "step": 60370 + }, + { + "epoch": 0.13316190043512494, + "grad_norm": 0.12776923179626465, + "learning_rate": 2.816216526904149e-05, + "loss": 0.035, + "step": 60380 + }, + { + "epoch": 0.1331839544100231, + "grad_norm": 0.11451496183872223, + "learning_rate": 2.8161372156073814e-05, + "loss": 0.0355, + "step": 60390 + }, + { + "epoch": 0.13320600838492125, + "grad_norm": 0.107308030128479, + "learning_rate": 2.8160578883183045e-05, + "loss": 0.0355, + "step": 60400 + }, + { + "epoch": 0.13322806235981943, + "grad_norm": 0.14477676153182983, + "learning_rate": 2.8159785450378816e-05, + "loss": 0.0361, + "step": 60410 + }, + { + "epoch": 0.1332501163347176, + "grad_norm": 0.1217120885848999, + "learning_rate": 2.8158991857670768e-05, + "loss": 0.0386, + "step": 60420 + }, + { + "epoch": 0.13327217030961575, + "grad_norm": 0.12247520685195923, + "learning_rate": 2.8158198105068544e-05, + "loss": 0.0363, + "step": 60430 + }, + { + "epoch": 0.13329422428451393, + "grad_norm": 0.10292325168848038, + "learning_rate": 2.8157404192581787e-05, + "loss": 0.0363, + "step": 60440 + }, + { + "epoch": 0.13331627825941209, + "grad_norm": 0.11751733720302582, + "learning_rate": 2.8156610120220153e-05, + "loss": 0.0355, + "step": 60450 + }, + { + "epoch": 0.13333833223431024, + "grad_norm": 0.12375085055828094, + "learning_rate": 2.815581588799328e-05, + "loss": 0.0354, + "step": 60460 + }, + { + "epoch": 0.13336038620920843, + "grad_norm": 0.1126893013715744, + "learning_rate": 2.8155021495910826e-05, + "loss": 0.0379, + "step": 60470 + }, + { + "epoch": 0.13338244018410658, + "grad_norm": 0.10146524012088776, + "learning_rate": 2.8154226943982438e-05, + "loss": 0.0349, + "step": 60480 + }, + { + "epoch": 0.13340449415900474, + "grad_norm": 0.1529056429862976, + "learning_rate": 2.815343223221777e-05, + "loss": 0.0357, + "step": 60490 + }, + { + "epoch": 0.13342654813390292, + "grad_norm": 0.11232606321573257, + "learning_rate": 2.8152637360626486e-05, + "loss": 0.0344, + "step": 60500 + }, + { + "epoch": 0.13344860210880108, + "grad_norm": 0.13638578355312347, + "learning_rate": 2.815184232921824e-05, + "loss": 0.0364, + "step": 60510 + }, + { + "epoch": 0.13347065608369924, + "grad_norm": 0.11486051976680756, + "learning_rate": 2.815104713800269e-05, + "loss": 0.0347, + "step": 60520 + }, + { + "epoch": 0.13349271005859742, + "grad_norm": 0.08825099468231201, + "learning_rate": 2.8150251786989502e-05, + "loss": 0.0367, + "step": 60530 + }, + { + "epoch": 0.13351476403349558, + "grad_norm": 0.12761564552783966, + "learning_rate": 2.814945627618834e-05, + "loss": 0.0345, + "step": 60540 + }, + { + "epoch": 0.13353681800839373, + "grad_norm": 0.11698850244283676, + "learning_rate": 2.8148660605608866e-05, + "loss": 0.0337, + "step": 60550 + }, + { + "epoch": 0.13355887198329192, + "grad_norm": 0.12610378861427307, + "learning_rate": 2.8147864775260752e-05, + "loss": 0.0345, + "step": 60560 + }, + { + "epoch": 0.13358092595819007, + "grad_norm": 0.10927915573120117, + "learning_rate": 2.814706878515367e-05, + "loss": 0.0352, + "step": 60570 + }, + { + "epoch": 0.13360297993308823, + "grad_norm": 0.11694901436567307, + "learning_rate": 2.814627263529729e-05, + "loss": 0.0359, + "step": 60580 + }, + { + "epoch": 0.1336250339079864, + "grad_norm": 0.11617069691419601, + "learning_rate": 2.814547632570128e-05, + "loss": 0.0357, + "step": 60590 + }, + { + "epoch": 0.13364708788288457, + "grad_norm": 0.10220429301261902, + "learning_rate": 2.8144679856375325e-05, + "loss": 0.0357, + "step": 60600 + }, + { + "epoch": 0.13366914185778273, + "grad_norm": 0.1296122968196869, + "learning_rate": 2.8143883227329098e-05, + "loss": 0.035, + "step": 60610 + }, + { + "epoch": 0.1336911958326809, + "grad_norm": 0.11678837984800339, + "learning_rate": 2.8143086438572278e-05, + "loss": 0.0372, + "step": 60620 + }, + { + "epoch": 0.13371324980757907, + "grad_norm": 0.11048542708158493, + "learning_rate": 2.814228949011455e-05, + "loss": 0.035, + "step": 60630 + }, + { + "epoch": 0.13373530378247722, + "grad_norm": 0.13417424261569977, + "learning_rate": 2.8141492381965597e-05, + "loss": 0.0376, + "step": 60640 + }, + { + "epoch": 0.1337573577573754, + "grad_norm": 0.1197262853384018, + "learning_rate": 2.8140695114135107e-05, + "loss": 0.0366, + "step": 60650 + }, + { + "epoch": 0.13377941173227356, + "grad_norm": 0.14577415585517883, + "learning_rate": 2.813989768663276e-05, + "loss": 0.037, + "step": 60660 + }, + { + "epoch": 0.13380146570717172, + "grad_norm": 0.14911732077598572, + "learning_rate": 2.8139100099468254e-05, + "loss": 0.0373, + "step": 60670 + }, + { + "epoch": 0.1338235196820699, + "grad_norm": 0.10719096660614014, + "learning_rate": 2.8138302352651275e-05, + "loss": 0.0368, + "step": 60680 + }, + { + "epoch": 0.13384557365696806, + "grad_norm": 0.09044117480516434, + "learning_rate": 2.813750444619152e-05, + "loss": 0.0364, + "step": 60690 + }, + { + "epoch": 0.13386762763186624, + "grad_norm": 0.09977348893880844, + "learning_rate": 2.8136706380098674e-05, + "loss": 0.0378, + "step": 60700 + }, + { + "epoch": 0.1338896816067644, + "grad_norm": 0.11459968984127045, + "learning_rate": 2.813590815438245e-05, + "loss": 0.0372, + "step": 60710 + }, + { + "epoch": 0.13391173558166256, + "grad_norm": 0.09610356390476227, + "learning_rate": 2.8135109769052535e-05, + "loss": 0.0374, + "step": 60720 + }, + { + "epoch": 0.13393378955656074, + "grad_norm": 0.11666204780340195, + "learning_rate": 2.8134311224118636e-05, + "loss": 0.036, + "step": 60730 + }, + { + "epoch": 0.1339558435314589, + "grad_norm": 0.11875035613775253, + "learning_rate": 2.8133512519590454e-05, + "loss": 0.0362, + "step": 60740 + }, + { + "epoch": 0.13397789750635705, + "grad_norm": 0.10247009247541428, + "learning_rate": 2.81327136554777e-05, + "loss": 0.0365, + "step": 60750 + }, + { + "epoch": 0.13399995148125524, + "grad_norm": 0.10925475507974625, + "learning_rate": 2.8131914631790074e-05, + "loss": 0.0357, + "step": 60760 + }, + { + "epoch": 0.1340220054561534, + "grad_norm": 0.13033495843410492, + "learning_rate": 2.813111544853728e-05, + "loss": 0.038, + "step": 60770 + }, + { + "epoch": 0.13404405943105155, + "grad_norm": 0.1097671389579773, + "learning_rate": 2.813031610572904e-05, + "loss": 0.0386, + "step": 60780 + }, + { + "epoch": 0.13406611340594973, + "grad_norm": 0.14961320161819458, + "learning_rate": 2.8129516603375066e-05, + "loss": 0.04, + "step": 60790 + }, + { + "epoch": 0.1340881673808479, + "grad_norm": 0.10923852771520615, + "learning_rate": 2.812871694148506e-05, + "loss": 0.0345, + "step": 60800 + }, + { + "epoch": 0.13411022135574605, + "grad_norm": 0.12422005087137222, + "learning_rate": 2.8127917120068755e-05, + "loss": 0.0363, + "step": 60810 + }, + { + "epoch": 0.13413227533064423, + "grad_norm": 0.112846739590168, + "learning_rate": 2.8127117139135858e-05, + "loss": 0.0373, + "step": 60820 + }, + { + "epoch": 0.13415432930554239, + "grad_norm": 0.13218308985233307, + "learning_rate": 2.8126316998696096e-05, + "loss": 0.0345, + "step": 60830 + }, + { + "epoch": 0.13417638328044054, + "grad_norm": 0.12332871556282043, + "learning_rate": 2.8125516698759194e-05, + "loss": 0.035, + "step": 60840 + }, + { + "epoch": 0.13419843725533873, + "grad_norm": 0.10519351065158844, + "learning_rate": 2.8124716239334864e-05, + "loss": 0.0345, + "step": 60850 + }, + { + "epoch": 0.13422049123023688, + "grad_norm": 0.13362984359264374, + "learning_rate": 2.812391562043284e-05, + "loss": 0.0352, + "step": 60860 + }, + { + "epoch": 0.13424254520513504, + "grad_norm": 0.12102173268795013, + "learning_rate": 2.8123114842062853e-05, + "loss": 0.0364, + "step": 60870 + }, + { + "epoch": 0.13426459918003322, + "grad_norm": 0.12153875082731247, + "learning_rate": 2.8122313904234633e-05, + "loss": 0.0356, + "step": 60880 + }, + { + "epoch": 0.13428665315493138, + "grad_norm": 0.12768565118312836, + "learning_rate": 2.8121512806957905e-05, + "loss": 0.0354, + "step": 60890 + }, + { + "epoch": 0.13430870712982954, + "grad_norm": 0.09463168680667877, + "learning_rate": 2.8120711550242408e-05, + "loss": 0.0368, + "step": 60900 + }, + { + "epoch": 0.13433076110472772, + "grad_norm": 0.09372004866600037, + "learning_rate": 2.811991013409788e-05, + "loss": 0.0355, + "step": 60910 + }, + { + "epoch": 0.13435281507962588, + "grad_norm": 0.10833602398633957, + "learning_rate": 2.8119108558534056e-05, + "loss": 0.0366, + "step": 60920 + }, + { + "epoch": 0.13437486905452403, + "grad_norm": 0.09164334833621979, + "learning_rate": 2.8118306823560673e-05, + "loss": 0.0353, + "step": 60930 + }, + { + "epoch": 0.13439692302942222, + "grad_norm": 0.12431927025318146, + "learning_rate": 2.8117504929187483e-05, + "loss": 0.0338, + "step": 60940 + }, + { + "epoch": 0.13441897700432037, + "grad_norm": 0.1128590852022171, + "learning_rate": 2.8116702875424218e-05, + "loss": 0.0358, + "step": 60950 + }, + { + "epoch": 0.13444103097921853, + "grad_norm": 0.118962362408638, + "learning_rate": 2.811590066228063e-05, + "loss": 0.0352, + "step": 60960 + }, + { + "epoch": 0.1344630849541167, + "grad_norm": 0.1331261247396469, + "learning_rate": 2.8115098289766467e-05, + "loss": 0.0373, + "step": 60970 + }, + { + "epoch": 0.13448513892901487, + "grad_norm": 0.10647036135196686, + "learning_rate": 2.8114295757891473e-05, + "loss": 0.0369, + "step": 60980 + }, + { + "epoch": 0.13450719290391303, + "grad_norm": 0.0833534225821495, + "learning_rate": 2.8113493066665405e-05, + "loss": 0.0349, + "step": 60990 + }, + { + "epoch": 0.1345292468788112, + "grad_norm": 0.10990983247756958, + "learning_rate": 2.811269021609802e-05, + "loss": 0.0353, + "step": 61000 + }, + { + "epoch": 0.13455130085370937, + "grad_norm": 0.1967262476682663, + "learning_rate": 2.8111887206199063e-05, + "loss": 0.0341, + "step": 61010 + }, + { + "epoch": 0.13457335482860752, + "grad_norm": 0.10190500319004059, + "learning_rate": 2.81110840369783e-05, + "loss": 0.0335, + "step": 61020 + }, + { + "epoch": 0.1345954088035057, + "grad_norm": 0.10964377969503403, + "learning_rate": 2.8110280708445485e-05, + "loss": 0.0337, + "step": 61030 + }, + { + "epoch": 0.13461746277840386, + "grad_norm": 0.09368694573640823, + "learning_rate": 2.8109477220610385e-05, + "loss": 0.0345, + "step": 61040 + }, + { + "epoch": 0.13463951675330202, + "grad_norm": 0.09792482107877731, + "learning_rate": 2.8108673573482753e-05, + "loss": 0.0349, + "step": 61050 + }, + { + "epoch": 0.1346615707282002, + "grad_norm": 0.12309711426496506, + "learning_rate": 2.8107869767072364e-05, + "loss": 0.0364, + "step": 61060 + }, + { + "epoch": 0.13468362470309836, + "grad_norm": 0.11903275549411774, + "learning_rate": 2.810706580138898e-05, + "loss": 0.0367, + "step": 61070 + }, + { + "epoch": 0.13470567867799652, + "grad_norm": 0.11483044177293777, + "learning_rate": 2.8106261676442374e-05, + "loss": 0.0369, + "step": 61080 + }, + { + "epoch": 0.1347277326528947, + "grad_norm": 0.11257023364305496, + "learning_rate": 2.8105457392242313e-05, + "loss": 0.0349, + "step": 61090 + }, + { + "epoch": 0.13474978662779286, + "grad_norm": 0.10331299901008606, + "learning_rate": 2.8104652948798573e-05, + "loss": 0.0354, + "step": 61100 + }, + { + "epoch": 0.13477184060269104, + "grad_norm": 0.10478783398866653, + "learning_rate": 2.8103848346120923e-05, + "loss": 0.0349, + "step": 61110 + }, + { + "epoch": 0.1347938945775892, + "grad_norm": 0.11894050240516663, + "learning_rate": 2.8103043584219145e-05, + "loss": 0.0351, + "step": 61120 + }, + { + "epoch": 0.13481594855248735, + "grad_norm": 0.09570752084255219, + "learning_rate": 2.8102238663103016e-05, + "loss": 0.0359, + "step": 61130 + }, + { + "epoch": 0.13483800252738554, + "grad_norm": 0.09808364510536194, + "learning_rate": 2.810143358278232e-05, + "loss": 0.0338, + "step": 61140 + }, + { + "epoch": 0.1348600565022837, + "grad_norm": 0.09010683000087738, + "learning_rate": 2.810062834326683e-05, + "loss": 0.0345, + "step": 61150 + }, + { + "epoch": 0.13488211047718185, + "grad_norm": 0.13197043538093567, + "learning_rate": 2.8099822944566343e-05, + "loss": 0.035, + "step": 61160 + }, + { + "epoch": 0.13490416445208003, + "grad_norm": 0.1095089539885521, + "learning_rate": 2.8099017386690635e-05, + "loss": 0.0373, + "step": 61170 + }, + { + "epoch": 0.1349262184269782, + "grad_norm": 0.09648169577121735, + "learning_rate": 2.8098211669649498e-05, + "loss": 0.0375, + "step": 61180 + }, + { + "epoch": 0.13494827240187635, + "grad_norm": 0.10039477050304413, + "learning_rate": 2.8097405793452723e-05, + "loss": 0.035, + "step": 61190 + }, + { + "epoch": 0.13497032637677453, + "grad_norm": 0.10957455635070801, + "learning_rate": 2.8096599758110102e-05, + "loss": 0.0363, + "step": 61200 + }, + { + "epoch": 0.1349923803516727, + "grad_norm": 0.13618715107440948, + "learning_rate": 2.809579356363143e-05, + "loss": 0.0367, + "step": 61210 + }, + { + "epoch": 0.13501443432657084, + "grad_norm": 0.11307843774557114, + "learning_rate": 2.80949872100265e-05, + "loss": 0.0382, + "step": 61220 + }, + { + "epoch": 0.13503648830146903, + "grad_norm": 0.110359787940979, + "learning_rate": 2.809418069730511e-05, + "loss": 0.035, + "step": 61230 + }, + { + "epoch": 0.13505854227636718, + "grad_norm": 0.0999787449836731, + "learning_rate": 2.8093374025477067e-05, + "loss": 0.0353, + "step": 61240 + }, + { + "epoch": 0.13508059625126534, + "grad_norm": 0.08830071240663528, + "learning_rate": 2.8092567194552164e-05, + "loss": 0.0353, + "step": 61250 + }, + { + "epoch": 0.13510265022616352, + "grad_norm": 0.11238876730203629, + "learning_rate": 2.809176020454021e-05, + "loss": 0.035, + "step": 61260 + }, + { + "epoch": 0.13512470420106168, + "grad_norm": 0.10409335047006607, + "learning_rate": 2.8090953055451002e-05, + "loss": 0.0359, + "step": 61270 + }, + { + "epoch": 0.13514675817595984, + "grad_norm": 0.08893316984176636, + "learning_rate": 2.8090145747294364e-05, + "loss": 0.0347, + "step": 61280 + }, + { + "epoch": 0.13516881215085802, + "grad_norm": 0.0983731597661972, + "learning_rate": 2.8089338280080093e-05, + "loss": 0.0352, + "step": 61290 + }, + { + "epoch": 0.13519086612575618, + "grad_norm": 0.12762944400310516, + "learning_rate": 2.8088530653818e-05, + "loss": 0.0355, + "step": 61300 + }, + { + "epoch": 0.13521292010065433, + "grad_norm": 0.1379075050354004, + "learning_rate": 2.8087722868517903e-05, + "loss": 0.0379, + "step": 61310 + }, + { + "epoch": 0.13523497407555252, + "grad_norm": 0.09523740410804749, + "learning_rate": 2.8086914924189618e-05, + "loss": 0.034, + "step": 61320 + }, + { + "epoch": 0.13525702805045067, + "grad_norm": 0.1314850151538849, + "learning_rate": 2.808610682084296e-05, + "loss": 0.0349, + "step": 61330 + }, + { + "epoch": 0.13527908202534883, + "grad_norm": 0.10906937718391418, + "learning_rate": 2.808529855848775e-05, + "loss": 0.0344, + "step": 61340 + }, + { + "epoch": 0.135301136000247, + "grad_norm": 0.08719096332788467, + "learning_rate": 2.8084490137133805e-05, + "loss": 0.0369, + "step": 61350 + }, + { + "epoch": 0.13532318997514517, + "grad_norm": 0.11044686287641525, + "learning_rate": 2.8083681556790953e-05, + "loss": 0.0356, + "step": 61360 + }, + { + "epoch": 0.13534524395004333, + "grad_norm": 0.11666819453239441, + "learning_rate": 2.808287281746902e-05, + "loss": 0.0337, + "step": 61370 + }, + { + "epoch": 0.1353672979249415, + "grad_norm": 0.11523609608411789, + "learning_rate": 2.8082063919177822e-05, + "loss": 0.0363, + "step": 61380 + }, + { + "epoch": 0.13538935189983967, + "grad_norm": 0.10757719725370407, + "learning_rate": 2.80812548619272e-05, + "loss": 0.0356, + "step": 61390 + }, + { + "epoch": 0.13541140587473782, + "grad_norm": 0.1234234943985939, + "learning_rate": 2.8080445645726985e-05, + "loss": 0.0357, + "step": 61400 + }, + { + "epoch": 0.135433459849636, + "grad_norm": 0.10273149609565735, + "learning_rate": 2.8079636270587e-05, + "loss": 0.0357, + "step": 61410 + }, + { + "epoch": 0.13545551382453416, + "grad_norm": 0.11341648548841476, + "learning_rate": 2.8078826736517083e-05, + "loss": 0.0344, + "step": 61420 + }, + { + "epoch": 0.13547756779943232, + "grad_norm": 0.1056375652551651, + "learning_rate": 2.8078017043527083e-05, + "loss": 0.0366, + "step": 61430 + }, + { + "epoch": 0.1354996217743305, + "grad_norm": 0.12268470972776413, + "learning_rate": 2.807720719162682e-05, + "loss": 0.0351, + "step": 61440 + }, + { + "epoch": 0.13552167574922866, + "grad_norm": 0.10751069337129593, + "learning_rate": 2.8076397180826145e-05, + "loss": 0.0327, + "step": 61450 + }, + { + "epoch": 0.13554372972412682, + "grad_norm": 0.12505744397640228, + "learning_rate": 2.8075587011134896e-05, + "loss": 0.0365, + "step": 61460 + }, + { + "epoch": 0.135565783699025, + "grad_norm": 0.11880569159984589, + "learning_rate": 2.8074776682562922e-05, + "loss": 0.0366, + "step": 61470 + }, + { + "epoch": 0.13558783767392316, + "grad_norm": 0.10812395066022873, + "learning_rate": 2.8073966195120063e-05, + "loss": 0.0371, + "step": 61480 + }, + { + "epoch": 0.1356098916488213, + "grad_norm": 0.14999085664749146, + "learning_rate": 2.8073155548816175e-05, + "loss": 0.0357, + "step": 61490 + }, + { + "epoch": 0.1356319456237195, + "grad_norm": 0.1275872141122818, + "learning_rate": 2.8072344743661103e-05, + "loss": 0.0345, + "step": 61500 + }, + { + "epoch": 0.13565399959861765, + "grad_norm": 0.1309577226638794, + "learning_rate": 2.8071533779664698e-05, + "loss": 0.0377, + "step": 61510 + }, + { + "epoch": 0.1356760535735158, + "grad_norm": 0.11265879124403, + "learning_rate": 2.8070722656836817e-05, + "loss": 0.0355, + "step": 61520 + }, + { + "epoch": 0.135698107548414, + "grad_norm": 0.07922030985355377, + "learning_rate": 2.806991137518732e-05, + "loss": 0.0345, + "step": 61530 + }, + { + "epoch": 0.13572016152331215, + "grad_norm": 0.10034073144197464, + "learning_rate": 2.8069099934726054e-05, + "loss": 0.0351, + "step": 61540 + }, + { + "epoch": 0.13574221549821033, + "grad_norm": 0.1201295256614685, + "learning_rate": 2.8068288335462887e-05, + "loss": 0.0351, + "step": 61550 + }, + { + "epoch": 0.1357642694731085, + "grad_norm": 0.11267822235822678, + "learning_rate": 2.8067476577407676e-05, + "loss": 0.0355, + "step": 61560 + }, + { + "epoch": 0.13578632344800665, + "grad_norm": 0.10630660504102707, + "learning_rate": 2.8066664660570287e-05, + "loss": 0.0368, + "step": 61570 + }, + { + "epoch": 0.13580837742290483, + "grad_norm": 0.12542776763439178, + "learning_rate": 2.8065852584960585e-05, + "loss": 0.0372, + "step": 61580 + }, + { + "epoch": 0.135830431397803, + "grad_norm": 0.13810734450817108, + "learning_rate": 2.8065040350588437e-05, + "loss": 0.0336, + "step": 61590 + }, + { + "epoch": 0.13585248537270114, + "grad_norm": 0.11876502633094788, + "learning_rate": 2.806422795746372e-05, + "loss": 0.0353, + "step": 61600 + }, + { + "epoch": 0.13587453934759933, + "grad_norm": 0.11478035897016525, + "learning_rate": 2.806341540559629e-05, + "loss": 0.0369, + "step": 61610 + }, + { + "epoch": 0.13589659332249748, + "grad_norm": 0.10045893490314484, + "learning_rate": 2.8062602694996036e-05, + "loss": 0.0333, + "step": 61620 + }, + { + "epoch": 0.13591864729739564, + "grad_norm": 0.1093566045165062, + "learning_rate": 2.806178982567282e-05, + "loss": 0.0349, + "step": 61630 + }, + { + "epoch": 0.13594070127229382, + "grad_norm": 0.10527048259973526, + "learning_rate": 2.806097679763653e-05, + "loss": 0.0356, + "step": 61640 + }, + { + "epoch": 0.13596275524719198, + "grad_norm": 0.15692944824695587, + "learning_rate": 2.806016361089704e-05, + "loss": 0.0363, + "step": 61650 + }, + { + "epoch": 0.13598480922209014, + "grad_norm": 0.0957830622792244, + "learning_rate": 2.8059350265464226e-05, + "loss": 0.0349, + "step": 61660 + }, + { + "epoch": 0.13600686319698832, + "grad_norm": 0.11252623051404953, + "learning_rate": 2.8058536761347983e-05, + "loss": 0.0366, + "step": 61670 + }, + { + "epoch": 0.13602891717188648, + "grad_norm": 0.11144948750734329, + "learning_rate": 2.8057723098558186e-05, + "loss": 0.0366, + "step": 61680 + }, + { + "epoch": 0.13605097114678463, + "grad_norm": 0.1280878633260727, + "learning_rate": 2.8056909277104725e-05, + "loss": 0.0358, + "step": 61690 + }, + { + "epoch": 0.13607302512168282, + "grad_norm": 0.09423200786113739, + "learning_rate": 2.8056095296997487e-05, + "loss": 0.0338, + "step": 61700 + }, + { + "epoch": 0.13609507909658097, + "grad_norm": 0.130771666765213, + "learning_rate": 2.8055281158246365e-05, + "loss": 0.0353, + "step": 61710 + }, + { + "epoch": 0.13611713307147913, + "grad_norm": 0.11251963675022125, + "learning_rate": 2.8054466860861252e-05, + "loss": 0.0349, + "step": 61720 + }, + { + "epoch": 0.1361391870463773, + "grad_norm": 0.10532992333173752, + "learning_rate": 2.805365240485204e-05, + "loss": 0.034, + "step": 61730 + }, + { + "epoch": 0.13616124102127547, + "grad_norm": 0.10162556171417236, + "learning_rate": 2.8052837790228625e-05, + "loss": 0.0353, + "step": 61740 + }, + { + "epoch": 0.13618329499617363, + "grad_norm": 0.0831604078412056, + "learning_rate": 2.805202301700091e-05, + "loss": 0.0348, + "step": 61750 + }, + { + "epoch": 0.1362053489710718, + "grad_norm": 0.09058237075805664, + "learning_rate": 2.805120808517879e-05, + "loss": 0.0364, + "step": 61760 + }, + { + "epoch": 0.13622740294596997, + "grad_norm": 0.09828968346118927, + "learning_rate": 2.8050392994772173e-05, + "loss": 0.035, + "step": 61770 + }, + { + "epoch": 0.13624945692086812, + "grad_norm": 0.09476020932197571, + "learning_rate": 2.804957774579096e-05, + "loss": 0.0355, + "step": 61780 + }, + { + "epoch": 0.1362715108957663, + "grad_norm": 0.10190683603286743, + "learning_rate": 2.804876233824505e-05, + "loss": 0.0354, + "step": 61790 + }, + { + "epoch": 0.13629356487066446, + "grad_norm": 0.14051292836666107, + "learning_rate": 2.804794677214436e-05, + "loss": 0.0364, + "step": 61800 + }, + { + "epoch": 0.13631561884556262, + "grad_norm": 0.10897573828697205, + "learning_rate": 2.80471310474988e-05, + "loss": 0.0367, + "step": 61810 + }, + { + "epoch": 0.1363376728204608, + "grad_norm": 0.12813420593738556, + "learning_rate": 2.804631516431828e-05, + "loss": 0.0354, + "step": 61820 + }, + { + "epoch": 0.13635972679535896, + "grad_norm": 0.12811163067817688, + "learning_rate": 2.804549912261271e-05, + "loss": 0.0358, + "step": 61830 + }, + { + "epoch": 0.13638178077025712, + "grad_norm": 0.11490677297115326, + "learning_rate": 2.8044682922392005e-05, + "loss": 0.0335, + "step": 61840 + }, + { + "epoch": 0.1364038347451553, + "grad_norm": 0.12168512493371964, + "learning_rate": 2.8043866563666092e-05, + "loss": 0.0364, + "step": 61850 + }, + { + "epoch": 0.13642588872005346, + "grad_norm": 0.11417748779058456, + "learning_rate": 2.8043050046444886e-05, + "loss": 0.0338, + "step": 61860 + }, + { + "epoch": 0.1364479426949516, + "grad_norm": 0.10030084103345871, + "learning_rate": 2.8042233370738302e-05, + "loss": 0.0352, + "step": 61870 + }, + { + "epoch": 0.1364699966698498, + "grad_norm": 0.1356816440820694, + "learning_rate": 2.804141653655627e-05, + "loss": 0.0349, + "step": 61880 + }, + { + "epoch": 0.13649205064474795, + "grad_norm": 0.11700090020895004, + "learning_rate": 2.804059954390872e-05, + "loss": 0.0367, + "step": 61890 + }, + { + "epoch": 0.1365141046196461, + "grad_norm": 0.10865292698144913, + "learning_rate": 2.8039782392805566e-05, + "loss": 0.0353, + "step": 61900 + }, + { + "epoch": 0.1365361585945443, + "grad_norm": 0.11482524871826172, + "learning_rate": 2.8038965083256747e-05, + "loss": 0.0361, + "step": 61910 + }, + { + "epoch": 0.13655821256944245, + "grad_norm": 0.12762343883514404, + "learning_rate": 2.803814761527219e-05, + "loss": 0.0363, + "step": 61920 + }, + { + "epoch": 0.1365802665443406, + "grad_norm": 0.1160278245806694, + "learning_rate": 2.8037329988861834e-05, + "loss": 0.0354, + "step": 61930 + }, + { + "epoch": 0.1366023205192388, + "grad_norm": 0.09557054936885834, + "learning_rate": 2.80365122040356e-05, + "loss": 0.0363, + "step": 61940 + }, + { + "epoch": 0.13662437449413695, + "grad_norm": 0.10700196772813797, + "learning_rate": 2.8035694260803444e-05, + "loss": 0.0368, + "step": 61950 + }, + { + "epoch": 0.1366464284690351, + "grad_norm": 0.08886757493019104, + "learning_rate": 2.803487615917529e-05, + "loss": 0.034, + "step": 61960 + }, + { + "epoch": 0.1366684824439333, + "grad_norm": 0.12106812000274658, + "learning_rate": 2.8034057899161085e-05, + "loss": 0.0365, + "step": 61970 + }, + { + "epoch": 0.13669053641883144, + "grad_norm": 0.12227863818407059, + "learning_rate": 2.8033239480770772e-05, + "loss": 0.0348, + "step": 61980 + }, + { + "epoch": 0.13671259039372963, + "grad_norm": 0.10595846176147461, + "learning_rate": 2.8032420904014294e-05, + "loss": 0.0358, + "step": 61990 + }, + { + "epoch": 0.13673464436862778, + "grad_norm": 0.10755718499422073, + "learning_rate": 2.803160216890159e-05, + "loss": 0.036, + "step": 62000 + }, + { + "epoch": 0.13675669834352594, + "grad_norm": 0.12869276106357574, + "learning_rate": 2.8030783275442625e-05, + "loss": 0.0376, + "step": 62010 + }, + { + "epoch": 0.13677875231842412, + "grad_norm": 0.12166564911603928, + "learning_rate": 2.8029964223647336e-05, + "loss": 0.0353, + "step": 62020 + }, + { + "epoch": 0.13680080629332228, + "grad_norm": 0.11712963134050369, + "learning_rate": 2.802914501352568e-05, + "loss": 0.0341, + "step": 62030 + }, + { + "epoch": 0.13682286026822044, + "grad_norm": 0.10372945666313171, + "learning_rate": 2.802832564508761e-05, + "loss": 0.0347, + "step": 62040 + }, + { + "epoch": 0.13684491424311862, + "grad_norm": 0.09548506140708923, + "learning_rate": 2.8027506118343085e-05, + "loss": 0.0356, + "step": 62050 + }, + { + "epoch": 0.13686696821801678, + "grad_norm": 0.11363567411899567, + "learning_rate": 2.8026686433302058e-05, + "loss": 0.0351, + "step": 62060 + }, + { + "epoch": 0.13688902219291493, + "grad_norm": 0.1216769739985466, + "learning_rate": 2.8025866589974493e-05, + "loss": 0.0364, + "step": 62070 + }, + { + "epoch": 0.13691107616781312, + "grad_norm": 0.10611297935247421, + "learning_rate": 2.8025046588370345e-05, + "loss": 0.0355, + "step": 62080 + }, + { + "epoch": 0.13693313014271127, + "grad_norm": 0.11804179847240448, + "learning_rate": 2.802422642849959e-05, + "loss": 0.0376, + "step": 62090 + }, + { + "epoch": 0.13695518411760943, + "grad_norm": 0.10596057772636414, + "learning_rate": 2.8023406110372188e-05, + "loss": 0.0347, + "step": 62100 + }, + { + "epoch": 0.1369772380925076, + "grad_norm": 0.11134763062000275, + "learning_rate": 2.80225856339981e-05, + "loss": 0.0371, + "step": 62110 + }, + { + "epoch": 0.13699929206740577, + "grad_norm": 0.10773550719022751, + "learning_rate": 2.80217649993873e-05, + "loss": 0.0345, + "step": 62120 + }, + { + "epoch": 0.13702134604230393, + "grad_norm": 0.12269312888383865, + "learning_rate": 2.8020944206549765e-05, + "loss": 0.0354, + "step": 62130 + }, + { + "epoch": 0.1370434000172021, + "grad_norm": 0.11271357536315918, + "learning_rate": 2.802012325549547e-05, + "loss": 0.034, + "step": 62140 + }, + { + "epoch": 0.13706545399210027, + "grad_norm": 0.1298818290233612, + "learning_rate": 2.8019302146234374e-05, + "loss": 0.0365, + "step": 62150 + }, + { + "epoch": 0.13708750796699842, + "grad_norm": 0.13585056364536285, + "learning_rate": 2.801848087877647e-05, + "loss": 0.0371, + "step": 62160 + }, + { + "epoch": 0.1371095619418966, + "grad_norm": 0.14326557517051697, + "learning_rate": 2.8017659453131726e-05, + "loss": 0.0345, + "step": 62170 + }, + { + "epoch": 0.13713161591679476, + "grad_norm": 0.11183440685272217, + "learning_rate": 2.8016837869310134e-05, + "loss": 0.0344, + "step": 62180 + }, + { + "epoch": 0.13715366989169292, + "grad_norm": 0.11979501694440842, + "learning_rate": 2.8016016127321674e-05, + "loss": 0.0345, + "step": 62190 + }, + { + "epoch": 0.1371757238665911, + "grad_norm": 0.10672131180763245, + "learning_rate": 2.8015194227176326e-05, + "loss": 0.0368, + "step": 62200 + }, + { + "epoch": 0.13719777784148926, + "grad_norm": 0.12041573971509933, + "learning_rate": 2.801437216888408e-05, + "loss": 0.0339, + "step": 62210 + }, + { + "epoch": 0.13721983181638742, + "grad_norm": 0.1192198246717453, + "learning_rate": 2.8013549952454925e-05, + "loss": 0.0344, + "step": 62220 + }, + { + "epoch": 0.1372418857912856, + "grad_norm": 0.13555915653705597, + "learning_rate": 2.8012727577898848e-05, + "loss": 0.0348, + "step": 62230 + }, + { + "epoch": 0.13726393976618376, + "grad_norm": 0.09702325612306595, + "learning_rate": 2.801190504522585e-05, + "loss": 0.0345, + "step": 62240 + }, + { + "epoch": 0.1372859937410819, + "grad_norm": 0.1399402767419815, + "learning_rate": 2.8011082354445915e-05, + "loss": 0.0373, + "step": 62250 + }, + { + "epoch": 0.1373080477159801, + "grad_norm": 0.11293522268533707, + "learning_rate": 2.801025950556905e-05, + "loss": 0.0349, + "step": 62260 + }, + { + "epoch": 0.13733010169087825, + "grad_norm": 0.10097945481538773, + "learning_rate": 2.8009436498605248e-05, + "loss": 0.036, + "step": 62270 + }, + { + "epoch": 0.1373521556657764, + "grad_norm": 0.10867034643888474, + "learning_rate": 2.8008613333564506e-05, + "loss": 0.0363, + "step": 62280 + }, + { + "epoch": 0.1373742096406746, + "grad_norm": 0.1336282640695572, + "learning_rate": 2.8007790010456834e-05, + "loss": 0.035, + "step": 62290 + }, + { + "epoch": 0.13739626361557275, + "grad_norm": 0.1286008208990097, + "learning_rate": 2.800696652929223e-05, + "loss": 0.0357, + "step": 62300 + }, + { + "epoch": 0.1374183175904709, + "grad_norm": 0.14178426563739777, + "learning_rate": 2.80061428900807e-05, + "loss": 0.035, + "step": 62310 + }, + { + "epoch": 0.1374403715653691, + "grad_norm": 0.12067048996686935, + "learning_rate": 2.800531909283226e-05, + "loss": 0.0352, + "step": 62320 + }, + { + "epoch": 0.13746242554026725, + "grad_norm": 0.14712761342525482, + "learning_rate": 2.8004495137556906e-05, + "loss": 0.0347, + "step": 62330 + }, + { + "epoch": 0.1374844795151654, + "grad_norm": 0.13841643929481506, + "learning_rate": 2.800367102426466e-05, + "loss": 0.0364, + "step": 62340 + }, + { + "epoch": 0.1375065334900636, + "grad_norm": 0.13403496146202087, + "learning_rate": 2.8002846752965535e-05, + "loss": 0.0363, + "step": 62350 + }, + { + "epoch": 0.13752858746496174, + "grad_norm": 0.10780541598796844, + "learning_rate": 2.800202232366955e-05, + "loss": 0.0356, + "step": 62360 + }, + { + "epoch": 0.1375506414398599, + "grad_norm": 0.11873313039541245, + "learning_rate": 2.800119773638671e-05, + "loss": 0.0357, + "step": 62370 + }, + { + "epoch": 0.13757269541475808, + "grad_norm": 0.14402441680431366, + "learning_rate": 2.8000372991127045e-05, + "loss": 0.036, + "step": 62380 + }, + { + "epoch": 0.13759474938965624, + "grad_norm": 0.12760969996452332, + "learning_rate": 2.7999548087900573e-05, + "loss": 0.0361, + "step": 62390 + }, + { + "epoch": 0.13761680336455442, + "grad_norm": 0.12672050297260284, + "learning_rate": 2.799872302671732e-05, + "loss": 0.0378, + "step": 62400 + }, + { + "epoch": 0.13763885733945258, + "grad_norm": 0.1178295835852623, + "learning_rate": 2.799789780758731e-05, + "loss": 0.034, + "step": 62410 + }, + { + "epoch": 0.13766091131435074, + "grad_norm": 0.13577672839164734, + "learning_rate": 2.799707243052057e-05, + "loss": 0.0369, + "step": 62420 + }, + { + "epoch": 0.13768296528924892, + "grad_norm": 0.12987004220485687, + "learning_rate": 2.7996246895527126e-05, + "loss": 0.0357, + "step": 62430 + }, + { + "epoch": 0.13770501926414708, + "grad_norm": 0.1255032867193222, + "learning_rate": 2.7995421202617014e-05, + "loss": 0.0347, + "step": 62440 + }, + { + "epoch": 0.13772707323904523, + "grad_norm": 0.13088002800941467, + "learning_rate": 2.799459535180026e-05, + "loss": 0.0362, + "step": 62450 + }, + { + "epoch": 0.13774912721394342, + "grad_norm": 0.10319416224956512, + "learning_rate": 2.7993769343086905e-05, + "loss": 0.0361, + "step": 62460 + }, + { + "epoch": 0.13777118118884157, + "grad_norm": 0.10188907384872437, + "learning_rate": 2.7992943176486987e-05, + "loss": 0.0341, + "step": 62470 + }, + { + "epoch": 0.13779323516373973, + "grad_norm": 0.1123296469449997, + "learning_rate": 2.7992116852010543e-05, + "loss": 0.0342, + "step": 62480 + }, + { + "epoch": 0.13781528913863791, + "grad_norm": 0.11548826098442078, + "learning_rate": 2.799129036966761e-05, + "loss": 0.035, + "step": 62490 + }, + { + "epoch": 0.13783734311353607, + "grad_norm": 0.11524403095245361, + "learning_rate": 2.7990463729468235e-05, + "loss": 0.035, + "step": 62500 + }, + { + "epoch": 0.13785939708843423, + "grad_norm": 0.10047467052936554, + "learning_rate": 2.798963693142246e-05, + "loss": 0.0339, + "step": 62510 + }, + { + "epoch": 0.1378814510633324, + "grad_norm": 0.10921045392751694, + "learning_rate": 2.798880997554033e-05, + "loss": 0.0351, + "step": 62520 + }, + { + "epoch": 0.13790350503823057, + "grad_norm": 0.13048484921455383, + "learning_rate": 2.7987982861831898e-05, + "loss": 0.0359, + "step": 62530 + }, + { + "epoch": 0.13792555901312872, + "grad_norm": 0.11377611011266708, + "learning_rate": 2.7987155590307214e-05, + "loss": 0.0351, + "step": 62540 + }, + { + "epoch": 0.1379476129880269, + "grad_norm": 0.11262751370668411, + "learning_rate": 2.798632816097632e-05, + "loss": 0.0367, + "step": 62550 + }, + { + "epoch": 0.13796966696292506, + "grad_norm": 0.12568722665309906, + "learning_rate": 2.798550057384929e-05, + "loss": 0.0357, + "step": 62560 + }, + { + "epoch": 0.13799172093782322, + "grad_norm": 0.15131846070289612, + "learning_rate": 2.7984672828936156e-05, + "loss": 0.0381, + "step": 62570 + }, + { + "epoch": 0.1380137749127214, + "grad_norm": 0.09026581794023514, + "learning_rate": 2.7983844926246992e-05, + "loss": 0.0359, + "step": 62580 + }, + { + "epoch": 0.13803582888761956, + "grad_norm": 0.11350180208683014, + "learning_rate": 2.7983016865791854e-05, + "loss": 0.0343, + "step": 62590 + }, + { + "epoch": 0.13805788286251772, + "grad_norm": 0.1276967078447342, + "learning_rate": 2.79821886475808e-05, + "loss": 0.0378, + "step": 62600 + }, + { + "epoch": 0.1380799368374159, + "grad_norm": 0.12571406364440918, + "learning_rate": 2.7981360271623896e-05, + "loss": 0.0368, + "step": 62610 + }, + { + "epoch": 0.13810199081231406, + "grad_norm": 0.10204555094242096, + "learning_rate": 2.7980531737931213e-05, + "loss": 0.0352, + "step": 62620 + }, + { + "epoch": 0.1381240447872122, + "grad_norm": 0.0972917452454567, + "learning_rate": 2.797970304651281e-05, + "loss": 0.034, + "step": 62630 + }, + { + "epoch": 0.1381460987621104, + "grad_norm": 0.1257554590702057, + "learning_rate": 2.7978874197378765e-05, + "loss": 0.0349, + "step": 62640 + }, + { + "epoch": 0.13816815273700855, + "grad_norm": 0.10361342132091522, + "learning_rate": 2.7978045190539142e-05, + "loss": 0.0353, + "step": 62650 + }, + { + "epoch": 0.1381902067119067, + "grad_norm": 0.12130022048950195, + "learning_rate": 2.797721602600401e-05, + "loss": 0.0354, + "step": 62660 + }, + { + "epoch": 0.1382122606868049, + "grad_norm": 0.08594454079866409, + "learning_rate": 2.7976386703783457e-05, + "loss": 0.0363, + "step": 62670 + }, + { + "epoch": 0.13823431466170305, + "grad_norm": 0.09868957847356796, + "learning_rate": 2.7975557223887555e-05, + "loss": 0.0347, + "step": 62680 + }, + { + "epoch": 0.1382563686366012, + "grad_norm": 0.09038028866052628, + "learning_rate": 2.797472758632638e-05, + "loss": 0.0349, + "step": 62690 + }, + { + "epoch": 0.1382784226114994, + "grad_norm": 0.11595848947763443, + "learning_rate": 2.7973897791110014e-05, + "loss": 0.0348, + "step": 62700 + }, + { + "epoch": 0.13830047658639755, + "grad_norm": 0.12947845458984375, + "learning_rate": 2.7973067838248543e-05, + "loss": 0.0343, + "step": 62710 + }, + { + "epoch": 0.1383225305612957, + "grad_norm": 0.1021561324596405, + "learning_rate": 2.7972237727752047e-05, + "loss": 0.0354, + "step": 62720 + }, + { + "epoch": 0.1383445845361939, + "grad_norm": 0.10552036017179489, + "learning_rate": 2.7971407459630615e-05, + "loss": 0.0377, + "step": 62730 + }, + { + "epoch": 0.13836663851109204, + "grad_norm": 0.12558388710021973, + "learning_rate": 2.7970577033894335e-05, + "loss": 0.0343, + "step": 62740 + }, + { + "epoch": 0.1383886924859902, + "grad_norm": 0.075917549431324, + "learning_rate": 2.79697464505533e-05, + "loss": 0.0324, + "step": 62750 + }, + { + "epoch": 0.13841074646088838, + "grad_norm": 0.08743351697921753, + "learning_rate": 2.7968915709617596e-05, + "loss": 0.0334, + "step": 62760 + }, + { + "epoch": 0.13843280043578654, + "grad_norm": 0.12723694741725922, + "learning_rate": 2.7968084811097324e-05, + "loss": 0.0387, + "step": 62770 + }, + { + "epoch": 0.1384548544106847, + "grad_norm": 0.1253923773765564, + "learning_rate": 2.7967253755002577e-05, + "loss": 0.0367, + "step": 62780 + }, + { + "epoch": 0.13847690838558288, + "grad_norm": 0.10387177020311356, + "learning_rate": 2.7966422541343457e-05, + "loss": 0.0361, + "step": 62790 + }, + { + "epoch": 0.13849896236048104, + "grad_norm": 0.09324585646390915, + "learning_rate": 2.7965591170130058e-05, + "loss": 0.0386, + "step": 62800 + }, + { + "epoch": 0.1385210163353792, + "grad_norm": 0.1231488585472107, + "learning_rate": 2.7964759641372485e-05, + "loss": 0.0352, + "step": 62810 + }, + { + "epoch": 0.13854307031027738, + "grad_norm": 0.14865387976169586, + "learning_rate": 2.796392795508084e-05, + "loss": 0.0378, + "step": 62820 + }, + { + "epoch": 0.13856512428517553, + "grad_norm": 0.11921441555023193, + "learning_rate": 2.796309611126523e-05, + "loss": 0.0369, + "step": 62830 + }, + { + "epoch": 0.13858717826007372, + "grad_norm": 0.12149559706449509, + "learning_rate": 2.7962264109935766e-05, + "loss": 0.0338, + "step": 62840 + }, + { + "epoch": 0.13860923223497187, + "grad_norm": 0.1079297810792923, + "learning_rate": 2.796143195110255e-05, + "loss": 0.0354, + "step": 62850 + }, + { + "epoch": 0.13863128620987003, + "grad_norm": 0.1115870401263237, + "learning_rate": 2.7960599634775705e-05, + "loss": 0.0358, + "step": 62860 + }, + { + "epoch": 0.13865334018476821, + "grad_norm": 0.11086440831422806, + "learning_rate": 2.7959767160965333e-05, + "loss": 0.0343, + "step": 62870 + }, + { + "epoch": 0.13867539415966637, + "grad_norm": 0.10821733623743057, + "learning_rate": 2.7958934529681554e-05, + "loss": 0.0361, + "step": 62880 + }, + { + "epoch": 0.13869744813456453, + "grad_norm": 0.11481864750385284, + "learning_rate": 2.795810174093449e-05, + "loss": 0.0356, + "step": 62890 + }, + { + "epoch": 0.1387195021094627, + "grad_norm": 0.13161541521549225, + "learning_rate": 2.795726879473425e-05, + "loss": 0.0375, + "step": 62900 + }, + { + "epoch": 0.13874155608436087, + "grad_norm": 0.12826187908649445, + "learning_rate": 2.7956435691090964e-05, + "loss": 0.0365, + "step": 62910 + }, + { + "epoch": 0.13876361005925902, + "grad_norm": 0.13026122748851776, + "learning_rate": 2.7955602430014747e-05, + "loss": 0.0341, + "step": 62920 + }, + { + "epoch": 0.1387856640341572, + "grad_norm": 0.13962508738040924, + "learning_rate": 2.7954769011515732e-05, + "loss": 0.0385, + "step": 62930 + }, + { + "epoch": 0.13880771800905536, + "grad_norm": 0.1511842906475067, + "learning_rate": 2.7953935435604043e-05, + "loss": 0.0373, + "step": 62940 + }, + { + "epoch": 0.13882977198395352, + "grad_norm": 0.13034507632255554, + "learning_rate": 2.7953101702289804e-05, + "loss": 0.0342, + "step": 62950 + }, + { + "epoch": 0.1388518259588517, + "grad_norm": 0.10103994607925415, + "learning_rate": 2.795226781158315e-05, + "loss": 0.035, + "step": 62960 + }, + { + "epoch": 0.13887387993374986, + "grad_norm": 0.11973263323307037, + "learning_rate": 2.7951433763494216e-05, + "loss": 0.0352, + "step": 62970 + }, + { + "epoch": 0.13889593390864802, + "grad_norm": 0.10790099203586578, + "learning_rate": 2.7950599558033133e-05, + "loss": 0.0346, + "step": 62980 + }, + { + "epoch": 0.1389179878835462, + "grad_norm": 0.12818481028079987, + "learning_rate": 2.7949765195210034e-05, + "loss": 0.0371, + "step": 62990 + }, + { + "epoch": 0.13894004185844436, + "grad_norm": 0.10696565359830856, + "learning_rate": 2.794893067503507e-05, + "loss": 0.0369, + "step": 63000 + }, + { + "epoch": 0.1389620958333425, + "grad_norm": 0.11370590329170227, + "learning_rate": 2.7948095997518366e-05, + "loss": 0.0361, + "step": 63010 + }, + { + "epoch": 0.1389841498082407, + "grad_norm": 0.11419677734375, + "learning_rate": 2.794726116267007e-05, + "loss": 0.0369, + "step": 63020 + }, + { + "epoch": 0.13900620378313885, + "grad_norm": 0.08091961592435837, + "learning_rate": 2.794642617050033e-05, + "loss": 0.036, + "step": 63030 + }, + { + "epoch": 0.139028257758037, + "grad_norm": 0.11004364490509033, + "learning_rate": 2.7945591021019286e-05, + "loss": 0.034, + "step": 63040 + }, + { + "epoch": 0.1390503117329352, + "grad_norm": 0.11438800394535065, + "learning_rate": 2.7944755714237088e-05, + "loss": 0.0353, + "step": 63050 + }, + { + "epoch": 0.13907236570783335, + "grad_norm": 0.11041457951068878, + "learning_rate": 2.794392025016389e-05, + "loss": 0.0347, + "step": 63060 + }, + { + "epoch": 0.1390944196827315, + "grad_norm": 0.1133178099989891, + "learning_rate": 2.7943084628809836e-05, + "loss": 0.0333, + "step": 63070 + }, + { + "epoch": 0.1391164736576297, + "grad_norm": 0.12310270965099335, + "learning_rate": 2.7942248850185085e-05, + "loss": 0.0369, + "step": 63080 + }, + { + "epoch": 0.13913852763252785, + "grad_norm": 0.09011740237474442, + "learning_rate": 2.7941412914299794e-05, + "loss": 0.0322, + "step": 63090 + }, + { + "epoch": 0.139160581607426, + "grad_norm": 0.14694544672966003, + "learning_rate": 2.7940576821164113e-05, + "loss": 0.0354, + "step": 63100 + }, + { + "epoch": 0.1391826355823242, + "grad_norm": 0.12871143221855164, + "learning_rate": 2.7939740570788207e-05, + "loss": 0.0347, + "step": 63110 + }, + { + "epoch": 0.13920468955722234, + "grad_norm": 0.1506899893283844, + "learning_rate": 2.7938904163182234e-05, + "loss": 0.0345, + "step": 63120 + }, + { + "epoch": 0.1392267435321205, + "grad_norm": 0.09329088032245636, + "learning_rate": 2.7938067598356362e-05, + "loss": 0.0334, + "step": 63130 + }, + { + "epoch": 0.13924879750701868, + "grad_norm": 0.11861605942249298, + "learning_rate": 2.793723087632075e-05, + "loss": 0.0358, + "step": 63140 + }, + { + "epoch": 0.13927085148191684, + "grad_norm": 0.10191334784030914, + "learning_rate": 2.7936393997085572e-05, + "loss": 0.035, + "step": 63150 + }, + { + "epoch": 0.139292905456815, + "grad_norm": 0.10820747911930084, + "learning_rate": 2.7935556960660994e-05, + "loss": 0.036, + "step": 63160 + }, + { + "epoch": 0.13931495943171318, + "grad_norm": 0.13836508989334106, + "learning_rate": 2.7934719767057178e-05, + "loss": 0.0343, + "step": 63170 + }, + { + "epoch": 0.13933701340661134, + "grad_norm": 0.13352209329605103, + "learning_rate": 2.793388241628431e-05, + "loss": 0.0348, + "step": 63180 + }, + { + "epoch": 0.1393590673815095, + "grad_norm": 0.10255688428878784, + "learning_rate": 2.7933044908352558e-05, + "loss": 0.0362, + "step": 63190 + }, + { + "epoch": 0.13938112135640768, + "grad_norm": 0.1054101511836052, + "learning_rate": 2.79322072432721e-05, + "loss": 0.035, + "step": 63200 + }, + { + "epoch": 0.13940317533130583, + "grad_norm": 0.10771998018026352, + "learning_rate": 2.7931369421053112e-05, + "loss": 0.0371, + "step": 63210 + }, + { + "epoch": 0.139425229306204, + "grad_norm": 0.15292564034461975, + "learning_rate": 2.793053144170578e-05, + "loss": 0.0346, + "step": 63220 + }, + { + "epoch": 0.13944728328110217, + "grad_norm": 0.11596404016017914, + "learning_rate": 2.792969330524028e-05, + "loss": 0.0362, + "step": 63230 + }, + { + "epoch": 0.13946933725600033, + "grad_norm": 0.12117617577314377, + "learning_rate": 2.79288550116668e-05, + "loss": 0.0346, + "step": 63240 + }, + { + "epoch": 0.1394913912308985, + "grad_norm": 0.1299002468585968, + "learning_rate": 2.7928016560995523e-05, + "loss": 0.035, + "step": 63250 + }, + { + "epoch": 0.13951344520579667, + "grad_norm": 0.1335836946964264, + "learning_rate": 2.792717795323664e-05, + "loss": 0.0354, + "step": 63260 + }, + { + "epoch": 0.13953549918069483, + "grad_norm": 0.11509998142719269, + "learning_rate": 2.792633918840034e-05, + "loss": 0.0351, + "step": 63270 + }, + { + "epoch": 0.139557553155593, + "grad_norm": 0.11488305777311325, + "learning_rate": 2.7925500266496815e-05, + "loss": 0.0364, + "step": 63280 + }, + { + "epoch": 0.13957960713049117, + "grad_norm": 0.09222345799207687, + "learning_rate": 2.792466118753626e-05, + "loss": 0.035, + "step": 63290 + }, + { + "epoch": 0.13960166110538932, + "grad_norm": 0.13713304698467255, + "learning_rate": 2.7923821951528864e-05, + "loss": 0.0339, + "step": 63300 + }, + { + "epoch": 0.1396237150802875, + "grad_norm": 0.12998540699481964, + "learning_rate": 2.792298255848483e-05, + "loss": 0.0362, + "step": 63310 + }, + { + "epoch": 0.13964576905518566, + "grad_norm": 0.11870168894529343, + "learning_rate": 2.7922143008414353e-05, + "loss": 0.0358, + "step": 63320 + }, + { + "epoch": 0.13966782303008382, + "grad_norm": 0.13376475870609283, + "learning_rate": 2.7921303301327642e-05, + "loss": 0.036, + "step": 63330 + }, + { + "epoch": 0.139689877004982, + "grad_norm": 0.12472742050886154, + "learning_rate": 2.7920463437234897e-05, + "loss": 0.0335, + "step": 63340 + }, + { + "epoch": 0.13971193097988016, + "grad_norm": 0.14592376351356506, + "learning_rate": 2.7919623416146324e-05, + "loss": 0.0345, + "step": 63350 + }, + { + "epoch": 0.13973398495477832, + "grad_norm": 0.13791494071483612, + "learning_rate": 2.7918783238072123e-05, + "loss": 0.0358, + "step": 63360 + }, + { + "epoch": 0.1397560389296765, + "grad_norm": 0.12572690844535828, + "learning_rate": 2.7917942903022514e-05, + "loss": 0.0373, + "step": 63370 + }, + { + "epoch": 0.13977809290457466, + "grad_norm": 0.13061141967773438, + "learning_rate": 2.7917102411007697e-05, + "loss": 0.0346, + "step": 63380 + }, + { + "epoch": 0.13980014687947281, + "grad_norm": 0.13826745748519897, + "learning_rate": 2.7916261762037896e-05, + "loss": 0.0351, + "step": 63390 + }, + { + "epoch": 0.139822200854371, + "grad_norm": 0.10966210812330246, + "learning_rate": 2.7915420956123315e-05, + "loss": 0.0363, + "step": 63400 + }, + { + "epoch": 0.13984425482926915, + "grad_norm": 0.10565724223852158, + "learning_rate": 2.7914579993274176e-05, + "loss": 0.0359, + "step": 63410 + }, + { + "epoch": 0.1398663088041673, + "grad_norm": 0.10730832070112228, + "learning_rate": 2.7913738873500698e-05, + "loss": 0.0369, + "step": 63420 + }, + { + "epoch": 0.1398883627790655, + "grad_norm": 0.10310032963752747, + "learning_rate": 2.7912897596813096e-05, + "loss": 0.0379, + "step": 63430 + }, + { + "epoch": 0.13991041675396365, + "grad_norm": 0.11194000393152237, + "learning_rate": 2.7912056163221602e-05, + "loss": 0.0358, + "step": 63440 + }, + { + "epoch": 0.1399324707288618, + "grad_norm": 0.09621042758226395, + "learning_rate": 2.791121457273643e-05, + "loss": 0.0365, + "step": 63450 + }, + { + "epoch": 0.13995452470376, + "grad_norm": 0.11189072579145432, + "learning_rate": 2.7910372825367816e-05, + "loss": 0.0364, + "step": 63460 + }, + { + "epoch": 0.13997657867865815, + "grad_norm": 0.148777574300766, + "learning_rate": 2.790953092112598e-05, + "loss": 0.0366, + "step": 63470 + }, + { + "epoch": 0.1399986326535563, + "grad_norm": 0.0961572676897049, + "learning_rate": 2.790868886002115e-05, + "loss": 0.0352, + "step": 63480 + }, + { + "epoch": 0.1400206866284545, + "grad_norm": 0.11505009233951569, + "learning_rate": 2.7907846642063568e-05, + "loss": 0.0354, + "step": 63490 + }, + { + "epoch": 0.14004274060335264, + "grad_norm": 0.14201487600803375, + "learning_rate": 2.7907004267263462e-05, + "loss": 0.0355, + "step": 63500 + }, + { + "epoch": 0.1400647945782508, + "grad_norm": 0.1418282836675644, + "learning_rate": 2.7906161735631066e-05, + "loss": 0.037, + "step": 63510 + }, + { + "epoch": 0.14008684855314898, + "grad_norm": 0.10567077994346619, + "learning_rate": 2.7905319047176622e-05, + "loss": 0.0362, + "step": 63520 + }, + { + "epoch": 0.14010890252804714, + "grad_norm": 0.0968351662158966, + "learning_rate": 2.7904476201910362e-05, + "loss": 0.034, + "step": 63530 + }, + { + "epoch": 0.1401309565029453, + "grad_norm": 0.1448008120059967, + "learning_rate": 2.790363319984254e-05, + "loss": 0.0347, + "step": 63540 + }, + { + "epoch": 0.14015301047784348, + "grad_norm": 0.14491073787212372, + "learning_rate": 2.7902790040983387e-05, + "loss": 0.0341, + "step": 63550 + }, + { + "epoch": 0.14017506445274164, + "grad_norm": 0.12452062219381332, + "learning_rate": 2.7901946725343153e-05, + "loss": 0.0363, + "step": 63560 + }, + { + "epoch": 0.1401971184276398, + "grad_norm": 0.11531448364257812, + "learning_rate": 2.7901103252932083e-05, + "loss": 0.0374, + "step": 63570 + }, + { + "epoch": 0.14021917240253798, + "grad_norm": 0.1275016814470291, + "learning_rate": 2.7900259623760428e-05, + "loss": 0.0352, + "step": 63580 + }, + { + "epoch": 0.14024122637743613, + "grad_norm": 0.14277586340904236, + "learning_rate": 2.7899415837838442e-05, + "loss": 0.0345, + "step": 63590 + }, + { + "epoch": 0.1402632803523343, + "grad_norm": 0.14039362967014313, + "learning_rate": 2.789857189517637e-05, + "loss": 0.0345, + "step": 63600 + }, + { + "epoch": 0.14028533432723247, + "grad_norm": 0.130295068025589, + "learning_rate": 2.789772779578448e-05, + "loss": 0.0357, + "step": 63610 + }, + { + "epoch": 0.14030738830213063, + "grad_norm": 0.11410263180732727, + "learning_rate": 2.789688353967301e-05, + "loss": 0.0347, + "step": 63620 + }, + { + "epoch": 0.1403294422770288, + "grad_norm": 0.1413617730140686, + "learning_rate": 2.7896039126852232e-05, + "loss": 0.0366, + "step": 63630 + }, + { + "epoch": 0.14035149625192697, + "grad_norm": 0.11004077643156052, + "learning_rate": 2.7895194557332403e-05, + "loss": 0.0359, + "step": 63640 + }, + { + "epoch": 0.14037355022682513, + "grad_norm": 0.1089862734079361, + "learning_rate": 2.7894349831123784e-05, + "loss": 0.0348, + "step": 63650 + }, + { + "epoch": 0.14039560420172328, + "grad_norm": 0.10555942356586456, + "learning_rate": 2.7893504948236644e-05, + "loss": 0.036, + "step": 63660 + }, + { + "epoch": 0.14041765817662147, + "grad_norm": 0.12107312679290771, + "learning_rate": 2.789265990868124e-05, + "loss": 0.036, + "step": 63670 + }, + { + "epoch": 0.14043971215151962, + "grad_norm": 0.11784705519676208, + "learning_rate": 2.789181471246785e-05, + "loss": 0.0346, + "step": 63680 + }, + { + "epoch": 0.1404617661264178, + "grad_norm": 0.10915012657642365, + "learning_rate": 2.7890969359606736e-05, + "loss": 0.0358, + "step": 63690 + }, + { + "epoch": 0.14048382010131596, + "grad_norm": 0.15494860708713531, + "learning_rate": 2.7890123850108177e-05, + "loss": 0.0377, + "step": 63700 + }, + { + "epoch": 0.14050587407621412, + "grad_norm": 0.11253052949905396, + "learning_rate": 2.7889278183982435e-05, + "loss": 0.0349, + "step": 63710 + }, + { + "epoch": 0.1405279280511123, + "grad_norm": 0.10999748855829239, + "learning_rate": 2.7888432361239802e-05, + "loss": 0.0349, + "step": 63720 + }, + { + "epoch": 0.14054998202601046, + "grad_norm": 0.0977388396859169, + "learning_rate": 2.7887586381890543e-05, + "loss": 0.0374, + "step": 63730 + }, + { + "epoch": 0.14057203600090862, + "grad_norm": 0.12597325444221497, + "learning_rate": 2.7886740245944942e-05, + "loss": 0.0351, + "step": 63740 + }, + { + "epoch": 0.1405940899758068, + "grad_norm": 0.11164052784442902, + "learning_rate": 2.7885893953413283e-05, + "loss": 0.0351, + "step": 63750 + }, + { + "epoch": 0.14061614395070496, + "grad_norm": 0.1396816074848175, + "learning_rate": 2.7885047504305843e-05, + "loss": 0.0351, + "step": 63760 + }, + { + "epoch": 0.14063819792560311, + "grad_norm": 0.10915548354387283, + "learning_rate": 2.788420089863291e-05, + "loss": 0.0349, + "step": 63770 + }, + { + "epoch": 0.1406602519005013, + "grad_norm": 0.0974070131778717, + "learning_rate": 2.7883354136404775e-05, + "loss": 0.035, + "step": 63780 + }, + { + "epoch": 0.14068230587539945, + "grad_norm": 0.11873248219490051, + "learning_rate": 2.788250721763172e-05, + "loss": 0.0341, + "step": 63790 + }, + { + "epoch": 0.1407043598502976, + "grad_norm": 0.09719040244817734, + "learning_rate": 2.788166014232404e-05, + "loss": 0.036, + "step": 63800 + }, + { + "epoch": 0.1407264138251958, + "grad_norm": 0.10175250470638275, + "learning_rate": 2.7880812910492027e-05, + "loss": 0.0363, + "step": 63810 + }, + { + "epoch": 0.14074846780009395, + "grad_norm": 0.1115715503692627, + "learning_rate": 2.7879965522145978e-05, + "loss": 0.0375, + "step": 63820 + }, + { + "epoch": 0.1407705217749921, + "grad_norm": 0.10456232726573944, + "learning_rate": 2.787911797729618e-05, + "loss": 0.0375, + "step": 63830 + }, + { + "epoch": 0.1407925757498903, + "grad_norm": 0.14169862866401672, + "learning_rate": 2.7878270275952948e-05, + "loss": 0.0359, + "step": 63840 + }, + { + "epoch": 0.14081462972478845, + "grad_norm": 0.09216726571321487, + "learning_rate": 2.7877422418126564e-05, + "loss": 0.0349, + "step": 63850 + }, + { + "epoch": 0.1408366836996866, + "grad_norm": 0.10837113112211227, + "learning_rate": 2.7876574403827344e-05, + "loss": 0.0363, + "step": 63860 + }, + { + "epoch": 0.1408587376745848, + "grad_norm": 0.10106974840164185, + "learning_rate": 2.787572623306559e-05, + "loss": 0.0355, + "step": 63870 + }, + { + "epoch": 0.14088079164948294, + "grad_norm": 0.11613689363002777, + "learning_rate": 2.7874877905851603e-05, + "loss": 0.0364, + "step": 63880 + }, + { + "epoch": 0.1409028456243811, + "grad_norm": 0.12021777033805847, + "learning_rate": 2.7874029422195696e-05, + "loss": 0.0347, + "step": 63890 + }, + { + "epoch": 0.14092489959927929, + "grad_norm": 0.12697134912014008, + "learning_rate": 2.787318078210817e-05, + "loss": 0.035, + "step": 63900 + }, + { + "epoch": 0.14094695357417744, + "grad_norm": 0.11864365637302399, + "learning_rate": 2.7872331985599347e-05, + "loss": 0.0348, + "step": 63910 + }, + { + "epoch": 0.1409690075490756, + "grad_norm": 0.10201752185821533, + "learning_rate": 2.7871483032679537e-05, + "loss": 0.0354, + "step": 63920 + }, + { + "epoch": 0.14099106152397378, + "grad_norm": 0.09348032623529434, + "learning_rate": 2.7870633923359053e-05, + "loss": 0.0358, + "step": 63930 + }, + { + "epoch": 0.14101311549887194, + "grad_norm": 0.10511614382266998, + "learning_rate": 2.786978465764822e-05, + "loss": 0.0392, + "step": 63940 + }, + { + "epoch": 0.1410351694737701, + "grad_norm": 0.12389510869979858, + "learning_rate": 2.786893523555735e-05, + "loss": 0.0353, + "step": 63950 + }, + { + "epoch": 0.14105722344866828, + "grad_norm": 0.08812455832958221, + "learning_rate": 2.7868085657096763e-05, + "loss": 0.0344, + "step": 63960 + }, + { + "epoch": 0.14107927742356643, + "grad_norm": 0.11340619623661041, + "learning_rate": 2.786723592227679e-05, + "loss": 0.0351, + "step": 63970 + }, + { + "epoch": 0.1411013313984646, + "grad_norm": 0.09994979947805405, + "learning_rate": 2.7866386031107752e-05, + "loss": 0.037, + "step": 63980 + }, + { + "epoch": 0.14112338537336278, + "grad_norm": 0.11347074806690216, + "learning_rate": 2.7865535983599976e-05, + "loss": 0.0357, + "step": 63990 + }, + { + "epoch": 0.14114543934826093, + "grad_norm": 0.12136399745941162, + "learning_rate": 2.7864685779763793e-05, + "loss": 0.0347, + "step": 64000 + }, + { + "epoch": 0.1411674933231591, + "grad_norm": 0.12202303111553192, + "learning_rate": 2.7863835419609526e-05, + "loss": 0.034, + "step": 64010 + }, + { + "epoch": 0.14118954729805727, + "grad_norm": 0.11072986572980881, + "learning_rate": 2.7862984903147515e-05, + "loss": 0.0342, + "step": 64020 + }, + { + "epoch": 0.14121160127295543, + "grad_norm": 0.1268737018108368, + "learning_rate": 2.786213423038809e-05, + "loss": 0.0361, + "step": 64030 + }, + { + "epoch": 0.14123365524785358, + "grad_norm": 0.12967227399349213, + "learning_rate": 2.7861283401341595e-05, + "loss": 0.0369, + "step": 64040 + }, + { + "epoch": 0.14125570922275177, + "grad_norm": 0.08773402124643326, + "learning_rate": 2.786043241601836e-05, + "loss": 0.0356, + "step": 64050 + }, + { + "epoch": 0.14127776319764992, + "grad_norm": 0.07939876616001129, + "learning_rate": 2.7859581274428732e-05, + "loss": 0.0346, + "step": 64060 + }, + { + "epoch": 0.14129981717254808, + "grad_norm": 0.09871094673871994, + "learning_rate": 2.785872997658305e-05, + "loss": 0.034, + "step": 64070 + }, + { + "epoch": 0.14132187114744627, + "grad_norm": 0.1538752168416977, + "learning_rate": 2.7857878522491656e-05, + "loss": 0.0364, + "step": 64080 + }, + { + "epoch": 0.14134392512234442, + "grad_norm": 0.10275581479072571, + "learning_rate": 2.78570269121649e-05, + "loss": 0.0329, + "step": 64090 + }, + { + "epoch": 0.14136597909724258, + "grad_norm": 0.10281278938055038, + "learning_rate": 2.7856175145613127e-05, + "loss": 0.0336, + "step": 64100 + }, + { + "epoch": 0.14138803307214076, + "grad_norm": 0.12518717348575592, + "learning_rate": 2.7855323222846684e-05, + "loss": 0.0368, + "step": 64110 + }, + { + "epoch": 0.14141008704703892, + "grad_norm": 0.0889330506324768, + "learning_rate": 2.7854471143875932e-05, + "loss": 0.0343, + "step": 64120 + }, + { + "epoch": 0.1414321410219371, + "grad_norm": 0.11775051802396774, + "learning_rate": 2.785361890871122e-05, + "loss": 0.0367, + "step": 64130 + }, + { + "epoch": 0.14145419499683526, + "grad_norm": 0.13145804405212402, + "learning_rate": 2.7852766517362895e-05, + "loss": 0.0378, + "step": 64140 + }, + { + "epoch": 0.14147624897173341, + "grad_norm": 0.11182987689971924, + "learning_rate": 2.7851913969841326e-05, + "loss": 0.0352, + "step": 64150 + }, + { + "epoch": 0.1414983029466316, + "grad_norm": 0.1452312022447586, + "learning_rate": 2.7851061266156873e-05, + "loss": 0.0339, + "step": 64160 + }, + { + "epoch": 0.14152035692152976, + "grad_norm": 0.09772852808237076, + "learning_rate": 2.7850208406319887e-05, + "loss": 0.0345, + "step": 64170 + }, + { + "epoch": 0.1415424108964279, + "grad_norm": 0.11393485218286514, + "learning_rate": 2.7849355390340732e-05, + "loss": 0.0359, + "step": 64180 + }, + { + "epoch": 0.1415644648713261, + "grad_norm": 0.11734147369861603, + "learning_rate": 2.7848502218229787e-05, + "loss": 0.0375, + "step": 64190 + }, + { + "epoch": 0.14158651884622425, + "grad_norm": 0.10747307538986206, + "learning_rate": 2.78476488899974e-05, + "loss": 0.034, + "step": 64200 + }, + { + "epoch": 0.1416085728211224, + "grad_norm": 0.08696465194225311, + "learning_rate": 2.7846795405653954e-05, + "loss": 0.0342, + "step": 64210 + }, + { + "epoch": 0.1416306267960206, + "grad_norm": 0.12936648726463318, + "learning_rate": 2.7845941765209815e-05, + "loss": 0.0366, + "step": 64220 + }, + { + "epoch": 0.14165268077091875, + "grad_norm": 0.1304529905319214, + "learning_rate": 2.7845087968675352e-05, + "loss": 0.0338, + "step": 64230 + }, + { + "epoch": 0.1416747347458169, + "grad_norm": 0.13487175107002258, + "learning_rate": 2.7844234016060944e-05, + "loss": 0.0361, + "step": 64240 + }, + { + "epoch": 0.1416967887207151, + "grad_norm": 0.11863557249307632, + "learning_rate": 2.7843379907376965e-05, + "loss": 0.0333, + "step": 64250 + }, + { + "epoch": 0.14171884269561325, + "grad_norm": 0.09121780842542648, + "learning_rate": 2.7842525642633795e-05, + "loss": 0.0347, + "step": 64260 + }, + { + "epoch": 0.1417408966705114, + "grad_norm": 0.09755859524011612, + "learning_rate": 2.7841671221841814e-05, + "loss": 0.0367, + "step": 64270 + }, + { + "epoch": 0.14176295064540959, + "grad_norm": 0.10884477943181992, + "learning_rate": 2.78408166450114e-05, + "loss": 0.0379, + "step": 64280 + }, + { + "epoch": 0.14178500462030774, + "grad_norm": 0.11982308328151703, + "learning_rate": 2.7839961912152943e-05, + "loss": 0.0381, + "step": 64290 + }, + { + "epoch": 0.1418070585952059, + "grad_norm": 0.13125725090503693, + "learning_rate": 2.783910702327682e-05, + "loss": 0.0345, + "step": 64300 + }, + { + "epoch": 0.14182911257010408, + "grad_norm": 0.16307257115840912, + "learning_rate": 2.783825197839343e-05, + "loss": 0.0351, + "step": 64310 + }, + { + "epoch": 0.14185116654500224, + "grad_norm": 0.15183910727500916, + "learning_rate": 2.7837396777513156e-05, + "loss": 0.0349, + "step": 64320 + }, + { + "epoch": 0.1418732205199004, + "grad_norm": 0.12454871833324432, + "learning_rate": 2.7836541420646393e-05, + "loss": 0.0359, + "step": 64330 + }, + { + "epoch": 0.14189527449479858, + "grad_norm": 0.13448920845985413, + "learning_rate": 2.7835685907803527e-05, + "loss": 0.0368, + "step": 64340 + }, + { + "epoch": 0.14191732846969674, + "grad_norm": 0.1153397187590599, + "learning_rate": 2.7834830238994963e-05, + "loss": 0.0346, + "step": 64350 + }, + { + "epoch": 0.1419393824445949, + "grad_norm": 0.11874226480722427, + "learning_rate": 2.783397441423109e-05, + "loss": 0.0359, + "step": 64360 + }, + { + "epoch": 0.14196143641949308, + "grad_norm": 0.10507304221391678, + "learning_rate": 2.7833118433522317e-05, + "loss": 0.0363, + "step": 64370 + }, + { + "epoch": 0.14198349039439123, + "grad_norm": 0.12907247245311737, + "learning_rate": 2.783226229687903e-05, + "loss": 0.0355, + "step": 64380 + }, + { + "epoch": 0.1420055443692894, + "grad_norm": 0.10370578616857529, + "learning_rate": 2.7831406004311644e-05, + "loss": 0.0364, + "step": 64390 + }, + { + "epoch": 0.14202759834418757, + "grad_norm": 0.1296766996383667, + "learning_rate": 2.7830549555830563e-05, + "loss": 0.0344, + "step": 64400 + }, + { + "epoch": 0.14204965231908573, + "grad_norm": 0.13169370591640472, + "learning_rate": 2.7829692951446187e-05, + "loss": 0.0342, + "step": 64410 + }, + { + "epoch": 0.14207170629398388, + "grad_norm": 0.10861831158399582, + "learning_rate": 2.7828836191168928e-05, + "loss": 0.0367, + "step": 64420 + }, + { + "epoch": 0.14209376026888207, + "grad_norm": 0.10183802247047424, + "learning_rate": 2.78279792750092e-05, + "loss": 0.0359, + "step": 64430 + }, + { + "epoch": 0.14211581424378023, + "grad_norm": 0.11516465991735458, + "learning_rate": 2.7827122202977412e-05, + "loss": 0.0357, + "step": 64440 + }, + { + "epoch": 0.14213786821867838, + "grad_norm": 0.08851861953735352, + "learning_rate": 2.7826264975083976e-05, + "loss": 0.0337, + "step": 64450 + }, + { + "epoch": 0.14215992219357657, + "grad_norm": 0.10163614898920059, + "learning_rate": 2.782540759133931e-05, + "loss": 0.0355, + "step": 64460 + }, + { + "epoch": 0.14218197616847472, + "grad_norm": 0.0936802551150322, + "learning_rate": 2.7824550051753836e-05, + "loss": 0.036, + "step": 64470 + }, + { + "epoch": 0.14220403014337288, + "grad_norm": 0.09590543806552887, + "learning_rate": 2.782369235633797e-05, + "loss": 0.0351, + "step": 64480 + }, + { + "epoch": 0.14222608411827106, + "grad_norm": 0.12820908427238464, + "learning_rate": 2.7822834505102132e-05, + "loss": 0.0343, + "step": 64490 + }, + { + "epoch": 0.14224813809316922, + "grad_norm": 0.09484991431236267, + "learning_rate": 2.782197649805675e-05, + "loss": 0.0352, + "step": 64500 + }, + { + "epoch": 0.14227019206806737, + "grad_norm": 0.08317600190639496, + "learning_rate": 2.7821118335212245e-05, + "loss": 0.0352, + "step": 64510 + }, + { + "epoch": 0.14229224604296556, + "grad_norm": 0.1292402744293213, + "learning_rate": 2.782026001657905e-05, + "loss": 0.0342, + "step": 64520 + }, + { + "epoch": 0.14231430001786372, + "grad_norm": 0.09593328088521957, + "learning_rate": 2.781940154216759e-05, + "loss": 0.0361, + "step": 64530 + }, + { + "epoch": 0.1423363539927619, + "grad_norm": 0.11827236413955688, + "learning_rate": 2.7818542911988295e-05, + "loss": 0.0344, + "step": 64540 + }, + { + "epoch": 0.14235840796766006, + "grad_norm": 0.18373656272888184, + "learning_rate": 2.7817684126051604e-05, + "loss": 0.0365, + "step": 64550 + }, + { + "epoch": 0.1423804619425582, + "grad_norm": 0.12052694708108902, + "learning_rate": 2.7816825184367952e-05, + "loss": 0.0347, + "step": 64560 + }, + { + "epoch": 0.1424025159174564, + "grad_norm": 0.12095113098621368, + "learning_rate": 2.781596608694777e-05, + "loss": 0.0331, + "step": 64570 + }, + { + "epoch": 0.14242456989235455, + "grad_norm": 0.10401925444602966, + "learning_rate": 2.7815106833801497e-05, + "loss": 0.0367, + "step": 64580 + }, + { + "epoch": 0.1424466238672527, + "grad_norm": 0.11401911079883575, + "learning_rate": 2.7814247424939578e-05, + "loss": 0.0352, + "step": 64590 + }, + { + "epoch": 0.1424686778421509, + "grad_norm": 0.09199023991823196, + "learning_rate": 2.7813387860372454e-05, + "loss": 0.0363, + "step": 64600 + }, + { + "epoch": 0.14249073181704905, + "grad_norm": 0.09409498423337936, + "learning_rate": 2.7812528140110574e-05, + "loss": 0.0358, + "step": 64610 + }, + { + "epoch": 0.1425127857919472, + "grad_norm": 0.11196881532669067, + "learning_rate": 2.7811668264164374e-05, + "loss": 0.0368, + "step": 64620 + }, + { + "epoch": 0.1425348397668454, + "grad_norm": 0.08975254744291306, + "learning_rate": 2.7810808232544314e-05, + "loss": 0.0351, + "step": 64630 + }, + { + "epoch": 0.14255689374174355, + "grad_norm": 0.12523460388183594, + "learning_rate": 2.7809948045260834e-05, + "loss": 0.036, + "step": 64640 + }, + { + "epoch": 0.1425789477166417, + "grad_norm": 0.10930140316486359, + "learning_rate": 2.780908770232439e-05, + "loss": 0.0351, + "step": 64650 + }, + { + "epoch": 0.14260100169153989, + "grad_norm": 0.1462441235780716, + "learning_rate": 2.780822720374544e-05, + "loss": 0.0364, + "step": 64660 + }, + { + "epoch": 0.14262305566643804, + "grad_norm": 0.1060829758644104, + "learning_rate": 2.780736654953444e-05, + "loss": 0.034, + "step": 64670 + }, + { + "epoch": 0.1426451096413362, + "grad_norm": 0.11294655501842499, + "learning_rate": 2.7806505739701837e-05, + "loss": 0.0366, + "step": 64680 + }, + { + "epoch": 0.14266716361623438, + "grad_norm": 0.09725972265005112, + "learning_rate": 2.78056447742581e-05, + "loss": 0.0354, + "step": 64690 + }, + { + "epoch": 0.14268921759113254, + "grad_norm": 0.11622977256774902, + "learning_rate": 2.780478365321369e-05, + "loss": 0.0358, + "step": 64700 + }, + { + "epoch": 0.1427112715660307, + "grad_norm": 0.1368197351694107, + "learning_rate": 2.7803922376579068e-05, + "loss": 0.037, + "step": 64710 + }, + { + "epoch": 0.14273332554092888, + "grad_norm": 0.1271246075630188, + "learning_rate": 2.7803060944364702e-05, + "loss": 0.0364, + "step": 64720 + }, + { + "epoch": 0.14275537951582704, + "grad_norm": 0.10912823677062988, + "learning_rate": 2.7802199356581054e-05, + "loss": 0.0349, + "step": 64730 + }, + { + "epoch": 0.1427774334907252, + "grad_norm": 0.12499713897705078, + "learning_rate": 2.7801337613238598e-05, + "loss": 0.0357, + "step": 64740 + }, + { + "epoch": 0.14279948746562338, + "grad_norm": 0.09517369419336319, + "learning_rate": 2.7800475714347805e-05, + "loss": 0.0365, + "step": 64750 + }, + { + "epoch": 0.14282154144052153, + "grad_norm": 0.09963133186101913, + "learning_rate": 2.7799613659919144e-05, + "loss": 0.0395, + "step": 64760 + }, + { + "epoch": 0.1428435954154197, + "grad_norm": 0.11145572364330292, + "learning_rate": 2.779875144996309e-05, + "loss": 0.0342, + "step": 64770 + }, + { + "epoch": 0.14286564939031787, + "grad_norm": 0.09330754727125168, + "learning_rate": 2.7797889084490127e-05, + "loss": 0.0358, + "step": 64780 + }, + { + "epoch": 0.14288770336521603, + "grad_norm": 0.11213336139917374, + "learning_rate": 2.7797026563510728e-05, + "loss": 0.0341, + "step": 64790 + }, + { + "epoch": 0.14290975734011419, + "grad_norm": 0.1310638040304184, + "learning_rate": 2.7796163887035375e-05, + "loss": 0.0352, + "step": 64800 + }, + { + "epoch": 0.14293181131501237, + "grad_norm": 0.08437534421682358, + "learning_rate": 2.7795301055074547e-05, + "loss": 0.0361, + "step": 64810 + }, + { + "epoch": 0.14295386528991053, + "grad_norm": 0.11145509034395218, + "learning_rate": 2.779443806763873e-05, + "loss": 0.0375, + "step": 64820 + }, + { + "epoch": 0.14297591926480868, + "grad_norm": 0.10858908295631409, + "learning_rate": 2.7793574924738408e-05, + "loss": 0.0346, + "step": 64830 + }, + { + "epoch": 0.14299797323970687, + "grad_norm": 0.09694422781467438, + "learning_rate": 2.7792711626384076e-05, + "loss": 0.0343, + "step": 64840 + }, + { + "epoch": 0.14302002721460502, + "grad_norm": 0.1306678056716919, + "learning_rate": 2.7791848172586217e-05, + "loss": 0.0361, + "step": 64850 + }, + { + "epoch": 0.14304208118950318, + "grad_norm": 0.10910069197416306, + "learning_rate": 2.7790984563355326e-05, + "loss": 0.0367, + "step": 64860 + }, + { + "epoch": 0.14306413516440136, + "grad_norm": 0.11184827983379364, + "learning_rate": 2.7790120798701896e-05, + "loss": 0.0351, + "step": 64870 + }, + { + "epoch": 0.14308618913929952, + "grad_norm": 0.108062244951725, + "learning_rate": 2.778925687863642e-05, + "loss": 0.0351, + "step": 64880 + }, + { + "epoch": 0.14310824311419768, + "grad_norm": 0.09338771551847458, + "learning_rate": 2.77883928031694e-05, + "loss": 0.0341, + "step": 64890 + }, + { + "epoch": 0.14313029708909586, + "grad_norm": 0.12981031835079193, + "learning_rate": 2.7787528572311334e-05, + "loss": 0.0358, + "step": 64900 + }, + { + "epoch": 0.14315235106399402, + "grad_norm": 0.1160503625869751, + "learning_rate": 2.7786664186072723e-05, + "loss": 0.0357, + "step": 64910 + }, + { + "epoch": 0.14317440503889217, + "grad_norm": 0.11673770844936371, + "learning_rate": 2.778579964446407e-05, + "loss": 0.0353, + "step": 64920 + }, + { + "epoch": 0.14319645901379036, + "grad_norm": 0.09455927461385727, + "learning_rate": 2.7784934947495875e-05, + "loss": 0.0353, + "step": 64930 + }, + { + "epoch": 0.1432185129886885, + "grad_norm": 0.10824181884527206, + "learning_rate": 2.7784070095178655e-05, + "loss": 0.0354, + "step": 64940 + }, + { + "epoch": 0.14324056696358667, + "grad_norm": 0.11320358514785767, + "learning_rate": 2.7783205087522907e-05, + "loss": 0.0345, + "step": 64950 + }, + { + "epoch": 0.14326262093848485, + "grad_norm": 0.10611017048358917, + "learning_rate": 2.778233992453915e-05, + "loss": 0.0337, + "step": 64960 + }, + { + "epoch": 0.143284674913383, + "grad_norm": 0.1019330620765686, + "learning_rate": 2.7781474606237896e-05, + "loss": 0.0354, + "step": 64970 + }, + { + "epoch": 0.1433067288882812, + "grad_norm": 0.14892823994159698, + "learning_rate": 2.7780609132629656e-05, + "loss": 0.0365, + "step": 64980 + }, + { + "epoch": 0.14332878286317935, + "grad_norm": 0.12222550064325333, + "learning_rate": 2.7779743503724953e-05, + "loss": 0.0348, + "step": 64990 + }, + { + "epoch": 0.1433508368380775, + "grad_norm": 0.09306332468986511, + "learning_rate": 2.7778877719534292e-05, + "loss": 0.0342, + "step": 65000 + }, + { + "epoch": 0.1433728908129757, + "grad_norm": 0.10704483836889267, + "learning_rate": 2.7778011780068207e-05, + "loss": 0.0336, + "step": 65010 + }, + { + "epoch": 0.14339494478787385, + "grad_norm": 0.09943003952503204, + "learning_rate": 2.7777145685337212e-05, + "loss": 0.035, + "step": 65020 + }, + { + "epoch": 0.143416998762772, + "grad_norm": 0.12761704623699188, + "learning_rate": 2.7776279435351838e-05, + "loss": 0.0373, + "step": 65030 + }, + { + "epoch": 0.1434390527376702, + "grad_norm": 0.12486574053764343, + "learning_rate": 2.7775413030122603e-05, + "loss": 0.0342, + "step": 65040 + }, + { + "epoch": 0.14346110671256834, + "grad_norm": 0.11544851958751678, + "learning_rate": 2.7774546469660036e-05, + "loss": 0.0355, + "step": 65050 + }, + { + "epoch": 0.1434831606874665, + "grad_norm": 0.12449388206005096, + "learning_rate": 2.7773679753974676e-05, + "loss": 0.0371, + "step": 65060 + }, + { + "epoch": 0.14350521466236468, + "grad_norm": 0.1243520975112915, + "learning_rate": 2.7772812883077037e-05, + "loss": 0.0379, + "step": 65070 + }, + { + "epoch": 0.14352726863726284, + "grad_norm": 0.13904905319213867, + "learning_rate": 2.7771945856977667e-05, + "loss": 0.0359, + "step": 65080 + }, + { + "epoch": 0.143549322612161, + "grad_norm": 0.11191879957914352, + "learning_rate": 2.7771078675687092e-05, + "loss": 0.037, + "step": 65090 + }, + { + "epoch": 0.14357137658705918, + "grad_norm": 0.10138911008834839, + "learning_rate": 2.7770211339215856e-05, + "loss": 0.0328, + "step": 65100 + }, + { + "epoch": 0.14359343056195734, + "grad_norm": 0.12566111981868744, + "learning_rate": 2.7769343847574492e-05, + "loss": 0.0343, + "step": 65110 + }, + { + "epoch": 0.1436154845368555, + "grad_norm": 0.10217807441949844, + "learning_rate": 2.776847620077355e-05, + "loss": 0.0362, + "step": 65120 + }, + { + "epoch": 0.14363753851175368, + "grad_norm": 0.15793979167938232, + "learning_rate": 2.7767608398823563e-05, + "loss": 0.0348, + "step": 65130 + }, + { + "epoch": 0.14365959248665183, + "grad_norm": 0.13917531073093414, + "learning_rate": 2.7766740441735077e-05, + "loss": 0.0376, + "step": 65140 + }, + { + "epoch": 0.14368164646155, + "grad_norm": 0.1446429342031479, + "learning_rate": 2.7765872329518647e-05, + "loss": 0.0361, + "step": 65150 + }, + { + "epoch": 0.14370370043644817, + "grad_norm": 0.09879060089588165, + "learning_rate": 2.7765004062184812e-05, + "loss": 0.0351, + "step": 65160 + }, + { + "epoch": 0.14372575441134633, + "grad_norm": 0.1132749691605568, + "learning_rate": 2.7764135639744127e-05, + "loss": 0.0349, + "step": 65170 + }, + { + "epoch": 0.14374780838624449, + "grad_norm": 0.13074859976768494, + "learning_rate": 2.7763267062207142e-05, + "loss": 0.0352, + "step": 65180 + }, + { + "epoch": 0.14376986236114267, + "grad_norm": 0.12243456393480301, + "learning_rate": 2.776239832958441e-05, + "loss": 0.0363, + "step": 65190 + }, + { + "epoch": 0.14379191633604083, + "grad_norm": 0.11814375966787338, + "learning_rate": 2.7761529441886488e-05, + "loss": 0.039, + "step": 65200 + }, + { + "epoch": 0.14381397031093898, + "grad_norm": 0.12949173152446747, + "learning_rate": 2.7760660399123938e-05, + "loss": 0.0353, + "step": 65210 + }, + { + "epoch": 0.14383602428583717, + "grad_norm": 0.1654304414987564, + "learning_rate": 2.775979120130731e-05, + "loss": 0.0388, + "step": 65220 + }, + { + "epoch": 0.14385807826073532, + "grad_norm": 0.11950503289699554, + "learning_rate": 2.7758921848447177e-05, + "loss": 0.0344, + "step": 65230 + }, + { + "epoch": 0.14388013223563348, + "grad_norm": 0.10927069932222366, + "learning_rate": 2.77580523405541e-05, + "loss": 0.0352, + "step": 65240 + }, + { + "epoch": 0.14390218621053166, + "grad_norm": 0.11516951769590378, + "learning_rate": 2.7757182677638633e-05, + "loss": 0.034, + "step": 65250 + }, + { + "epoch": 0.14392424018542982, + "grad_norm": 0.11229147017002106, + "learning_rate": 2.775631285971136e-05, + "loss": 0.0338, + "step": 65260 + }, + { + "epoch": 0.14394629416032798, + "grad_norm": 0.10863380134105682, + "learning_rate": 2.7755442886782835e-05, + "loss": 0.0374, + "step": 65270 + }, + { + "epoch": 0.14396834813522616, + "grad_norm": 0.10291395336389542, + "learning_rate": 2.7754572758863638e-05, + "loss": 0.0367, + "step": 65280 + }, + { + "epoch": 0.14399040211012432, + "grad_norm": 0.09006128460168839, + "learning_rate": 2.775370247596434e-05, + "loss": 0.036, + "step": 65290 + }, + { + "epoch": 0.14401245608502247, + "grad_norm": 0.12016407400369644, + "learning_rate": 2.7752832038095517e-05, + "loss": 0.0375, + "step": 65300 + }, + { + "epoch": 0.14403451005992066, + "grad_norm": 0.10930346697568893, + "learning_rate": 2.7751961445267742e-05, + "loss": 0.0347, + "step": 65310 + }, + { + "epoch": 0.1440565640348188, + "grad_norm": 0.09431779384613037, + "learning_rate": 2.7751090697491597e-05, + "loss": 0.0336, + "step": 65320 + }, + { + "epoch": 0.14407861800971697, + "grad_norm": 0.10271823406219482, + "learning_rate": 2.775021979477766e-05, + "loss": 0.0355, + "step": 65330 + }, + { + "epoch": 0.14410067198461515, + "grad_norm": 0.10958787798881531, + "learning_rate": 2.7749348737136514e-05, + "loss": 0.0345, + "step": 65340 + }, + { + "epoch": 0.1441227259595133, + "grad_norm": 0.23876908421516418, + "learning_rate": 2.774847752457874e-05, + "loss": 0.034, + "step": 65350 + }, + { + "epoch": 0.14414477993441147, + "grad_norm": 0.09412343800067902, + "learning_rate": 2.7747606157114933e-05, + "loss": 0.034, + "step": 65360 + }, + { + "epoch": 0.14416683390930965, + "grad_norm": 0.10412168502807617, + "learning_rate": 2.774673463475568e-05, + "loss": 0.0352, + "step": 65370 + }, + { + "epoch": 0.1441888878842078, + "grad_norm": 0.10052848607301712, + "learning_rate": 2.7745862957511554e-05, + "loss": 0.0346, + "step": 65380 + }, + { + "epoch": 0.14421094185910596, + "grad_norm": 0.1024254709482193, + "learning_rate": 2.7744991125393167e-05, + "loss": 0.0345, + "step": 65390 + }, + { + "epoch": 0.14423299583400415, + "grad_norm": 0.11479194462299347, + "learning_rate": 2.7744119138411102e-05, + "loss": 0.0362, + "step": 65400 + }, + { + "epoch": 0.1442550498089023, + "grad_norm": 0.0897490531206131, + "learning_rate": 2.7743246996575958e-05, + "loss": 0.0347, + "step": 65410 + }, + { + "epoch": 0.1442771037838005, + "grad_norm": 0.10604997724294662, + "learning_rate": 2.7742374699898333e-05, + "loss": 0.0361, + "step": 65420 + }, + { + "epoch": 0.14429915775869864, + "grad_norm": 0.08713237196207047, + "learning_rate": 2.774150224838882e-05, + "loss": 0.0338, + "step": 65430 + }, + { + "epoch": 0.1443212117335968, + "grad_norm": 0.10037924349308014, + "learning_rate": 2.7740629642058027e-05, + "loss": 0.0336, + "step": 65440 + }, + { + "epoch": 0.14434326570849498, + "grad_norm": 0.08981242030858994, + "learning_rate": 2.7739756880916557e-05, + "loss": 0.0369, + "step": 65450 + }, + { + "epoch": 0.14436531968339314, + "grad_norm": 0.11231731623411179, + "learning_rate": 2.773888396497501e-05, + "loss": 0.0354, + "step": 65460 + }, + { + "epoch": 0.1443873736582913, + "grad_norm": 0.09782599657773972, + "learning_rate": 2.7738010894243997e-05, + "loss": 0.0358, + "step": 65470 + }, + { + "epoch": 0.14440942763318948, + "grad_norm": 0.09632786363363266, + "learning_rate": 2.773713766873412e-05, + "loss": 0.0358, + "step": 65480 + }, + { + "epoch": 0.14443148160808764, + "grad_norm": 0.11229363083839417, + "learning_rate": 2.7736264288456005e-05, + "loss": 0.0345, + "step": 65490 + }, + { + "epoch": 0.1444535355829858, + "grad_norm": 0.10719233006238937, + "learning_rate": 2.7735390753420243e-05, + "loss": 0.0362, + "step": 65500 + }, + { + "epoch": 0.14447558955788398, + "grad_norm": 0.09226804226636887, + "learning_rate": 2.7734517063637463e-05, + "loss": 0.0359, + "step": 65510 + }, + { + "epoch": 0.14449764353278213, + "grad_norm": 0.09564701467752457, + "learning_rate": 2.7733643219118278e-05, + "loss": 0.0358, + "step": 65520 + }, + { + "epoch": 0.1445196975076803, + "grad_norm": 0.09679652005434036, + "learning_rate": 2.773276921987331e-05, + "loss": 0.0382, + "step": 65530 + }, + { + "epoch": 0.14454175148257847, + "grad_norm": 0.12361530214548111, + "learning_rate": 2.7731895065913167e-05, + "loss": 0.036, + "step": 65540 + }, + { + "epoch": 0.14456380545747663, + "grad_norm": 0.088223896920681, + "learning_rate": 2.7731020757248482e-05, + "loss": 0.0369, + "step": 65550 + }, + { + "epoch": 0.14458585943237479, + "grad_norm": 0.11746365576982498, + "learning_rate": 2.773014629388987e-05, + "loss": 0.0358, + "step": 65560 + }, + { + "epoch": 0.14460791340727297, + "grad_norm": 0.12009452283382416, + "learning_rate": 2.7729271675847964e-05, + "loss": 0.0347, + "step": 65570 + }, + { + "epoch": 0.14462996738217113, + "grad_norm": 0.10354585945606232, + "learning_rate": 2.7728396903133392e-05, + "loss": 0.0342, + "step": 65580 + }, + { + "epoch": 0.14465202135706928, + "grad_norm": 0.10799793154001236, + "learning_rate": 2.7727521975756774e-05, + "loss": 0.0349, + "step": 65590 + }, + { + "epoch": 0.14467407533196747, + "grad_norm": 0.13393981754779816, + "learning_rate": 2.7726646893728754e-05, + "loss": 0.034, + "step": 65600 + }, + { + "epoch": 0.14469612930686562, + "grad_norm": 0.11916505545377731, + "learning_rate": 2.7725771657059956e-05, + "loss": 0.0349, + "step": 65610 + }, + { + "epoch": 0.14471818328176378, + "grad_norm": 0.16378015279769897, + "learning_rate": 2.7724896265761014e-05, + "loss": 0.0338, + "step": 65620 + }, + { + "epoch": 0.14474023725666196, + "grad_norm": 0.11041947454214096, + "learning_rate": 2.772402071984257e-05, + "loss": 0.0366, + "step": 65630 + }, + { + "epoch": 0.14476229123156012, + "grad_norm": 0.11710841953754425, + "learning_rate": 2.772314501931526e-05, + "loss": 0.0347, + "step": 65640 + }, + { + "epoch": 0.14478434520645828, + "grad_norm": 0.08389496803283691, + "learning_rate": 2.772226916418973e-05, + "loss": 0.0342, + "step": 65650 + }, + { + "epoch": 0.14480639918135646, + "grad_norm": 0.07394076138734818, + "learning_rate": 2.7721393154476613e-05, + "loss": 0.0352, + "step": 65660 + }, + { + "epoch": 0.14482845315625462, + "grad_norm": 0.13186722993850708, + "learning_rate": 2.7720516990186557e-05, + "loss": 0.0335, + "step": 65670 + }, + { + "epoch": 0.14485050713115277, + "grad_norm": 0.09153959900140762, + "learning_rate": 2.7719640671330214e-05, + "loss": 0.0332, + "step": 65680 + }, + { + "epoch": 0.14487256110605096, + "grad_norm": 0.1437126249074936, + "learning_rate": 2.7718764197918224e-05, + "loss": 0.0366, + "step": 65690 + }, + { + "epoch": 0.1448946150809491, + "grad_norm": 0.10305102169513702, + "learning_rate": 2.771788756996124e-05, + "loss": 0.0358, + "step": 65700 + }, + { + "epoch": 0.14491666905584727, + "grad_norm": 0.13736845552921295, + "learning_rate": 2.7717010787469915e-05, + "loss": 0.038, + "step": 65710 + }, + { + "epoch": 0.14493872303074545, + "grad_norm": 0.12997636198997498, + "learning_rate": 2.7716133850454907e-05, + "loss": 0.0359, + "step": 65720 + }, + { + "epoch": 0.1449607770056436, + "grad_norm": 0.1283513754606247, + "learning_rate": 2.771525675892686e-05, + "loss": 0.0358, + "step": 65730 + }, + { + "epoch": 0.14498283098054177, + "grad_norm": 0.10470257699489594, + "learning_rate": 2.7714379512896443e-05, + "loss": 0.0348, + "step": 65740 + }, + { + "epoch": 0.14500488495543995, + "grad_norm": 0.10891750454902649, + "learning_rate": 2.7713502112374306e-05, + "loss": 0.0354, + "step": 65750 + }, + { + "epoch": 0.1450269389303381, + "grad_norm": 0.09523496776819229, + "learning_rate": 2.771262455737112e-05, + "loss": 0.0357, + "step": 65760 + }, + { + "epoch": 0.14504899290523626, + "grad_norm": 0.13160598278045654, + "learning_rate": 2.7711746847897538e-05, + "loss": 0.0338, + "step": 65770 + }, + { + "epoch": 0.14507104688013445, + "grad_norm": 0.11975433677434921, + "learning_rate": 2.7710868983964234e-05, + "loss": 0.0352, + "step": 65780 + }, + { + "epoch": 0.1450931008550326, + "grad_norm": 0.10125000029802322, + "learning_rate": 2.7709990965581866e-05, + "loss": 0.0356, + "step": 65790 + }, + { + "epoch": 0.14511515482993076, + "grad_norm": 0.09682914614677429, + "learning_rate": 2.7709112792761112e-05, + "loss": 0.0365, + "step": 65800 + }, + { + "epoch": 0.14513720880482894, + "grad_norm": 0.09857108443975449, + "learning_rate": 2.7708234465512638e-05, + "loss": 0.0348, + "step": 65810 + }, + { + "epoch": 0.1451592627797271, + "grad_norm": 0.13263335824012756, + "learning_rate": 2.7707355983847114e-05, + "loss": 0.035, + "step": 65820 + }, + { + "epoch": 0.14518131675462528, + "grad_norm": 0.10266850143671036, + "learning_rate": 2.7706477347775218e-05, + "loss": 0.0367, + "step": 65830 + }, + { + "epoch": 0.14520337072952344, + "grad_norm": 0.09782896190881729, + "learning_rate": 2.7705598557307624e-05, + "loss": 0.0365, + "step": 65840 + }, + { + "epoch": 0.1452254247044216, + "grad_norm": 0.10619870573282242, + "learning_rate": 2.7704719612455014e-05, + "loss": 0.0355, + "step": 65850 + }, + { + "epoch": 0.14524747867931978, + "grad_norm": 0.1283106654882431, + "learning_rate": 2.7703840513228064e-05, + "loss": 0.0355, + "step": 65860 + }, + { + "epoch": 0.14526953265421794, + "grad_norm": 0.11570045351982117, + "learning_rate": 2.770296125963746e-05, + "loss": 0.0366, + "step": 65870 + }, + { + "epoch": 0.1452915866291161, + "grad_norm": 0.1349017322063446, + "learning_rate": 2.7702081851693883e-05, + "loss": 0.038, + "step": 65880 + }, + { + "epoch": 0.14531364060401428, + "grad_norm": 0.09497828036546707, + "learning_rate": 2.7701202289408014e-05, + "loss": 0.0364, + "step": 65890 + }, + { + "epoch": 0.14533569457891243, + "grad_norm": 0.10266490280628204, + "learning_rate": 2.770032257279055e-05, + "loss": 0.0355, + "step": 65900 + }, + { + "epoch": 0.1453577485538106, + "grad_norm": 0.11905151605606079, + "learning_rate": 2.769944270185217e-05, + "loss": 0.0351, + "step": 65910 + }, + { + "epoch": 0.14537980252870877, + "grad_norm": 0.09520477056503296, + "learning_rate": 2.7698562676603578e-05, + "loss": 0.0351, + "step": 65920 + }, + { + "epoch": 0.14540185650360693, + "grad_norm": 0.10776777565479279, + "learning_rate": 2.769768249705546e-05, + "loss": 0.0364, + "step": 65930 + }, + { + "epoch": 0.1454239104785051, + "grad_norm": 0.09353765100240707, + "learning_rate": 2.7696802163218503e-05, + "loss": 0.036, + "step": 65940 + }, + { + "epoch": 0.14544596445340327, + "grad_norm": 0.09580151736736298, + "learning_rate": 2.769592167510342e-05, + "loss": 0.0353, + "step": 65950 + }, + { + "epoch": 0.14546801842830143, + "grad_norm": 0.0934201255440712, + "learning_rate": 2.7695041032720897e-05, + "loss": 0.0335, + "step": 65960 + }, + { + "epoch": 0.14549007240319958, + "grad_norm": 0.1250818371772766, + "learning_rate": 2.769416023608164e-05, + "loss": 0.0354, + "step": 65970 + }, + { + "epoch": 0.14551212637809777, + "grad_norm": 0.09989940375089645, + "learning_rate": 2.769327928519635e-05, + "loss": 0.0351, + "step": 65980 + }, + { + "epoch": 0.14553418035299592, + "grad_norm": 0.11290758848190308, + "learning_rate": 2.7692398180075736e-05, + "loss": 0.0341, + "step": 65990 + }, + { + "epoch": 0.14555623432789408, + "grad_norm": 0.09897659718990326, + "learning_rate": 2.76915169207305e-05, + "loss": 0.0354, + "step": 66000 + }, + { + "epoch": 0.14557828830279226, + "grad_norm": 0.08663800358772278, + "learning_rate": 2.769063550717135e-05, + "loss": 0.0331, + "step": 66010 + }, + { + "epoch": 0.14560034227769042, + "grad_norm": 0.09910140931606293, + "learning_rate": 2.7689753939408994e-05, + "loss": 0.0358, + "step": 66020 + }, + { + "epoch": 0.14562239625258858, + "grad_norm": 0.12499946355819702, + "learning_rate": 2.768887221745415e-05, + "loss": 0.0342, + "step": 66030 + }, + { + "epoch": 0.14564445022748676, + "grad_norm": 0.08682747930288315, + "learning_rate": 2.7687990341317522e-05, + "loss": 0.0344, + "step": 66040 + }, + { + "epoch": 0.14566650420238492, + "grad_norm": 0.09944924712181091, + "learning_rate": 2.7687108311009837e-05, + "loss": 0.0356, + "step": 66050 + }, + { + "epoch": 0.14568855817728307, + "grad_norm": 0.10462436825037003, + "learning_rate": 2.7686226126541807e-05, + "loss": 0.0342, + "step": 66060 + }, + { + "epoch": 0.14571061215218126, + "grad_norm": 0.1022927388548851, + "learning_rate": 2.768534378792415e-05, + "loss": 0.0356, + "step": 66070 + }, + { + "epoch": 0.1457326661270794, + "grad_norm": 0.09846004843711853, + "learning_rate": 2.7684461295167593e-05, + "loss": 0.0331, + "step": 66080 + }, + { + "epoch": 0.14575472010197757, + "grad_norm": 0.08025051653385162, + "learning_rate": 2.7683578648282853e-05, + "loss": 0.0364, + "step": 66090 + }, + { + "epoch": 0.14577677407687575, + "grad_norm": 0.10325294733047485, + "learning_rate": 2.768269584728066e-05, + "loss": 0.0338, + "step": 66100 + }, + { + "epoch": 0.1457988280517739, + "grad_norm": 0.109317846596241, + "learning_rate": 2.7681812892171732e-05, + "loss": 0.0372, + "step": 66110 + }, + { + "epoch": 0.14582088202667207, + "grad_norm": 0.14498741924762726, + "learning_rate": 2.7680929782966808e-05, + "loss": 0.0386, + "step": 66120 + }, + { + "epoch": 0.14584293600157025, + "grad_norm": 0.12725545465946198, + "learning_rate": 2.768004651967661e-05, + "loss": 0.0349, + "step": 66130 + }, + { + "epoch": 0.1458649899764684, + "grad_norm": 0.10545404255390167, + "learning_rate": 2.767916310231188e-05, + "loss": 0.036, + "step": 66140 + }, + { + "epoch": 0.14588704395136656, + "grad_norm": 0.11184902489185333, + "learning_rate": 2.7678279530883343e-05, + "loss": 0.035, + "step": 66150 + }, + { + "epoch": 0.14590909792626475, + "grad_norm": 0.09443455189466476, + "learning_rate": 2.7677395805401745e-05, + "loss": 0.0347, + "step": 66160 + }, + { + "epoch": 0.1459311519011629, + "grad_norm": 0.11056601256132126, + "learning_rate": 2.767651192587781e-05, + "loss": 0.0364, + "step": 66170 + }, + { + "epoch": 0.14595320587606106, + "grad_norm": 0.10135844349861145, + "learning_rate": 2.767562789232229e-05, + "loss": 0.0335, + "step": 66180 + }, + { + "epoch": 0.14597525985095924, + "grad_norm": 0.08969362080097198, + "learning_rate": 2.767474370474593e-05, + "loss": 0.0353, + "step": 66190 + }, + { + "epoch": 0.1459973138258574, + "grad_norm": 0.07479440420866013, + "learning_rate": 2.767385936315946e-05, + "loss": 0.0347, + "step": 66200 + }, + { + "epoch": 0.14601936780075556, + "grad_norm": 0.12360812723636627, + "learning_rate": 2.7672974867573638e-05, + "loss": 0.0367, + "step": 66210 + }, + { + "epoch": 0.14604142177565374, + "grad_norm": 0.1339990794658661, + "learning_rate": 2.76720902179992e-05, + "loss": 0.0356, + "step": 66220 + }, + { + "epoch": 0.1460634757505519, + "grad_norm": 0.10555720329284668, + "learning_rate": 2.7671205414446903e-05, + "loss": 0.0353, + "step": 66230 + }, + { + "epoch": 0.14608552972545005, + "grad_norm": 0.10598532855510712, + "learning_rate": 2.7670320456927498e-05, + "loss": 0.0338, + "step": 66240 + }, + { + "epoch": 0.14610758370034824, + "grad_norm": 0.1134970486164093, + "learning_rate": 2.7669435345451737e-05, + "loss": 0.0349, + "step": 66250 + }, + { + "epoch": 0.1461296376752464, + "grad_norm": 0.09822659194469452, + "learning_rate": 2.7668550080030374e-05, + "loss": 0.0352, + "step": 66260 + }, + { + "epoch": 0.14615169165014458, + "grad_norm": 0.10979113727807999, + "learning_rate": 2.7667664660674167e-05, + "loss": 0.0322, + "step": 66270 + }, + { + "epoch": 0.14617374562504273, + "grad_norm": 0.12982992827892303, + "learning_rate": 2.7666779087393875e-05, + "loss": 0.0354, + "step": 66280 + }, + { + "epoch": 0.1461957995999409, + "grad_norm": 0.12219778448343277, + "learning_rate": 2.7665893360200252e-05, + "loss": 0.0366, + "step": 66290 + }, + { + "epoch": 0.14621785357483907, + "grad_norm": 0.10951685160398483, + "learning_rate": 2.7665007479104074e-05, + "loss": 0.0334, + "step": 66300 + }, + { + "epoch": 0.14623990754973723, + "grad_norm": 0.08292990922927856, + "learning_rate": 2.766412144411609e-05, + "loss": 0.0334, + "step": 66310 + }, + { + "epoch": 0.1462619615246354, + "grad_norm": 0.11378969997167587, + "learning_rate": 2.766323525524708e-05, + "loss": 0.0375, + "step": 66320 + }, + { + "epoch": 0.14628401549953357, + "grad_norm": 0.12136761099100113, + "learning_rate": 2.76623489125078e-05, + "loss": 0.0339, + "step": 66330 + }, + { + "epoch": 0.14630606947443173, + "grad_norm": 0.1258566677570343, + "learning_rate": 2.766146241590903e-05, + "loss": 0.0348, + "step": 66340 + }, + { + "epoch": 0.14632812344932988, + "grad_norm": 0.11484511941671371, + "learning_rate": 2.7660575765461532e-05, + "loss": 0.0348, + "step": 66350 + }, + { + "epoch": 0.14635017742422807, + "grad_norm": 0.11796794831752777, + "learning_rate": 2.765968896117609e-05, + "loss": 0.0348, + "step": 66360 + }, + { + "epoch": 0.14637223139912622, + "grad_norm": 0.11117460578680038, + "learning_rate": 2.7658802003063473e-05, + "loss": 0.0341, + "step": 66370 + }, + { + "epoch": 0.14639428537402438, + "grad_norm": 0.11859879642724991, + "learning_rate": 2.765791489113446e-05, + "loss": 0.034, + "step": 66380 + }, + { + "epoch": 0.14641633934892256, + "grad_norm": 0.09724117070436478, + "learning_rate": 2.765702762539983e-05, + "loss": 0.0343, + "step": 66390 + }, + { + "epoch": 0.14643839332382072, + "grad_norm": 0.10140082240104675, + "learning_rate": 2.7656140205870362e-05, + "loss": 0.0384, + "step": 66400 + }, + { + "epoch": 0.14646044729871888, + "grad_norm": 0.12384931743144989, + "learning_rate": 2.7655252632556844e-05, + "loss": 0.0359, + "step": 66410 + }, + { + "epoch": 0.14648250127361706, + "grad_norm": 0.1020546555519104, + "learning_rate": 2.7654364905470057e-05, + "loss": 0.0349, + "step": 66420 + }, + { + "epoch": 0.14650455524851522, + "grad_norm": 0.10753950476646423, + "learning_rate": 2.765347702462079e-05, + "loss": 0.0345, + "step": 66430 + }, + { + "epoch": 0.14652660922341337, + "grad_norm": 0.11826859414577484, + "learning_rate": 2.7652588990019824e-05, + "loss": 0.0351, + "step": 66440 + }, + { + "epoch": 0.14654866319831156, + "grad_norm": 0.1260007917881012, + "learning_rate": 2.7651700801677963e-05, + "loss": 0.0356, + "step": 66450 + }, + { + "epoch": 0.1465707171732097, + "grad_norm": 0.10491624474525452, + "learning_rate": 2.7650812459605992e-05, + "loss": 0.037, + "step": 66460 + }, + { + "epoch": 0.14659277114810787, + "grad_norm": 0.10142957419157028, + "learning_rate": 2.7649923963814698e-05, + "loss": 0.0348, + "step": 66470 + }, + { + "epoch": 0.14661482512300605, + "grad_norm": 0.11337226629257202, + "learning_rate": 2.7649035314314894e-05, + "loss": 0.0344, + "step": 66480 + }, + { + "epoch": 0.1466368790979042, + "grad_norm": 0.09340660274028778, + "learning_rate": 2.7648146511117363e-05, + "loss": 0.0352, + "step": 66490 + }, + { + "epoch": 0.14665893307280237, + "grad_norm": 0.14025503396987915, + "learning_rate": 2.7647257554232906e-05, + "loss": 0.0362, + "step": 66500 + }, + { + "epoch": 0.14668098704770055, + "grad_norm": 0.11184714734554291, + "learning_rate": 2.7646368443672337e-05, + "loss": 0.0365, + "step": 66510 + }, + { + "epoch": 0.1467030410225987, + "grad_norm": 0.10801432281732559, + "learning_rate": 2.7645479179446445e-05, + "loss": 0.0338, + "step": 66520 + }, + { + "epoch": 0.14672509499749686, + "grad_norm": 0.1165013536810875, + "learning_rate": 2.7644589761566045e-05, + "loss": 0.0334, + "step": 66530 + }, + { + "epoch": 0.14674714897239505, + "grad_norm": 0.10341613739728928, + "learning_rate": 2.764370019004194e-05, + "loss": 0.0334, + "step": 66540 + }, + { + "epoch": 0.1467692029472932, + "grad_norm": 0.08993586897850037, + "learning_rate": 2.764281046488494e-05, + "loss": 0.0352, + "step": 66550 + }, + { + "epoch": 0.14679125692219136, + "grad_norm": 0.10525200515985489, + "learning_rate": 2.7641920586105856e-05, + "loss": 0.0354, + "step": 66560 + }, + { + "epoch": 0.14681331089708954, + "grad_norm": 0.12273342907428741, + "learning_rate": 2.76410305537155e-05, + "loss": 0.0359, + "step": 66570 + }, + { + "epoch": 0.1468353648719877, + "grad_norm": 0.1244654506444931, + "learning_rate": 2.764014036772469e-05, + "loss": 0.0344, + "step": 66580 + }, + { + "epoch": 0.14685741884688586, + "grad_norm": 0.1135370284318924, + "learning_rate": 2.763925002814424e-05, + "loss": 0.0352, + "step": 66590 + }, + { + "epoch": 0.14687947282178404, + "grad_norm": 0.11065972596406937, + "learning_rate": 2.7638359534984967e-05, + "loss": 0.038, + "step": 66600 + }, + { + "epoch": 0.1469015267966822, + "grad_norm": 0.12330219149589539, + "learning_rate": 2.7637468888257695e-05, + "loss": 0.0313, + "step": 66610 + }, + { + "epoch": 0.14692358077158035, + "grad_norm": 0.10681476444005966, + "learning_rate": 2.763657808797324e-05, + "loss": 0.0358, + "step": 66620 + }, + { + "epoch": 0.14694563474647854, + "grad_norm": 0.10801240801811218, + "learning_rate": 2.7635687134142434e-05, + "loss": 0.0353, + "step": 66630 + }, + { + "epoch": 0.1469676887213767, + "grad_norm": 0.11025572568178177, + "learning_rate": 2.76347960267761e-05, + "loss": 0.0336, + "step": 66640 + }, + { + "epoch": 0.14698974269627485, + "grad_norm": 0.1261003017425537, + "learning_rate": 2.7633904765885064e-05, + "loss": 0.0382, + "step": 66650 + }, + { + "epoch": 0.14701179667117303, + "grad_norm": 0.10459514707326889, + "learning_rate": 2.7633013351480157e-05, + "loss": 0.0343, + "step": 66660 + }, + { + "epoch": 0.1470338506460712, + "grad_norm": 0.0999630019068718, + "learning_rate": 2.763212178357221e-05, + "loss": 0.0367, + "step": 66670 + }, + { + "epoch": 0.14705590462096935, + "grad_norm": 0.11716293543577194, + "learning_rate": 2.7631230062172055e-05, + "loss": 0.0367, + "step": 66680 + }, + { + "epoch": 0.14707795859586753, + "grad_norm": 0.13156066834926605, + "learning_rate": 2.763033818729053e-05, + "loss": 0.034, + "step": 66690 + }, + { + "epoch": 0.1471000125707657, + "grad_norm": 0.11206833273172379, + "learning_rate": 2.762944615893847e-05, + "loss": 0.0343, + "step": 66700 + }, + { + "epoch": 0.14712206654566387, + "grad_norm": 0.12526537477970123, + "learning_rate": 2.7628553977126718e-05, + "loss": 0.0349, + "step": 66710 + }, + { + "epoch": 0.14714412052056203, + "grad_norm": 0.11900316923856735, + "learning_rate": 2.762766164186611e-05, + "loss": 0.0326, + "step": 66720 + }, + { + "epoch": 0.14716617449546018, + "grad_norm": 0.14009803533554077, + "learning_rate": 2.7626769153167494e-05, + "loss": 0.036, + "step": 66730 + }, + { + "epoch": 0.14718822847035837, + "grad_norm": 0.1334153711795807, + "learning_rate": 2.7625876511041703e-05, + "loss": 0.0373, + "step": 66740 + }, + { + "epoch": 0.14721028244525652, + "grad_norm": 0.10855041444301605, + "learning_rate": 2.7624983715499597e-05, + "loss": 0.034, + "step": 66750 + }, + { + "epoch": 0.14723233642015468, + "grad_norm": 0.10656974464654922, + "learning_rate": 2.7624090766552018e-05, + "loss": 0.0345, + "step": 66760 + }, + { + "epoch": 0.14725439039505286, + "grad_norm": 0.15841597318649292, + "learning_rate": 2.7623197664209815e-05, + "loss": 0.0354, + "step": 66770 + }, + { + "epoch": 0.14727644436995102, + "grad_norm": 0.11902828514575958, + "learning_rate": 2.762230440848385e-05, + "loss": 0.0349, + "step": 66780 + }, + { + "epoch": 0.14729849834484918, + "grad_norm": 0.1028021052479744, + "learning_rate": 2.7621410999384962e-05, + "loss": 0.035, + "step": 66790 + }, + { + "epoch": 0.14732055231974736, + "grad_norm": 0.08846868574619293, + "learning_rate": 2.7620517436924013e-05, + "loss": 0.0365, + "step": 66800 + }, + { + "epoch": 0.14734260629464552, + "grad_norm": 0.12913326919078827, + "learning_rate": 2.7619623721111862e-05, + "loss": 0.0348, + "step": 66810 + }, + { + "epoch": 0.14736466026954367, + "grad_norm": 0.08830516785383224, + "learning_rate": 2.7618729851959372e-05, + "loss": 0.035, + "step": 66820 + }, + { + "epoch": 0.14738671424444186, + "grad_norm": 0.14706820249557495, + "learning_rate": 2.7617835829477397e-05, + "loss": 0.0347, + "step": 66830 + }, + { + "epoch": 0.14740876821934001, + "grad_norm": 0.11156068742275238, + "learning_rate": 2.7616941653676805e-05, + "loss": 0.036, + "step": 66840 + }, + { + "epoch": 0.14743082219423817, + "grad_norm": 0.15029025077819824, + "learning_rate": 2.7616047324568456e-05, + "loss": 0.036, + "step": 66850 + }, + { + "epoch": 0.14745287616913635, + "grad_norm": 0.12334999442100525, + "learning_rate": 2.7615152842163224e-05, + "loss": 0.0358, + "step": 66860 + }, + { + "epoch": 0.1474749301440345, + "grad_norm": 0.11865463107824326, + "learning_rate": 2.7614258206471975e-05, + "loss": 0.0347, + "step": 66870 + }, + { + "epoch": 0.14749698411893267, + "grad_norm": 0.10082894563674927, + "learning_rate": 2.7613363417505577e-05, + "loss": 0.0355, + "step": 66880 + }, + { + "epoch": 0.14751903809383085, + "grad_norm": 0.09552286565303802, + "learning_rate": 2.7612468475274902e-05, + "loss": 0.0339, + "step": 66890 + }, + { + "epoch": 0.147541092068729, + "grad_norm": 0.10463167726993561, + "learning_rate": 2.761157337979083e-05, + "loss": 0.0328, + "step": 66900 + }, + { + "epoch": 0.14756314604362716, + "grad_norm": 0.13163886964321136, + "learning_rate": 2.7610678131064237e-05, + "loss": 0.0339, + "step": 66910 + }, + { + "epoch": 0.14758520001852535, + "grad_norm": 0.09662777185440063, + "learning_rate": 2.7609782729105993e-05, + "loss": 0.0334, + "step": 66920 + }, + { + "epoch": 0.1476072539934235, + "grad_norm": 0.09597962349653244, + "learning_rate": 2.7608887173926988e-05, + "loss": 0.0352, + "step": 66930 + }, + { + "epoch": 0.14762930796832166, + "grad_norm": 0.13303719460964203, + "learning_rate": 2.7607991465538093e-05, + "loss": 0.0336, + "step": 66940 + }, + { + "epoch": 0.14765136194321984, + "grad_norm": 0.11221033334732056, + "learning_rate": 2.7607095603950204e-05, + "loss": 0.0356, + "step": 66950 + }, + { + "epoch": 0.147673415918118, + "grad_norm": 0.14118531346321106, + "learning_rate": 2.7606199589174198e-05, + "loss": 0.0362, + "step": 66960 + }, + { + "epoch": 0.14769546989301616, + "grad_norm": 0.1211036667227745, + "learning_rate": 2.7605303421220967e-05, + "loss": 0.0351, + "step": 66970 + }, + { + "epoch": 0.14771752386791434, + "grad_norm": 0.10286086052656174, + "learning_rate": 2.7604407100101394e-05, + "loss": 0.0385, + "step": 66980 + }, + { + "epoch": 0.1477395778428125, + "grad_norm": 0.14080344140529633, + "learning_rate": 2.7603510625826375e-05, + "loss": 0.0345, + "step": 66990 + }, + { + "epoch": 0.14776163181771065, + "grad_norm": 0.10580811649560928, + "learning_rate": 2.7602613998406803e-05, + "loss": 0.0329, + "step": 67000 + }, + { + "epoch": 0.14778368579260884, + "grad_norm": 0.11564932763576508, + "learning_rate": 2.7601717217853574e-05, + "loss": 0.0363, + "step": 67010 + }, + { + "epoch": 0.147805739767507, + "grad_norm": 0.07886020094156265, + "learning_rate": 2.760082028417758e-05, + "loss": 0.036, + "step": 67020 + }, + { + "epoch": 0.14782779374240515, + "grad_norm": 0.15565098822116852, + "learning_rate": 2.759992319738972e-05, + "loss": 0.0358, + "step": 67030 + }, + { + "epoch": 0.14784984771730333, + "grad_norm": 0.13760879635810852, + "learning_rate": 2.75990259575009e-05, + "loss": 0.0359, + "step": 67040 + }, + { + "epoch": 0.1478719016922015, + "grad_norm": 0.11538846045732498, + "learning_rate": 2.759812856452202e-05, + "loss": 0.0357, + "step": 67050 + }, + { + "epoch": 0.14789395566709965, + "grad_norm": 0.11030083149671555, + "learning_rate": 2.7597231018463985e-05, + "loss": 0.035, + "step": 67060 + }, + { + "epoch": 0.14791600964199783, + "grad_norm": 0.11426392197608948, + "learning_rate": 2.7596333319337695e-05, + "loss": 0.0333, + "step": 67070 + }, + { + "epoch": 0.147938063616896, + "grad_norm": 0.11716514080762863, + "learning_rate": 2.7595435467154062e-05, + "loss": 0.0369, + "step": 67080 + }, + { + "epoch": 0.14796011759179414, + "grad_norm": 0.10685983300209045, + "learning_rate": 2.7594537461924e-05, + "loss": 0.0352, + "step": 67090 + }, + { + "epoch": 0.14798217156669233, + "grad_norm": 0.13840922713279724, + "learning_rate": 2.7593639303658414e-05, + "loss": 0.0369, + "step": 67100 + }, + { + "epoch": 0.14800422554159048, + "grad_norm": 0.137315034866333, + "learning_rate": 2.759274099236822e-05, + "loss": 0.0364, + "step": 67110 + }, + { + "epoch": 0.14802627951648867, + "grad_norm": 0.12781085073947906, + "learning_rate": 2.7591842528064335e-05, + "loss": 0.0355, + "step": 67120 + }, + { + "epoch": 0.14804833349138682, + "grad_norm": 0.12642110884189606, + "learning_rate": 2.7590943910757677e-05, + "loss": 0.0328, + "step": 67130 + }, + { + "epoch": 0.14807038746628498, + "grad_norm": 0.1063193827867508, + "learning_rate": 2.7590045140459155e-05, + "loss": 0.0356, + "step": 67140 + }, + { + "epoch": 0.14809244144118316, + "grad_norm": 0.09888342022895813, + "learning_rate": 2.7589146217179704e-05, + "loss": 0.0338, + "step": 67150 + }, + { + "epoch": 0.14811449541608132, + "grad_norm": 0.14291124045848846, + "learning_rate": 2.7588247140930236e-05, + "loss": 0.0367, + "step": 67160 + }, + { + "epoch": 0.14813654939097948, + "grad_norm": 0.14869041740894318, + "learning_rate": 2.7587347911721685e-05, + "loss": 0.0358, + "step": 67170 + }, + { + "epoch": 0.14815860336587766, + "grad_norm": 0.12125317007303238, + "learning_rate": 2.758644852956497e-05, + "loss": 0.0343, + "step": 67180 + }, + { + "epoch": 0.14818065734077582, + "grad_norm": 0.10936233401298523, + "learning_rate": 2.758554899447102e-05, + "loss": 0.0357, + "step": 67190 + }, + { + "epoch": 0.14820271131567397, + "grad_norm": 0.14880920946598053, + "learning_rate": 2.7584649306450768e-05, + "loss": 0.0364, + "step": 67200 + }, + { + "epoch": 0.14822476529057216, + "grad_norm": 0.1381206065416336, + "learning_rate": 2.7583749465515148e-05, + "loss": 0.0354, + "step": 67210 + }, + { + "epoch": 0.14824681926547031, + "grad_norm": 0.11102499812841415, + "learning_rate": 2.7582849471675088e-05, + "loss": 0.0355, + "step": 67220 + }, + { + "epoch": 0.14826887324036847, + "grad_norm": 0.10661297291517258, + "learning_rate": 2.7581949324941525e-05, + "loss": 0.0368, + "step": 67230 + }, + { + "epoch": 0.14829092721526665, + "grad_norm": 0.10627691447734833, + "learning_rate": 2.7581049025325404e-05, + "loss": 0.0341, + "step": 67240 + }, + { + "epoch": 0.1483129811901648, + "grad_norm": 0.11068715155124664, + "learning_rate": 2.7580148572837654e-05, + "loss": 0.035, + "step": 67250 + }, + { + "epoch": 0.14833503516506297, + "grad_norm": 0.12533169984817505, + "learning_rate": 2.7579247967489222e-05, + "loss": 0.0362, + "step": 67260 + }, + { + "epoch": 0.14835708913996115, + "grad_norm": 0.11262578517198563, + "learning_rate": 2.7578347209291052e-05, + "loss": 0.0348, + "step": 67270 + }, + { + "epoch": 0.1483791431148593, + "grad_norm": 0.1116073876619339, + "learning_rate": 2.7577446298254087e-05, + "loss": 0.0355, + "step": 67280 + }, + { + "epoch": 0.14840119708975746, + "grad_norm": 0.11636022478342056, + "learning_rate": 2.7576545234389272e-05, + "loss": 0.0364, + "step": 67290 + }, + { + "epoch": 0.14842325106465565, + "grad_norm": 0.13961896300315857, + "learning_rate": 2.7575644017707564e-05, + "loss": 0.0344, + "step": 67300 + }, + { + "epoch": 0.1484453050395538, + "grad_norm": 0.10557372868061066, + "learning_rate": 2.7574742648219902e-05, + "loss": 0.037, + "step": 67310 + }, + { + "epoch": 0.14846735901445196, + "grad_norm": 0.08964300900697708, + "learning_rate": 2.757384112593725e-05, + "loss": 0.0336, + "step": 67320 + }, + { + "epoch": 0.14848941298935014, + "grad_norm": 0.09809272736310959, + "learning_rate": 2.757293945087055e-05, + "loss": 0.0358, + "step": 67330 + }, + { + "epoch": 0.1485114669642483, + "grad_norm": 0.10166579484939575, + "learning_rate": 2.7572037623030765e-05, + "loss": 0.0358, + "step": 67340 + }, + { + "epoch": 0.14853352093914646, + "grad_norm": 0.10248525440692902, + "learning_rate": 2.7571135642428856e-05, + "loss": 0.0337, + "step": 67350 + }, + { + "epoch": 0.14855557491404464, + "grad_norm": 0.08517774194478989, + "learning_rate": 2.7570233509075783e-05, + "loss": 0.0343, + "step": 67360 + }, + { + "epoch": 0.1485776288889428, + "grad_norm": 0.0929982140660286, + "learning_rate": 2.7569331222982503e-05, + "loss": 0.0339, + "step": 67370 + }, + { + "epoch": 0.14859968286384095, + "grad_norm": 0.13064688444137573, + "learning_rate": 2.7568428784159977e-05, + "loss": 0.0361, + "step": 67380 + }, + { + "epoch": 0.14862173683873914, + "grad_norm": 0.11313134431838989, + "learning_rate": 2.756752619261918e-05, + "loss": 0.0343, + "step": 67390 + }, + { + "epoch": 0.1486437908136373, + "grad_norm": 0.11403539031744003, + "learning_rate": 2.756662344837107e-05, + "loss": 0.0348, + "step": 67400 + }, + { + "epoch": 0.14866584478853545, + "grad_norm": 0.18443454802036285, + "learning_rate": 2.756572055142662e-05, + "loss": 0.0357, + "step": 67410 + }, + { + "epoch": 0.14868789876343363, + "grad_norm": 0.115865558385849, + "learning_rate": 2.7564817501796803e-05, + "loss": 0.0356, + "step": 67420 + }, + { + "epoch": 0.1487099527383318, + "grad_norm": 0.1287527084350586, + "learning_rate": 2.7563914299492592e-05, + "loss": 0.0357, + "step": 67430 + }, + { + "epoch": 0.14873200671322995, + "grad_norm": 0.12168057262897491, + "learning_rate": 2.7563010944524956e-05, + "loss": 0.0338, + "step": 67440 + }, + { + "epoch": 0.14875406068812813, + "grad_norm": 0.1599196195602417, + "learning_rate": 2.7562107436904874e-05, + "loss": 0.0362, + "step": 67450 + }, + { + "epoch": 0.1487761146630263, + "grad_norm": 0.11257527023553848, + "learning_rate": 2.7561203776643335e-05, + "loss": 0.0351, + "step": 67460 + }, + { + "epoch": 0.14879816863792444, + "grad_norm": 0.11504779756069183, + "learning_rate": 2.75602999637513e-05, + "loss": 0.0357, + "step": 67470 + }, + { + "epoch": 0.14882022261282263, + "grad_norm": 0.12497954070568085, + "learning_rate": 2.755939599823977e-05, + "loss": 0.0367, + "step": 67480 + }, + { + "epoch": 0.14884227658772078, + "grad_norm": 0.11153342574834824, + "learning_rate": 2.7558491880119717e-05, + "loss": 0.0349, + "step": 67490 + }, + { + "epoch": 0.14886433056261894, + "grad_norm": 0.14052416384220123, + "learning_rate": 2.755758760940213e-05, + "loss": 0.0335, + "step": 67500 + }, + { + "epoch": 0.14888638453751712, + "grad_norm": 0.1218561977148056, + "learning_rate": 2.7556683186097997e-05, + "loss": 0.0347, + "step": 67510 + }, + { + "epoch": 0.14890843851241528, + "grad_norm": 0.110286645591259, + "learning_rate": 2.7555778610218308e-05, + "loss": 0.0344, + "step": 67520 + }, + { + "epoch": 0.14893049248731344, + "grad_norm": 0.12201327085494995, + "learning_rate": 2.7554873881774053e-05, + "loss": 0.0347, + "step": 67530 + }, + { + "epoch": 0.14895254646221162, + "grad_norm": 0.1012614518404007, + "learning_rate": 2.755396900077623e-05, + "loss": 0.0333, + "step": 67540 + }, + { + "epoch": 0.14897460043710978, + "grad_norm": 0.10524201393127441, + "learning_rate": 2.755306396723583e-05, + "loss": 0.0356, + "step": 67550 + }, + { + "epoch": 0.14899665441200796, + "grad_norm": 0.09142734855413437, + "learning_rate": 2.755215878116385e-05, + "loss": 0.036, + "step": 67560 + }, + { + "epoch": 0.14901870838690612, + "grad_norm": 0.10131889581680298, + "learning_rate": 2.7551253442571286e-05, + "loss": 0.0337, + "step": 67570 + }, + { + "epoch": 0.14904076236180427, + "grad_norm": 0.10754085332155228, + "learning_rate": 2.7550347951469146e-05, + "loss": 0.0344, + "step": 67580 + }, + { + "epoch": 0.14906281633670246, + "grad_norm": 0.10672584176063538, + "learning_rate": 2.7549442307868432e-05, + "loss": 0.0352, + "step": 67590 + }, + { + "epoch": 0.14908487031160061, + "grad_norm": 0.09205631166696548, + "learning_rate": 2.754853651178014e-05, + "loss": 0.0364, + "step": 67600 + }, + { + "epoch": 0.14910692428649877, + "grad_norm": 0.08031889796257019, + "learning_rate": 2.7547630563215286e-05, + "loss": 0.0342, + "step": 67610 + }, + { + "epoch": 0.14912897826139696, + "grad_norm": 0.10321412235498428, + "learning_rate": 2.7546724462184873e-05, + "loss": 0.0353, + "step": 67620 + }, + { + "epoch": 0.1491510322362951, + "grad_norm": 0.10925037413835526, + "learning_rate": 2.7545818208699908e-05, + "loss": 0.0358, + "step": 67630 + }, + { + "epoch": 0.14917308621119327, + "grad_norm": 0.1088804230093956, + "learning_rate": 2.754491180277141e-05, + "loss": 0.0338, + "step": 67640 + }, + { + "epoch": 0.14919514018609145, + "grad_norm": 0.09020394086837769, + "learning_rate": 2.7544005244410384e-05, + "loss": 0.0343, + "step": 67650 + }, + { + "epoch": 0.1492171941609896, + "grad_norm": 0.10383043438196182, + "learning_rate": 2.7543098533627855e-05, + "loss": 0.0332, + "step": 67660 + }, + { + "epoch": 0.14923924813588776, + "grad_norm": 0.09611826390028, + "learning_rate": 2.7542191670434835e-05, + "loss": 0.0343, + "step": 67670 + }, + { + "epoch": 0.14926130211078595, + "grad_norm": 0.10140174627304077, + "learning_rate": 2.754128465484235e-05, + "loss": 0.0352, + "step": 67680 + }, + { + "epoch": 0.1492833560856841, + "grad_norm": 0.12357384711503983, + "learning_rate": 2.7540377486861408e-05, + "loss": 0.0347, + "step": 67690 + }, + { + "epoch": 0.14930541006058226, + "grad_norm": 0.11677414178848267, + "learning_rate": 2.7539470166503043e-05, + "loss": 0.0359, + "step": 67700 + }, + { + "epoch": 0.14932746403548045, + "grad_norm": 0.13934753835201263, + "learning_rate": 2.7538562693778277e-05, + "loss": 0.036, + "step": 67710 + }, + { + "epoch": 0.1493495180103786, + "grad_norm": 0.11586692184209824, + "learning_rate": 2.7537655068698132e-05, + "loss": 0.0352, + "step": 67720 + }, + { + "epoch": 0.14937157198527676, + "grad_norm": 0.11959785968065262, + "learning_rate": 2.7536747291273643e-05, + "loss": 0.0364, + "step": 67730 + }, + { + "epoch": 0.14939362596017494, + "grad_norm": 0.0858684703707695, + "learning_rate": 2.753583936151584e-05, + "loss": 0.0349, + "step": 67740 + }, + { + "epoch": 0.1494156799350731, + "grad_norm": 0.11245410144329071, + "learning_rate": 2.7534931279435747e-05, + "loss": 0.0364, + "step": 67750 + }, + { + "epoch": 0.14943773390997125, + "grad_norm": 0.1108332946896553, + "learning_rate": 2.753402304504441e-05, + "loss": 0.0337, + "step": 67760 + }, + { + "epoch": 0.14945978788486944, + "grad_norm": 0.1205742284655571, + "learning_rate": 2.7533114658352856e-05, + "loss": 0.0354, + "step": 67770 + }, + { + "epoch": 0.1494818418597676, + "grad_norm": 0.10462427884340286, + "learning_rate": 2.7532206119372126e-05, + "loss": 0.0355, + "step": 67780 + }, + { + "epoch": 0.14950389583466575, + "grad_norm": 0.120069220662117, + "learning_rate": 2.7531297428113257e-05, + "loss": 0.0336, + "step": 67790 + }, + { + "epoch": 0.14952594980956394, + "grad_norm": 0.13254910707473755, + "learning_rate": 2.7530388584587297e-05, + "loss": 0.0347, + "step": 67800 + }, + { + "epoch": 0.1495480037844621, + "grad_norm": 0.11954551935195923, + "learning_rate": 2.752947958880528e-05, + "loss": 0.0359, + "step": 67810 + }, + { + "epoch": 0.14957005775936025, + "grad_norm": 0.12266398966312408, + "learning_rate": 2.752857044077826e-05, + "loss": 0.0356, + "step": 67820 + }, + { + "epoch": 0.14959211173425843, + "grad_norm": 0.0934847742319107, + "learning_rate": 2.7527661140517278e-05, + "loss": 0.0355, + "step": 67830 + }, + { + "epoch": 0.1496141657091566, + "grad_norm": 0.09708775579929352, + "learning_rate": 2.7526751688033392e-05, + "loss": 0.0325, + "step": 67840 + }, + { + "epoch": 0.14963621968405474, + "grad_norm": 0.09730854630470276, + "learning_rate": 2.752584208333764e-05, + "loss": 0.0354, + "step": 67850 + }, + { + "epoch": 0.14965827365895293, + "grad_norm": 0.1427128165960312, + "learning_rate": 2.752493232644108e-05, + "loss": 0.0354, + "step": 67860 + }, + { + "epoch": 0.14968032763385108, + "grad_norm": 0.13662299513816833, + "learning_rate": 2.7524022417354766e-05, + "loss": 0.0352, + "step": 67870 + }, + { + "epoch": 0.14970238160874924, + "grad_norm": 0.13656744360923767, + "learning_rate": 2.7523112356089758e-05, + "loss": 0.0357, + "step": 67880 + }, + { + "epoch": 0.14972443558364742, + "grad_norm": 0.10377758741378784, + "learning_rate": 2.7522202142657107e-05, + "loss": 0.0327, + "step": 67890 + }, + { + "epoch": 0.14974648955854558, + "grad_norm": 0.12375647574663162, + "learning_rate": 2.752129177706788e-05, + "loss": 0.0371, + "step": 67900 + }, + { + "epoch": 0.14976854353344374, + "grad_norm": 0.08920098841190338, + "learning_rate": 2.752038125933314e-05, + "loss": 0.0349, + "step": 67910 + }, + { + "epoch": 0.14979059750834192, + "grad_norm": 0.10376131534576416, + "learning_rate": 2.7519470589463937e-05, + "loss": 0.0341, + "step": 67920 + }, + { + "epoch": 0.14981265148324008, + "grad_norm": 0.11750829964876175, + "learning_rate": 2.751855976747135e-05, + "loss": 0.0359, + "step": 67930 + }, + { + "epoch": 0.14983470545813823, + "grad_norm": 0.1151033565402031, + "learning_rate": 2.7517648793366444e-05, + "loss": 0.0349, + "step": 67940 + }, + { + "epoch": 0.14985675943303642, + "grad_norm": 0.12231042236089706, + "learning_rate": 2.751673766716029e-05, + "loss": 0.035, + "step": 67950 + }, + { + "epoch": 0.14987881340793457, + "grad_norm": 0.07876439392566681, + "learning_rate": 2.7515826388863952e-05, + "loss": 0.0367, + "step": 67960 + }, + { + "epoch": 0.14990086738283276, + "grad_norm": 0.1561659276485443, + "learning_rate": 2.7514914958488502e-05, + "loss": 0.035, + "step": 67970 + }, + { + "epoch": 0.14992292135773091, + "grad_norm": 0.10587462037801743, + "learning_rate": 2.7514003376045026e-05, + "loss": 0.0359, + "step": 67980 + }, + { + "epoch": 0.14994497533262907, + "grad_norm": 0.0995045155286789, + "learning_rate": 2.751309164154459e-05, + "loss": 0.0357, + "step": 67990 + }, + { + "epoch": 0.14996702930752726, + "grad_norm": 0.09819064289331436, + "learning_rate": 2.7512179754998274e-05, + "loss": 0.0364, + "step": 68000 + }, + { + "epoch": 0.1499890832824254, + "grad_norm": 0.11304926127195358, + "learning_rate": 2.7511267716417165e-05, + "loss": 0.0352, + "step": 68010 + }, + { + "epoch": 0.15001113725732357, + "grad_norm": 0.16601350903511047, + "learning_rate": 2.751035552581234e-05, + "loss": 0.0352, + "step": 68020 + }, + { + "epoch": 0.15003319123222175, + "grad_norm": 0.14215530455112457, + "learning_rate": 2.7509443183194885e-05, + "loss": 0.0344, + "step": 68030 + }, + { + "epoch": 0.1500552452071199, + "grad_norm": 0.13696721196174622, + "learning_rate": 2.7508530688575883e-05, + "loss": 0.036, + "step": 68040 + }, + { + "epoch": 0.15007729918201806, + "grad_norm": 0.11636161804199219, + "learning_rate": 2.750761804196642e-05, + "loss": 0.0346, + "step": 68050 + }, + { + "epoch": 0.15009935315691625, + "grad_norm": 0.1094699278473854, + "learning_rate": 2.7506705243377593e-05, + "loss": 0.0346, + "step": 68060 + }, + { + "epoch": 0.1501214071318144, + "grad_norm": 0.10982026904821396, + "learning_rate": 2.7505792292820487e-05, + "loss": 0.0333, + "step": 68070 + }, + { + "epoch": 0.15014346110671256, + "grad_norm": 0.1336430460214615, + "learning_rate": 2.7504879190306196e-05, + "loss": 0.0329, + "step": 68080 + }, + { + "epoch": 0.15016551508161075, + "grad_norm": 0.11453108489513397, + "learning_rate": 2.750396593584582e-05, + "loss": 0.0351, + "step": 68090 + }, + { + "epoch": 0.1501875690565089, + "grad_norm": 0.14127327501773834, + "learning_rate": 2.750305252945045e-05, + "loss": 0.0342, + "step": 68100 + }, + { + "epoch": 0.15020962303140706, + "grad_norm": 0.1448332816362381, + "learning_rate": 2.7502138971131185e-05, + "loss": 0.0343, + "step": 68110 + }, + { + "epoch": 0.15023167700630524, + "grad_norm": 0.15279130637645721, + "learning_rate": 2.750122526089913e-05, + "loss": 0.0351, + "step": 68120 + }, + { + "epoch": 0.1502537309812034, + "grad_norm": 0.10951629281044006, + "learning_rate": 2.7500311398765383e-05, + "loss": 0.0358, + "step": 68130 + }, + { + "epoch": 0.15027578495610155, + "grad_norm": 0.12206241488456726, + "learning_rate": 2.749939738474105e-05, + "loss": 0.0344, + "step": 68140 + }, + { + "epoch": 0.15029783893099974, + "grad_norm": 0.10170548409223557, + "learning_rate": 2.749848321883724e-05, + "loss": 0.035, + "step": 68150 + }, + { + "epoch": 0.1503198929058979, + "grad_norm": 0.11223313957452774, + "learning_rate": 2.749756890106506e-05, + "loss": 0.0365, + "step": 68160 + }, + { + "epoch": 0.15034194688079605, + "grad_norm": 0.10550401359796524, + "learning_rate": 2.7496654431435613e-05, + "loss": 0.0343, + "step": 68170 + }, + { + "epoch": 0.15036400085569424, + "grad_norm": 0.1083599254488945, + "learning_rate": 2.749573980996002e-05, + "loss": 0.0343, + "step": 68180 + }, + { + "epoch": 0.1503860548305924, + "grad_norm": 0.11248525977134705, + "learning_rate": 2.749482503664939e-05, + "loss": 0.0356, + "step": 68190 + }, + { + "epoch": 0.15040810880549055, + "grad_norm": 0.12323611974716187, + "learning_rate": 2.7493910111514834e-05, + "loss": 0.0349, + "step": 68200 + }, + { + "epoch": 0.15043016278038873, + "grad_norm": 0.11245016008615494, + "learning_rate": 2.7492995034567478e-05, + "loss": 0.0368, + "step": 68210 + }, + { + "epoch": 0.1504522167552869, + "grad_norm": 0.1620461642742157, + "learning_rate": 2.7492079805818438e-05, + "loss": 0.0351, + "step": 68220 + }, + { + "epoch": 0.15047427073018504, + "grad_norm": 0.12335574626922607, + "learning_rate": 2.7491164425278834e-05, + "loss": 0.0355, + "step": 68230 + }, + { + "epoch": 0.15049632470508323, + "grad_norm": 0.09356191009283066, + "learning_rate": 2.7490248892959786e-05, + "loss": 0.035, + "step": 68240 + }, + { + "epoch": 0.15051837867998138, + "grad_norm": 0.134588822722435, + "learning_rate": 2.748933320887242e-05, + "loss": 0.0347, + "step": 68250 + }, + { + "epoch": 0.15054043265487954, + "grad_norm": 0.11192977428436279, + "learning_rate": 2.7488417373027868e-05, + "loss": 0.034, + "step": 68260 + }, + { + "epoch": 0.15056248662977773, + "grad_norm": 0.08183426409959793, + "learning_rate": 2.7487501385437254e-05, + "loss": 0.0348, + "step": 68270 + }, + { + "epoch": 0.15058454060467588, + "grad_norm": 0.0983477383852005, + "learning_rate": 2.7486585246111705e-05, + "loss": 0.0345, + "step": 68280 + }, + { + "epoch": 0.15060659457957404, + "grad_norm": 0.09364777058362961, + "learning_rate": 2.7485668955062356e-05, + "loss": 0.0351, + "step": 68290 + }, + { + "epoch": 0.15062864855447222, + "grad_norm": 0.10505513846874237, + "learning_rate": 2.7484752512300336e-05, + "loss": 0.0366, + "step": 68300 + }, + { + "epoch": 0.15065070252937038, + "grad_norm": 0.11453955620527267, + "learning_rate": 2.7483835917836796e-05, + "loss": 0.0357, + "step": 68310 + }, + { + "epoch": 0.15067275650426853, + "grad_norm": 0.12192096561193466, + "learning_rate": 2.7482919171682854e-05, + "loss": 0.034, + "step": 68320 + }, + { + "epoch": 0.15069481047916672, + "grad_norm": 0.13233889639377594, + "learning_rate": 2.7482002273849663e-05, + "loss": 0.036, + "step": 68330 + }, + { + "epoch": 0.15071686445406487, + "grad_norm": 0.1255640685558319, + "learning_rate": 2.7481085224348358e-05, + "loss": 0.0343, + "step": 68340 + }, + { + "epoch": 0.15073891842896303, + "grad_norm": 0.15608304738998413, + "learning_rate": 2.7480168023190086e-05, + "loss": 0.0351, + "step": 68350 + }, + { + "epoch": 0.15076097240386122, + "grad_norm": 0.09688019752502441, + "learning_rate": 2.7479250670385987e-05, + "loss": 0.0339, + "step": 68360 + }, + { + "epoch": 0.15078302637875937, + "grad_norm": 0.1401233822107315, + "learning_rate": 2.747833316594721e-05, + "loss": 0.0348, + "step": 68370 + }, + { + "epoch": 0.15080508035365753, + "grad_norm": 0.10225903242826462, + "learning_rate": 2.7477415509884903e-05, + "loss": 0.0371, + "step": 68380 + }, + { + "epoch": 0.1508271343285557, + "grad_norm": 0.09308847784996033, + "learning_rate": 2.7476497702210217e-05, + "loss": 0.0343, + "step": 68390 + }, + { + "epoch": 0.15084918830345387, + "grad_norm": 0.1402275711297989, + "learning_rate": 2.7475579742934303e-05, + "loss": 0.0358, + "step": 68400 + }, + { + "epoch": 0.15087124227835205, + "grad_norm": 0.10931404680013657, + "learning_rate": 2.7474661632068318e-05, + "loss": 0.035, + "step": 68410 + }, + { + "epoch": 0.1508932962532502, + "grad_norm": 0.11337912082672119, + "learning_rate": 2.7473743369623415e-05, + "loss": 0.0345, + "step": 68420 + }, + { + "epoch": 0.15091535022814836, + "grad_norm": 0.13118164241313934, + "learning_rate": 2.7472824955610755e-05, + "loss": 0.0359, + "step": 68430 + }, + { + "epoch": 0.15093740420304655, + "grad_norm": 0.0986342504620552, + "learning_rate": 2.7471906390041494e-05, + "loss": 0.0343, + "step": 68440 + }, + { + "epoch": 0.1509594581779447, + "grad_norm": 0.13352757692337036, + "learning_rate": 2.7470987672926798e-05, + "loss": 0.0382, + "step": 68450 + }, + { + "epoch": 0.15098151215284286, + "grad_norm": 0.11898722499608994, + "learning_rate": 2.7470068804277823e-05, + "loss": 0.036, + "step": 68460 + }, + { + "epoch": 0.15100356612774105, + "grad_norm": 0.12621699273586273, + "learning_rate": 2.7469149784105743e-05, + "loss": 0.0381, + "step": 68470 + }, + { + "epoch": 0.1510256201026392, + "grad_norm": 0.11496277153491974, + "learning_rate": 2.7468230612421715e-05, + "loss": 0.0343, + "step": 68480 + }, + { + "epoch": 0.15104767407753736, + "grad_norm": 0.10673949867486954, + "learning_rate": 2.7467311289236914e-05, + "loss": 0.0346, + "step": 68490 + }, + { + "epoch": 0.15106972805243554, + "grad_norm": 0.11176979541778564, + "learning_rate": 2.7466391814562516e-05, + "loss": 0.0355, + "step": 68500 + }, + { + "epoch": 0.1510917820273337, + "grad_norm": 0.10286864638328552, + "learning_rate": 2.746547218840968e-05, + "loss": 0.0353, + "step": 68510 + }, + { + "epoch": 0.15111383600223185, + "grad_norm": 0.14057159423828125, + "learning_rate": 2.746455241078959e-05, + "loss": 0.0337, + "step": 68520 + }, + { + "epoch": 0.15113588997713004, + "grad_norm": 0.1143825426697731, + "learning_rate": 2.7463632481713417e-05, + "loss": 0.0331, + "step": 68530 + }, + { + "epoch": 0.1511579439520282, + "grad_norm": 0.10540696233510971, + "learning_rate": 2.7462712401192344e-05, + "loss": 0.0349, + "step": 68540 + }, + { + "epoch": 0.15117999792692635, + "grad_norm": 0.09448612481355667, + "learning_rate": 2.7461792169237552e-05, + "loss": 0.0339, + "step": 68550 + }, + { + "epoch": 0.15120205190182454, + "grad_norm": 0.09039876610040665, + "learning_rate": 2.7460871785860215e-05, + "loss": 0.0351, + "step": 68560 + }, + { + "epoch": 0.1512241058767227, + "grad_norm": 0.12142950296401978, + "learning_rate": 2.745995125107152e-05, + "loss": 0.0359, + "step": 68570 + }, + { + "epoch": 0.15124615985162085, + "grad_norm": 0.1287918984889984, + "learning_rate": 2.7459030564882655e-05, + "loss": 0.0349, + "step": 68580 + }, + { + "epoch": 0.15126821382651903, + "grad_norm": 0.14077752828598022, + "learning_rate": 2.7458109727304808e-05, + "loss": 0.0358, + "step": 68590 + }, + { + "epoch": 0.1512902678014172, + "grad_norm": 0.11832300573587418, + "learning_rate": 2.745718873834916e-05, + "loss": 0.0366, + "step": 68600 + }, + { + "epoch": 0.15131232177631534, + "grad_norm": 0.09998463839292526, + "learning_rate": 2.745626759802691e-05, + "loss": 0.035, + "step": 68610 + }, + { + "epoch": 0.15133437575121353, + "grad_norm": 0.11354837566614151, + "learning_rate": 2.745534630634925e-05, + "loss": 0.0351, + "step": 68620 + }, + { + "epoch": 0.15135642972611169, + "grad_norm": 0.1362159699201584, + "learning_rate": 2.745442486332737e-05, + "loss": 0.0358, + "step": 68630 + }, + { + "epoch": 0.15137848370100984, + "grad_norm": 0.10926862061023712, + "learning_rate": 2.7453503268972472e-05, + "loss": 0.0336, + "step": 68640 + }, + { + "epoch": 0.15140053767590803, + "grad_norm": 0.09285053610801697, + "learning_rate": 2.7452581523295745e-05, + "loss": 0.0352, + "step": 68650 + }, + { + "epoch": 0.15142259165080618, + "grad_norm": 0.12557213008403778, + "learning_rate": 2.74516596263084e-05, + "loss": 0.0357, + "step": 68660 + }, + { + "epoch": 0.15144464562570434, + "grad_norm": 0.11643527448177338, + "learning_rate": 2.7450737578021635e-05, + "loss": 0.0364, + "step": 68670 + }, + { + "epoch": 0.15146669960060252, + "grad_norm": 0.11587820202112198, + "learning_rate": 2.744981537844665e-05, + "loss": 0.0364, + "step": 68680 + }, + { + "epoch": 0.15148875357550068, + "grad_norm": 0.10437579452991486, + "learning_rate": 2.7448893027594655e-05, + "loss": 0.0347, + "step": 68690 + }, + { + "epoch": 0.15151080755039883, + "grad_norm": 0.10666213929653168, + "learning_rate": 2.7447970525476855e-05, + "loss": 0.0367, + "step": 68700 + }, + { + "epoch": 0.15153286152529702, + "grad_norm": 0.14000073075294495, + "learning_rate": 2.7447047872104464e-05, + "loss": 0.0342, + "step": 68710 + }, + { + "epoch": 0.15155491550019518, + "grad_norm": 0.08730536699295044, + "learning_rate": 2.7446125067488685e-05, + "loss": 0.0344, + "step": 68720 + }, + { + "epoch": 0.15157696947509333, + "grad_norm": 0.10735584795475006, + "learning_rate": 2.744520211164074e-05, + "loss": 0.035, + "step": 68730 + }, + { + "epoch": 0.15159902344999152, + "grad_norm": 0.0960569977760315, + "learning_rate": 2.7444279004571837e-05, + "loss": 0.0354, + "step": 68740 + }, + { + "epoch": 0.15162107742488967, + "grad_norm": 0.0879802480340004, + "learning_rate": 2.7443355746293194e-05, + "loss": 0.0359, + "step": 68750 + }, + { + "epoch": 0.15164313139978783, + "grad_norm": 0.13787730038166046, + "learning_rate": 2.7442432336816025e-05, + "loss": 0.0366, + "step": 68760 + }, + { + "epoch": 0.151665185374686, + "grad_norm": 0.14402925968170166, + "learning_rate": 2.7441508776151563e-05, + "loss": 0.0378, + "step": 68770 + }, + { + "epoch": 0.15168723934958417, + "grad_norm": 0.10817926377058029, + "learning_rate": 2.7440585064311022e-05, + "loss": 0.0342, + "step": 68780 + }, + { + "epoch": 0.15170929332448232, + "grad_norm": 0.09521342813968658, + "learning_rate": 2.7439661201305624e-05, + "loss": 0.0361, + "step": 68790 + }, + { + "epoch": 0.1517313472993805, + "grad_norm": 0.1258607804775238, + "learning_rate": 2.7438737187146597e-05, + "loss": 0.0345, + "step": 68800 + }, + { + "epoch": 0.15175340127427867, + "grad_norm": 0.10321543365716934, + "learning_rate": 2.7437813021845172e-05, + "loss": 0.0371, + "step": 68810 + }, + { + "epoch": 0.15177545524917682, + "grad_norm": 0.11509211361408234, + "learning_rate": 2.743688870541257e-05, + "loss": 0.035, + "step": 68820 + }, + { + "epoch": 0.151797509224075, + "grad_norm": 0.12103613466024399, + "learning_rate": 2.7435964237860033e-05, + "loss": 0.0376, + "step": 68830 + }, + { + "epoch": 0.15181956319897316, + "grad_norm": 0.1191381961107254, + "learning_rate": 2.7435039619198788e-05, + "loss": 0.0362, + "step": 68840 + }, + { + "epoch": 0.15184161717387135, + "grad_norm": 0.12080475687980652, + "learning_rate": 2.7434114849440073e-05, + "loss": 0.035, + "step": 68850 + }, + { + "epoch": 0.1518636711487695, + "grad_norm": 0.12312014400959015, + "learning_rate": 2.743318992859512e-05, + "loss": 0.0336, + "step": 68860 + }, + { + "epoch": 0.15188572512366766, + "grad_norm": 0.1263076812028885, + "learning_rate": 2.743226485667517e-05, + "loss": 0.034, + "step": 68870 + }, + { + "epoch": 0.15190777909856584, + "grad_norm": 0.10893841087818146, + "learning_rate": 2.7431339633691463e-05, + "loss": 0.0346, + "step": 68880 + }, + { + "epoch": 0.151929833073464, + "grad_norm": 0.15592941641807556, + "learning_rate": 2.7430414259655245e-05, + "loss": 0.0358, + "step": 68890 + }, + { + "epoch": 0.15195188704836216, + "grad_norm": 0.10929703712463379, + "learning_rate": 2.742948873457776e-05, + "loss": 0.0358, + "step": 68900 + }, + { + "epoch": 0.15197394102326034, + "grad_norm": 0.10637789964675903, + "learning_rate": 2.7428563058470246e-05, + "loss": 0.036, + "step": 68910 + }, + { + "epoch": 0.1519959949981585, + "grad_norm": 0.10992871969938278, + "learning_rate": 2.742763723134396e-05, + "loss": 0.0343, + "step": 68920 + }, + { + "epoch": 0.15201804897305665, + "grad_norm": 0.11234638839960098, + "learning_rate": 2.7426711253210145e-05, + "loss": 0.0359, + "step": 68930 + }, + { + "epoch": 0.15204010294795484, + "grad_norm": 0.09089113026857376, + "learning_rate": 2.7425785124080057e-05, + "loss": 0.0352, + "step": 68940 + }, + { + "epoch": 0.152062156922853, + "grad_norm": 0.1379413604736328, + "learning_rate": 2.742485884396495e-05, + "loss": 0.0357, + "step": 68950 + }, + { + "epoch": 0.15208421089775115, + "grad_norm": 0.10038939863443375, + "learning_rate": 2.7423932412876074e-05, + "loss": 0.0352, + "step": 68960 + }, + { + "epoch": 0.15210626487264933, + "grad_norm": 0.14745865762233734, + "learning_rate": 2.742300583082469e-05, + "loss": 0.0359, + "step": 68970 + }, + { + "epoch": 0.1521283188475475, + "grad_norm": 0.10115434974431992, + "learning_rate": 2.742207909782206e-05, + "loss": 0.0339, + "step": 68980 + }, + { + "epoch": 0.15215037282244565, + "grad_norm": 0.10157597064971924, + "learning_rate": 2.7421152213879434e-05, + "loss": 0.0359, + "step": 68990 + }, + { + "epoch": 0.15217242679734383, + "grad_norm": 0.08252786099910736, + "learning_rate": 2.7420225179008085e-05, + "loss": 0.0359, + "step": 69000 + }, + { + "epoch": 0.15219448077224199, + "grad_norm": 0.1071147471666336, + "learning_rate": 2.7419297993219272e-05, + "loss": 0.0329, + "step": 69010 + }, + { + "epoch": 0.15221653474714014, + "grad_norm": 0.10935798287391663, + "learning_rate": 2.741837065652426e-05, + "loss": 0.0358, + "step": 69020 + }, + { + "epoch": 0.15223858872203833, + "grad_norm": 0.11916395276784897, + "learning_rate": 2.741744316893432e-05, + "loss": 0.0351, + "step": 69030 + }, + { + "epoch": 0.15226064269693648, + "grad_norm": 0.14532214403152466, + "learning_rate": 2.7416515530460728e-05, + "loss": 0.0347, + "step": 69040 + }, + { + "epoch": 0.15228269667183464, + "grad_norm": 0.11050083488225937, + "learning_rate": 2.7415587741114743e-05, + "loss": 0.0348, + "step": 69050 + }, + { + "epoch": 0.15230475064673282, + "grad_norm": 0.1075870469212532, + "learning_rate": 2.7414659800907648e-05, + "loss": 0.0332, + "step": 69060 + }, + { + "epoch": 0.15232680462163098, + "grad_norm": 0.10428957641124725, + "learning_rate": 2.7413731709850716e-05, + "loss": 0.0355, + "step": 69070 + }, + { + "epoch": 0.15234885859652914, + "grad_norm": 0.119698166847229, + "learning_rate": 2.741280346795522e-05, + "loss": 0.0351, + "step": 69080 + }, + { + "epoch": 0.15237091257142732, + "grad_norm": 0.11367451399564743, + "learning_rate": 2.7411875075232442e-05, + "loss": 0.0361, + "step": 69090 + }, + { + "epoch": 0.15239296654632548, + "grad_norm": 0.1235976591706276, + "learning_rate": 2.7410946531693665e-05, + "loss": 0.034, + "step": 69100 + }, + { + "epoch": 0.15241502052122363, + "grad_norm": 0.11354267597198486, + "learning_rate": 2.741001783735017e-05, + "loss": 0.0339, + "step": 69110 + }, + { + "epoch": 0.15243707449612182, + "grad_norm": 0.15339292585849762, + "learning_rate": 2.7409088992213238e-05, + "loss": 0.0367, + "step": 69120 + }, + { + "epoch": 0.15245912847101997, + "grad_norm": 0.1340147852897644, + "learning_rate": 2.740815999629416e-05, + "loss": 0.0363, + "step": 69130 + }, + { + "epoch": 0.15248118244591813, + "grad_norm": 0.12681083381175995, + "learning_rate": 2.7407230849604227e-05, + "loss": 0.0343, + "step": 69140 + }, + { + "epoch": 0.1525032364208163, + "grad_norm": 0.11852356046438217, + "learning_rate": 2.740630155215472e-05, + "loss": 0.0324, + "step": 69150 + }, + { + "epoch": 0.15252529039571447, + "grad_norm": 0.11179198324680328, + "learning_rate": 2.740537210395694e-05, + "loss": 0.037, + "step": 69160 + }, + { + "epoch": 0.15254734437061263, + "grad_norm": 0.11664335429668427, + "learning_rate": 2.740444250502217e-05, + "loss": 0.0338, + "step": 69170 + }, + { + "epoch": 0.1525693983455108, + "grad_norm": 0.13034987449645996, + "learning_rate": 2.7403512755361718e-05, + "loss": 0.0365, + "step": 69180 + }, + { + "epoch": 0.15259145232040897, + "grad_norm": 0.13932736217975616, + "learning_rate": 2.7402582854986873e-05, + "loss": 0.0333, + "step": 69190 + }, + { + "epoch": 0.15261350629530712, + "grad_norm": 0.12442830950021744, + "learning_rate": 2.7401652803908936e-05, + "loss": 0.0346, + "step": 69200 + }, + { + "epoch": 0.1526355602702053, + "grad_norm": 0.11089073866605759, + "learning_rate": 2.7400722602139205e-05, + "loss": 0.0343, + "step": 69210 + }, + { + "epoch": 0.15265761424510346, + "grad_norm": 0.11691591888666153, + "learning_rate": 2.7399792249688992e-05, + "loss": 0.0342, + "step": 69220 + }, + { + "epoch": 0.15267966822000162, + "grad_norm": 0.13695134222507477, + "learning_rate": 2.7398861746569594e-05, + "loss": 0.0356, + "step": 69230 + }, + { + "epoch": 0.1527017221948998, + "grad_norm": 0.10117494314908981, + "learning_rate": 2.739793109279232e-05, + "loss": 0.0349, + "step": 69240 + }, + { + "epoch": 0.15272377616979796, + "grad_norm": 0.14164471626281738, + "learning_rate": 2.7397000288368473e-05, + "loss": 0.0353, + "step": 69250 + }, + { + "epoch": 0.15274583014469614, + "grad_norm": 0.14844241738319397, + "learning_rate": 2.739606933330937e-05, + "loss": 0.0358, + "step": 69260 + }, + { + "epoch": 0.1527678841195943, + "grad_norm": 0.12782077491283417, + "learning_rate": 2.7395138227626324e-05, + "loss": 0.0342, + "step": 69270 + }, + { + "epoch": 0.15278993809449246, + "grad_norm": 0.1251014769077301, + "learning_rate": 2.739420697133064e-05, + "loss": 0.035, + "step": 69280 + }, + { + "epoch": 0.15281199206939064, + "grad_norm": 0.10733821243047714, + "learning_rate": 2.7393275564433648e-05, + "loss": 0.0342, + "step": 69290 + }, + { + "epoch": 0.1528340460442888, + "grad_norm": 0.1188613772392273, + "learning_rate": 2.7392344006946652e-05, + "loss": 0.0335, + "step": 69300 + }, + { + "epoch": 0.15285610001918695, + "grad_norm": 0.11998144537210464, + "learning_rate": 2.7391412298880972e-05, + "loss": 0.0358, + "step": 69310 + }, + { + "epoch": 0.15287815399408514, + "grad_norm": 0.09959378093481064, + "learning_rate": 2.7390480440247932e-05, + "loss": 0.0344, + "step": 69320 + }, + { + "epoch": 0.1529002079689833, + "grad_norm": 0.11575840413570404, + "learning_rate": 2.7389548431058862e-05, + "loss": 0.0345, + "step": 69330 + }, + { + "epoch": 0.15292226194388145, + "grad_norm": 0.09712696820497513, + "learning_rate": 2.738861627132508e-05, + "loss": 0.0368, + "step": 69340 + }, + { + "epoch": 0.15294431591877963, + "grad_norm": 0.11303149163722992, + "learning_rate": 2.7387683961057913e-05, + "loss": 0.0353, + "step": 69350 + }, + { + "epoch": 0.1529663698936778, + "grad_norm": 0.14427173137664795, + "learning_rate": 2.738675150026869e-05, + "loss": 0.0385, + "step": 69360 + }, + { + "epoch": 0.15298842386857595, + "grad_norm": 0.12448830157518387, + "learning_rate": 2.7385818888968743e-05, + "loss": 0.0346, + "step": 69370 + }, + { + "epoch": 0.15301047784347413, + "grad_norm": 0.12218768149614334, + "learning_rate": 2.7384886127169398e-05, + "loss": 0.0365, + "step": 69380 + }, + { + "epoch": 0.15303253181837229, + "grad_norm": 0.11290305107831955, + "learning_rate": 2.7383953214881995e-05, + "loss": 0.0341, + "step": 69390 + }, + { + "epoch": 0.15305458579327044, + "grad_norm": 0.08560696244239807, + "learning_rate": 2.738302015211787e-05, + "loss": 0.0367, + "step": 69400 + }, + { + "epoch": 0.15307663976816863, + "grad_norm": 0.11080530285835266, + "learning_rate": 2.7382086938888356e-05, + "loss": 0.0325, + "step": 69410 + }, + { + "epoch": 0.15309869374306678, + "grad_norm": 0.10230516642332077, + "learning_rate": 2.73811535752048e-05, + "loss": 0.0341, + "step": 69420 + }, + { + "epoch": 0.15312074771796494, + "grad_norm": 0.10449487715959549, + "learning_rate": 2.7380220061078537e-05, + "loss": 0.0336, + "step": 69430 + }, + { + "epoch": 0.15314280169286312, + "grad_norm": 0.0958237424492836, + "learning_rate": 2.7379286396520906e-05, + "loss": 0.0331, + "step": 69440 + }, + { + "epoch": 0.15316485566776128, + "grad_norm": 0.10758619755506516, + "learning_rate": 2.7378352581543257e-05, + "loss": 0.0345, + "step": 69450 + }, + { + "epoch": 0.15318690964265944, + "grad_norm": 0.1266203075647354, + "learning_rate": 2.7377418616156942e-05, + "loss": 0.036, + "step": 69460 + }, + { + "epoch": 0.15320896361755762, + "grad_norm": 0.10789960622787476, + "learning_rate": 2.7376484500373303e-05, + "loss": 0.0361, + "step": 69470 + }, + { + "epoch": 0.15323101759245578, + "grad_norm": 0.10032138973474503, + "learning_rate": 2.7375550234203698e-05, + "loss": 0.0355, + "step": 69480 + }, + { + "epoch": 0.15325307156735393, + "grad_norm": 0.13082213699817657, + "learning_rate": 2.7374615817659462e-05, + "loss": 0.0373, + "step": 69490 + }, + { + "epoch": 0.15327512554225212, + "grad_norm": 0.11622358858585358, + "learning_rate": 2.7373681250751968e-05, + "loss": 0.0349, + "step": 69500 + }, + { + "epoch": 0.15329717951715027, + "grad_norm": 0.10190194845199585, + "learning_rate": 2.737274653349256e-05, + "loss": 0.0355, + "step": 69510 + }, + { + "epoch": 0.15331923349204843, + "grad_norm": 0.1260388195514679, + "learning_rate": 2.73718116658926e-05, + "loss": 0.035, + "step": 69520 + }, + { + "epoch": 0.1533412874669466, + "grad_norm": 0.09717079252004623, + "learning_rate": 2.737087664796345e-05, + "loss": 0.0336, + "step": 69530 + }, + { + "epoch": 0.15336334144184477, + "grad_norm": 0.1412764936685562, + "learning_rate": 2.736994147971647e-05, + "loss": 0.0345, + "step": 69540 + }, + { + "epoch": 0.15338539541674293, + "grad_norm": 0.11385070532560349, + "learning_rate": 2.7369006161163016e-05, + "loss": 0.0347, + "step": 69550 + }, + { + "epoch": 0.1534074493916411, + "grad_norm": 0.09894060343503952, + "learning_rate": 2.7368070692314466e-05, + "loss": 0.0339, + "step": 69560 + }, + { + "epoch": 0.15342950336653927, + "grad_norm": 0.1263551115989685, + "learning_rate": 2.7367135073182176e-05, + "loss": 0.0355, + "step": 69570 + }, + { + "epoch": 0.15345155734143742, + "grad_norm": 0.12519770860671997, + "learning_rate": 2.7366199303777517e-05, + "loss": 0.0353, + "step": 69580 + }, + { + "epoch": 0.1534736113163356, + "grad_norm": 0.13839957118034363, + "learning_rate": 2.7365263384111863e-05, + "loss": 0.0346, + "step": 69590 + }, + { + "epoch": 0.15349566529123376, + "grad_norm": 0.08919138461351395, + "learning_rate": 2.736432731419658e-05, + "loss": 0.0363, + "step": 69600 + }, + { + "epoch": 0.15351771926613192, + "grad_norm": 0.09272410720586777, + "learning_rate": 2.7363391094043048e-05, + "loss": 0.034, + "step": 69610 + }, + { + "epoch": 0.1535397732410301, + "grad_norm": 0.10315505415201187, + "learning_rate": 2.7362454723662644e-05, + "loss": 0.0341, + "step": 69620 + }, + { + "epoch": 0.15356182721592826, + "grad_norm": 0.10028444230556488, + "learning_rate": 2.7361518203066742e-05, + "loss": 0.034, + "step": 69630 + }, + { + "epoch": 0.15358388119082642, + "grad_norm": 0.09687095880508423, + "learning_rate": 2.7360581532266724e-05, + "loss": 0.0364, + "step": 69640 + }, + { + "epoch": 0.1536059351657246, + "grad_norm": 0.15945200622081757, + "learning_rate": 2.7359644711273964e-05, + "loss": 0.0352, + "step": 69650 + }, + { + "epoch": 0.15362798914062276, + "grad_norm": 0.10211202502250671, + "learning_rate": 2.7358707740099858e-05, + "loss": 0.0343, + "step": 69660 + }, + { + "epoch": 0.1536500431155209, + "grad_norm": 0.10642051696777344, + "learning_rate": 2.7357770618755785e-05, + "loss": 0.0354, + "step": 69670 + }, + { + "epoch": 0.1536720970904191, + "grad_norm": 0.10268079489469528, + "learning_rate": 2.735683334725313e-05, + "loss": 0.0373, + "step": 69680 + }, + { + "epoch": 0.15369415106531725, + "grad_norm": 0.10466935485601425, + "learning_rate": 2.735589592560328e-05, + "loss": 0.0339, + "step": 69690 + }, + { + "epoch": 0.15371620504021544, + "grad_norm": 0.11488748341798782, + "learning_rate": 2.735495835381763e-05, + "loss": 0.0347, + "step": 69700 + }, + { + "epoch": 0.1537382590151136, + "grad_norm": 0.12875553965568542, + "learning_rate": 2.7354020631907572e-05, + "loss": 0.0354, + "step": 69710 + }, + { + "epoch": 0.15376031299001175, + "grad_norm": 0.11928165704011917, + "learning_rate": 2.7353082759884497e-05, + "loss": 0.0341, + "step": 69720 + }, + { + "epoch": 0.15378236696490993, + "grad_norm": 0.13828809559345245, + "learning_rate": 2.7352144737759804e-05, + "loss": 0.0341, + "step": 69730 + }, + { + "epoch": 0.1538044209398081, + "grad_norm": 0.10778825730085373, + "learning_rate": 2.7351206565544888e-05, + "loss": 0.0358, + "step": 69740 + }, + { + "epoch": 0.15382647491470625, + "grad_norm": 0.10911334306001663, + "learning_rate": 2.7350268243251156e-05, + "loss": 0.0341, + "step": 69750 + }, + { + "epoch": 0.15384852888960443, + "grad_norm": 0.10369371622800827, + "learning_rate": 2.734932977089e-05, + "loss": 0.0338, + "step": 69760 + }, + { + "epoch": 0.1538705828645026, + "grad_norm": 0.10786005854606628, + "learning_rate": 2.734839114847283e-05, + "loss": 0.0345, + "step": 69770 + }, + { + "epoch": 0.15389263683940074, + "grad_norm": 0.10298171639442444, + "learning_rate": 2.734745237601105e-05, + "loss": 0.0348, + "step": 69780 + }, + { + "epoch": 0.15391469081429893, + "grad_norm": 0.12235280126333237, + "learning_rate": 2.7346513453516065e-05, + "loss": 0.033, + "step": 69790 + }, + { + "epoch": 0.15393674478919708, + "grad_norm": 0.12406347692012787, + "learning_rate": 2.7345574380999282e-05, + "loss": 0.0353, + "step": 69800 + }, + { + "epoch": 0.15395879876409524, + "grad_norm": 0.10435754060745239, + "learning_rate": 2.7344635158472114e-05, + "loss": 0.0341, + "step": 69810 + }, + { + "epoch": 0.15398085273899342, + "grad_norm": 0.091422438621521, + "learning_rate": 2.7343695785945975e-05, + "loss": 0.0358, + "step": 69820 + }, + { + "epoch": 0.15400290671389158, + "grad_norm": 0.11334411054849625, + "learning_rate": 2.734275626343228e-05, + "loss": 0.0339, + "step": 69830 + }, + { + "epoch": 0.15402496068878974, + "grad_norm": 0.10858696699142456, + "learning_rate": 2.734181659094244e-05, + "loss": 0.0351, + "step": 69840 + }, + { + "epoch": 0.15404701466368792, + "grad_norm": 0.11708606034517288, + "learning_rate": 2.7340876768487872e-05, + "loss": 0.0352, + "step": 69850 + }, + { + "epoch": 0.15406906863858608, + "grad_norm": 0.13904887437820435, + "learning_rate": 2.7339936796080006e-05, + "loss": 0.0336, + "step": 69860 + }, + { + "epoch": 0.15409112261348423, + "grad_norm": 0.11847659945487976, + "learning_rate": 2.7338996673730253e-05, + "loss": 0.0351, + "step": 69870 + }, + { + "epoch": 0.15411317658838242, + "grad_norm": 0.12097229808568954, + "learning_rate": 2.7338056401450044e-05, + "loss": 0.0373, + "step": 69880 + }, + { + "epoch": 0.15413523056328057, + "grad_norm": 0.10203062742948532, + "learning_rate": 2.73371159792508e-05, + "loss": 0.036, + "step": 69890 + }, + { + "epoch": 0.15415728453817873, + "grad_norm": 0.11809439957141876, + "learning_rate": 2.7336175407143946e-05, + "loss": 0.0334, + "step": 69900 + }, + { + "epoch": 0.1541793385130769, + "grad_norm": 0.11477752774953842, + "learning_rate": 2.7335234685140914e-05, + "loss": 0.0324, + "step": 69910 + }, + { + "epoch": 0.15420139248797507, + "grad_norm": 0.07688542455434799, + "learning_rate": 2.733429381325314e-05, + "loss": 0.033, + "step": 69920 + }, + { + "epoch": 0.15422344646287323, + "grad_norm": 0.11808191239833832, + "learning_rate": 2.7333352791492045e-05, + "loss": 0.0348, + "step": 69930 + }, + { + "epoch": 0.1542455004377714, + "grad_norm": 0.09500274807214737, + "learning_rate": 2.733241161986907e-05, + "loss": 0.0361, + "step": 69940 + }, + { + "epoch": 0.15426755441266957, + "grad_norm": 0.14579610526561737, + "learning_rate": 2.733147029839565e-05, + "loss": 0.0347, + "step": 69950 + }, + { + "epoch": 0.15428960838756772, + "grad_norm": 0.09403255581855774, + "learning_rate": 2.7330528827083226e-05, + "loss": 0.0346, + "step": 69960 + }, + { + "epoch": 0.1543116623624659, + "grad_norm": 0.13784676790237427, + "learning_rate": 2.732958720594323e-05, + "loss": 0.0363, + "step": 69970 + }, + { + "epoch": 0.15433371633736406, + "grad_norm": 0.12974897027015686, + "learning_rate": 2.7328645434987113e-05, + "loss": 0.0351, + "step": 69980 + }, + { + "epoch": 0.15435577031226222, + "grad_norm": 0.11019710451364517, + "learning_rate": 2.732770351422631e-05, + "loss": 0.0342, + "step": 69990 + }, + { + "epoch": 0.1543778242871604, + "grad_norm": 0.15334153175354004, + "learning_rate": 2.732676144367227e-05, + "loss": 0.0336, + "step": 70000 + }, + { + "epoch": 0.15439987826205856, + "grad_norm": 0.11849468946456909, + "learning_rate": 2.7325819223336442e-05, + "loss": 0.0365, + "step": 70010 + }, + { + "epoch": 0.15442193223695672, + "grad_norm": 0.10279576480388641, + "learning_rate": 2.732487685323027e-05, + "loss": 0.0369, + "step": 70020 + }, + { + "epoch": 0.1544439862118549, + "grad_norm": 0.11565304547548294, + "learning_rate": 2.7323934333365213e-05, + "loss": 0.0363, + "step": 70030 + }, + { + "epoch": 0.15446604018675306, + "grad_norm": 0.12206649780273438, + "learning_rate": 2.7322991663752715e-05, + "loss": 0.0341, + "step": 70040 + }, + { + "epoch": 0.1544880941616512, + "grad_norm": 0.11591152101755142, + "learning_rate": 2.732204884440423e-05, + "loss": 0.0342, + "step": 70050 + }, + { + "epoch": 0.1545101481365494, + "grad_norm": 0.10106994211673737, + "learning_rate": 2.7321105875331226e-05, + "loss": 0.0354, + "step": 70060 + }, + { + "epoch": 0.15453220211144755, + "grad_norm": 0.12361317127943039, + "learning_rate": 2.7320162756545145e-05, + "loss": 0.0336, + "step": 70070 + }, + { + "epoch": 0.1545542560863457, + "grad_norm": 0.13801918923854828, + "learning_rate": 2.7319219488057456e-05, + "loss": 0.0363, + "step": 70080 + }, + { + "epoch": 0.1545763100612439, + "grad_norm": 0.15593908727169037, + "learning_rate": 2.731827606987962e-05, + "loss": 0.0351, + "step": 70090 + }, + { + "epoch": 0.15459836403614205, + "grad_norm": 0.09833718836307526, + "learning_rate": 2.7317332502023097e-05, + "loss": 0.0349, + "step": 70100 + }, + { + "epoch": 0.1546204180110402, + "grad_norm": 0.10835815966129303, + "learning_rate": 2.7316388784499356e-05, + "loss": 0.0355, + "step": 70110 + }, + { + "epoch": 0.1546424719859384, + "grad_norm": 0.1030033677816391, + "learning_rate": 2.731544491731987e-05, + "loss": 0.0347, + "step": 70120 + }, + { + "epoch": 0.15466452596083655, + "grad_norm": 0.12271866947412491, + "learning_rate": 2.7314500900496092e-05, + "loss": 0.0342, + "step": 70130 + }, + { + "epoch": 0.15468657993573473, + "grad_norm": 0.12368637323379517, + "learning_rate": 2.7313556734039503e-05, + "loss": 0.0322, + "step": 70140 + }, + { + "epoch": 0.1547086339106329, + "grad_norm": 0.10331259667873383, + "learning_rate": 2.7312612417961573e-05, + "loss": 0.0356, + "step": 70150 + }, + { + "epoch": 0.15473068788553104, + "grad_norm": 0.0839386060833931, + "learning_rate": 2.731166795227378e-05, + "loss": 0.0344, + "step": 70160 + }, + { + "epoch": 0.15475274186042923, + "grad_norm": 0.09369631856679916, + "learning_rate": 2.7310723336987595e-05, + "loss": 0.034, + "step": 70170 + }, + { + "epoch": 0.15477479583532738, + "grad_norm": 0.09013018757104874, + "learning_rate": 2.7309778572114496e-05, + "loss": 0.0361, + "step": 70180 + }, + { + "epoch": 0.15479684981022554, + "grad_norm": 0.0872434452176094, + "learning_rate": 2.730883365766597e-05, + "loss": 0.0339, + "step": 70190 + }, + { + "epoch": 0.15481890378512372, + "grad_norm": 0.12079546600580215, + "learning_rate": 2.730788859365349e-05, + "loss": 0.0342, + "step": 70200 + }, + { + "epoch": 0.15484095776002188, + "grad_norm": 0.10378814488649368, + "learning_rate": 2.7306943380088544e-05, + "loss": 0.0332, + "step": 70210 + }, + { + "epoch": 0.15486301173492004, + "grad_norm": 0.12514661252498627, + "learning_rate": 2.730599801698262e-05, + "loss": 0.0342, + "step": 70220 + }, + { + "epoch": 0.15488506570981822, + "grad_norm": 0.12824185192584991, + "learning_rate": 2.7305052504347194e-05, + "loss": 0.037, + "step": 70230 + }, + { + "epoch": 0.15490711968471638, + "grad_norm": 0.10267851501703262, + "learning_rate": 2.730410684219377e-05, + "loss": 0.0339, + "step": 70240 + }, + { + "epoch": 0.15492917365961453, + "grad_norm": 0.11860992014408112, + "learning_rate": 2.7303161030533826e-05, + "loss": 0.0327, + "step": 70250 + }, + { + "epoch": 0.15495122763451272, + "grad_norm": 0.11500384658575058, + "learning_rate": 2.7302215069378858e-05, + "loss": 0.0342, + "step": 70260 + }, + { + "epoch": 0.15497328160941087, + "grad_norm": 0.0979299321770668, + "learning_rate": 2.7301268958740365e-05, + "loss": 0.0349, + "step": 70270 + }, + { + "epoch": 0.15499533558430903, + "grad_norm": 0.0956253856420517, + "learning_rate": 2.7300322698629838e-05, + "loss": 0.0333, + "step": 70280 + }, + { + "epoch": 0.1550173895592072, + "grad_norm": 0.08495601266622543, + "learning_rate": 2.7299376289058778e-05, + "loss": 0.0349, + "step": 70290 + }, + { + "epoch": 0.15503944353410537, + "grad_norm": 0.12573260068893433, + "learning_rate": 2.7298429730038685e-05, + "loss": 0.0357, + "step": 70300 + }, + { + "epoch": 0.15506149750900353, + "grad_norm": 0.10975603759288788, + "learning_rate": 2.729748302158106e-05, + "loss": 0.0347, + "step": 70310 + }, + { + "epoch": 0.1550835514839017, + "grad_norm": 0.11694534868001938, + "learning_rate": 2.7296536163697398e-05, + "loss": 0.0354, + "step": 70320 + }, + { + "epoch": 0.15510560545879987, + "grad_norm": 0.10999235510826111, + "learning_rate": 2.729558915639922e-05, + "loss": 0.0338, + "step": 70330 + }, + { + "epoch": 0.15512765943369802, + "grad_norm": 0.11193699389696121, + "learning_rate": 2.729464199969802e-05, + "loss": 0.0346, + "step": 70340 + }, + { + "epoch": 0.1551497134085962, + "grad_norm": 0.1069992184638977, + "learning_rate": 2.7293694693605312e-05, + "loss": 0.0339, + "step": 70350 + }, + { + "epoch": 0.15517176738349436, + "grad_norm": 0.1669016182422638, + "learning_rate": 2.7292747238132606e-05, + "loss": 0.0347, + "step": 70360 + }, + { + "epoch": 0.15519382135839252, + "grad_norm": 0.1077481061220169, + "learning_rate": 2.7291799633291416e-05, + "loss": 0.0334, + "step": 70370 + }, + { + "epoch": 0.1552158753332907, + "grad_norm": 0.12229958921670914, + "learning_rate": 2.7290851879093255e-05, + "loss": 0.0326, + "step": 70380 + }, + { + "epoch": 0.15523792930818886, + "grad_norm": 0.11196546256542206, + "learning_rate": 2.728990397554964e-05, + "loss": 0.0356, + "step": 70390 + }, + { + "epoch": 0.15525998328308702, + "grad_norm": 0.1337067186832428, + "learning_rate": 2.7288955922672084e-05, + "loss": 0.0348, + "step": 70400 + }, + { + "epoch": 0.1552820372579852, + "grad_norm": 0.10547921061515808, + "learning_rate": 2.728800772047212e-05, + "loss": 0.0344, + "step": 70410 + }, + { + "epoch": 0.15530409123288336, + "grad_norm": 0.11779747158288956, + "learning_rate": 2.7287059368961253e-05, + "loss": 0.0336, + "step": 70420 + }, + { + "epoch": 0.1553261452077815, + "grad_norm": 0.1097370907664299, + "learning_rate": 2.7286110868151014e-05, + "loss": 0.035, + "step": 70430 + }, + { + "epoch": 0.1553481991826797, + "grad_norm": 0.08850926160812378, + "learning_rate": 2.7285162218052932e-05, + "loss": 0.0337, + "step": 70440 + }, + { + "epoch": 0.15537025315757785, + "grad_norm": 0.10947660356760025, + "learning_rate": 2.7284213418678527e-05, + "loss": 0.0356, + "step": 70450 + }, + { + "epoch": 0.155392307132476, + "grad_norm": 0.09777608513832092, + "learning_rate": 2.728326447003933e-05, + "loss": 0.0372, + "step": 70460 + }, + { + "epoch": 0.1554143611073742, + "grad_norm": 0.1129453107714653, + "learning_rate": 2.7282315372146872e-05, + "loss": 0.0354, + "step": 70470 + }, + { + "epoch": 0.15543641508227235, + "grad_norm": 0.12790551781654358, + "learning_rate": 2.7281366125012688e-05, + "loss": 0.0347, + "step": 70480 + }, + { + "epoch": 0.1554584690571705, + "grad_norm": 0.11064569652080536, + "learning_rate": 2.728041672864831e-05, + "loss": 0.0334, + "step": 70490 + }, + { + "epoch": 0.1554805230320687, + "grad_norm": 0.10545699298381805, + "learning_rate": 2.7279467183065275e-05, + "loss": 0.0374, + "step": 70500 + }, + { + "epoch": 0.15550257700696685, + "grad_norm": 0.1290944218635559, + "learning_rate": 2.7278517488275117e-05, + "loss": 0.0355, + "step": 70510 + }, + { + "epoch": 0.155524630981865, + "grad_norm": 0.07330048829317093, + "learning_rate": 2.727756764428938e-05, + "loss": 0.0334, + "step": 70520 + }, + { + "epoch": 0.1555466849567632, + "grad_norm": 0.10493189096450806, + "learning_rate": 2.7276617651119606e-05, + "loss": 0.0328, + "step": 70530 + }, + { + "epoch": 0.15556873893166134, + "grad_norm": 0.12447945773601532, + "learning_rate": 2.7275667508777333e-05, + "loss": 0.0342, + "step": 70540 + }, + { + "epoch": 0.15559079290655953, + "grad_norm": 0.11420506983995438, + "learning_rate": 2.7274717217274113e-05, + "loss": 0.0366, + "step": 70550 + }, + { + "epoch": 0.15561284688145768, + "grad_norm": 0.10589373856782913, + "learning_rate": 2.7273766776621484e-05, + "loss": 0.0365, + "step": 70560 + }, + { + "epoch": 0.15563490085635584, + "grad_norm": 0.12768810987472534, + "learning_rate": 2.7272816186831007e-05, + "loss": 0.0331, + "step": 70570 + }, + { + "epoch": 0.15565695483125402, + "grad_norm": 0.09784634411334991, + "learning_rate": 2.727186544791422e-05, + "loss": 0.0353, + "step": 70580 + }, + { + "epoch": 0.15567900880615218, + "grad_norm": 0.12810292840003967, + "learning_rate": 2.7270914559882688e-05, + "loss": 0.0364, + "step": 70590 + }, + { + "epoch": 0.15570106278105034, + "grad_norm": 0.09009446203708649, + "learning_rate": 2.7269963522747952e-05, + "loss": 0.0324, + "step": 70600 + }, + { + "epoch": 0.15572311675594852, + "grad_norm": 0.10951623320579529, + "learning_rate": 2.726901233652158e-05, + "loss": 0.0343, + "step": 70610 + }, + { + "epoch": 0.15574517073084668, + "grad_norm": 0.12027130275964737, + "learning_rate": 2.726806100121512e-05, + "loss": 0.0349, + "step": 70620 + }, + { + "epoch": 0.15576722470574483, + "grad_norm": 0.12617933750152588, + "learning_rate": 2.7267109516840138e-05, + "loss": 0.037, + "step": 70630 + }, + { + "epoch": 0.15578927868064302, + "grad_norm": 0.11918928474187851, + "learning_rate": 2.726615788340819e-05, + "loss": 0.0336, + "step": 70640 + }, + { + "epoch": 0.15581133265554117, + "grad_norm": 0.10998764634132385, + "learning_rate": 2.7265206100930844e-05, + "loss": 0.0345, + "step": 70650 + }, + { + "epoch": 0.15583338663043933, + "grad_norm": 0.123602956533432, + "learning_rate": 2.7264254169419668e-05, + "loss": 0.034, + "step": 70660 + }, + { + "epoch": 0.15585544060533751, + "grad_norm": 0.09741422533988953, + "learning_rate": 2.726330208888622e-05, + "loss": 0.0334, + "step": 70670 + }, + { + "epoch": 0.15587749458023567, + "grad_norm": 0.08980292081832886, + "learning_rate": 2.7262349859342073e-05, + "loss": 0.0356, + "step": 70680 + }, + { + "epoch": 0.15589954855513383, + "grad_norm": 0.10531866550445557, + "learning_rate": 2.72613974807988e-05, + "loss": 0.0359, + "step": 70690 + }, + { + "epoch": 0.155921602530032, + "grad_norm": 0.09965700656175613, + "learning_rate": 2.726044495326797e-05, + "loss": 0.0346, + "step": 70700 + }, + { + "epoch": 0.15594365650493017, + "grad_norm": 0.1250828057527542, + "learning_rate": 2.7259492276761158e-05, + "loss": 0.0339, + "step": 70710 + }, + { + "epoch": 0.15596571047982832, + "grad_norm": 0.09011407941579819, + "learning_rate": 2.7258539451289942e-05, + "loss": 0.0336, + "step": 70720 + }, + { + "epoch": 0.1559877644547265, + "grad_norm": 0.09783818572759628, + "learning_rate": 2.7257586476865895e-05, + "loss": 0.0305, + "step": 70730 + }, + { + "epoch": 0.15600981842962466, + "grad_norm": 0.09479641914367676, + "learning_rate": 2.7256633353500598e-05, + "loss": 0.0354, + "step": 70740 + }, + { + "epoch": 0.15603187240452282, + "grad_norm": 0.12178314477205276, + "learning_rate": 2.7255680081205634e-05, + "loss": 0.034, + "step": 70750 + }, + { + "epoch": 0.156053926379421, + "grad_norm": 0.11390098184347153, + "learning_rate": 2.725472665999259e-05, + "loss": 0.0353, + "step": 70760 + }, + { + "epoch": 0.15607598035431916, + "grad_norm": 0.10608606040477753, + "learning_rate": 2.725377308987304e-05, + "loss": 0.0349, + "step": 70770 + }, + { + "epoch": 0.15609803432921732, + "grad_norm": 0.08490367233753204, + "learning_rate": 2.7252819370858586e-05, + "loss": 0.0351, + "step": 70780 + }, + { + "epoch": 0.1561200883041155, + "grad_norm": 0.09945554286241531, + "learning_rate": 2.72518655029608e-05, + "loss": 0.0352, + "step": 70790 + }, + { + "epoch": 0.15614214227901366, + "grad_norm": 0.11798721551895142, + "learning_rate": 2.7250911486191284e-05, + "loss": 0.035, + "step": 70800 + }, + { + "epoch": 0.1561641962539118, + "grad_norm": 0.1254078596830368, + "learning_rate": 2.7249957320561625e-05, + "loss": 0.0344, + "step": 70810 + }, + { + "epoch": 0.15618625022881, + "grad_norm": 0.1338387429714203, + "learning_rate": 2.7249003006083422e-05, + "loss": 0.0336, + "step": 70820 + }, + { + "epoch": 0.15620830420370815, + "grad_norm": 0.11204489320516586, + "learning_rate": 2.7248048542768266e-05, + "loss": 0.0351, + "step": 70830 + }, + { + "epoch": 0.1562303581786063, + "grad_norm": 0.137992724776268, + "learning_rate": 2.7247093930627757e-05, + "loss": 0.0339, + "step": 70840 + }, + { + "epoch": 0.1562524121535045, + "grad_norm": 0.09429015219211578, + "learning_rate": 2.724613916967349e-05, + "loss": 0.0351, + "step": 70850 + }, + { + "epoch": 0.15627446612840265, + "grad_norm": 0.10995589196681976, + "learning_rate": 2.7245184259917075e-05, + "loss": 0.0356, + "step": 70860 + }, + { + "epoch": 0.1562965201033008, + "grad_norm": 0.07471102476119995, + "learning_rate": 2.7244229201370108e-05, + "loss": 0.0344, + "step": 70870 + }, + { + "epoch": 0.156318574078199, + "grad_norm": 0.17620007693767548, + "learning_rate": 2.7243273994044192e-05, + "loss": 0.0341, + "step": 70880 + }, + { + "epoch": 0.15634062805309715, + "grad_norm": 0.10960465669631958, + "learning_rate": 2.7242318637950942e-05, + "loss": 0.0355, + "step": 70890 + }, + { + "epoch": 0.1563626820279953, + "grad_norm": 0.10261081904172897, + "learning_rate": 2.724136313310196e-05, + "loss": 0.035, + "step": 70900 + }, + { + "epoch": 0.1563847360028935, + "grad_norm": 0.08503681421279907, + "learning_rate": 2.7240407479508855e-05, + "loss": 0.0349, + "step": 70910 + }, + { + "epoch": 0.15640678997779164, + "grad_norm": 0.1285223513841629, + "learning_rate": 2.7239451677183246e-05, + "loss": 0.0368, + "step": 70920 + }, + { + "epoch": 0.1564288439526898, + "grad_norm": 0.08087620139122009, + "learning_rate": 2.7238495726136744e-05, + "loss": 0.0338, + "step": 70930 + }, + { + "epoch": 0.15645089792758798, + "grad_norm": 0.09200000017881393, + "learning_rate": 2.7237539626380964e-05, + "loss": 0.0357, + "step": 70940 + }, + { + "epoch": 0.15647295190248614, + "grad_norm": 0.11293846368789673, + "learning_rate": 2.723658337792752e-05, + "loss": 0.0344, + "step": 70950 + }, + { + "epoch": 0.1564950058773843, + "grad_norm": 0.12042492628097534, + "learning_rate": 2.723562698078804e-05, + "loss": 0.0349, + "step": 70960 + }, + { + "epoch": 0.15651705985228248, + "grad_norm": 0.08053500950336456, + "learning_rate": 2.7234670434974136e-05, + "loss": 0.0351, + "step": 70970 + }, + { + "epoch": 0.15653911382718064, + "grad_norm": 0.10612960904836655, + "learning_rate": 2.7233713740497435e-05, + "loss": 0.0341, + "step": 70980 + }, + { + "epoch": 0.15656116780207882, + "grad_norm": 0.11943769454956055, + "learning_rate": 2.7232756897369558e-05, + "loss": 0.0369, + "step": 70990 + }, + { + "epoch": 0.15658322177697698, + "grad_norm": 0.12871341407299042, + "learning_rate": 2.723179990560214e-05, + "loss": 0.0358, + "step": 71000 + }, + { + "epoch": 0.15660527575187513, + "grad_norm": 0.09308838099241257, + "learning_rate": 2.7230842765206805e-05, + "loss": 0.0346, + "step": 71010 + }, + { + "epoch": 0.15662732972677332, + "grad_norm": 0.12980230152606964, + "learning_rate": 2.722988547619518e-05, + "loss": 0.0358, + "step": 71020 + }, + { + "epoch": 0.15664938370167147, + "grad_norm": 0.11669239401817322, + "learning_rate": 2.72289280385789e-05, + "loss": 0.0359, + "step": 71030 + }, + { + "epoch": 0.15667143767656963, + "grad_norm": 0.12726029753684998, + "learning_rate": 2.7227970452369596e-05, + "loss": 0.0329, + "step": 71040 + }, + { + "epoch": 0.15669349165146781, + "grad_norm": 0.08878193795681, + "learning_rate": 2.722701271757891e-05, + "loss": 0.0348, + "step": 71050 + }, + { + "epoch": 0.15671554562636597, + "grad_norm": 0.08408694714307785, + "learning_rate": 2.7226054834218475e-05, + "loss": 0.0341, + "step": 71060 + }, + { + "epoch": 0.15673759960126413, + "grad_norm": 0.10686423629522324, + "learning_rate": 2.7225096802299928e-05, + "loss": 0.038, + "step": 71070 + }, + { + "epoch": 0.1567596535761623, + "grad_norm": 0.12824413180351257, + "learning_rate": 2.7224138621834913e-05, + "loss": 0.0333, + "step": 71080 + }, + { + "epoch": 0.15678170755106047, + "grad_norm": 0.1648043543100357, + "learning_rate": 2.7223180292835074e-05, + "loss": 0.0348, + "step": 71090 + }, + { + "epoch": 0.15680376152595862, + "grad_norm": 0.12807145714759827, + "learning_rate": 2.722222181531205e-05, + "loss": 0.035, + "step": 71100 + }, + { + "epoch": 0.1568258155008568, + "grad_norm": 0.10168123990297318, + "learning_rate": 2.72212631892775e-05, + "loss": 0.0353, + "step": 71110 + }, + { + "epoch": 0.15684786947575496, + "grad_norm": 0.09302928298711777, + "learning_rate": 2.7220304414743054e-05, + "loss": 0.0364, + "step": 71120 + }, + { + "epoch": 0.15686992345065312, + "grad_norm": 0.1029636487364769, + "learning_rate": 2.7219345491720378e-05, + "loss": 0.0342, + "step": 71130 + }, + { + "epoch": 0.1568919774255513, + "grad_norm": 0.12153955549001694, + "learning_rate": 2.721838642022111e-05, + "loss": 0.0338, + "step": 71140 + }, + { + "epoch": 0.15691403140044946, + "grad_norm": 0.11414725333452225, + "learning_rate": 2.7217427200256917e-05, + "loss": 0.0345, + "step": 71150 + }, + { + "epoch": 0.15693608537534762, + "grad_norm": 0.12094694375991821, + "learning_rate": 2.721646783183945e-05, + "loss": 0.034, + "step": 71160 + }, + { + "epoch": 0.1569581393502458, + "grad_norm": 0.1414400190114975, + "learning_rate": 2.7215508314980358e-05, + "loss": 0.0373, + "step": 71170 + }, + { + "epoch": 0.15698019332514396, + "grad_norm": 0.10882598906755447, + "learning_rate": 2.721454864969131e-05, + "loss": 0.036, + "step": 71180 + }, + { + "epoch": 0.1570022473000421, + "grad_norm": 0.09618157148361206, + "learning_rate": 2.7213588835983965e-05, + "loss": 0.0344, + "step": 71190 + }, + { + "epoch": 0.1570243012749403, + "grad_norm": 0.09337477385997772, + "learning_rate": 2.721262887386998e-05, + "loss": 0.0338, + "step": 71200 + }, + { + "epoch": 0.15704635524983845, + "grad_norm": 0.09333585947751999, + "learning_rate": 2.721166876336103e-05, + "loss": 0.0321, + "step": 71210 + }, + { + "epoch": 0.1570684092247366, + "grad_norm": 0.0896916314959526, + "learning_rate": 2.721070850446877e-05, + "loss": 0.036, + "step": 71220 + }, + { + "epoch": 0.1570904631996348, + "grad_norm": 0.09710909426212311, + "learning_rate": 2.720974809720487e-05, + "loss": 0.034, + "step": 71230 + }, + { + "epoch": 0.15711251717453295, + "grad_norm": 0.10307887196540833, + "learning_rate": 2.720878754158101e-05, + "loss": 0.0334, + "step": 71240 + }, + { + "epoch": 0.1571345711494311, + "grad_norm": 0.10704322159290314, + "learning_rate": 2.720782683760885e-05, + "loss": 0.034, + "step": 71250 + }, + { + "epoch": 0.1571566251243293, + "grad_norm": 0.13623206317424774, + "learning_rate": 2.7206865985300064e-05, + "loss": 0.0347, + "step": 71260 + }, + { + "epoch": 0.15717867909922745, + "grad_norm": 0.08489133417606354, + "learning_rate": 2.720590498466634e-05, + "loss": 0.0358, + "step": 71270 + }, + { + "epoch": 0.1572007330741256, + "grad_norm": 0.11204022169113159, + "learning_rate": 2.720494383571934e-05, + "loss": 0.0354, + "step": 71280 + }, + { + "epoch": 0.1572227870490238, + "grad_norm": 0.10729837417602539, + "learning_rate": 2.7203982538470754e-05, + "loss": 0.0338, + "step": 71290 + }, + { + "epoch": 0.15724484102392194, + "grad_norm": 0.11492694169282913, + "learning_rate": 2.720302109293225e-05, + "loss": 0.0326, + "step": 71300 + }, + { + "epoch": 0.1572668949988201, + "grad_norm": 0.12293446809053421, + "learning_rate": 2.7202059499115522e-05, + "loss": 0.0358, + "step": 71310 + }, + { + "epoch": 0.15728894897371828, + "grad_norm": 0.11113762110471725, + "learning_rate": 2.7201097757032252e-05, + "loss": 0.0371, + "step": 71320 + }, + { + "epoch": 0.15731100294861644, + "grad_norm": 0.11051609367132187, + "learning_rate": 2.7200135866694123e-05, + "loss": 0.0349, + "step": 71330 + }, + { + "epoch": 0.1573330569235146, + "grad_norm": 0.15473529696464539, + "learning_rate": 2.7199173828112828e-05, + "loss": 0.0343, + "step": 71340 + }, + { + "epoch": 0.15735511089841278, + "grad_norm": 0.12097834050655365, + "learning_rate": 2.719821164130005e-05, + "loss": 0.0355, + "step": 71350 + }, + { + "epoch": 0.15737716487331094, + "grad_norm": 0.1099608764052391, + "learning_rate": 2.719724930626748e-05, + "loss": 0.0351, + "step": 71360 + }, + { + "epoch": 0.1573992188482091, + "grad_norm": 0.13075357675552368, + "learning_rate": 2.7196286823026818e-05, + "loss": 0.0351, + "step": 71370 + }, + { + "epoch": 0.15742127282310728, + "grad_norm": 0.10358599573373795, + "learning_rate": 2.7195324191589758e-05, + "loss": 0.0338, + "step": 71380 + }, + { + "epoch": 0.15744332679800543, + "grad_norm": 0.10173764824867249, + "learning_rate": 2.7194361411967994e-05, + "loss": 0.0336, + "step": 71390 + }, + { + "epoch": 0.15746538077290362, + "grad_norm": 0.12738527357578278, + "learning_rate": 2.7193398484173222e-05, + "loss": 0.0353, + "step": 71400 + }, + { + "epoch": 0.15748743474780177, + "grad_norm": 0.12172399461269379, + "learning_rate": 2.719243540821715e-05, + "loss": 0.034, + "step": 71410 + }, + { + "epoch": 0.15750948872269993, + "grad_norm": 0.11353709548711777, + "learning_rate": 2.7191472184111473e-05, + "loss": 0.0354, + "step": 71420 + }, + { + "epoch": 0.15753154269759811, + "grad_norm": 0.1178874522447586, + "learning_rate": 2.7190508811867902e-05, + "loss": 0.0343, + "step": 71430 + }, + { + "epoch": 0.15755359667249627, + "grad_norm": 0.11269327998161316, + "learning_rate": 2.718954529149814e-05, + "loss": 0.0356, + "step": 71440 + }, + { + "epoch": 0.15757565064739443, + "grad_norm": 0.0826556533575058, + "learning_rate": 2.718858162301389e-05, + "loss": 0.0348, + "step": 71450 + }, + { + "epoch": 0.1575977046222926, + "grad_norm": 0.1203269511461258, + "learning_rate": 2.7187617806426866e-05, + "loss": 0.0338, + "step": 71460 + }, + { + "epoch": 0.15761975859719077, + "grad_norm": 0.12406830489635468, + "learning_rate": 2.7186653841748773e-05, + "loss": 0.0358, + "step": 71470 + }, + { + "epoch": 0.15764181257208892, + "grad_norm": 0.11545898020267487, + "learning_rate": 2.718568972899134e-05, + "loss": 0.0349, + "step": 71480 + }, + { + "epoch": 0.1576638665469871, + "grad_norm": 0.12286783754825592, + "learning_rate": 2.7184725468166266e-05, + "loss": 0.0324, + "step": 71490 + }, + { + "epoch": 0.15768592052188526, + "grad_norm": 0.10973326861858368, + "learning_rate": 2.7183761059285278e-05, + "loss": 0.033, + "step": 71500 + }, + { + "epoch": 0.15770797449678342, + "grad_norm": 0.09375379979610443, + "learning_rate": 2.7182796502360085e-05, + "loss": 0.0339, + "step": 71510 + }, + { + "epoch": 0.1577300284716816, + "grad_norm": 0.11373241990804672, + "learning_rate": 2.718183179740241e-05, + "loss": 0.0321, + "step": 71520 + }, + { + "epoch": 0.15775208244657976, + "grad_norm": 0.09669342637062073, + "learning_rate": 2.7180866944423982e-05, + "loss": 0.0348, + "step": 71530 + }, + { + "epoch": 0.15777413642147792, + "grad_norm": 0.09361975640058517, + "learning_rate": 2.7179901943436515e-05, + "loss": 0.0342, + "step": 71540 + }, + { + "epoch": 0.1577961903963761, + "grad_norm": 0.10330890119075775, + "learning_rate": 2.7178936794451747e-05, + "loss": 0.0346, + "step": 71550 + }, + { + "epoch": 0.15781824437127426, + "grad_norm": 0.09753713756799698, + "learning_rate": 2.7177971497481393e-05, + "loss": 0.0331, + "step": 71560 + }, + { + "epoch": 0.15784029834617241, + "grad_norm": 0.11603765189647675, + "learning_rate": 2.717700605253719e-05, + "loss": 0.0343, + "step": 71570 + }, + { + "epoch": 0.1578623523210706, + "grad_norm": 0.12768249213695526, + "learning_rate": 2.7176040459630863e-05, + "loss": 0.0345, + "step": 71580 + }, + { + "epoch": 0.15788440629596875, + "grad_norm": 0.09113751351833344, + "learning_rate": 2.7175074718774153e-05, + "loss": 0.0348, + "step": 71590 + }, + { + "epoch": 0.1579064602708669, + "grad_norm": 0.1032782644033432, + "learning_rate": 2.717410882997879e-05, + "loss": 0.0347, + "step": 71600 + }, + { + "epoch": 0.1579285142457651, + "grad_norm": 0.10799141228199005, + "learning_rate": 2.7173142793256503e-05, + "loss": 0.0347, + "step": 71610 + }, + { + "epoch": 0.15795056822066325, + "grad_norm": 0.12016697973012924, + "learning_rate": 2.7172176608619043e-05, + "loss": 0.0352, + "step": 71620 + }, + { + "epoch": 0.1579726221955614, + "grad_norm": 0.13146527111530304, + "learning_rate": 2.7171210276078143e-05, + "loss": 0.0351, + "step": 71630 + }, + { + "epoch": 0.1579946761704596, + "grad_norm": 0.10640336573123932, + "learning_rate": 2.717024379564555e-05, + "loss": 0.0363, + "step": 71640 + }, + { + "epoch": 0.15801673014535775, + "grad_norm": 0.08607114851474762, + "learning_rate": 2.7169277167333003e-05, + "loss": 0.033, + "step": 71650 + }, + { + "epoch": 0.1580387841202559, + "grad_norm": 0.11009997129440308, + "learning_rate": 2.7168310391152245e-05, + "loss": 0.0366, + "step": 71660 + }, + { + "epoch": 0.1580608380951541, + "grad_norm": 0.11447685956954956, + "learning_rate": 2.7167343467115032e-05, + "loss": 0.0367, + "step": 71670 + }, + { + "epoch": 0.15808289207005224, + "grad_norm": 0.10317868739366531, + "learning_rate": 2.7166376395233104e-05, + "loss": 0.0348, + "step": 71680 + }, + { + "epoch": 0.1581049460449504, + "grad_norm": 0.1326722353696823, + "learning_rate": 2.716540917551822e-05, + "loss": 0.0369, + "step": 71690 + }, + { + "epoch": 0.15812700001984858, + "grad_norm": 0.12330678850412369, + "learning_rate": 2.7164441807982125e-05, + "loss": 0.0338, + "step": 71700 + }, + { + "epoch": 0.15814905399474674, + "grad_norm": 0.14370475709438324, + "learning_rate": 2.716347429263658e-05, + "loss": 0.0348, + "step": 71710 + }, + { + "epoch": 0.1581711079696449, + "grad_norm": 0.12314590066671371, + "learning_rate": 2.7162506629493334e-05, + "loss": 0.0332, + "step": 71720 + }, + { + "epoch": 0.15819316194454308, + "grad_norm": 0.12391234934329987, + "learning_rate": 2.7161538818564154e-05, + "loss": 0.0353, + "step": 71730 + }, + { + "epoch": 0.15821521591944124, + "grad_norm": 0.10397467762231827, + "learning_rate": 2.716057085986079e-05, + "loss": 0.0345, + "step": 71740 + }, + { + "epoch": 0.1582372698943394, + "grad_norm": 0.09410285949707031, + "learning_rate": 2.715960275339501e-05, + "loss": 0.0358, + "step": 71750 + }, + { + "epoch": 0.15825932386923758, + "grad_norm": 0.13208924233913422, + "learning_rate": 2.7158634499178577e-05, + "loss": 0.0333, + "step": 71760 + }, + { + "epoch": 0.15828137784413573, + "grad_norm": 0.11517110466957092, + "learning_rate": 2.7157666097223254e-05, + "loss": 0.0373, + "step": 71770 + }, + { + "epoch": 0.1583034318190339, + "grad_norm": 0.11687799543142319, + "learning_rate": 2.715669754754081e-05, + "loss": 0.0348, + "step": 71780 + }, + { + "epoch": 0.15832548579393207, + "grad_norm": 0.1152515783905983, + "learning_rate": 2.7155728850143013e-05, + "loss": 0.0333, + "step": 71790 + }, + { + "epoch": 0.15834753976883023, + "grad_norm": 0.1051568016409874, + "learning_rate": 2.7154760005041635e-05, + "loss": 0.0359, + "step": 71800 + }, + { + "epoch": 0.1583695937437284, + "grad_norm": 0.09379662573337555, + "learning_rate": 2.7153791012248448e-05, + "loss": 0.0342, + "step": 71810 + }, + { + "epoch": 0.15839164771862657, + "grad_norm": 0.09355053305625916, + "learning_rate": 2.715282187177522e-05, + "loss": 0.0336, + "step": 71820 + }, + { + "epoch": 0.15841370169352473, + "grad_norm": 0.1308518350124359, + "learning_rate": 2.7151852583633733e-05, + "loss": 0.0349, + "step": 71830 + }, + { + "epoch": 0.1584357556684229, + "grad_norm": 0.09880587458610535, + "learning_rate": 2.715088314783577e-05, + "loss": 0.0344, + "step": 71840 + }, + { + "epoch": 0.15845780964332107, + "grad_norm": 0.11253199726343155, + "learning_rate": 2.7149913564393097e-05, + "loss": 0.0357, + "step": 71850 + }, + { + "epoch": 0.15847986361821922, + "grad_norm": 0.1139967292547226, + "learning_rate": 2.7148943833317506e-05, + "loss": 0.0341, + "step": 71860 + }, + { + "epoch": 0.1585019175931174, + "grad_norm": 0.12717226147651672, + "learning_rate": 2.714797395462078e-05, + "loss": 0.0358, + "step": 71870 + }, + { + "epoch": 0.15852397156801556, + "grad_norm": 0.0950872153043747, + "learning_rate": 2.7147003928314694e-05, + "loss": 0.0361, + "step": 71880 + }, + { + "epoch": 0.15854602554291372, + "grad_norm": 0.10960142314434052, + "learning_rate": 2.7146033754411047e-05, + "loss": 0.0341, + "step": 71890 + }, + { + "epoch": 0.1585680795178119, + "grad_norm": 0.10270346701145172, + "learning_rate": 2.714506343292162e-05, + "loss": 0.0332, + "step": 71900 + }, + { + "epoch": 0.15859013349271006, + "grad_norm": 0.1354086697101593, + "learning_rate": 2.7144092963858206e-05, + "loss": 0.0351, + "step": 71910 + }, + { + "epoch": 0.15861218746760822, + "grad_norm": 0.15805105865001678, + "learning_rate": 2.7143122347232596e-05, + "loss": 0.0355, + "step": 71920 + }, + { + "epoch": 0.1586342414425064, + "grad_norm": 0.12302804738283157, + "learning_rate": 2.7142151583056585e-05, + "loss": 0.0329, + "step": 71930 + }, + { + "epoch": 0.15865629541740456, + "grad_norm": 0.11894836276769638, + "learning_rate": 2.714118067134197e-05, + "loss": 0.0345, + "step": 71940 + }, + { + "epoch": 0.15867834939230271, + "grad_norm": 0.10926016420125961, + "learning_rate": 2.7140209612100542e-05, + "loss": 0.0339, + "step": 71950 + }, + { + "epoch": 0.1587004033672009, + "grad_norm": 0.10906877368688583, + "learning_rate": 2.713923840534411e-05, + "loss": 0.0328, + "step": 71960 + }, + { + "epoch": 0.15872245734209905, + "grad_norm": 0.1357090026140213, + "learning_rate": 2.713826705108447e-05, + "loss": 0.034, + "step": 71970 + }, + { + "epoch": 0.1587445113169972, + "grad_norm": 0.09813100099563599, + "learning_rate": 2.713729554933342e-05, + "loss": 0.0355, + "step": 71980 + }, + { + "epoch": 0.1587665652918954, + "grad_norm": 0.0917910635471344, + "learning_rate": 2.7136323900102773e-05, + "loss": 0.0344, + "step": 71990 + }, + { + "epoch": 0.15878861926679355, + "grad_norm": 0.11234128475189209, + "learning_rate": 2.713535210340433e-05, + "loss": 0.0333, + "step": 72000 + }, + { + "epoch": 0.1588106732416917, + "grad_norm": 0.10172750800848007, + "learning_rate": 2.7134380159249903e-05, + "loss": 0.0329, + "step": 72010 + }, + { + "epoch": 0.1588327272165899, + "grad_norm": 0.12351201474666595, + "learning_rate": 2.71334080676513e-05, + "loss": 0.0364, + "step": 72020 + }, + { + "epoch": 0.15885478119148805, + "grad_norm": 0.10579660534858704, + "learning_rate": 2.7132435828620332e-05, + "loss": 0.0337, + "step": 72030 + }, + { + "epoch": 0.1588768351663862, + "grad_norm": 0.11741070449352264, + "learning_rate": 2.7131463442168815e-05, + "loss": 0.0334, + "step": 72040 + }, + { + "epoch": 0.1588988891412844, + "grad_norm": 0.10534447431564331, + "learning_rate": 2.7130490908308564e-05, + "loss": 0.0349, + "step": 72050 + }, + { + "epoch": 0.15892094311618254, + "grad_norm": 0.12504343688488007, + "learning_rate": 2.7129518227051394e-05, + "loss": 0.0337, + "step": 72060 + }, + { + "epoch": 0.1589429970910807, + "grad_norm": 0.10856258869171143, + "learning_rate": 2.7128545398409125e-05, + "loss": 0.0357, + "step": 72070 + }, + { + "epoch": 0.15896505106597889, + "grad_norm": 0.2029557079076767, + "learning_rate": 2.712757242239358e-05, + "loss": 0.0372, + "step": 72080 + }, + { + "epoch": 0.15898710504087704, + "grad_norm": 0.10466787964105606, + "learning_rate": 2.7126599299016575e-05, + "loss": 0.0335, + "step": 72090 + }, + { + "epoch": 0.1590091590157752, + "grad_norm": 0.11760266870260239, + "learning_rate": 2.7125626028289946e-05, + "loss": 0.0359, + "step": 72100 + }, + { + "epoch": 0.15903121299067338, + "grad_norm": 0.09375713765621185, + "learning_rate": 2.712465261022551e-05, + "loss": 0.036, + "step": 72110 + }, + { + "epoch": 0.15905326696557154, + "grad_norm": 0.0975378155708313, + "learning_rate": 2.7123679044835092e-05, + "loss": 0.0334, + "step": 72120 + }, + { + "epoch": 0.1590753209404697, + "grad_norm": 0.10741408169269562, + "learning_rate": 2.7122705332130534e-05, + "loss": 0.0335, + "step": 72130 + }, + { + "epoch": 0.15909737491536788, + "grad_norm": 0.1359291672706604, + "learning_rate": 2.7121731472123655e-05, + "loss": 0.0358, + "step": 72140 + }, + { + "epoch": 0.15911942889026603, + "grad_norm": 0.11491671949625015, + "learning_rate": 2.71207574648263e-05, + "loss": 0.035, + "step": 72150 + }, + { + "epoch": 0.1591414828651642, + "grad_norm": 0.10066662728786469, + "learning_rate": 2.7119783310250294e-05, + "loss": 0.0349, + "step": 72160 + }, + { + "epoch": 0.15916353684006238, + "grad_norm": 0.09987779706716537, + "learning_rate": 2.7118809008407476e-05, + "loss": 0.0334, + "step": 72170 + }, + { + "epoch": 0.15918559081496053, + "grad_norm": 0.11174491047859192, + "learning_rate": 2.711783455930969e-05, + "loss": 0.0359, + "step": 72180 + }, + { + "epoch": 0.1592076447898587, + "grad_norm": 0.09814456105232239, + "learning_rate": 2.7116859962968775e-05, + "loss": 0.0342, + "step": 72190 + }, + { + "epoch": 0.15922969876475687, + "grad_norm": 0.10072379559278488, + "learning_rate": 2.7115885219396564e-05, + "loss": 0.0335, + "step": 72200 + }, + { + "epoch": 0.15925175273965503, + "grad_norm": 0.12521201372146606, + "learning_rate": 2.711491032860491e-05, + "loss": 0.033, + "step": 72210 + }, + { + "epoch": 0.15927380671455318, + "grad_norm": 0.11610157042741776, + "learning_rate": 2.711393529060566e-05, + "loss": 0.034, + "step": 72220 + }, + { + "epoch": 0.15929586068945137, + "grad_norm": 0.10790865123271942, + "learning_rate": 2.7112960105410662e-05, + "loss": 0.0365, + "step": 72230 + }, + { + "epoch": 0.15931791466434952, + "grad_norm": 0.09577221423387527, + "learning_rate": 2.7111984773031755e-05, + "loss": 0.0344, + "step": 72240 + }, + { + "epoch": 0.15933996863924768, + "grad_norm": 0.13264481723308563, + "learning_rate": 2.7111009293480805e-05, + "loss": 0.0329, + "step": 72250 + }, + { + "epoch": 0.15936202261414587, + "grad_norm": 0.10126502811908722, + "learning_rate": 2.7110033666769652e-05, + "loss": 0.0349, + "step": 72260 + }, + { + "epoch": 0.15938407658904402, + "grad_norm": 0.10578197985887527, + "learning_rate": 2.7109057892910156e-05, + "loss": 0.0346, + "step": 72270 + }, + { + "epoch": 0.1594061305639422, + "grad_norm": 0.10587505251169205, + "learning_rate": 2.7108081971914176e-05, + "loss": 0.0352, + "step": 72280 + }, + { + "epoch": 0.15942818453884036, + "grad_norm": 0.1195639818906784, + "learning_rate": 2.7107105903793564e-05, + "loss": 0.0347, + "step": 72290 + }, + { + "epoch": 0.15945023851373852, + "grad_norm": 0.10874050855636597, + "learning_rate": 2.710612968856019e-05, + "loss": 0.0349, + "step": 72300 + }, + { + "epoch": 0.1594722924886367, + "grad_norm": 0.09689616411924362, + "learning_rate": 2.7105153326225904e-05, + "loss": 0.0343, + "step": 72310 + }, + { + "epoch": 0.15949434646353486, + "grad_norm": 0.10998082906007767, + "learning_rate": 2.710417681680258e-05, + "loss": 0.035, + "step": 72320 + }, + { + "epoch": 0.15951640043843301, + "grad_norm": 0.0897393673658371, + "learning_rate": 2.7103200160302078e-05, + "loss": 0.032, + "step": 72330 + }, + { + "epoch": 0.1595384544133312, + "grad_norm": 0.08919364213943481, + "learning_rate": 2.710222335673627e-05, + "loss": 0.0337, + "step": 72340 + }, + { + "epoch": 0.15956050838822936, + "grad_norm": 0.0959555134177208, + "learning_rate": 2.7101246406117017e-05, + "loss": 0.0337, + "step": 72350 + }, + { + "epoch": 0.1595825623631275, + "grad_norm": 0.09785816073417664, + "learning_rate": 2.71002693084562e-05, + "loss": 0.0352, + "step": 72360 + }, + { + "epoch": 0.1596046163380257, + "grad_norm": 0.11396950483322144, + "learning_rate": 2.7099292063765682e-05, + "loss": 0.0331, + "step": 72370 + }, + { + "epoch": 0.15962667031292385, + "grad_norm": 0.09802858531475067, + "learning_rate": 2.7098314672057346e-05, + "loss": 0.0339, + "step": 72380 + }, + { + "epoch": 0.159648724287822, + "grad_norm": 0.10161197930574417, + "learning_rate": 2.709733713334306e-05, + "loss": 0.0347, + "step": 72390 + }, + { + "epoch": 0.1596707782627202, + "grad_norm": 0.10779589414596558, + "learning_rate": 2.709635944763471e-05, + "loss": 0.0352, + "step": 72400 + }, + { + "epoch": 0.15969283223761835, + "grad_norm": 0.1488965004682541, + "learning_rate": 2.7095381614944164e-05, + "loss": 0.035, + "step": 72410 + }, + { + "epoch": 0.1597148862125165, + "grad_norm": 0.12488969415426254, + "learning_rate": 2.7094403635283317e-05, + "loss": 0.0372, + "step": 72420 + }, + { + "epoch": 0.1597369401874147, + "grad_norm": 0.12374703586101532, + "learning_rate": 2.709342550866405e-05, + "loss": 0.0335, + "step": 72430 + }, + { + "epoch": 0.15975899416231285, + "grad_norm": 0.13430391252040863, + "learning_rate": 2.709244723509824e-05, + "loss": 0.0338, + "step": 72440 + }, + { + "epoch": 0.159781048137211, + "grad_norm": 0.17413276433944702, + "learning_rate": 2.709146881459778e-05, + "loss": 0.0368, + "step": 72450 + }, + { + "epoch": 0.15980310211210919, + "grad_norm": 0.16518525779247284, + "learning_rate": 2.7090490247174558e-05, + "loss": 0.0346, + "step": 72460 + }, + { + "epoch": 0.15982515608700734, + "grad_norm": 0.10436784476041794, + "learning_rate": 2.7089511532840467e-05, + "loss": 0.0328, + "step": 72470 + }, + { + "epoch": 0.1598472100619055, + "grad_norm": 0.08402729034423828, + "learning_rate": 2.708853267160739e-05, + "loss": 0.0342, + "step": 72480 + }, + { + "epoch": 0.15986926403680368, + "grad_norm": 0.13456566631793976, + "learning_rate": 2.7087553663487233e-05, + "loss": 0.0352, + "step": 72490 + }, + { + "epoch": 0.15989131801170184, + "grad_norm": 0.11109591275453568, + "learning_rate": 2.7086574508491884e-05, + "loss": 0.0332, + "step": 72500 + }, + { + "epoch": 0.1599133719866, + "grad_norm": 0.12451514601707458, + "learning_rate": 2.7085595206633247e-05, + "loss": 0.0363, + "step": 72510 + }, + { + "epoch": 0.15993542596149818, + "grad_norm": 0.19107408821582794, + "learning_rate": 2.7084615757923214e-05, + "loss": 0.0334, + "step": 72520 + }, + { + "epoch": 0.15995747993639634, + "grad_norm": 0.11482764780521393, + "learning_rate": 2.708363616237369e-05, + "loss": 0.0333, + "step": 72530 + }, + { + "epoch": 0.1599795339112945, + "grad_norm": 0.1661963164806366, + "learning_rate": 2.7082656419996578e-05, + "loss": 0.0353, + "step": 72540 + }, + { + "epoch": 0.16000158788619268, + "grad_norm": 0.0914597287774086, + "learning_rate": 2.708167653080378e-05, + "loss": 0.0352, + "step": 72550 + }, + { + "epoch": 0.16002364186109083, + "grad_norm": 0.09670959413051605, + "learning_rate": 2.708069649480721e-05, + "loss": 0.0345, + "step": 72560 + }, + { + "epoch": 0.160045695835989, + "grad_norm": 0.14200137555599213, + "learning_rate": 2.707971631201877e-05, + "loss": 0.0338, + "step": 72570 + }, + { + "epoch": 0.16006774981088717, + "grad_norm": 0.1199120283126831, + "learning_rate": 2.7078735982450366e-05, + "loss": 0.0329, + "step": 72580 + }, + { + "epoch": 0.16008980378578533, + "grad_norm": 0.12556667625904083, + "learning_rate": 2.7077755506113923e-05, + "loss": 0.035, + "step": 72590 + }, + { + "epoch": 0.16011185776068348, + "grad_norm": 0.11863306164741516, + "learning_rate": 2.7076774883021345e-05, + "loss": 0.0361, + "step": 72600 + }, + { + "epoch": 0.16013391173558167, + "grad_norm": 0.09745801985263824, + "learning_rate": 2.707579411318455e-05, + "loss": 0.0346, + "step": 72610 + }, + { + "epoch": 0.16015596571047983, + "grad_norm": 0.10627980530261993, + "learning_rate": 2.7074813196615455e-05, + "loss": 0.0341, + "step": 72620 + }, + { + "epoch": 0.16017801968537798, + "grad_norm": 0.10762105137109756, + "learning_rate": 2.707383213332598e-05, + "loss": 0.0347, + "step": 72630 + }, + { + "epoch": 0.16020007366027617, + "grad_norm": 0.10718121379613876, + "learning_rate": 2.7072850923328044e-05, + "loss": 0.036, + "step": 72640 + }, + { + "epoch": 0.16022212763517432, + "grad_norm": 0.0885409563779831, + "learning_rate": 2.707186956663357e-05, + "loss": 0.034, + "step": 72650 + }, + { + "epoch": 0.16024418161007248, + "grad_norm": 0.1173781007528305, + "learning_rate": 2.7070888063254485e-05, + "loss": 0.0342, + "step": 72660 + }, + { + "epoch": 0.16026623558497066, + "grad_norm": 0.09810566157102585, + "learning_rate": 2.7069906413202712e-05, + "loss": 0.0332, + "step": 72670 + }, + { + "epoch": 0.16028828955986882, + "grad_norm": 0.11609824746847153, + "learning_rate": 2.7068924616490176e-05, + "loss": 0.0359, + "step": 72680 + }, + { + "epoch": 0.160310343534767, + "grad_norm": 0.09495258331298828, + "learning_rate": 2.7067942673128813e-05, + "loss": 0.0327, + "step": 72690 + }, + { + "epoch": 0.16033239750966516, + "grad_norm": 0.12752002477645874, + "learning_rate": 2.706696058313056e-05, + "loss": 0.0325, + "step": 72700 + }, + { + "epoch": 0.16035445148456332, + "grad_norm": 0.14352114498615265, + "learning_rate": 2.7065978346507336e-05, + "loss": 0.0362, + "step": 72710 + }, + { + "epoch": 0.1603765054594615, + "grad_norm": 0.10968535393476486, + "learning_rate": 2.7064995963271084e-05, + "loss": 0.0328, + "step": 72720 + }, + { + "epoch": 0.16039855943435966, + "grad_norm": 0.09395747631788254, + "learning_rate": 2.706401343343374e-05, + "loss": 0.0331, + "step": 72730 + }, + { + "epoch": 0.1604206134092578, + "grad_norm": 0.09507733583450317, + "learning_rate": 2.7063030757007243e-05, + "loss": 0.0329, + "step": 72740 + }, + { + "epoch": 0.160442667384156, + "grad_norm": 0.09774181246757507, + "learning_rate": 2.706204793400353e-05, + "loss": 0.0351, + "step": 72750 + }, + { + "epoch": 0.16046472135905415, + "grad_norm": 0.11725907027721405, + "learning_rate": 2.706106496443455e-05, + "loss": 0.0332, + "step": 72760 + }, + { + "epoch": 0.1604867753339523, + "grad_norm": 0.11427333205938339, + "learning_rate": 2.7060081848312238e-05, + "loss": 0.0363, + "step": 72770 + }, + { + "epoch": 0.1605088293088505, + "grad_norm": 0.0897153913974762, + "learning_rate": 2.7059098585648552e-05, + "loss": 0.0362, + "step": 72780 + }, + { + "epoch": 0.16053088328374865, + "grad_norm": 0.11825350672006607, + "learning_rate": 2.705811517645543e-05, + "loss": 0.0355, + "step": 72790 + }, + { + "epoch": 0.1605529372586468, + "grad_norm": 0.12504906952381134, + "learning_rate": 2.7057131620744823e-05, + "loss": 0.0332, + "step": 72800 + }, + { + "epoch": 0.160574991233545, + "grad_norm": 0.1503501683473587, + "learning_rate": 2.7056147918528683e-05, + "loss": 0.0357, + "step": 72810 + }, + { + "epoch": 0.16059704520844315, + "grad_norm": 0.15375925600528717, + "learning_rate": 2.7055164069818964e-05, + "loss": 0.0356, + "step": 72820 + }, + { + "epoch": 0.1606190991833413, + "grad_norm": 0.11215860396623611, + "learning_rate": 2.7054180074627615e-05, + "loss": 0.035, + "step": 72830 + }, + { + "epoch": 0.16064115315823949, + "grad_norm": 0.13772627711296082, + "learning_rate": 2.7053195932966605e-05, + "loss": 0.0357, + "step": 72840 + }, + { + "epoch": 0.16066320713313764, + "grad_norm": 0.10212312638759613, + "learning_rate": 2.7052211644847875e-05, + "loss": 0.0337, + "step": 72850 + }, + { + "epoch": 0.1606852611080358, + "grad_norm": 0.1020142138004303, + "learning_rate": 2.70512272102834e-05, + "loss": 0.0336, + "step": 72860 + }, + { + "epoch": 0.16070731508293398, + "grad_norm": 0.0902508869767189, + "learning_rate": 2.7050242629285136e-05, + "loss": 0.0358, + "step": 72870 + }, + { + "epoch": 0.16072936905783214, + "grad_norm": 0.1195671409368515, + "learning_rate": 2.7049257901865047e-05, + "loss": 0.0344, + "step": 72880 + }, + { + "epoch": 0.1607514230327303, + "grad_norm": 0.11314436793327332, + "learning_rate": 2.7048273028035096e-05, + "loss": 0.0352, + "step": 72890 + }, + { + "epoch": 0.16077347700762848, + "grad_norm": 0.10133223980665207, + "learning_rate": 2.7047288007807253e-05, + "loss": 0.0338, + "step": 72900 + }, + { + "epoch": 0.16079553098252664, + "grad_norm": 0.14487475156784058, + "learning_rate": 2.704630284119349e-05, + "loss": 0.0329, + "step": 72910 + }, + { + "epoch": 0.1608175849574248, + "grad_norm": 0.15455478429794312, + "learning_rate": 2.7045317528205768e-05, + "loss": 0.0335, + "step": 72920 + }, + { + "epoch": 0.16083963893232298, + "grad_norm": 0.11531016230583191, + "learning_rate": 2.7044332068856066e-05, + "loss": 0.037, + "step": 72930 + }, + { + "epoch": 0.16086169290722113, + "grad_norm": 0.14374564588069916, + "learning_rate": 2.7043346463156357e-05, + "loss": 0.0341, + "step": 72940 + }, + { + "epoch": 0.1608837468821193, + "grad_norm": 0.12997902929782867, + "learning_rate": 2.704236071111862e-05, + "loss": 0.0347, + "step": 72950 + }, + { + "epoch": 0.16090580085701747, + "grad_norm": 0.1667083501815796, + "learning_rate": 2.7041374812754832e-05, + "loss": 0.0346, + "step": 72960 + }, + { + "epoch": 0.16092785483191563, + "grad_norm": 0.14572212100028992, + "learning_rate": 2.704038876807697e-05, + "loss": 0.0345, + "step": 72970 + }, + { + "epoch": 0.16094990880681379, + "grad_norm": 0.09880132228136063, + "learning_rate": 2.7039402577097018e-05, + "loss": 0.0354, + "step": 72980 + }, + { + "epoch": 0.16097196278171197, + "grad_norm": 0.11394190788269043, + "learning_rate": 2.7038416239826953e-05, + "loss": 0.0345, + "step": 72990 + }, + { + "epoch": 0.16099401675661013, + "grad_norm": 0.10563881695270538, + "learning_rate": 2.703742975627877e-05, + "loss": 0.0334, + "step": 73000 + }, + { + "epoch": 0.16101607073150828, + "grad_norm": 0.11825470626354218, + "learning_rate": 2.7036443126464447e-05, + "loss": 0.0356, + "step": 73010 + }, + { + "epoch": 0.16103812470640647, + "grad_norm": 0.11403477936983109, + "learning_rate": 2.7035456350395977e-05, + "loss": 0.0337, + "step": 73020 + }, + { + "epoch": 0.16106017868130462, + "grad_norm": 0.10850467532873154, + "learning_rate": 2.7034469428085348e-05, + "loss": 0.0334, + "step": 73030 + }, + { + "epoch": 0.16108223265620278, + "grad_norm": 0.10905670374631882, + "learning_rate": 2.7033482359544553e-05, + "loss": 0.0335, + "step": 73040 + }, + { + "epoch": 0.16110428663110096, + "grad_norm": 0.136845201253891, + "learning_rate": 2.7032495144785587e-05, + "loss": 0.0343, + "step": 73050 + }, + { + "epoch": 0.16112634060599912, + "grad_norm": 0.09108198434114456, + "learning_rate": 2.7031507783820443e-05, + "loss": 0.0349, + "step": 73060 + }, + { + "epoch": 0.16114839458089728, + "grad_norm": 0.14242511987686157, + "learning_rate": 2.7030520276661123e-05, + "loss": 0.0346, + "step": 73070 + }, + { + "epoch": 0.16117044855579546, + "grad_norm": 0.12279261648654938, + "learning_rate": 2.7029532623319616e-05, + "loss": 0.0349, + "step": 73080 + }, + { + "epoch": 0.16119250253069362, + "grad_norm": 0.1453283280134201, + "learning_rate": 2.7028544823807938e-05, + "loss": 0.0362, + "step": 73090 + }, + { + "epoch": 0.16121455650559177, + "grad_norm": 0.11862461268901825, + "learning_rate": 2.702755687813808e-05, + "loss": 0.0364, + "step": 73100 + }, + { + "epoch": 0.16123661048048996, + "grad_norm": 0.11991846561431885, + "learning_rate": 2.7026568786322046e-05, + "loss": 0.0351, + "step": 73110 + }, + { + "epoch": 0.1612586644553881, + "grad_norm": 0.13261263072490692, + "learning_rate": 2.702558054837185e-05, + "loss": 0.0331, + "step": 73120 + }, + { + "epoch": 0.1612807184302863, + "grad_norm": 0.1387288123369217, + "learning_rate": 2.7024592164299496e-05, + "loss": 0.0336, + "step": 73130 + }, + { + "epoch": 0.16130277240518445, + "grad_norm": 0.13992920517921448, + "learning_rate": 2.7023603634116994e-05, + "loss": 0.0355, + "step": 73140 + }, + { + "epoch": 0.1613248263800826, + "grad_norm": 0.13491323590278625, + "learning_rate": 2.7022614957836355e-05, + "loss": 0.0332, + "step": 73150 + }, + { + "epoch": 0.1613468803549808, + "grad_norm": 0.10989052057266235, + "learning_rate": 2.7021626135469594e-05, + "loss": 0.0353, + "step": 73160 + }, + { + "epoch": 0.16136893432987895, + "grad_norm": 0.0971142128109932, + "learning_rate": 2.702063716702873e-05, + "loss": 0.0334, + "step": 73170 + }, + { + "epoch": 0.1613909883047771, + "grad_norm": 0.1075836718082428, + "learning_rate": 2.701964805252577e-05, + "loss": 0.0337, + "step": 73180 + }, + { + "epoch": 0.1614130422796753, + "grad_norm": 0.09157904982566833, + "learning_rate": 2.7018658791972737e-05, + "loss": 0.033, + "step": 73190 + }, + { + "epoch": 0.16143509625457345, + "grad_norm": 0.11289658397436142, + "learning_rate": 2.7017669385381653e-05, + "loss": 0.0348, + "step": 73200 + }, + { + "epoch": 0.1614571502294716, + "grad_norm": 0.11340529471635818, + "learning_rate": 2.701667983276454e-05, + "loss": 0.0336, + "step": 73210 + }, + { + "epoch": 0.1614792042043698, + "grad_norm": 0.12492895126342773, + "learning_rate": 2.701569013413342e-05, + "loss": 0.035, + "step": 73220 + }, + { + "epoch": 0.16150125817926794, + "grad_norm": 0.10965250432491302, + "learning_rate": 2.701470028950032e-05, + "loss": 0.0347, + "step": 73230 + }, + { + "epoch": 0.1615233121541661, + "grad_norm": 0.09488356113433838, + "learning_rate": 2.7013710298877272e-05, + "loss": 0.0346, + "step": 73240 + }, + { + "epoch": 0.16154536612906428, + "grad_norm": 0.1274593621492386, + "learning_rate": 2.7012720162276298e-05, + "loss": 0.0333, + "step": 73250 + }, + { + "epoch": 0.16156742010396244, + "grad_norm": 0.10935542732477188, + "learning_rate": 2.7011729879709432e-05, + "loss": 0.034, + "step": 73260 + }, + { + "epoch": 0.1615894740788606, + "grad_norm": 0.10354842245578766, + "learning_rate": 2.7010739451188703e-05, + "loss": 0.0344, + "step": 73270 + }, + { + "epoch": 0.16161152805375878, + "grad_norm": 0.1552705317735672, + "learning_rate": 2.7009748876726156e-05, + "loss": 0.0344, + "step": 73280 + }, + { + "epoch": 0.16163358202865694, + "grad_norm": 0.10772038996219635, + "learning_rate": 2.700875815633382e-05, + "loss": 0.0335, + "step": 73290 + }, + { + "epoch": 0.1616556360035551, + "grad_norm": 0.0994667187333107, + "learning_rate": 2.7007767290023734e-05, + "loss": 0.0359, + "step": 73300 + }, + { + "epoch": 0.16167768997845328, + "grad_norm": 0.11474242061376572, + "learning_rate": 2.7006776277807936e-05, + "loss": 0.0356, + "step": 73310 + }, + { + "epoch": 0.16169974395335143, + "grad_norm": 0.1448652595281601, + "learning_rate": 2.700578511969847e-05, + "loss": 0.0367, + "step": 73320 + }, + { + "epoch": 0.1617217979282496, + "grad_norm": 0.10543984174728394, + "learning_rate": 2.7004793815707382e-05, + "loss": 0.0343, + "step": 73330 + }, + { + "epoch": 0.16174385190314777, + "grad_norm": 0.10339187830686569, + "learning_rate": 2.7003802365846716e-05, + "loss": 0.0358, + "step": 73340 + }, + { + "epoch": 0.16176590587804593, + "grad_norm": 0.1226215735077858, + "learning_rate": 2.700281077012851e-05, + "loss": 0.0352, + "step": 73350 + }, + { + "epoch": 0.16178795985294409, + "grad_norm": 0.10597534477710724, + "learning_rate": 2.7001819028564826e-05, + "loss": 0.036, + "step": 73360 + }, + { + "epoch": 0.16181001382784227, + "grad_norm": 0.12306219339370728, + "learning_rate": 2.7000827141167708e-05, + "loss": 0.0341, + "step": 73370 + }, + { + "epoch": 0.16183206780274043, + "grad_norm": 0.11466153711080551, + "learning_rate": 2.6999835107949214e-05, + "loss": 0.0351, + "step": 73380 + }, + { + "epoch": 0.16185412177763858, + "grad_norm": 0.11993078887462616, + "learning_rate": 2.699884292892139e-05, + "loss": 0.0338, + "step": 73390 + }, + { + "epoch": 0.16187617575253677, + "grad_norm": 0.134256511926651, + "learning_rate": 2.6997850604096296e-05, + "loss": 0.0339, + "step": 73400 + }, + { + "epoch": 0.16189822972743492, + "grad_norm": 0.10642699152231216, + "learning_rate": 2.699685813348599e-05, + "loss": 0.0347, + "step": 73410 + }, + { + "epoch": 0.16192028370233308, + "grad_norm": 0.11831071227788925, + "learning_rate": 2.699586551710253e-05, + "loss": 0.0349, + "step": 73420 + }, + { + "epoch": 0.16194233767723126, + "grad_norm": 0.1491701304912567, + "learning_rate": 2.699487275495798e-05, + "loss": 0.0346, + "step": 73430 + }, + { + "epoch": 0.16196439165212942, + "grad_norm": 0.12384794652462006, + "learning_rate": 2.6993879847064398e-05, + "loss": 0.0322, + "step": 73440 + }, + { + "epoch": 0.16198644562702758, + "grad_norm": 0.13518023490905762, + "learning_rate": 2.6992886793433853e-05, + "loss": 0.0347, + "step": 73450 + }, + { + "epoch": 0.16200849960192576, + "grad_norm": 0.13085943460464478, + "learning_rate": 2.699189359407841e-05, + "loss": 0.0349, + "step": 73460 + }, + { + "epoch": 0.16203055357682392, + "grad_norm": 0.12801986932754517, + "learning_rate": 2.699090024901014e-05, + "loss": 0.0324, + "step": 73470 + }, + { + "epoch": 0.16205260755172207, + "grad_norm": 0.1566208302974701, + "learning_rate": 2.6989906758241106e-05, + "loss": 0.0328, + "step": 73480 + }, + { + "epoch": 0.16207466152662026, + "grad_norm": 0.08459730446338654, + "learning_rate": 2.6988913121783386e-05, + "loss": 0.0339, + "step": 73490 + }, + { + "epoch": 0.1620967155015184, + "grad_norm": 0.10079425573348999, + "learning_rate": 2.6987919339649053e-05, + "loss": 0.0376, + "step": 73500 + }, + { + "epoch": 0.16211876947641657, + "grad_norm": 0.10090307146310806, + "learning_rate": 2.6986925411850183e-05, + "loss": 0.0341, + "step": 73510 + }, + { + "epoch": 0.16214082345131475, + "grad_norm": 0.12453029304742813, + "learning_rate": 2.698593133839885e-05, + "loss": 0.0355, + "step": 73520 + }, + { + "epoch": 0.1621628774262129, + "grad_norm": 0.12095820158720016, + "learning_rate": 2.6984937119307136e-05, + "loss": 0.0343, + "step": 73530 + }, + { + "epoch": 0.16218493140111107, + "grad_norm": 0.138739213347435, + "learning_rate": 2.698394275458712e-05, + "loss": 0.034, + "step": 73540 + }, + { + "epoch": 0.16220698537600925, + "grad_norm": 0.17385219037532806, + "learning_rate": 2.6982948244250885e-05, + "loss": 0.0349, + "step": 73550 + }, + { + "epoch": 0.1622290393509074, + "grad_norm": 0.11371545493602753, + "learning_rate": 2.6981953588310516e-05, + "loss": 0.0365, + "step": 73560 + }, + { + "epoch": 0.1622510933258056, + "grad_norm": 0.16215401887893677, + "learning_rate": 2.6980958786778093e-05, + "loss": 0.0327, + "step": 73570 + }, + { + "epoch": 0.16227314730070375, + "grad_norm": 0.14245416224002838, + "learning_rate": 2.6979963839665714e-05, + "loss": 0.0344, + "step": 73580 + }, + { + "epoch": 0.1622952012756019, + "grad_norm": 0.1561661958694458, + "learning_rate": 2.6978968746985457e-05, + "loss": 0.0361, + "step": 73590 + }, + { + "epoch": 0.1623172552505001, + "grad_norm": 0.11083947122097015, + "learning_rate": 2.6977973508749426e-05, + "loss": 0.0337, + "step": 73600 + }, + { + "epoch": 0.16233930922539824, + "grad_norm": 0.12718498706817627, + "learning_rate": 2.6976978124969705e-05, + "loss": 0.0333, + "step": 73610 + }, + { + "epoch": 0.1623613632002964, + "grad_norm": 0.12012968212366104, + "learning_rate": 2.697598259565839e-05, + "loss": 0.0345, + "step": 73620 + }, + { + "epoch": 0.16238341717519458, + "grad_norm": 0.13048119843006134, + "learning_rate": 2.6974986920827586e-05, + "loss": 0.0342, + "step": 73630 + }, + { + "epoch": 0.16240547115009274, + "grad_norm": 0.12072217464447021, + "learning_rate": 2.6973991100489378e-05, + "loss": 0.0366, + "step": 73640 + }, + { + "epoch": 0.1624275251249909, + "grad_norm": 0.11661141365766525, + "learning_rate": 2.697299513465588e-05, + "loss": 0.0369, + "step": 73650 + }, + { + "epoch": 0.16244957909988908, + "grad_norm": 0.1056012213230133, + "learning_rate": 2.6971999023339182e-05, + "loss": 0.0353, + "step": 73660 + }, + { + "epoch": 0.16247163307478724, + "grad_norm": 0.08527525514364243, + "learning_rate": 2.6971002766551392e-05, + "loss": 0.0368, + "step": 73670 + }, + { + "epoch": 0.1624936870496854, + "grad_norm": 0.08945655077695847, + "learning_rate": 2.697000636430462e-05, + "loss": 0.0345, + "step": 73680 + }, + { + "epoch": 0.16251574102458358, + "grad_norm": 0.09485647082328796, + "learning_rate": 2.6969009816610963e-05, + "loss": 0.0346, + "step": 73690 + }, + { + "epoch": 0.16253779499948173, + "grad_norm": 0.09964332729578018, + "learning_rate": 2.6968013123482535e-05, + "loss": 0.0335, + "step": 73700 + }, + { + "epoch": 0.1625598489743799, + "grad_norm": 0.13457275927066803, + "learning_rate": 2.6967016284931453e-05, + "loss": 0.0344, + "step": 73710 + }, + { + "epoch": 0.16258190294927807, + "grad_norm": 0.11913967877626419, + "learning_rate": 2.6966019300969825e-05, + "loss": 0.0349, + "step": 73720 + }, + { + "epoch": 0.16260395692417623, + "grad_norm": 0.1255825012922287, + "learning_rate": 2.6965022171609763e-05, + "loss": 0.0351, + "step": 73730 + }, + { + "epoch": 0.16262601089907439, + "grad_norm": 0.10434188693761826, + "learning_rate": 2.6964024896863384e-05, + "loss": 0.0363, + "step": 73740 + }, + { + "epoch": 0.16264806487397257, + "grad_norm": 0.13394920527935028, + "learning_rate": 2.6963027476742807e-05, + "loss": 0.0346, + "step": 73750 + }, + { + "epoch": 0.16267011884887073, + "grad_norm": 0.10676833987236023, + "learning_rate": 2.6962029911260152e-05, + "loss": 0.0333, + "step": 73760 + }, + { + "epoch": 0.16269217282376888, + "grad_norm": 0.11688393354415894, + "learning_rate": 2.6961032200427537e-05, + "loss": 0.0327, + "step": 73770 + }, + { + "epoch": 0.16271422679866707, + "grad_norm": 0.11404851078987122, + "learning_rate": 2.6960034344257086e-05, + "loss": 0.0347, + "step": 73780 + }, + { + "epoch": 0.16273628077356522, + "grad_norm": 0.12741626799106598, + "learning_rate": 2.6959036342760934e-05, + "loss": 0.0364, + "step": 73790 + }, + { + "epoch": 0.16275833474846338, + "grad_norm": 0.15832892060279846, + "learning_rate": 2.695803819595119e-05, + "loss": 0.0353, + "step": 73800 + }, + { + "epoch": 0.16278038872336156, + "grad_norm": 0.09670252352952957, + "learning_rate": 2.6957039903839997e-05, + "loss": 0.0315, + "step": 73810 + }, + { + "epoch": 0.16280244269825972, + "grad_norm": 0.12537911534309387, + "learning_rate": 2.6956041466439478e-05, + "loss": 0.0364, + "step": 73820 + }, + { + "epoch": 0.16282449667315788, + "grad_norm": 0.11529925465583801, + "learning_rate": 2.695504288376177e-05, + "loss": 0.035, + "step": 73830 + }, + { + "epoch": 0.16284655064805606, + "grad_norm": 0.11830377578735352, + "learning_rate": 2.6954044155819002e-05, + "loss": 0.0348, + "step": 73840 + }, + { + "epoch": 0.16286860462295422, + "grad_norm": 0.13044747710227966, + "learning_rate": 2.6953045282623308e-05, + "loss": 0.0354, + "step": 73850 + }, + { + "epoch": 0.16289065859785237, + "grad_norm": 0.0925721526145935, + "learning_rate": 2.6952046264186833e-05, + "loss": 0.033, + "step": 73860 + }, + { + "epoch": 0.16291271257275056, + "grad_norm": 0.09674543142318726, + "learning_rate": 2.6951047100521705e-05, + "loss": 0.0346, + "step": 73870 + }, + { + "epoch": 0.1629347665476487, + "grad_norm": 0.10149041563272476, + "learning_rate": 2.6950047791640075e-05, + "loss": 0.0358, + "step": 73880 + }, + { + "epoch": 0.16295682052254687, + "grad_norm": 0.12318553030490875, + "learning_rate": 2.694904833755408e-05, + "loss": 0.0338, + "step": 73890 + }, + { + "epoch": 0.16297887449744505, + "grad_norm": 0.10457500070333481, + "learning_rate": 2.6948048738275873e-05, + "loss": 0.0335, + "step": 73900 + }, + { + "epoch": 0.1630009284723432, + "grad_norm": 0.09393446147441864, + "learning_rate": 2.6947048993817587e-05, + "loss": 0.0336, + "step": 73910 + }, + { + "epoch": 0.16302298244724137, + "grad_norm": 0.11354640871286392, + "learning_rate": 2.6946049104191378e-05, + "loss": 0.0344, + "step": 73920 + }, + { + "epoch": 0.16304503642213955, + "grad_norm": 0.09673232585191727, + "learning_rate": 2.6945049069409387e-05, + "loss": 0.0346, + "step": 73930 + }, + { + "epoch": 0.1630670903970377, + "grad_norm": 0.09128539264202118, + "learning_rate": 2.6944048889483778e-05, + "loss": 0.0334, + "step": 73940 + }, + { + "epoch": 0.16308914437193586, + "grad_norm": 0.1253860592842102, + "learning_rate": 2.6943048564426697e-05, + "loss": 0.0354, + "step": 73950 + }, + { + "epoch": 0.16311119834683405, + "grad_norm": 0.13215720653533936, + "learning_rate": 2.69420480942503e-05, + "loss": 0.0364, + "step": 73960 + }, + { + "epoch": 0.1631332523217322, + "grad_norm": 0.08220607787370682, + "learning_rate": 2.6941047478966744e-05, + "loss": 0.0341, + "step": 73970 + }, + { + "epoch": 0.1631553062966304, + "grad_norm": 0.11067324876785278, + "learning_rate": 2.6940046718588182e-05, + "loss": 0.0346, + "step": 73980 + }, + { + "epoch": 0.16317736027152854, + "grad_norm": 0.12709596753120422, + "learning_rate": 2.6939045813126785e-05, + "loss": 0.0352, + "step": 73990 + }, + { + "epoch": 0.1631994142464267, + "grad_norm": 0.11474885046482086, + "learning_rate": 2.69380447625947e-05, + "loss": 0.0319, + "step": 74000 + }, + { + "epoch": 0.16322146822132488, + "grad_norm": 0.12034787982702255, + "learning_rate": 2.693704356700411e-05, + "loss": 0.0338, + "step": 74010 + }, + { + "epoch": 0.16324352219622304, + "grad_norm": 0.11657480895519257, + "learning_rate": 2.693604222636717e-05, + "loss": 0.037, + "step": 74020 + }, + { + "epoch": 0.1632655761711212, + "grad_norm": 0.10070475935935974, + "learning_rate": 2.6935040740696043e-05, + "loss": 0.0336, + "step": 74030 + }, + { + "epoch": 0.16328763014601938, + "grad_norm": 0.12538884580135345, + "learning_rate": 2.69340391100029e-05, + "loss": 0.0357, + "step": 74040 + }, + { + "epoch": 0.16330968412091754, + "grad_norm": 0.1296541690826416, + "learning_rate": 2.693303733429992e-05, + "loss": 0.0352, + "step": 74050 + }, + { + "epoch": 0.1633317380958157, + "grad_norm": 0.10365688055753708, + "learning_rate": 2.6932035413599264e-05, + "loss": 0.0346, + "step": 74060 + }, + { + "epoch": 0.16335379207071388, + "grad_norm": 0.07812512665987015, + "learning_rate": 2.693103334791312e-05, + "loss": 0.0336, + "step": 74070 + }, + { + "epoch": 0.16337584604561203, + "grad_norm": 0.11677464842796326, + "learning_rate": 2.6930031137253652e-05, + "loss": 0.0331, + "step": 74080 + }, + { + "epoch": 0.1633979000205102, + "grad_norm": 0.10664515197277069, + "learning_rate": 2.692902878163304e-05, + "loss": 0.0343, + "step": 74090 + }, + { + "epoch": 0.16341995399540837, + "grad_norm": 0.13378790020942688, + "learning_rate": 2.6928026281063463e-05, + "loss": 0.0347, + "step": 74100 + }, + { + "epoch": 0.16344200797030653, + "grad_norm": 0.11107845604419708, + "learning_rate": 2.6927023635557112e-05, + "loss": 0.0337, + "step": 74110 + }, + { + "epoch": 0.1634640619452047, + "grad_norm": 0.10611458867788315, + "learning_rate": 2.6926020845126158e-05, + "loss": 0.0368, + "step": 74120 + }, + { + "epoch": 0.16348611592010287, + "grad_norm": 0.11074993014335632, + "learning_rate": 2.6925017909782795e-05, + "loss": 0.0363, + "step": 74130 + }, + { + "epoch": 0.16350816989500103, + "grad_norm": 0.10944310575723648, + "learning_rate": 2.6924014829539203e-05, + "loss": 0.0341, + "step": 74140 + }, + { + "epoch": 0.16353022386989918, + "grad_norm": 0.11680235713720322, + "learning_rate": 2.6923011604407572e-05, + "loss": 0.0354, + "step": 74150 + }, + { + "epoch": 0.16355227784479737, + "grad_norm": 0.10250644385814667, + "learning_rate": 2.6922008234400094e-05, + "loss": 0.0353, + "step": 74160 + }, + { + "epoch": 0.16357433181969552, + "grad_norm": 0.09266235679388046, + "learning_rate": 2.692100471952896e-05, + "loss": 0.0366, + "step": 74170 + }, + { + "epoch": 0.16359638579459368, + "grad_norm": 0.09964916110038757, + "learning_rate": 2.692000105980636e-05, + "loss": 0.0339, + "step": 74180 + }, + { + "epoch": 0.16361843976949186, + "grad_norm": 0.1047181561589241, + "learning_rate": 2.6918997255244494e-05, + "loss": 0.0339, + "step": 74190 + }, + { + "epoch": 0.16364049374439002, + "grad_norm": 0.09838228672742844, + "learning_rate": 2.691799330585556e-05, + "loss": 0.0341, + "step": 74200 + }, + { + "epoch": 0.16366254771928818, + "grad_norm": 0.10366807132959366, + "learning_rate": 2.691698921165175e-05, + "loss": 0.0344, + "step": 74210 + }, + { + "epoch": 0.16368460169418636, + "grad_norm": 0.08768599480390549, + "learning_rate": 2.6915984972645278e-05, + "loss": 0.0344, + "step": 74220 + }, + { + "epoch": 0.16370665566908452, + "grad_norm": 0.11190599948167801, + "learning_rate": 2.6914980588848334e-05, + "loss": 0.0341, + "step": 74230 + }, + { + "epoch": 0.16372870964398267, + "grad_norm": 0.10724382102489471, + "learning_rate": 2.6913976060273124e-05, + "loss": 0.033, + "step": 74240 + }, + { + "epoch": 0.16375076361888086, + "grad_norm": 0.11872603744268417, + "learning_rate": 2.691297138693186e-05, + "loss": 0.0336, + "step": 74250 + }, + { + "epoch": 0.163772817593779, + "grad_norm": 0.10616189986467361, + "learning_rate": 2.6911966568836742e-05, + "loss": 0.0343, + "step": 74260 + }, + { + "epoch": 0.16379487156867717, + "grad_norm": 0.13596129417419434, + "learning_rate": 2.691096160599998e-05, + "loss": 0.0336, + "step": 74270 + }, + { + "epoch": 0.16381692554357535, + "grad_norm": 0.12287753820419312, + "learning_rate": 2.6909956498433798e-05, + "loss": 0.0347, + "step": 74280 + }, + { + "epoch": 0.1638389795184735, + "grad_norm": 0.09542059898376465, + "learning_rate": 2.69089512461504e-05, + "loss": 0.0358, + "step": 74290 + }, + { + "epoch": 0.16386103349337167, + "grad_norm": 0.10663482546806335, + "learning_rate": 2.6907945849161992e-05, + "loss": 0.0342, + "step": 74300 + }, + { + "epoch": 0.16388308746826985, + "grad_norm": 0.11571594327688217, + "learning_rate": 2.69069403074808e-05, + "loss": 0.0345, + "step": 74310 + }, + { + "epoch": 0.163905141443168, + "grad_norm": 0.110112264752388, + "learning_rate": 2.6905934621119047e-05, + "loss": 0.0358, + "step": 74320 + }, + { + "epoch": 0.16392719541806616, + "grad_norm": 0.10821487009525299, + "learning_rate": 2.6904928790088945e-05, + "loss": 0.0348, + "step": 74330 + }, + { + "epoch": 0.16394924939296435, + "grad_norm": 0.10628573596477509, + "learning_rate": 2.6903922814402716e-05, + "loss": 0.0353, + "step": 74340 + }, + { + "epoch": 0.1639713033678625, + "grad_norm": 0.12507542967796326, + "learning_rate": 2.6902916694072587e-05, + "loss": 0.0337, + "step": 74350 + }, + { + "epoch": 0.16399335734276066, + "grad_norm": 0.10098560154438019, + "learning_rate": 2.6901910429110782e-05, + "loss": 0.034, + "step": 74360 + }, + { + "epoch": 0.16401541131765884, + "grad_norm": 0.11002005636692047, + "learning_rate": 2.690090401952953e-05, + "loss": 0.0325, + "step": 74370 + }, + { + "epoch": 0.164037465292557, + "grad_norm": 0.101382777094841, + "learning_rate": 2.689989746534105e-05, + "loss": 0.0341, + "step": 74380 + }, + { + "epoch": 0.16405951926745516, + "grad_norm": 0.08498819172382355, + "learning_rate": 2.689889076655759e-05, + "loss": 0.0345, + "step": 74390 + }, + { + "epoch": 0.16408157324235334, + "grad_norm": 0.10785850137472153, + "learning_rate": 2.6897883923191373e-05, + "loss": 0.0355, + "step": 74400 + }, + { + "epoch": 0.1641036272172515, + "grad_norm": 0.13379734754562378, + "learning_rate": 2.6896876935254628e-05, + "loss": 0.0356, + "step": 74410 + }, + { + "epoch": 0.16412568119214968, + "grad_norm": 0.12070869654417038, + "learning_rate": 2.6895869802759595e-05, + "loss": 0.0349, + "step": 74420 + }, + { + "epoch": 0.16414773516704784, + "grad_norm": 0.10717897117137909, + "learning_rate": 2.6894862525718515e-05, + "loss": 0.0359, + "step": 74430 + }, + { + "epoch": 0.164169789141946, + "grad_norm": 0.12057067453861237, + "learning_rate": 2.6893855104143623e-05, + "loss": 0.0355, + "step": 74440 + }, + { + "epoch": 0.16419184311684418, + "grad_norm": 0.09162268787622452, + "learning_rate": 2.689284753804716e-05, + "loss": 0.0356, + "step": 74450 + }, + { + "epoch": 0.16421389709174233, + "grad_norm": 0.11800924688577652, + "learning_rate": 2.6891839827441376e-05, + "loss": 0.0366, + "step": 74460 + }, + { + "epoch": 0.1642359510666405, + "grad_norm": 0.11040183901786804, + "learning_rate": 2.6890831972338507e-05, + "loss": 0.0348, + "step": 74470 + }, + { + "epoch": 0.16425800504153867, + "grad_norm": 0.10705482959747314, + "learning_rate": 2.6889823972750808e-05, + "loss": 0.0353, + "step": 74480 + }, + { + "epoch": 0.16428005901643683, + "grad_norm": 0.11649520695209503, + "learning_rate": 2.6888815828690513e-05, + "loss": 0.0342, + "step": 74490 + }, + { + "epoch": 0.164302112991335, + "grad_norm": 0.09236778318881989, + "learning_rate": 2.6887807540169884e-05, + "loss": 0.0351, + "step": 74500 + }, + { + "epoch": 0.16432416696623317, + "grad_norm": 0.09460470825433731, + "learning_rate": 2.6886799107201173e-05, + "loss": 0.0333, + "step": 74510 + }, + { + "epoch": 0.16434622094113133, + "grad_norm": 0.18851348757743835, + "learning_rate": 2.6885790529796627e-05, + "loss": 0.0358, + "step": 74520 + }, + { + "epoch": 0.16436827491602948, + "grad_norm": 0.10355675965547562, + "learning_rate": 2.6884781807968505e-05, + "loss": 0.0329, + "step": 74530 + }, + { + "epoch": 0.16439032889092767, + "grad_norm": 0.11502845585346222, + "learning_rate": 2.688377294172906e-05, + "loss": 0.0342, + "step": 74540 + }, + { + "epoch": 0.16441238286582582, + "grad_norm": 0.11401163041591644, + "learning_rate": 2.6882763931090557e-05, + "loss": 0.0356, + "step": 74550 + }, + { + "epoch": 0.16443443684072398, + "grad_norm": 0.10560581088066101, + "learning_rate": 2.6881754776065252e-05, + "loss": 0.0332, + "step": 74560 + }, + { + "epoch": 0.16445649081562216, + "grad_norm": 0.13255326449871063, + "learning_rate": 2.688074547666541e-05, + "loss": 0.036, + "step": 74570 + }, + { + "epoch": 0.16447854479052032, + "grad_norm": 0.11478979885578156, + "learning_rate": 2.687973603290329e-05, + "loss": 0.0346, + "step": 74580 + }, + { + "epoch": 0.16450059876541848, + "grad_norm": 0.12988336384296417, + "learning_rate": 2.687872644479116e-05, + "loss": 0.0346, + "step": 74590 + }, + { + "epoch": 0.16452265274031666, + "grad_norm": 0.10894661396741867, + "learning_rate": 2.6877716712341294e-05, + "loss": 0.0339, + "step": 74600 + }, + { + "epoch": 0.16454470671521482, + "grad_norm": 0.1343277543783188, + "learning_rate": 2.687670683556595e-05, + "loss": 0.0335, + "step": 74610 + }, + { + "epoch": 0.16456676069011297, + "grad_norm": 0.11845221370458603, + "learning_rate": 2.687569681447741e-05, + "loss": 0.0329, + "step": 74620 + }, + { + "epoch": 0.16458881466501116, + "grad_norm": 0.10828351974487305, + "learning_rate": 2.6874686649087934e-05, + "loss": 0.0339, + "step": 74630 + }, + { + "epoch": 0.1646108686399093, + "grad_norm": 0.10279036313295364, + "learning_rate": 2.687367633940981e-05, + "loss": 0.0332, + "step": 74640 + }, + { + "epoch": 0.16463292261480747, + "grad_norm": 0.12254020571708679, + "learning_rate": 2.687266588545531e-05, + "loss": 0.0341, + "step": 74650 + }, + { + "epoch": 0.16465497658970565, + "grad_norm": 0.11695867031812668, + "learning_rate": 2.6871655287236698e-05, + "loss": 0.034, + "step": 74660 + }, + { + "epoch": 0.1646770305646038, + "grad_norm": 0.12932036817073822, + "learning_rate": 2.6870644544766276e-05, + "loss": 0.0342, + "step": 74670 + }, + { + "epoch": 0.16469908453950197, + "grad_norm": 0.11057016998529434, + "learning_rate": 2.6869633658056313e-05, + "loss": 0.0356, + "step": 74680 + }, + { + "epoch": 0.16472113851440015, + "grad_norm": 0.09353917837142944, + "learning_rate": 2.6868622627119087e-05, + "loss": 0.0345, + "step": 74690 + }, + { + "epoch": 0.1647431924892983, + "grad_norm": 0.11880657076835632, + "learning_rate": 2.6867611451966898e-05, + "loss": 0.0344, + "step": 74700 + }, + { + "epoch": 0.16476524646419646, + "grad_norm": 0.08985203504562378, + "learning_rate": 2.6866600132612025e-05, + "loss": 0.033, + "step": 74710 + }, + { + "epoch": 0.16478730043909465, + "grad_norm": 0.12881286442279816, + "learning_rate": 2.6865588669066754e-05, + "loss": 0.0342, + "step": 74720 + }, + { + "epoch": 0.1648093544139928, + "grad_norm": 0.1088448017835617, + "learning_rate": 2.686457706134338e-05, + "loss": 0.0349, + "step": 74730 + }, + { + "epoch": 0.16483140838889096, + "grad_norm": 0.09177844971418381, + "learning_rate": 2.686356530945419e-05, + "loss": 0.0345, + "step": 74740 + }, + { + "epoch": 0.16485346236378914, + "grad_norm": 0.09124225378036499, + "learning_rate": 2.6862553413411483e-05, + "loss": 0.0328, + "step": 74750 + }, + { + "epoch": 0.1648755163386873, + "grad_norm": 0.10486242175102234, + "learning_rate": 2.6861541373227547e-05, + "loss": 0.034, + "step": 74760 + }, + { + "epoch": 0.16489757031358546, + "grad_norm": 0.11886871606111526, + "learning_rate": 2.6860529188914687e-05, + "loss": 0.0348, + "step": 74770 + }, + { + "epoch": 0.16491962428848364, + "grad_norm": 0.10553038865327835, + "learning_rate": 2.6859516860485202e-05, + "loss": 0.0327, + "step": 74780 + }, + { + "epoch": 0.1649416782633818, + "grad_norm": 0.13886858522891998, + "learning_rate": 2.6858504387951384e-05, + "loss": 0.0356, + "step": 74790 + }, + { + "epoch": 0.16496373223827995, + "grad_norm": 0.09602636843919754, + "learning_rate": 2.685749177132555e-05, + "loss": 0.0353, + "step": 74800 + }, + { + "epoch": 0.16498578621317814, + "grad_norm": 0.12335468828678131, + "learning_rate": 2.685647901061999e-05, + "loss": 0.0344, + "step": 74810 + }, + { + "epoch": 0.1650078401880763, + "grad_norm": 0.0995018482208252, + "learning_rate": 2.6855466105847017e-05, + "loss": 0.0335, + "step": 74820 + }, + { + "epoch": 0.16502989416297448, + "grad_norm": 0.09733335673809052, + "learning_rate": 2.6854453057018934e-05, + "loss": 0.0356, + "step": 74830 + }, + { + "epoch": 0.16505194813787263, + "grad_norm": 0.09985069185495377, + "learning_rate": 2.685343986414806e-05, + "loss": 0.0352, + "step": 74840 + }, + { + "epoch": 0.1650740021127708, + "grad_norm": 0.10824207961559296, + "learning_rate": 2.6852426527246695e-05, + "loss": 0.0317, + "step": 74850 + }, + { + "epoch": 0.16509605608766897, + "grad_norm": 0.09647326916456223, + "learning_rate": 2.685141304632716e-05, + "loss": 0.0339, + "step": 74860 + }, + { + "epoch": 0.16511811006256713, + "grad_norm": 0.09729236364364624, + "learning_rate": 2.6850399421401765e-05, + "loss": 0.0344, + "step": 74870 + }, + { + "epoch": 0.1651401640374653, + "grad_norm": 0.12656578421592712, + "learning_rate": 2.6849385652482828e-05, + "loss": 0.0325, + "step": 74880 + }, + { + "epoch": 0.16516221801236347, + "grad_norm": 0.10333958268165588, + "learning_rate": 2.684837173958267e-05, + "loss": 0.0345, + "step": 74890 + }, + { + "epoch": 0.16518427198726163, + "grad_norm": 0.10655205696821213, + "learning_rate": 2.684735768271361e-05, + "loss": 0.031, + "step": 74900 + }, + { + "epoch": 0.16520632596215978, + "grad_norm": 0.1098477691411972, + "learning_rate": 2.6846343481887965e-05, + "loss": 0.037, + "step": 74910 + }, + { + "epoch": 0.16522837993705797, + "grad_norm": 0.10930754244327545, + "learning_rate": 2.684532913711806e-05, + "loss": 0.0342, + "step": 74920 + }, + { + "epoch": 0.16525043391195612, + "grad_norm": 0.12088797986507416, + "learning_rate": 2.6844314648416228e-05, + "loss": 0.0351, + "step": 74930 + }, + { + "epoch": 0.16527248788685428, + "grad_norm": 0.1350221335887909, + "learning_rate": 2.6843300015794787e-05, + "loss": 0.0354, + "step": 74940 + }, + { + "epoch": 0.16529454186175246, + "grad_norm": 0.11273011565208435, + "learning_rate": 2.684228523926607e-05, + "loss": 0.0352, + "step": 74950 + }, + { + "epoch": 0.16531659583665062, + "grad_norm": 0.1216055154800415, + "learning_rate": 2.6841270318842407e-05, + "loss": 0.035, + "step": 74960 + }, + { + "epoch": 0.16533864981154878, + "grad_norm": 0.15390320122241974, + "learning_rate": 2.6840255254536123e-05, + "loss": 0.0318, + "step": 74970 + }, + { + "epoch": 0.16536070378644696, + "grad_norm": 0.10596555471420288, + "learning_rate": 2.6839240046359565e-05, + "loss": 0.0347, + "step": 74980 + }, + { + "epoch": 0.16538275776134512, + "grad_norm": 0.14249537885189056, + "learning_rate": 2.6838224694325062e-05, + "loss": 0.0358, + "step": 74990 + }, + { + "epoch": 0.16540481173624327, + "grad_norm": 0.09516648948192596, + "learning_rate": 2.6837209198444948e-05, + "loss": 0.0344, + "step": 75000 + }, + { + "epoch": 0.16542686571114146, + "grad_norm": 0.1320265531539917, + "learning_rate": 2.6836193558731572e-05, + "loss": 0.0341, + "step": 75010 + }, + { + "epoch": 0.1654489196860396, + "grad_norm": 0.10827714949846268, + "learning_rate": 2.6835177775197266e-05, + "loss": 0.035, + "step": 75020 + }, + { + "epoch": 0.16547097366093777, + "grad_norm": 0.10376626253128052, + "learning_rate": 2.6834161847854374e-05, + "loss": 0.0343, + "step": 75030 + }, + { + "epoch": 0.16549302763583595, + "grad_norm": 0.10210752487182617, + "learning_rate": 2.6833145776715242e-05, + "loss": 0.0326, + "step": 75040 + }, + { + "epoch": 0.1655150816107341, + "grad_norm": 0.11370886117219925, + "learning_rate": 2.6832129561792216e-05, + "loss": 0.0326, + "step": 75050 + }, + { + "epoch": 0.16553713558563227, + "grad_norm": 0.12472925335168839, + "learning_rate": 2.6831113203097654e-05, + "loss": 0.0354, + "step": 75060 + }, + { + "epoch": 0.16555918956053045, + "grad_norm": 0.15527482330799103, + "learning_rate": 2.6830096700643887e-05, + "loss": 0.0353, + "step": 75070 + }, + { + "epoch": 0.1655812435354286, + "grad_norm": 0.13759154081344604, + "learning_rate": 2.682908005444328e-05, + "loss": 0.0351, + "step": 75080 + }, + { + "epoch": 0.16560329751032676, + "grad_norm": 0.10750874876976013, + "learning_rate": 2.6828063264508176e-05, + "loss": 0.0332, + "step": 75090 + }, + { + "epoch": 0.16562535148522495, + "grad_norm": 0.09729194641113281, + "learning_rate": 2.682704633085094e-05, + "loss": 0.0344, + "step": 75100 + }, + { + "epoch": 0.1656474054601231, + "grad_norm": 0.09896860271692276, + "learning_rate": 2.6826029253483923e-05, + "loss": 0.0337, + "step": 75110 + }, + { + "epoch": 0.16566945943502126, + "grad_norm": 0.10511229187250137, + "learning_rate": 2.6825012032419486e-05, + "loss": 0.0332, + "step": 75120 + }, + { + "epoch": 0.16569151340991944, + "grad_norm": 0.09591428935527802, + "learning_rate": 2.6823994667669987e-05, + "loss": 0.036, + "step": 75130 + }, + { + "epoch": 0.1657135673848176, + "grad_norm": 0.09432950615882874, + "learning_rate": 2.682297715924779e-05, + "loss": 0.0341, + "step": 75140 + }, + { + "epoch": 0.16573562135971576, + "grad_norm": 0.12132391333580017, + "learning_rate": 2.682195950716526e-05, + "loss": 0.0329, + "step": 75150 + }, + { + "epoch": 0.16575767533461394, + "grad_norm": 0.0992090255022049, + "learning_rate": 2.6820941711434758e-05, + "loss": 0.0347, + "step": 75160 + }, + { + "epoch": 0.1657797293095121, + "grad_norm": 0.10331864655017853, + "learning_rate": 2.681992377206865e-05, + "loss": 0.0353, + "step": 75170 + }, + { + "epoch": 0.16580178328441025, + "grad_norm": 0.11602826416492462, + "learning_rate": 2.6818905689079313e-05, + "loss": 0.0312, + "step": 75180 + }, + { + "epoch": 0.16582383725930844, + "grad_norm": 0.14065569639205933, + "learning_rate": 2.681788746247911e-05, + "loss": 0.0331, + "step": 75190 + }, + { + "epoch": 0.1658458912342066, + "grad_norm": 0.1481083184480667, + "learning_rate": 2.681686909228042e-05, + "loss": 0.033, + "step": 75200 + }, + { + "epoch": 0.16586794520910475, + "grad_norm": 0.1073915883898735, + "learning_rate": 2.681585057849561e-05, + "loss": 0.0329, + "step": 75210 + }, + { + "epoch": 0.16588999918400293, + "grad_norm": 0.10923118889331818, + "learning_rate": 2.6814831921137062e-05, + "loss": 0.0364, + "step": 75220 + }, + { + "epoch": 0.1659120531589011, + "grad_norm": 0.13927970826625824, + "learning_rate": 2.681381312021715e-05, + "loss": 0.0335, + "step": 75230 + }, + { + "epoch": 0.16593410713379925, + "grad_norm": 0.12057141214609146, + "learning_rate": 2.6812794175748253e-05, + "loss": 0.0374, + "step": 75240 + }, + { + "epoch": 0.16595616110869743, + "grad_norm": 0.12308388948440552, + "learning_rate": 2.6811775087742755e-05, + "loss": 0.0353, + "step": 75250 + }, + { + "epoch": 0.1659782150835956, + "grad_norm": 0.13333231210708618, + "learning_rate": 2.6810755856213037e-05, + "loss": 0.0362, + "step": 75260 + }, + { + "epoch": 0.16600026905849377, + "grad_norm": 0.1577335149049759, + "learning_rate": 2.6809736481171486e-05, + "loss": 0.0359, + "step": 75270 + }, + { + "epoch": 0.16602232303339193, + "grad_norm": 0.10155249387025833, + "learning_rate": 2.6808716962630488e-05, + "loss": 0.0354, + "step": 75280 + }, + { + "epoch": 0.16604437700829008, + "grad_norm": 0.101931631565094, + "learning_rate": 2.6807697300602424e-05, + "loss": 0.0348, + "step": 75290 + }, + { + "epoch": 0.16606643098318827, + "grad_norm": 0.10797058790922165, + "learning_rate": 2.6806677495099695e-05, + "loss": 0.0359, + "step": 75300 + }, + { + "epoch": 0.16608848495808642, + "grad_norm": 0.1440293937921524, + "learning_rate": 2.680565754613468e-05, + "loss": 0.0336, + "step": 75310 + }, + { + "epoch": 0.16611053893298458, + "grad_norm": 0.0954408198595047, + "learning_rate": 2.680463745371979e-05, + "loss": 0.0335, + "step": 75320 + }, + { + "epoch": 0.16613259290788276, + "grad_norm": 0.11502159386873245, + "learning_rate": 2.6803617217867403e-05, + "loss": 0.0338, + "step": 75330 + }, + { + "epoch": 0.16615464688278092, + "grad_norm": 0.12745238840579987, + "learning_rate": 2.680259683858992e-05, + "loss": 0.0349, + "step": 75340 + }, + { + "epoch": 0.16617670085767908, + "grad_norm": 0.1107691079378128, + "learning_rate": 2.6801576315899744e-05, + "loss": 0.0336, + "step": 75350 + }, + { + "epoch": 0.16619875483257726, + "grad_norm": 0.12033537775278091, + "learning_rate": 2.6800555649809277e-05, + "loss": 0.0359, + "step": 75360 + }, + { + "epoch": 0.16622080880747542, + "grad_norm": 0.10076960176229477, + "learning_rate": 2.6799534840330914e-05, + "loss": 0.0338, + "step": 75370 + }, + { + "epoch": 0.16624286278237357, + "grad_norm": 0.1120896264910698, + "learning_rate": 2.6798513887477063e-05, + "loss": 0.0343, + "step": 75380 + }, + { + "epoch": 0.16626491675727176, + "grad_norm": 0.1391092836856842, + "learning_rate": 2.6797492791260124e-05, + "loss": 0.036, + "step": 75390 + }, + { + "epoch": 0.16628697073216991, + "grad_norm": 0.11389380693435669, + "learning_rate": 2.6796471551692515e-05, + "loss": 0.0346, + "step": 75400 + }, + { + "epoch": 0.16630902470706807, + "grad_norm": 0.1070532500743866, + "learning_rate": 2.6795450168786634e-05, + "loss": 0.0329, + "step": 75410 + }, + { + "epoch": 0.16633107868196625, + "grad_norm": 0.11205891519784927, + "learning_rate": 2.6794428642554903e-05, + "loss": 0.0355, + "step": 75420 + }, + { + "epoch": 0.1663531326568644, + "grad_norm": 0.10419946908950806, + "learning_rate": 2.6793406973009728e-05, + "loss": 0.0334, + "step": 75430 + }, + { + "epoch": 0.16637518663176257, + "grad_norm": 0.10474658757448196, + "learning_rate": 2.679238516016352e-05, + "loss": 0.0342, + "step": 75440 + }, + { + "epoch": 0.16639724060666075, + "grad_norm": 0.10215763002634048, + "learning_rate": 2.6791363204028693e-05, + "loss": 0.0363, + "step": 75450 + }, + { + "epoch": 0.1664192945815589, + "grad_norm": 0.09990071505308151, + "learning_rate": 2.679034110461768e-05, + "loss": 0.0344, + "step": 75460 + }, + { + "epoch": 0.16644134855645706, + "grad_norm": 0.12217818200588226, + "learning_rate": 2.6789318861942886e-05, + "loss": 0.0347, + "step": 75470 + }, + { + "epoch": 0.16646340253135525, + "grad_norm": 0.10923565179109573, + "learning_rate": 2.6788296476016736e-05, + "loss": 0.0335, + "step": 75480 + }, + { + "epoch": 0.1664854565062534, + "grad_norm": 0.10792343318462372, + "learning_rate": 2.6787273946851656e-05, + "loss": 0.0356, + "step": 75490 + }, + { + "epoch": 0.16650751048115156, + "grad_norm": 0.10489644855260849, + "learning_rate": 2.6786251274460068e-05, + "loss": 0.0363, + "step": 75500 + }, + { + "epoch": 0.16652956445604974, + "grad_norm": 0.11364581435918808, + "learning_rate": 2.6785228458854396e-05, + "loss": 0.0356, + "step": 75510 + }, + { + "epoch": 0.1665516184309479, + "grad_norm": 0.12081853300333023, + "learning_rate": 2.6784205500047073e-05, + "loss": 0.0357, + "step": 75520 + }, + { + "epoch": 0.16657367240584606, + "grad_norm": 0.14453187584877014, + "learning_rate": 2.678318239805053e-05, + "loss": 0.0355, + "step": 75530 + }, + { + "epoch": 0.16659572638074424, + "grad_norm": 0.09244607388973236, + "learning_rate": 2.6782159152877192e-05, + "loss": 0.0345, + "step": 75540 + }, + { + "epoch": 0.1666177803556424, + "grad_norm": 0.13586260378360748, + "learning_rate": 2.67811357645395e-05, + "loss": 0.0334, + "step": 75550 + }, + { + "epoch": 0.16663983433054055, + "grad_norm": 0.12256332486867905, + "learning_rate": 2.678011223304988e-05, + "loss": 0.0368, + "step": 75560 + }, + { + "epoch": 0.16666188830543874, + "grad_norm": 0.08382165431976318, + "learning_rate": 2.6779088558420778e-05, + "loss": 0.0346, + "step": 75570 + }, + { + "epoch": 0.1666839422803369, + "grad_norm": 0.10271003097295761, + "learning_rate": 2.6778064740664625e-05, + "loss": 0.0341, + "step": 75580 + }, + { + "epoch": 0.16670599625523505, + "grad_norm": 0.08939483016729355, + "learning_rate": 2.677704077979387e-05, + "loss": 0.0349, + "step": 75590 + }, + { + "epoch": 0.16672805023013323, + "grad_norm": 0.10506007075309753, + "learning_rate": 2.6776016675820947e-05, + "loss": 0.0324, + "step": 75600 + }, + { + "epoch": 0.1667501042050314, + "grad_norm": 0.12602940201759338, + "learning_rate": 2.6774992428758305e-05, + "loss": 0.0355, + "step": 75610 + }, + { + "epoch": 0.16677215817992955, + "grad_norm": 0.13685353100299835, + "learning_rate": 2.6773968038618388e-05, + "loss": 0.034, + "step": 75620 + }, + { + "epoch": 0.16679421215482773, + "grad_norm": 0.1012098640203476, + "learning_rate": 2.677294350541364e-05, + "loss": 0.0337, + "step": 75630 + }, + { + "epoch": 0.1668162661297259, + "grad_norm": 0.08631070703268051, + "learning_rate": 2.6771918829156513e-05, + "loss": 0.0368, + "step": 75640 + }, + { + "epoch": 0.16683832010462404, + "grad_norm": 0.11265826225280762, + "learning_rate": 2.677089400985946e-05, + "loss": 0.0347, + "step": 75650 + }, + { + "epoch": 0.16686037407952223, + "grad_norm": 0.13816829025745392, + "learning_rate": 2.676986904753493e-05, + "loss": 0.0368, + "step": 75660 + }, + { + "epoch": 0.16688242805442038, + "grad_norm": 0.09127304702997208, + "learning_rate": 2.676884394219538e-05, + "loss": 0.0355, + "step": 75670 + }, + { + "epoch": 0.16690448202931854, + "grad_norm": 0.09418071061372757, + "learning_rate": 2.676781869385326e-05, + "loss": 0.0364, + "step": 75680 + }, + { + "epoch": 0.16692653600421672, + "grad_norm": 0.10212782025337219, + "learning_rate": 2.6766793302521036e-05, + "loss": 0.0345, + "step": 75690 + }, + { + "epoch": 0.16694858997911488, + "grad_norm": 0.1407444328069687, + "learning_rate": 2.6765767768211165e-05, + "loss": 0.034, + "step": 75700 + }, + { + "epoch": 0.16697064395401306, + "grad_norm": 0.12145689874887466, + "learning_rate": 2.6764742090936105e-05, + "loss": 0.0345, + "step": 75710 + }, + { + "epoch": 0.16699269792891122, + "grad_norm": 0.13918070495128632, + "learning_rate": 2.6763716270708318e-05, + "loss": 0.0336, + "step": 75720 + }, + { + "epoch": 0.16701475190380938, + "grad_norm": 0.10537014901638031, + "learning_rate": 2.6762690307540275e-05, + "loss": 0.0334, + "step": 75730 + }, + { + "epoch": 0.16703680587870756, + "grad_norm": 0.10934901982545853, + "learning_rate": 2.6761664201444435e-05, + "loss": 0.0335, + "step": 75740 + }, + { + "epoch": 0.16705885985360572, + "grad_norm": 0.12830963730812073, + "learning_rate": 2.676063795243328e-05, + "loss": 0.0342, + "step": 75750 + }, + { + "epoch": 0.16708091382850387, + "grad_norm": 0.10057773441076279, + "learning_rate": 2.6759611560519256e-05, + "loss": 0.0352, + "step": 75760 + }, + { + "epoch": 0.16710296780340206, + "grad_norm": 0.10749910026788712, + "learning_rate": 2.6758585025714857e-05, + "loss": 0.0325, + "step": 75770 + }, + { + "epoch": 0.16712502177830021, + "grad_norm": 0.18418274819850922, + "learning_rate": 2.6757558348032548e-05, + "loss": 0.0344, + "step": 75780 + }, + { + "epoch": 0.16714707575319837, + "grad_norm": 0.09158485382795334, + "learning_rate": 2.6756531527484803e-05, + "loss": 0.0326, + "step": 75790 + }, + { + "epoch": 0.16716912972809655, + "grad_norm": 0.11842810362577438, + "learning_rate": 2.6755504564084098e-05, + "loss": 0.034, + "step": 75800 + }, + { + "epoch": 0.1671911837029947, + "grad_norm": 0.1118403822183609, + "learning_rate": 2.6754477457842917e-05, + "loss": 0.0348, + "step": 75810 + }, + { + "epoch": 0.16721323767789287, + "grad_norm": 0.09595313668251038, + "learning_rate": 2.6753450208773734e-05, + "loss": 0.0333, + "step": 75820 + }, + { + "epoch": 0.16723529165279105, + "grad_norm": 0.10784897208213806, + "learning_rate": 2.6752422816889035e-05, + "loss": 0.0335, + "step": 75830 + }, + { + "epoch": 0.1672573456276892, + "grad_norm": 0.12808758020401, + "learning_rate": 2.6751395282201304e-05, + "loss": 0.0354, + "step": 75840 + }, + { + "epoch": 0.16727939960258736, + "grad_norm": 0.10697917640209198, + "learning_rate": 2.6750367604723025e-05, + "loss": 0.034, + "step": 75850 + }, + { + "epoch": 0.16730145357748555, + "grad_norm": 0.11967422068119049, + "learning_rate": 2.674933978446668e-05, + "loss": 0.0344, + "step": 75860 + }, + { + "epoch": 0.1673235075523837, + "grad_norm": 0.11443404108285904, + "learning_rate": 2.674831182144477e-05, + "loss": 0.0342, + "step": 75870 + }, + { + "epoch": 0.16734556152728186, + "grad_norm": 0.13523827493190765, + "learning_rate": 2.6747283715669776e-05, + "loss": 0.0362, + "step": 75880 + }, + { + "epoch": 0.16736761550218004, + "grad_norm": 0.11615776270627975, + "learning_rate": 2.6746255467154197e-05, + "loss": 0.0343, + "step": 75890 + }, + { + "epoch": 0.1673896694770782, + "grad_norm": 0.14947544038295746, + "learning_rate": 2.674522707591052e-05, + "loss": 0.0326, + "step": 75900 + }, + { + "epoch": 0.16741172345197636, + "grad_norm": 0.09616191685199738, + "learning_rate": 2.6744198541951244e-05, + "loss": 0.0338, + "step": 75910 + }, + { + "epoch": 0.16743377742687454, + "grad_norm": 0.09501350671052933, + "learning_rate": 2.6743169865288872e-05, + "loss": 0.0342, + "step": 75920 + }, + { + "epoch": 0.1674558314017727, + "grad_norm": 0.10536991059780121, + "learning_rate": 2.6742141045935896e-05, + "loss": 0.0359, + "step": 75930 + }, + { + "epoch": 0.16747788537667085, + "grad_norm": 0.12359856814146042, + "learning_rate": 2.6741112083904817e-05, + "loss": 0.0348, + "step": 75940 + }, + { + "epoch": 0.16749993935156904, + "grad_norm": 0.15049272775650024, + "learning_rate": 2.6740082979208145e-05, + "loss": 0.0349, + "step": 75950 + }, + { + "epoch": 0.1675219933264672, + "grad_norm": 0.12159136682748795, + "learning_rate": 2.673905373185838e-05, + "loss": 0.0317, + "step": 75960 + }, + { + "epoch": 0.16754404730136535, + "grad_norm": 0.0882176011800766, + "learning_rate": 2.673802434186803e-05, + "loss": 0.0356, + "step": 75970 + }, + { + "epoch": 0.16756610127626353, + "grad_norm": 0.08316049724817276, + "learning_rate": 2.6736994809249598e-05, + "loss": 0.0339, + "step": 75980 + }, + { + "epoch": 0.1675881552511617, + "grad_norm": 0.13801918923854828, + "learning_rate": 2.67359651340156e-05, + "loss": 0.0355, + "step": 75990 + }, + { + "epoch": 0.16761020922605985, + "grad_norm": 0.09494804590940475, + "learning_rate": 2.673493531617854e-05, + "loss": 0.0339, + "step": 76000 + }, + { + "epoch": 0.16763226320095803, + "grad_norm": 0.1024220734834671, + "learning_rate": 2.6733905355750943e-05, + "loss": 0.0347, + "step": 76010 + }, + { + "epoch": 0.1676543171758562, + "grad_norm": 0.09783333539962769, + "learning_rate": 2.6732875252745315e-05, + "loss": 0.0332, + "step": 76020 + }, + { + "epoch": 0.16767637115075434, + "grad_norm": 0.11248227953910828, + "learning_rate": 2.6731845007174173e-05, + "loss": 0.0349, + "step": 76030 + }, + { + "epoch": 0.16769842512565253, + "grad_norm": 0.10788309574127197, + "learning_rate": 2.6730814619050038e-05, + "loss": 0.0327, + "step": 76040 + }, + { + "epoch": 0.16772047910055068, + "grad_norm": 0.13717283308506012, + "learning_rate": 2.672978408838543e-05, + "loss": 0.0327, + "step": 76050 + }, + { + "epoch": 0.16774253307544884, + "grad_norm": 0.12324196845293045, + "learning_rate": 2.6728753415192873e-05, + "loss": 0.0357, + "step": 76060 + }, + { + "epoch": 0.16776458705034702, + "grad_norm": 0.10321437567472458, + "learning_rate": 2.6727722599484883e-05, + "loss": 0.0337, + "step": 76070 + }, + { + "epoch": 0.16778664102524518, + "grad_norm": 0.13554269075393677, + "learning_rate": 2.6726691641273996e-05, + "loss": 0.0347, + "step": 76080 + }, + { + "epoch": 0.16780869500014334, + "grad_norm": 0.12773042917251587, + "learning_rate": 2.6725660540572735e-05, + "loss": 0.0358, + "step": 76090 + }, + { + "epoch": 0.16783074897504152, + "grad_norm": 0.13258260488510132, + "learning_rate": 2.672462929739362e-05, + "loss": 0.033, + "step": 76100 + }, + { + "epoch": 0.16785280294993968, + "grad_norm": 0.09504686295986176, + "learning_rate": 2.6723597911749193e-05, + "loss": 0.0347, + "step": 76110 + }, + { + "epoch": 0.16787485692483786, + "grad_norm": 0.07639602571725845, + "learning_rate": 2.6722566383651984e-05, + "loss": 0.035, + "step": 76120 + }, + { + "epoch": 0.16789691089973602, + "grad_norm": 0.10175598412752151, + "learning_rate": 2.6721534713114526e-05, + "loss": 0.033, + "step": 76130 + }, + { + "epoch": 0.16791896487463417, + "grad_norm": 0.11727286130189896, + "learning_rate": 2.672050290014935e-05, + "loss": 0.0346, + "step": 76140 + }, + { + "epoch": 0.16794101884953236, + "grad_norm": 0.12429173290729523, + "learning_rate": 2.6719470944769e-05, + "loss": 0.0345, + "step": 76150 + }, + { + "epoch": 0.16796307282443051, + "grad_norm": 0.10252893716096878, + "learning_rate": 2.6718438846986014e-05, + "loss": 0.035, + "step": 76160 + }, + { + "epoch": 0.16798512679932867, + "grad_norm": 0.10461138933897018, + "learning_rate": 2.6717406606812932e-05, + "loss": 0.0349, + "step": 76170 + }, + { + "epoch": 0.16800718077422686, + "grad_norm": 0.09778036922216415, + "learning_rate": 2.6716374224262296e-05, + "loss": 0.0338, + "step": 76180 + }, + { + "epoch": 0.168029234749125, + "grad_norm": 0.09967268258333206, + "learning_rate": 2.671534169934665e-05, + "loss": 0.0336, + "step": 76190 + }, + { + "epoch": 0.16805128872402317, + "grad_norm": 0.09313733130693436, + "learning_rate": 2.6714309032078544e-05, + "loss": 0.0341, + "step": 76200 + }, + { + "epoch": 0.16807334269892135, + "grad_norm": 0.10764195770025253, + "learning_rate": 2.671327622247052e-05, + "loss": 0.0326, + "step": 76210 + }, + { + "epoch": 0.1680953966738195, + "grad_norm": 0.10467347502708435, + "learning_rate": 2.6712243270535133e-05, + "loss": 0.0345, + "step": 76220 + }, + { + "epoch": 0.16811745064871766, + "grad_norm": 0.11403192579746246, + "learning_rate": 2.671121017628493e-05, + "loss": 0.0342, + "step": 76230 + }, + { + "epoch": 0.16813950462361585, + "grad_norm": 0.09961669147014618, + "learning_rate": 2.671017693973247e-05, + "loss": 0.0372, + "step": 76240 + }, + { + "epoch": 0.168161558598514, + "grad_norm": 0.12246309220790863, + "learning_rate": 2.67091435608903e-05, + "loss": 0.0362, + "step": 76250 + }, + { + "epoch": 0.16818361257341216, + "grad_norm": 0.15653979778289795, + "learning_rate": 2.670811003977098e-05, + "loss": 0.0354, + "step": 76260 + }, + { + "epoch": 0.16820566654831035, + "grad_norm": 0.1224665641784668, + "learning_rate": 2.670707637638707e-05, + "loss": 0.0346, + "step": 76270 + }, + { + "epoch": 0.1682277205232085, + "grad_norm": 0.10991312563419342, + "learning_rate": 2.6706042570751132e-05, + "loss": 0.0347, + "step": 76280 + }, + { + "epoch": 0.16824977449810666, + "grad_norm": 0.0968884527683258, + "learning_rate": 2.670500862287572e-05, + "loss": 0.0336, + "step": 76290 + }, + { + "epoch": 0.16827182847300484, + "grad_norm": 0.10374508798122406, + "learning_rate": 2.6703974532773403e-05, + "loss": 0.0342, + "step": 76300 + }, + { + "epoch": 0.168293882447903, + "grad_norm": 0.12656700611114502, + "learning_rate": 2.670294030045675e-05, + "loss": 0.0335, + "step": 76310 + }, + { + "epoch": 0.16831593642280115, + "grad_norm": 0.10204464197158813, + "learning_rate": 2.6701905925938318e-05, + "loss": 0.0334, + "step": 76320 + }, + { + "epoch": 0.16833799039769934, + "grad_norm": 0.10383204370737076, + "learning_rate": 2.6700871409230678e-05, + "loss": 0.0342, + "step": 76330 + }, + { + "epoch": 0.1683600443725975, + "grad_norm": 0.13860289752483368, + "learning_rate": 2.669983675034641e-05, + "loss": 0.0329, + "step": 76340 + }, + { + "epoch": 0.16838209834749565, + "grad_norm": 0.12938757240772247, + "learning_rate": 2.6698801949298073e-05, + "loss": 0.0353, + "step": 76350 + }, + { + "epoch": 0.16840415232239384, + "grad_norm": 0.1142125055193901, + "learning_rate": 2.6697767006098244e-05, + "loss": 0.034, + "step": 76360 + }, + { + "epoch": 0.168426206297292, + "grad_norm": 0.08222886174917221, + "learning_rate": 2.669673192075951e-05, + "loss": 0.0345, + "step": 76370 + }, + { + "epoch": 0.16844826027219015, + "grad_norm": 0.12280363589525223, + "learning_rate": 2.6695696693294433e-05, + "loss": 0.0365, + "step": 76380 + }, + { + "epoch": 0.16847031424708833, + "grad_norm": 0.12272661924362183, + "learning_rate": 2.66946613237156e-05, + "loss": 0.0345, + "step": 76390 + }, + { + "epoch": 0.1684923682219865, + "grad_norm": 0.1167541965842247, + "learning_rate": 2.6693625812035595e-05, + "loss": 0.0342, + "step": 76400 + }, + { + "epoch": 0.16851442219688464, + "grad_norm": 0.09430060535669327, + "learning_rate": 2.6692590158266993e-05, + "loss": 0.0348, + "step": 76410 + }, + { + "epoch": 0.16853647617178283, + "grad_norm": 0.12299767881631851, + "learning_rate": 2.669155436242238e-05, + "loss": 0.0325, + "step": 76420 + }, + { + "epoch": 0.16855853014668098, + "grad_norm": 0.12741073966026306, + "learning_rate": 2.6690518424514343e-05, + "loss": 0.034, + "step": 76430 + }, + { + "epoch": 0.16858058412157914, + "grad_norm": 0.10522301495075226, + "learning_rate": 2.668948234455547e-05, + "loss": 0.0352, + "step": 76440 + }, + { + "epoch": 0.16860263809647733, + "grad_norm": 0.1128687784075737, + "learning_rate": 2.6688446122558352e-05, + "loss": 0.0341, + "step": 76450 + }, + { + "epoch": 0.16862469207137548, + "grad_norm": 0.13717347383499146, + "learning_rate": 2.6687409758535574e-05, + "loss": 0.0348, + "step": 76460 + }, + { + "epoch": 0.16864674604627364, + "grad_norm": 0.1250213086605072, + "learning_rate": 2.6686373252499734e-05, + "loss": 0.0333, + "step": 76470 + }, + { + "epoch": 0.16866880002117182, + "grad_norm": 0.10465829074382782, + "learning_rate": 2.6685336604463425e-05, + "loss": 0.0345, + "step": 76480 + }, + { + "epoch": 0.16869085399606998, + "grad_norm": 0.11019481718540192, + "learning_rate": 2.668429981443924e-05, + "loss": 0.033, + "step": 76490 + }, + { + "epoch": 0.16871290797096813, + "grad_norm": 0.09504888206720352, + "learning_rate": 2.668326288243979e-05, + "loss": 0.0339, + "step": 76500 + }, + { + "epoch": 0.16873496194586632, + "grad_norm": 0.1081528440117836, + "learning_rate": 2.6682225808477656e-05, + "loss": 0.0332, + "step": 76510 + }, + { + "epoch": 0.16875701592076447, + "grad_norm": 0.11445757001638412, + "learning_rate": 2.668118859256545e-05, + "loss": 0.0335, + "step": 76520 + }, + { + "epoch": 0.16877906989566263, + "grad_norm": 0.16544899344444275, + "learning_rate": 2.6680151234715776e-05, + "loss": 0.0332, + "step": 76530 + }, + { + "epoch": 0.16880112387056082, + "grad_norm": 0.09909923374652863, + "learning_rate": 2.6679113734941238e-05, + "loss": 0.0329, + "step": 76540 + }, + { + "epoch": 0.16882317784545897, + "grad_norm": 0.11611049622297287, + "learning_rate": 2.667807609325444e-05, + "loss": 0.0334, + "step": 76550 + }, + { + "epoch": 0.16884523182035716, + "grad_norm": 0.09366883337497711, + "learning_rate": 2.667703830966799e-05, + "loss": 0.0335, + "step": 76560 + }, + { + "epoch": 0.1688672857952553, + "grad_norm": 0.12153278291225433, + "learning_rate": 2.6676000384194504e-05, + "loss": 0.0338, + "step": 76570 + }, + { + "epoch": 0.16888933977015347, + "grad_norm": 0.12643752992153168, + "learning_rate": 2.6674962316846583e-05, + "loss": 0.0341, + "step": 76580 + }, + { + "epoch": 0.16891139374505165, + "grad_norm": 0.08965237438678741, + "learning_rate": 2.6673924107636852e-05, + "loss": 0.0349, + "step": 76590 + }, + { + "epoch": 0.1689334477199498, + "grad_norm": 0.11718756705522537, + "learning_rate": 2.667288575657792e-05, + "loss": 0.0344, + "step": 76600 + }, + { + "epoch": 0.16895550169484796, + "grad_norm": 0.13062842190265656, + "learning_rate": 2.66718472636824e-05, + "loss": 0.0334, + "step": 76610 + }, + { + "epoch": 0.16897755566974615, + "grad_norm": 0.11371520906686783, + "learning_rate": 2.6670808628962923e-05, + "loss": 0.0356, + "step": 76620 + }, + { + "epoch": 0.1689996096446443, + "grad_norm": 0.10166312754154205, + "learning_rate": 2.6669769852432096e-05, + "loss": 0.0342, + "step": 76630 + }, + { + "epoch": 0.16902166361954246, + "grad_norm": 0.117136150598526, + "learning_rate": 2.6668730934102554e-05, + "loss": 0.0347, + "step": 76640 + }, + { + "epoch": 0.16904371759444065, + "grad_norm": 0.13133756816387177, + "learning_rate": 2.666769187398691e-05, + "loss": 0.0352, + "step": 76650 + }, + { + "epoch": 0.1690657715693388, + "grad_norm": 0.1352774202823639, + "learning_rate": 2.6666652672097793e-05, + "loss": 0.0354, + "step": 76660 + }, + { + "epoch": 0.16908782554423696, + "grad_norm": 0.12781965732574463, + "learning_rate": 2.666561332844783e-05, + "loss": 0.0339, + "step": 76670 + }, + { + "epoch": 0.16910987951913514, + "grad_norm": 0.11427021771669388, + "learning_rate": 2.6664573843049654e-05, + "loss": 0.0354, + "step": 76680 + }, + { + "epoch": 0.1691319334940333, + "grad_norm": 0.11788105964660645, + "learning_rate": 2.666353421591589e-05, + "loss": 0.035, + "step": 76690 + }, + { + "epoch": 0.16915398746893145, + "grad_norm": 0.08773475140333176, + "learning_rate": 2.6662494447059176e-05, + "loss": 0.0326, + "step": 76700 + }, + { + "epoch": 0.16917604144382964, + "grad_norm": 0.10023439675569534, + "learning_rate": 2.666145453649214e-05, + "loss": 0.0343, + "step": 76710 + }, + { + "epoch": 0.1691980954187278, + "grad_norm": 0.10237698256969452, + "learning_rate": 2.6660414484227423e-05, + "loss": 0.034, + "step": 76720 + }, + { + "epoch": 0.16922014939362595, + "grad_norm": 0.112942174077034, + "learning_rate": 2.665937429027766e-05, + "loss": 0.0327, + "step": 76730 + }, + { + "epoch": 0.16924220336852414, + "grad_norm": 0.0877213180065155, + "learning_rate": 2.6658333954655492e-05, + "loss": 0.034, + "step": 76740 + }, + { + "epoch": 0.1692642573434223, + "grad_norm": 0.08379985392093658, + "learning_rate": 2.665729347737356e-05, + "loss": 0.0358, + "step": 76750 + }, + { + "epoch": 0.16928631131832045, + "grad_norm": 0.15830694139003754, + "learning_rate": 2.6656252858444503e-05, + "loss": 0.0342, + "step": 76760 + }, + { + "epoch": 0.16930836529321863, + "grad_norm": 0.10732251405715942, + "learning_rate": 2.6655212097880967e-05, + "loss": 0.0336, + "step": 76770 + }, + { + "epoch": 0.1693304192681168, + "grad_norm": 0.11377808451652527, + "learning_rate": 2.6654171195695606e-05, + "loss": 0.0321, + "step": 76780 + }, + { + "epoch": 0.16935247324301494, + "grad_norm": 0.10122912377119064, + "learning_rate": 2.665313015190106e-05, + "loss": 0.0321, + "step": 76790 + }, + { + "epoch": 0.16937452721791313, + "grad_norm": 0.09778108447790146, + "learning_rate": 2.6652088966509976e-05, + "loss": 0.0325, + "step": 76800 + }, + { + "epoch": 0.16939658119281129, + "grad_norm": 0.1648470014333725, + "learning_rate": 2.6651047639535013e-05, + "loss": 0.0317, + "step": 76810 + }, + { + "epoch": 0.16941863516770944, + "grad_norm": 0.10736332088708878, + "learning_rate": 2.665000617098882e-05, + "loss": 0.0348, + "step": 76820 + }, + { + "epoch": 0.16944068914260763, + "grad_norm": 0.10346661508083344, + "learning_rate": 2.6648964560884052e-05, + "loss": 0.0345, + "step": 76830 + }, + { + "epoch": 0.16946274311750578, + "grad_norm": 0.1028214618563652, + "learning_rate": 2.6647922809233366e-05, + "loss": 0.0346, + "step": 76840 + }, + { + "epoch": 0.16948479709240394, + "grad_norm": 0.1024368479847908, + "learning_rate": 2.6646880916049425e-05, + "loss": 0.0322, + "step": 76850 + }, + { + "epoch": 0.16950685106730212, + "grad_norm": 0.11297327280044556, + "learning_rate": 2.6645838881344875e-05, + "loss": 0.0334, + "step": 76860 + }, + { + "epoch": 0.16952890504220028, + "grad_norm": 0.0941099300980568, + "learning_rate": 2.6644796705132393e-05, + "loss": 0.033, + "step": 76870 + }, + { + "epoch": 0.16955095901709843, + "grad_norm": 0.10197602212429047, + "learning_rate": 2.664375438742464e-05, + "loss": 0.0311, + "step": 76880 + }, + { + "epoch": 0.16957301299199662, + "grad_norm": 0.11671138554811478, + "learning_rate": 2.664271192823427e-05, + "loss": 0.032, + "step": 76890 + }, + { + "epoch": 0.16959506696689478, + "grad_norm": 0.11740759015083313, + "learning_rate": 2.6641669327573964e-05, + "loss": 0.0343, + "step": 76900 + }, + { + "epoch": 0.16961712094179293, + "grad_norm": 0.09380542486906052, + "learning_rate": 2.6640626585456378e-05, + "loss": 0.0363, + "step": 76910 + }, + { + "epoch": 0.16963917491669112, + "grad_norm": 0.09618857502937317, + "learning_rate": 2.6639583701894192e-05, + "loss": 0.033, + "step": 76920 + }, + { + "epoch": 0.16966122889158927, + "grad_norm": 0.10085486620664597, + "learning_rate": 2.6638540676900076e-05, + "loss": 0.0334, + "step": 76930 + }, + { + "epoch": 0.16968328286648743, + "grad_norm": 0.13022328913211823, + "learning_rate": 2.66374975104867e-05, + "loss": 0.0352, + "step": 76940 + }, + { + "epoch": 0.1697053368413856, + "grad_norm": 0.12165369838476181, + "learning_rate": 2.6636454202666743e-05, + "loss": 0.0362, + "step": 76950 + }, + { + "epoch": 0.16972739081628377, + "grad_norm": 0.11294874548912048, + "learning_rate": 2.6635410753452876e-05, + "loss": 0.0343, + "step": 76960 + }, + { + "epoch": 0.16974944479118192, + "grad_norm": 0.11600980907678604, + "learning_rate": 2.6634367162857788e-05, + "loss": 0.0335, + "step": 76970 + }, + { + "epoch": 0.1697714987660801, + "grad_norm": 0.10373510420322418, + "learning_rate": 2.6633323430894152e-05, + "loss": 0.0332, + "step": 76980 + }, + { + "epoch": 0.16979355274097827, + "grad_norm": 0.1213439330458641, + "learning_rate": 2.6632279557574655e-05, + "loss": 0.0355, + "step": 76990 + }, + { + "epoch": 0.16981560671587645, + "grad_norm": 0.13254790008068085, + "learning_rate": 2.6631235542911977e-05, + "loss": 0.0341, + "step": 77000 + }, + { + "epoch": 0.1698376606907746, + "grad_norm": 0.09063161909580231, + "learning_rate": 2.6630191386918806e-05, + "loss": 0.0352, + "step": 77010 + }, + { + "epoch": 0.16985971466567276, + "grad_norm": 0.09927710890769958, + "learning_rate": 2.662914708960783e-05, + "loss": 0.0339, + "step": 77020 + }, + { + "epoch": 0.16988176864057095, + "grad_norm": 0.0952000543475151, + "learning_rate": 2.662810265099173e-05, + "loss": 0.034, + "step": 77030 + }, + { + "epoch": 0.1699038226154691, + "grad_norm": 0.09943322092294693, + "learning_rate": 2.6627058071083215e-05, + "loss": 0.0338, + "step": 77040 + }, + { + "epoch": 0.16992587659036726, + "grad_norm": 0.14233626425266266, + "learning_rate": 2.6626013349894963e-05, + "loss": 0.0336, + "step": 77050 + }, + { + "epoch": 0.16994793056526544, + "grad_norm": 0.10801058262586594, + "learning_rate": 2.6624968487439667e-05, + "loss": 0.033, + "step": 77060 + }, + { + "epoch": 0.1699699845401636, + "grad_norm": 0.11266505718231201, + "learning_rate": 2.6623923483730034e-05, + "loss": 0.0357, + "step": 77070 + }, + { + "epoch": 0.16999203851506176, + "grad_norm": 0.09873943030834198, + "learning_rate": 2.662287833877875e-05, + "loss": 0.035, + "step": 77080 + }, + { + "epoch": 0.17001409248995994, + "grad_norm": 0.14204448461532593, + "learning_rate": 2.6621833052598527e-05, + "loss": 0.0347, + "step": 77090 + }, + { + "epoch": 0.1700361464648581, + "grad_norm": 0.11217791587114334, + "learning_rate": 2.6620787625202054e-05, + "loss": 0.0362, + "step": 77100 + }, + { + "epoch": 0.17005820043975625, + "grad_norm": 0.1231984868645668, + "learning_rate": 2.6619742056602042e-05, + "loss": 0.0324, + "step": 77110 + }, + { + "epoch": 0.17008025441465444, + "grad_norm": 0.10964511334896088, + "learning_rate": 2.6618696346811195e-05, + "loss": 0.0324, + "step": 77120 + }, + { + "epoch": 0.1701023083895526, + "grad_norm": 0.11496507376432419, + "learning_rate": 2.6617650495842212e-05, + "loss": 0.0356, + "step": 77130 + }, + { + "epoch": 0.17012436236445075, + "grad_norm": 0.08254452794790268, + "learning_rate": 2.661660450370781e-05, + "loss": 0.0329, + "step": 77140 + }, + { + "epoch": 0.17014641633934893, + "grad_norm": 0.10579877346754074, + "learning_rate": 2.66155583704207e-05, + "loss": 0.0344, + "step": 77150 + }, + { + "epoch": 0.1701684703142471, + "grad_norm": 0.13031621277332306, + "learning_rate": 2.6614512095993583e-05, + "loss": 0.0355, + "step": 77160 + }, + { + "epoch": 0.17019052428914525, + "grad_norm": 0.09963130205869675, + "learning_rate": 2.6613465680439178e-05, + "loss": 0.0341, + "step": 77170 + }, + { + "epoch": 0.17021257826404343, + "grad_norm": 0.09032832086086273, + "learning_rate": 2.6612419123770204e-05, + "loss": 0.0348, + "step": 77180 + }, + { + "epoch": 0.17023463223894159, + "grad_norm": 0.08525047451257706, + "learning_rate": 2.661137242599937e-05, + "loss": 0.0358, + "step": 77190 + }, + { + "epoch": 0.17025668621383974, + "grad_norm": 0.1241353303194046, + "learning_rate": 2.6610325587139397e-05, + "loss": 0.0333, + "step": 77200 + }, + { + "epoch": 0.17027874018873793, + "grad_norm": 0.11446458846330643, + "learning_rate": 2.6609278607203008e-05, + "loss": 0.0359, + "step": 77210 + }, + { + "epoch": 0.17030079416363608, + "grad_norm": 0.12005987763404846, + "learning_rate": 2.6608231486202928e-05, + "loss": 0.0337, + "step": 77220 + }, + { + "epoch": 0.17032284813853424, + "grad_norm": 0.1187741830945015, + "learning_rate": 2.6607184224151868e-05, + "loss": 0.0332, + "step": 77230 + }, + { + "epoch": 0.17034490211343242, + "grad_norm": 0.12756583094596863, + "learning_rate": 2.6606136821062566e-05, + "loss": 0.0328, + "step": 77240 + }, + { + "epoch": 0.17036695608833058, + "grad_norm": 0.12674374878406525, + "learning_rate": 2.660508927694774e-05, + "loss": 0.0361, + "step": 77250 + }, + { + "epoch": 0.17038901006322874, + "grad_norm": 0.11260160058736801, + "learning_rate": 2.660404159182012e-05, + "loss": 0.0376, + "step": 77260 + }, + { + "epoch": 0.17041106403812692, + "grad_norm": 0.14954717457294464, + "learning_rate": 2.6602993765692445e-05, + "loss": 0.0334, + "step": 77270 + }, + { + "epoch": 0.17043311801302508, + "grad_norm": 0.0970519557595253, + "learning_rate": 2.6601945798577438e-05, + "loss": 0.0372, + "step": 77280 + }, + { + "epoch": 0.17045517198792323, + "grad_norm": 0.12712831795215607, + "learning_rate": 2.6600897690487835e-05, + "loss": 0.0324, + "step": 77290 + }, + { + "epoch": 0.17047722596282142, + "grad_norm": 0.09296029806137085, + "learning_rate": 2.6599849441436374e-05, + "loss": 0.0333, + "step": 77300 + }, + { + "epoch": 0.17049927993771957, + "grad_norm": 0.09979081153869629, + "learning_rate": 2.6598801051435787e-05, + "loss": 0.0332, + "step": 77310 + }, + { + "epoch": 0.17052133391261773, + "grad_norm": 0.14419357478618622, + "learning_rate": 2.6597752520498815e-05, + "loss": 0.0361, + "step": 77320 + }, + { + "epoch": 0.1705433878875159, + "grad_norm": 0.12037581205368042, + "learning_rate": 2.6596703848638204e-05, + "loss": 0.0361, + "step": 77330 + }, + { + "epoch": 0.17056544186241407, + "grad_norm": 0.1052107885479927, + "learning_rate": 2.659565503586669e-05, + "loss": 0.0349, + "step": 77340 + }, + { + "epoch": 0.17058749583731223, + "grad_norm": 0.12207616865634918, + "learning_rate": 2.659460608219702e-05, + "loss": 0.0342, + "step": 77350 + }, + { + "epoch": 0.1706095498122104, + "grad_norm": 0.14071781933307648, + "learning_rate": 2.659355698764194e-05, + "loss": 0.0348, + "step": 77360 + }, + { + "epoch": 0.17063160378710857, + "grad_norm": 0.08524421602487564, + "learning_rate": 2.6592507752214194e-05, + "loss": 0.0347, + "step": 77370 + }, + { + "epoch": 0.17065365776200672, + "grad_norm": 0.11192633956670761, + "learning_rate": 2.6591458375926537e-05, + "loss": 0.0341, + "step": 77380 + }, + { + "epoch": 0.1706757117369049, + "grad_norm": 0.1315147429704666, + "learning_rate": 2.6590408858791713e-05, + "loss": 0.033, + "step": 77390 + }, + { + "epoch": 0.17069776571180306, + "grad_norm": 0.15226209163665771, + "learning_rate": 2.658935920082248e-05, + "loss": 0.0353, + "step": 77400 + }, + { + "epoch": 0.17071981968670125, + "grad_norm": 0.09028134495019913, + "learning_rate": 2.6588309402031594e-05, + "loss": 0.0346, + "step": 77410 + }, + { + "epoch": 0.1707418736615994, + "grad_norm": 0.17993444204330444, + "learning_rate": 2.65872594624318e-05, + "loss": 0.0352, + "step": 77420 + }, + { + "epoch": 0.17076392763649756, + "grad_norm": 0.12217986583709717, + "learning_rate": 2.6586209382035865e-05, + "loss": 0.0352, + "step": 77430 + }, + { + "epoch": 0.17078598161139574, + "grad_norm": 0.10543391853570938, + "learning_rate": 2.6585159160856553e-05, + "loss": 0.0336, + "step": 77440 + }, + { + "epoch": 0.1708080355862939, + "grad_norm": 0.11131499707698822, + "learning_rate": 2.6584108798906616e-05, + "loss": 0.034, + "step": 77450 + }, + { + "epoch": 0.17083008956119206, + "grad_norm": 0.13426567614078522, + "learning_rate": 2.6583058296198817e-05, + "loss": 0.0343, + "step": 77460 + }, + { + "epoch": 0.17085214353609024, + "grad_norm": 0.1649017333984375, + "learning_rate": 2.6582007652745923e-05, + "loss": 0.0364, + "step": 77470 + }, + { + "epoch": 0.1708741975109884, + "grad_norm": 0.08714628964662552, + "learning_rate": 2.6580956868560704e-05, + "loss": 0.0338, + "step": 77480 + }, + { + "epoch": 0.17089625148588655, + "grad_norm": 0.10557325929403305, + "learning_rate": 2.6579905943655923e-05, + "loss": 0.0336, + "step": 77490 + }, + { + "epoch": 0.17091830546078474, + "grad_norm": 0.09558634459972382, + "learning_rate": 2.6578854878044353e-05, + "loss": 0.0336, + "step": 77500 + }, + { + "epoch": 0.1709403594356829, + "grad_norm": 0.1642952412366867, + "learning_rate": 2.6577803671738764e-05, + "loss": 0.033, + "step": 77510 + }, + { + "epoch": 0.17096241341058105, + "grad_norm": 0.10586492717266083, + "learning_rate": 2.657675232475193e-05, + "loss": 0.0362, + "step": 77520 + }, + { + "epoch": 0.17098446738547923, + "grad_norm": 0.09499961137771606, + "learning_rate": 2.657570083709662e-05, + "loss": 0.0366, + "step": 77530 + }, + { + "epoch": 0.1710065213603774, + "grad_norm": 0.14460167288780212, + "learning_rate": 2.657464920878562e-05, + "loss": 0.0321, + "step": 77540 + }, + { + "epoch": 0.17102857533527555, + "grad_norm": 0.11081955581903458, + "learning_rate": 2.6573597439831705e-05, + "loss": 0.0346, + "step": 77550 + }, + { + "epoch": 0.17105062931017373, + "grad_norm": 0.10786391794681549, + "learning_rate": 2.657254553024765e-05, + "loss": 0.0343, + "step": 77560 + }, + { + "epoch": 0.17107268328507189, + "grad_norm": 0.11419349163770676, + "learning_rate": 2.6571493480046245e-05, + "loss": 0.0349, + "step": 77570 + }, + { + "epoch": 0.17109473725997004, + "grad_norm": 0.13428859412670135, + "learning_rate": 2.6570441289240265e-05, + "loss": 0.0349, + "step": 77580 + }, + { + "epoch": 0.17111679123486823, + "grad_norm": 0.12377936393022537, + "learning_rate": 2.65693889578425e-05, + "loss": 0.0328, + "step": 77590 + }, + { + "epoch": 0.17113884520976638, + "grad_norm": 0.10585872828960419, + "learning_rate": 2.6568336485865735e-05, + "loss": 0.0332, + "step": 77600 + }, + { + "epoch": 0.17116089918466454, + "grad_norm": 0.09972579032182693, + "learning_rate": 2.6567283873322762e-05, + "loss": 0.0345, + "step": 77610 + }, + { + "epoch": 0.17118295315956272, + "grad_norm": 0.1489674597978592, + "learning_rate": 2.6566231120226367e-05, + "loss": 0.0359, + "step": 77620 + }, + { + "epoch": 0.17120500713446088, + "grad_norm": 0.10846253484487534, + "learning_rate": 2.6565178226589344e-05, + "loss": 0.0346, + "step": 77630 + }, + { + "epoch": 0.17122706110935904, + "grad_norm": 0.11342019587755203, + "learning_rate": 2.6564125192424483e-05, + "loss": 0.0335, + "step": 77640 + }, + { + "epoch": 0.17124911508425722, + "grad_norm": 0.11514155566692352, + "learning_rate": 2.6563072017744588e-05, + "loss": 0.0341, + "step": 77650 + }, + { + "epoch": 0.17127116905915538, + "grad_norm": 0.09143530577421188, + "learning_rate": 2.6562018702562445e-05, + "loss": 0.0336, + "step": 77660 + }, + { + "epoch": 0.17129322303405353, + "grad_norm": 0.11076731234788895, + "learning_rate": 2.6560965246890862e-05, + "loss": 0.0341, + "step": 77670 + }, + { + "epoch": 0.17131527700895172, + "grad_norm": 0.10573431104421616, + "learning_rate": 2.6559911650742636e-05, + "loss": 0.0316, + "step": 77680 + }, + { + "epoch": 0.17133733098384987, + "grad_norm": 0.10658387094736099, + "learning_rate": 2.655885791413057e-05, + "loss": 0.0347, + "step": 77690 + }, + { + "epoch": 0.17135938495874803, + "grad_norm": 0.09126414358615875, + "learning_rate": 2.6557804037067466e-05, + "loss": 0.0337, + "step": 77700 + }, + { + "epoch": 0.1713814389336462, + "grad_norm": 0.09636466950178146, + "learning_rate": 2.655675001956613e-05, + "loss": 0.0357, + "step": 77710 + }, + { + "epoch": 0.17140349290854437, + "grad_norm": 0.11749514937400818, + "learning_rate": 2.6555695861639367e-05, + "loss": 0.0337, + "step": 77720 + }, + { + "epoch": 0.17142554688344253, + "grad_norm": 0.09171566367149353, + "learning_rate": 2.6554641563299994e-05, + "loss": 0.0325, + "step": 77730 + }, + { + "epoch": 0.1714476008583407, + "grad_norm": 0.09874816238880157, + "learning_rate": 2.6553587124560813e-05, + "loss": 0.0339, + "step": 77740 + }, + { + "epoch": 0.17146965483323887, + "grad_norm": 0.12584948539733887, + "learning_rate": 2.655253254543464e-05, + "loss": 0.0345, + "step": 77750 + }, + { + "epoch": 0.17149170880813702, + "grad_norm": 0.1458227038383484, + "learning_rate": 2.655147782593429e-05, + "loss": 0.0348, + "step": 77760 + }, + { + "epoch": 0.1715137627830352, + "grad_norm": 0.11259448528289795, + "learning_rate": 2.6550422966072582e-05, + "loss": 0.0342, + "step": 77770 + }, + { + "epoch": 0.17153581675793336, + "grad_norm": 0.11540747433900833, + "learning_rate": 2.6549367965862325e-05, + "loss": 0.0353, + "step": 77780 + }, + { + "epoch": 0.17155787073283152, + "grad_norm": 0.11699727177619934, + "learning_rate": 2.6548312825316345e-05, + "loss": 0.0334, + "step": 77790 + }, + { + "epoch": 0.1715799247077297, + "grad_norm": 0.11536174267530441, + "learning_rate": 2.6547257544447457e-05, + "loss": 0.0335, + "step": 77800 + }, + { + "epoch": 0.17160197868262786, + "grad_norm": 0.12487979978322983, + "learning_rate": 2.654620212326849e-05, + "loss": 0.0315, + "step": 77810 + }, + { + "epoch": 0.17162403265752602, + "grad_norm": 0.1145954355597496, + "learning_rate": 2.6545146561792268e-05, + "loss": 0.0331, + "step": 77820 + }, + { + "epoch": 0.1716460866324242, + "grad_norm": 0.11070026457309723, + "learning_rate": 2.6544090860031615e-05, + "loss": 0.0329, + "step": 77830 + }, + { + "epoch": 0.17166814060732236, + "grad_norm": 0.1338651329278946, + "learning_rate": 2.6543035017999354e-05, + "loss": 0.0349, + "step": 77840 + }, + { + "epoch": 0.17169019458222054, + "grad_norm": 0.10003060847520828, + "learning_rate": 2.654197903570832e-05, + "loss": 0.0345, + "step": 77850 + }, + { + "epoch": 0.1717122485571187, + "grad_norm": 0.10598380118608475, + "learning_rate": 2.654092291317135e-05, + "loss": 0.0349, + "step": 77860 + }, + { + "epoch": 0.17173430253201685, + "grad_norm": 0.10082856565713882, + "learning_rate": 2.653986665040126e-05, + "loss": 0.0342, + "step": 77870 + }, + { + "epoch": 0.17175635650691504, + "grad_norm": 0.09192851930856705, + "learning_rate": 2.6538810247410902e-05, + "loss": 0.0349, + "step": 77880 + }, + { + "epoch": 0.1717784104818132, + "grad_norm": 0.1090332493185997, + "learning_rate": 2.65377537042131e-05, + "loss": 0.0331, + "step": 77890 + }, + { + "epoch": 0.17180046445671135, + "grad_norm": 0.09700191766023636, + "learning_rate": 2.65366970208207e-05, + "loss": 0.0336, + "step": 77900 + }, + { + "epoch": 0.17182251843160953, + "grad_norm": 0.17649778723716736, + "learning_rate": 2.6535640197246536e-05, + "loss": 0.0354, + "step": 77910 + }, + { + "epoch": 0.1718445724065077, + "grad_norm": 0.11243823170661926, + "learning_rate": 2.6534583233503455e-05, + "loss": 0.0333, + "step": 77920 + }, + { + "epoch": 0.17186662638140585, + "grad_norm": 0.12805481255054474, + "learning_rate": 2.6533526129604298e-05, + "loss": 0.0347, + "step": 77930 + }, + { + "epoch": 0.17188868035630403, + "grad_norm": 0.10707032680511475, + "learning_rate": 2.6532468885561907e-05, + "loss": 0.0336, + "step": 77940 + }, + { + "epoch": 0.1719107343312022, + "grad_norm": 0.09066890925168991, + "learning_rate": 2.6531411501389134e-05, + "loss": 0.0339, + "step": 77950 + }, + { + "epoch": 0.17193278830610034, + "grad_norm": 0.0950128585100174, + "learning_rate": 2.653035397709882e-05, + "loss": 0.0338, + "step": 77960 + }, + { + "epoch": 0.17195484228099853, + "grad_norm": 0.15077567100524902, + "learning_rate": 2.652929631270382e-05, + "loss": 0.0361, + "step": 77970 + }, + { + "epoch": 0.17197689625589668, + "grad_norm": 0.12296158820390701, + "learning_rate": 2.6528238508216985e-05, + "loss": 0.0335, + "step": 77980 + }, + { + "epoch": 0.17199895023079484, + "grad_norm": 0.14943210780620575, + "learning_rate": 2.6527180563651167e-05, + "loss": 0.0336, + "step": 77990 + }, + { + "epoch": 0.17202100420569302, + "grad_norm": 0.08719337731599808, + "learning_rate": 2.6526122479019222e-05, + "loss": 0.0348, + "step": 78000 + }, + { + "epoch": 0.17204305818059118, + "grad_norm": 0.10825609415769577, + "learning_rate": 2.6525064254334005e-05, + "loss": 0.0362, + "step": 78010 + }, + { + "epoch": 0.17206511215548934, + "grad_norm": 0.0908409059047699, + "learning_rate": 2.6524005889608376e-05, + "loss": 0.035, + "step": 78020 + }, + { + "epoch": 0.17208716613038752, + "grad_norm": 0.11116078495979309, + "learning_rate": 2.6522947384855198e-05, + "loss": 0.0342, + "step": 78030 + }, + { + "epoch": 0.17210922010528568, + "grad_norm": 0.09658624976873398, + "learning_rate": 2.6521888740087328e-05, + "loss": 0.0333, + "step": 78040 + }, + { + "epoch": 0.17213127408018383, + "grad_norm": 0.0990118533372879, + "learning_rate": 2.6520829955317634e-05, + "loss": 0.0317, + "step": 78050 + }, + { + "epoch": 0.17215332805508202, + "grad_norm": 0.103375643491745, + "learning_rate": 2.6519771030558974e-05, + "loss": 0.0353, + "step": 78060 + }, + { + "epoch": 0.17217538202998017, + "grad_norm": 0.11891826242208481, + "learning_rate": 2.651871196582422e-05, + "loss": 0.0332, + "step": 78070 + }, + { + "epoch": 0.17219743600487833, + "grad_norm": 0.09411726891994476, + "learning_rate": 2.6517652761126243e-05, + "loss": 0.0334, + "step": 78080 + }, + { + "epoch": 0.1722194899797765, + "grad_norm": 0.12133363634347916, + "learning_rate": 2.6516593416477908e-05, + "loss": 0.0331, + "step": 78090 + }, + { + "epoch": 0.17224154395467467, + "grad_norm": 0.10777264088392258, + "learning_rate": 2.651553393189209e-05, + "loss": 0.0328, + "step": 78100 + }, + { + "epoch": 0.17226359792957283, + "grad_norm": 0.0961076095700264, + "learning_rate": 2.651447430738166e-05, + "loss": 0.0344, + "step": 78110 + }, + { + "epoch": 0.172285651904471, + "grad_norm": 0.1288350224494934, + "learning_rate": 2.6513414542959504e-05, + "loss": 0.0338, + "step": 78120 + }, + { + "epoch": 0.17230770587936917, + "grad_norm": 0.10749933868646622, + "learning_rate": 2.651235463863848e-05, + "loss": 0.0342, + "step": 78130 + }, + { + "epoch": 0.17232975985426732, + "grad_norm": 0.11437828093767166, + "learning_rate": 2.6511294594431488e-05, + "loss": 0.0365, + "step": 78140 + }, + { + "epoch": 0.1723518138291655, + "grad_norm": 0.10072217136621475, + "learning_rate": 2.651023441035139e-05, + "loss": 0.0358, + "step": 78150 + }, + { + "epoch": 0.17237386780406366, + "grad_norm": 0.12018758058547974, + "learning_rate": 2.6509174086411078e-05, + "loss": 0.0346, + "step": 78160 + }, + { + "epoch": 0.17239592177896182, + "grad_norm": 0.0937550887465477, + "learning_rate": 2.650811362262344e-05, + "loss": 0.0341, + "step": 78170 + }, + { + "epoch": 0.17241797575386, + "grad_norm": 0.15398310124874115, + "learning_rate": 2.6507053019001348e-05, + "loss": 0.0332, + "step": 78180 + }, + { + "epoch": 0.17244002972875816, + "grad_norm": 0.13052289187908173, + "learning_rate": 2.6505992275557705e-05, + "loss": 0.0355, + "step": 78190 + }, + { + "epoch": 0.17246208370365632, + "grad_norm": 0.10852804034948349, + "learning_rate": 2.6504931392305386e-05, + "loss": 0.0358, + "step": 78200 + }, + { + "epoch": 0.1724841376785545, + "grad_norm": 0.09547806531190872, + "learning_rate": 2.650387036925729e-05, + "loss": 0.0334, + "step": 78210 + }, + { + "epoch": 0.17250619165345266, + "grad_norm": 0.12337370961904526, + "learning_rate": 2.650280920642631e-05, + "loss": 0.0344, + "step": 78220 + }, + { + "epoch": 0.1725282456283508, + "grad_norm": 0.10518646240234375, + "learning_rate": 2.6501747903825337e-05, + "loss": 0.036, + "step": 78230 + }, + { + "epoch": 0.172550299603249, + "grad_norm": 0.11037071794271469, + "learning_rate": 2.6500686461467267e-05, + "loss": 0.034, + "step": 78240 + }, + { + "epoch": 0.17257235357814715, + "grad_norm": 0.14341720938682556, + "learning_rate": 2.6499624879365e-05, + "loss": 0.0324, + "step": 78250 + }, + { + "epoch": 0.17259440755304534, + "grad_norm": 0.12580430507659912, + "learning_rate": 2.6498563157531428e-05, + "loss": 0.0338, + "step": 78260 + }, + { + "epoch": 0.1726164615279435, + "grad_norm": 0.0975944921374321, + "learning_rate": 2.649750129597946e-05, + "loss": 0.0339, + "step": 78270 + }, + { + "epoch": 0.17263851550284165, + "grad_norm": 0.1100861206650734, + "learning_rate": 2.6496439294721996e-05, + "loss": 0.0332, + "step": 78280 + }, + { + "epoch": 0.17266056947773983, + "grad_norm": 0.10738243162631989, + "learning_rate": 2.649537715377194e-05, + "loss": 0.0347, + "step": 78290 + }, + { + "epoch": 0.172682623452638, + "grad_norm": 0.11012589186429977, + "learning_rate": 2.64943148731422e-05, + "loss": 0.0339, + "step": 78300 + }, + { + "epoch": 0.17270467742753615, + "grad_norm": 0.10360430926084518, + "learning_rate": 2.649325245284568e-05, + "loss": 0.0331, + "step": 78310 + }, + { + "epoch": 0.17272673140243433, + "grad_norm": 0.13603098690509796, + "learning_rate": 2.649218989289529e-05, + "loss": 0.0342, + "step": 78320 + }, + { + "epoch": 0.1727487853773325, + "grad_norm": 0.09044921398162842, + "learning_rate": 2.6491127193303942e-05, + "loss": 0.0335, + "step": 78330 + }, + { + "epoch": 0.17277083935223064, + "grad_norm": 0.0979008823633194, + "learning_rate": 2.6490064354084553e-05, + "loss": 0.033, + "step": 78340 + }, + { + "epoch": 0.17279289332712883, + "grad_norm": 0.10020241886377335, + "learning_rate": 2.6489001375250032e-05, + "loss": 0.0334, + "step": 78350 + }, + { + "epoch": 0.17281494730202698, + "grad_norm": 0.10320480912923813, + "learning_rate": 2.6487938256813296e-05, + "loss": 0.0345, + "step": 78360 + }, + { + "epoch": 0.17283700127692514, + "grad_norm": 0.1176048219203949, + "learning_rate": 2.6486874998787263e-05, + "loss": 0.0344, + "step": 78370 + }, + { + "epoch": 0.17285905525182332, + "grad_norm": 0.12531225383281708, + "learning_rate": 2.6485811601184856e-05, + "loss": 0.0333, + "step": 78380 + }, + { + "epoch": 0.17288110922672148, + "grad_norm": 0.11799202859401703, + "learning_rate": 2.648474806401899e-05, + "loss": 0.0362, + "step": 78390 + }, + { + "epoch": 0.17290316320161964, + "grad_norm": 0.10613789409399033, + "learning_rate": 2.6483684387302598e-05, + "loss": 0.0346, + "step": 78400 + }, + { + "epoch": 0.17292521717651782, + "grad_norm": 0.09189815074205399, + "learning_rate": 2.648262057104859e-05, + "loss": 0.0338, + "step": 78410 + }, + { + "epoch": 0.17294727115141598, + "grad_norm": 0.10610567778348923, + "learning_rate": 2.6481556615269903e-05, + "loss": 0.0334, + "step": 78420 + }, + { + "epoch": 0.17296932512631413, + "grad_norm": 0.09490883350372314, + "learning_rate": 2.648049251997947e-05, + "loss": 0.0353, + "step": 78430 + }, + { + "epoch": 0.17299137910121232, + "grad_norm": 0.11022987961769104, + "learning_rate": 2.647942828519021e-05, + "loss": 0.0335, + "step": 78440 + }, + { + "epoch": 0.17301343307611047, + "grad_norm": 0.10499945282936096, + "learning_rate": 2.647836391091505e-05, + "loss": 0.0352, + "step": 78450 + }, + { + "epoch": 0.17303548705100863, + "grad_norm": 0.12968005239963531, + "learning_rate": 2.647729939716694e-05, + "loss": 0.0345, + "step": 78460 + }, + { + "epoch": 0.1730575410259068, + "grad_norm": 0.09137418866157532, + "learning_rate": 2.6476234743958804e-05, + "loss": 0.0319, + "step": 78470 + }, + { + "epoch": 0.17307959500080497, + "grad_norm": 0.09111639857292175, + "learning_rate": 2.647516995130358e-05, + "loss": 0.0335, + "step": 78480 + }, + { + "epoch": 0.17310164897570313, + "grad_norm": 0.11619142442941666, + "learning_rate": 2.6474105019214205e-05, + "loss": 0.0359, + "step": 78490 + }, + { + "epoch": 0.1731237029506013, + "grad_norm": 0.09762230515480042, + "learning_rate": 2.647303994770362e-05, + "loss": 0.035, + "step": 78500 + }, + { + "epoch": 0.17314575692549947, + "grad_norm": 0.12430970370769501, + "learning_rate": 2.647197473678477e-05, + "loss": 0.0334, + "step": 78510 + }, + { + "epoch": 0.17316781090039762, + "grad_norm": 0.11506832391023636, + "learning_rate": 2.6470909386470595e-05, + "loss": 0.0342, + "step": 78520 + }, + { + "epoch": 0.1731898648752958, + "grad_norm": 0.13518095016479492, + "learning_rate": 2.6469843896774045e-05, + "loss": 0.0332, + "step": 78530 + }, + { + "epoch": 0.17321191885019396, + "grad_norm": 0.12810778617858887, + "learning_rate": 2.6468778267708056e-05, + "loss": 0.0331, + "step": 78540 + }, + { + "epoch": 0.17323397282509212, + "grad_norm": 0.1607973873615265, + "learning_rate": 2.646771249928558e-05, + "loss": 0.0355, + "step": 78550 + }, + { + "epoch": 0.1732560267999903, + "grad_norm": 0.11700958013534546, + "learning_rate": 2.6466646591519578e-05, + "loss": 0.0331, + "step": 78560 + }, + { + "epoch": 0.17327808077488846, + "grad_norm": 0.1184816062450409, + "learning_rate": 2.646558054442299e-05, + "loss": 0.0347, + "step": 78570 + }, + { + "epoch": 0.17330013474978662, + "grad_norm": 0.12453948706388474, + "learning_rate": 2.6464514358008774e-05, + "loss": 0.0337, + "step": 78580 + }, + { + "epoch": 0.1733221887246848, + "grad_norm": 0.1262449026107788, + "learning_rate": 2.6463448032289885e-05, + "loss": 0.0337, + "step": 78590 + }, + { + "epoch": 0.17334424269958296, + "grad_norm": 0.09261927753686905, + "learning_rate": 2.6462381567279277e-05, + "loss": 0.0329, + "step": 78600 + }, + { + "epoch": 0.1733662966744811, + "grad_norm": 0.09921973943710327, + "learning_rate": 2.646131496298991e-05, + "loss": 0.0345, + "step": 78610 + }, + { + "epoch": 0.1733883506493793, + "grad_norm": 0.10055813938379288, + "learning_rate": 2.6460248219434743e-05, + "loss": 0.033, + "step": 78620 + }, + { + "epoch": 0.17341040462427745, + "grad_norm": 0.13429895043373108, + "learning_rate": 2.6459181336626747e-05, + "loss": 0.0348, + "step": 78630 + }, + { + "epoch": 0.1734324585991756, + "grad_norm": 0.0891297236084938, + "learning_rate": 2.6458114314578873e-05, + "loss": 0.0359, + "step": 78640 + }, + { + "epoch": 0.1734545125740738, + "grad_norm": 0.12549135088920593, + "learning_rate": 2.6457047153304093e-05, + "loss": 0.0325, + "step": 78650 + }, + { + "epoch": 0.17347656654897195, + "grad_norm": 0.12739445269107819, + "learning_rate": 2.645597985281537e-05, + "loss": 0.0359, + "step": 78660 + }, + { + "epoch": 0.1734986205238701, + "grad_norm": 0.1167592704296112, + "learning_rate": 2.6454912413125683e-05, + "loss": 0.0333, + "step": 78670 + }, + { + "epoch": 0.1735206744987683, + "grad_norm": 0.0858575850725174, + "learning_rate": 2.645384483424799e-05, + "loss": 0.0315, + "step": 78680 + }, + { + "epoch": 0.17354272847366645, + "grad_norm": 0.1357692927122116, + "learning_rate": 2.6452777116195267e-05, + "loss": 0.0337, + "step": 78690 + }, + { + "epoch": 0.17356478244856463, + "grad_norm": 0.10148968547582626, + "learning_rate": 2.645170925898049e-05, + "loss": 0.0348, + "step": 78700 + }, + { + "epoch": 0.1735868364234628, + "grad_norm": 0.12914523482322693, + "learning_rate": 2.6450641262616632e-05, + "loss": 0.0368, + "step": 78710 + }, + { + "epoch": 0.17360889039836094, + "grad_norm": 0.10862816125154495, + "learning_rate": 2.644957312711668e-05, + "loss": 0.0356, + "step": 78720 + }, + { + "epoch": 0.17363094437325913, + "grad_norm": 0.12245381623506546, + "learning_rate": 2.6448504852493593e-05, + "loss": 0.0333, + "step": 78730 + }, + { + "epoch": 0.17365299834815728, + "grad_norm": 0.10318715125322342, + "learning_rate": 2.644743643876037e-05, + "loss": 0.0357, + "step": 78740 + }, + { + "epoch": 0.17367505232305544, + "grad_norm": 0.10759209096431732, + "learning_rate": 2.6446367885929982e-05, + "loss": 0.0344, + "step": 78750 + }, + { + "epoch": 0.17369710629795362, + "grad_norm": 0.12851160764694214, + "learning_rate": 2.6445299194015417e-05, + "loss": 0.0329, + "step": 78760 + }, + { + "epoch": 0.17371916027285178, + "grad_norm": 0.12976934015750885, + "learning_rate": 2.6444230363029663e-05, + "loss": 0.0345, + "step": 78770 + }, + { + "epoch": 0.17374121424774994, + "grad_norm": 0.141787588596344, + "learning_rate": 2.64431613929857e-05, + "loss": 0.0359, + "step": 78780 + }, + { + "epoch": 0.17376326822264812, + "grad_norm": 0.12094247341156006, + "learning_rate": 2.644209228389653e-05, + "loss": 0.0325, + "step": 78790 + }, + { + "epoch": 0.17378532219754628, + "grad_norm": 0.0837375596165657, + "learning_rate": 2.6441023035775134e-05, + "loss": 0.0344, + "step": 78800 + }, + { + "epoch": 0.17380737617244443, + "grad_norm": 0.11965108662843704, + "learning_rate": 2.64399536486345e-05, + "loss": 0.0359, + "step": 78810 + }, + { + "epoch": 0.17382943014734262, + "grad_norm": 0.12553450465202332, + "learning_rate": 2.6438884122487634e-05, + "loss": 0.0336, + "step": 78820 + }, + { + "epoch": 0.17385148412224077, + "grad_norm": 0.12445592880249023, + "learning_rate": 2.6437814457347523e-05, + "loss": 0.0355, + "step": 78830 + }, + { + "epoch": 0.17387353809713893, + "grad_norm": 0.10470307618379593, + "learning_rate": 2.6436744653227168e-05, + "loss": 0.0348, + "step": 78840 + }, + { + "epoch": 0.17389559207203711, + "grad_norm": 0.08337708562612534, + "learning_rate": 2.6435674710139566e-05, + "loss": 0.0333, + "step": 78850 + }, + { + "epoch": 0.17391764604693527, + "grad_norm": 0.09671106934547424, + "learning_rate": 2.6434604628097723e-05, + "loss": 0.034, + "step": 78860 + }, + { + "epoch": 0.17393970002183343, + "grad_norm": 0.11712606996297836, + "learning_rate": 2.6433534407114636e-05, + "loss": 0.0353, + "step": 78870 + }, + { + "epoch": 0.1739617539967316, + "grad_norm": 0.12104599177837372, + "learning_rate": 2.6432464047203306e-05, + "loss": 0.0348, + "step": 78880 + }, + { + "epoch": 0.17398380797162977, + "grad_norm": 0.08536224067211151, + "learning_rate": 2.6431393548376746e-05, + "loss": 0.0343, + "step": 78890 + }, + { + "epoch": 0.17400586194652792, + "grad_norm": 0.12656091153621674, + "learning_rate": 2.6430322910647968e-05, + "loss": 0.0343, + "step": 78900 + }, + { + "epoch": 0.1740279159214261, + "grad_norm": 0.09327376633882523, + "learning_rate": 2.642925213402997e-05, + "loss": 0.0319, + "step": 78910 + }, + { + "epoch": 0.17404996989632426, + "grad_norm": 0.09815673530101776, + "learning_rate": 2.6428181218535764e-05, + "loss": 0.0342, + "step": 78920 + }, + { + "epoch": 0.17407202387122242, + "grad_norm": 0.09598527103662491, + "learning_rate": 2.642711016417837e-05, + "loss": 0.0347, + "step": 78930 + }, + { + "epoch": 0.1740940778461206, + "grad_norm": 0.11518321186304092, + "learning_rate": 2.64260389709708e-05, + "loss": 0.035, + "step": 78940 + }, + { + "epoch": 0.17411613182101876, + "grad_norm": 0.12969547510147095, + "learning_rate": 2.6424967638926068e-05, + "loss": 0.0338, + "step": 78950 + }, + { + "epoch": 0.17413818579591692, + "grad_norm": 0.14210976660251617, + "learning_rate": 2.642389616805719e-05, + "loss": 0.0334, + "step": 78960 + }, + { + "epoch": 0.1741602397708151, + "grad_norm": 0.10619927197694778, + "learning_rate": 2.6422824558377186e-05, + "loss": 0.0349, + "step": 78970 + }, + { + "epoch": 0.17418229374571326, + "grad_norm": 0.11363699287176132, + "learning_rate": 2.642175280989908e-05, + "loss": 0.0361, + "step": 78980 + }, + { + "epoch": 0.1742043477206114, + "grad_norm": 0.11772635579109192, + "learning_rate": 2.6420680922635897e-05, + "loss": 0.0334, + "step": 78990 + }, + { + "epoch": 0.1742264016955096, + "grad_norm": 0.10478346049785614, + "learning_rate": 2.6419608896600657e-05, + "loss": 0.0346, + "step": 79000 + }, + { + "epoch": 0.17424845567040775, + "grad_norm": 0.12716557085514069, + "learning_rate": 2.6418536731806383e-05, + "loss": 0.0356, + "step": 79010 + }, + { + "epoch": 0.1742705096453059, + "grad_norm": 0.1014481782913208, + "learning_rate": 2.641746442826611e-05, + "loss": 0.0342, + "step": 79020 + }, + { + "epoch": 0.1742925636202041, + "grad_norm": 0.1378704160451889, + "learning_rate": 2.6416391985992865e-05, + "loss": 0.036, + "step": 79030 + }, + { + "epoch": 0.17431461759510225, + "grad_norm": 0.10049188882112503, + "learning_rate": 2.6415319404999676e-05, + "loss": 0.0327, + "step": 79040 + }, + { + "epoch": 0.1743366715700004, + "grad_norm": 0.1336345225572586, + "learning_rate": 2.6414246685299576e-05, + "loss": 0.0356, + "step": 79050 + }, + { + "epoch": 0.1743587255448986, + "grad_norm": 0.12441429495811462, + "learning_rate": 2.6413173826905604e-05, + "loss": 0.035, + "step": 79060 + }, + { + "epoch": 0.17438077951979675, + "grad_norm": 0.23723240196704865, + "learning_rate": 2.64121008298308e-05, + "loss": 0.0328, + "step": 79070 + }, + { + "epoch": 0.1744028334946949, + "grad_norm": 0.10791955143213272, + "learning_rate": 2.641102769408819e-05, + "loss": 0.0334, + "step": 79080 + }, + { + "epoch": 0.1744248874695931, + "grad_norm": 0.09682276844978333, + "learning_rate": 2.6409954419690823e-05, + "loss": 0.0333, + "step": 79090 + }, + { + "epoch": 0.17444694144449124, + "grad_norm": 0.11671031266450882, + "learning_rate": 2.6408881006651735e-05, + "loss": 0.0334, + "step": 79100 + }, + { + "epoch": 0.1744689954193894, + "grad_norm": 0.11970341950654984, + "learning_rate": 2.6407807454983972e-05, + "loss": 0.0357, + "step": 79110 + }, + { + "epoch": 0.17449104939428758, + "grad_norm": 0.11913912743330002, + "learning_rate": 2.640673376470058e-05, + "loss": 0.0359, + "step": 79120 + }, + { + "epoch": 0.17451310336918574, + "grad_norm": 0.11984169483184814, + "learning_rate": 2.64056599358146e-05, + "loss": 0.0326, + "step": 79130 + }, + { + "epoch": 0.17453515734408392, + "grad_norm": 0.11330563575029373, + "learning_rate": 2.640458596833908e-05, + "loss": 0.0329, + "step": 79140 + }, + { + "epoch": 0.17455721131898208, + "grad_norm": 0.17086713016033173, + "learning_rate": 2.6403511862287078e-05, + "loss": 0.0359, + "step": 79150 + }, + { + "epoch": 0.17457926529388024, + "grad_norm": 0.15143953263759613, + "learning_rate": 2.640243761767164e-05, + "loss": 0.0352, + "step": 79160 + }, + { + "epoch": 0.17460131926877842, + "grad_norm": 0.151935875415802, + "learning_rate": 2.6401363234505815e-05, + "loss": 0.0336, + "step": 79170 + }, + { + "epoch": 0.17462337324367658, + "grad_norm": 0.10976028442382812, + "learning_rate": 2.6400288712802663e-05, + "loss": 0.0368, + "step": 79180 + }, + { + "epoch": 0.17464542721857473, + "grad_norm": 0.10005610436201096, + "learning_rate": 2.639921405257524e-05, + "loss": 0.0333, + "step": 79190 + }, + { + "epoch": 0.17466748119347292, + "grad_norm": 0.10109196603298187, + "learning_rate": 2.6398139253836605e-05, + "loss": 0.0326, + "step": 79200 + }, + { + "epoch": 0.17468953516837107, + "grad_norm": 0.11252646148204803, + "learning_rate": 2.6397064316599818e-05, + "loss": 0.0348, + "step": 79210 + }, + { + "epoch": 0.17471158914326923, + "grad_norm": 0.09697330743074417, + "learning_rate": 2.6395989240877938e-05, + "loss": 0.0343, + "step": 79220 + }, + { + "epoch": 0.17473364311816741, + "grad_norm": 0.09445816278457642, + "learning_rate": 2.6394914026684023e-05, + "loss": 0.033, + "step": 79230 + }, + { + "epoch": 0.17475569709306557, + "grad_norm": 0.13309185206890106, + "learning_rate": 2.6393838674031155e-05, + "loss": 0.0344, + "step": 79240 + }, + { + "epoch": 0.17477775106796373, + "grad_norm": 0.11267639696598053, + "learning_rate": 2.6392763182932382e-05, + "loss": 0.033, + "step": 79250 + }, + { + "epoch": 0.1747998050428619, + "grad_norm": 0.12204017490148544, + "learning_rate": 2.639168755340078e-05, + "loss": 0.0347, + "step": 79260 + }, + { + "epoch": 0.17482185901776007, + "grad_norm": 0.09930377453565598, + "learning_rate": 2.6390611785449418e-05, + "loss": 0.035, + "step": 79270 + }, + { + "epoch": 0.17484391299265822, + "grad_norm": 0.09898297488689423, + "learning_rate": 2.6389535879091373e-05, + "loss": 0.0361, + "step": 79280 + }, + { + "epoch": 0.1748659669675564, + "grad_norm": 0.11895376443862915, + "learning_rate": 2.6388459834339712e-05, + "loss": 0.035, + "step": 79290 + }, + { + "epoch": 0.17488802094245456, + "grad_norm": 0.1436195820569992, + "learning_rate": 2.638738365120751e-05, + "loss": 0.0342, + "step": 79300 + }, + { + "epoch": 0.17491007491735272, + "grad_norm": 0.1205332800745964, + "learning_rate": 2.6386307329707843e-05, + "loss": 0.0353, + "step": 79310 + }, + { + "epoch": 0.1749321288922509, + "grad_norm": 0.08565572649240494, + "learning_rate": 2.63852308698538e-05, + "loss": 0.0334, + "step": 79320 + }, + { + "epoch": 0.17495418286714906, + "grad_norm": 0.08868764340877533, + "learning_rate": 2.6384154271658443e-05, + "loss": 0.0342, + "step": 79330 + }, + { + "epoch": 0.17497623684204722, + "grad_norm": 0.11344410479068756, + "learning_rate": 2.6383077535134866e-05, + "loss": 0.0347, + "step": 79340 + }, + { + "epoch": 0.1749982908169454, + "grad_norm": 0.1127864345908165, + "learning_rate": 2.638200066029615e-05, + "loss": 0.035, + "step": 79350 + }, + { + "epoch": 0.17502034479184356, + "grad_norm": 0.09999383240938187, + "learning_rate": 2.638092364715538e-05, + "loss": 0.035, + "step": 79360 + }, + { + "epoch": 0.1750423987667417, + "grad_norm": 0.13233111798763275, + "learning_rate": 2.6379846495725643e-05, + "loss": 0.0313, + "step": 79370 + }, + { + "epoch": 0.1750644527416399, + "grad_norm": 0.11003951728343964, + "learning_rate": 2.6378769206020026e-05, + "loss": 0.0327, + "step": 79380 + }, + { + "epoch": 0.17508650671653805, + "grad_norm": 0.09635666012763977, + "learning_rate": 2.637769177805162e-05, + "loss": 0.0341, + "step": 79390 + }, + { + "epoch": 0.1751085606914362, + "grad_norm": 0.09955016523599625, + "learning_rate": 2.6376614211833513e-05, + "loss": 0.0338, + "step": 79400 + }, + { + "epoch": 0.1751306146663344, + "grad_norm": 0.12150856107473373, + "learning_rate": 2.637553650737881e-05, + "loss": 0.0328, + "step": 79410 + }, + { + "epoch": 0.17515266864123255, + "grad_norm": 0.13764037191867828, + "learning_rate": 2.6374458664700592e-05, + "loss": 0.0345, + "step": 79420 + }, + { + "epoch": 0.1751747226161307, + "grad_norm": 0.12750911712646484, + "learning_rate": 2.637338068381196e-05, + "loss": 0.035, + "step": 79430 + }, + { + "epoch": 0.1751967765910289, + "grad_norm": 0.10658229142427444, + "learning_rate": 2.637230256472602e-05, + "loss": 0.0341, + "step": 79440 + }, + { + "epoch": 0.17521883056592705, + "grad_norm": 0.09709744900465012, + "learning_rate": 2.6371224307455865e-05, + "loss": 0.0335, + "step": 79450 + }, + { + "epoch": 0.1752408845408252, + "grad_norm": 0.08607342094182968, + "learning_rate": 2.6370145912014598e-05, + "loss": 0.0338, + "step": 79460 + }, + { + "epoch": 0.1752629385157234, + "grad_norm": 0.10080964118242264, + "learning_rate": 2.636906737841532e-05, + "loss": 0.0359, + "step": 79470 + }, + { + "epoch": 0.17528499249062154, + "grad_norm": 0.12497622519731522, + "learning_rate": 2.6367988706671144e-05, + "loss": 0.0355, + "step": 79480 + }, + { + "epoch": 0.1753070464655197, + "grad_norm": 0.11719848215579987, + "learning_rate": 2.6366909896795172e-05, + "loss": 0.0325, + "step": 79490 + }, + { + "epoch": 0.17532910044041788, + "grad_norm": 0.10579817742109299, + "learning_rate": 2.636583094880051e-05, + "loss": 0.0326, + "step": 79500 + }, + { + "epoch": 0.17535115441531604, + "grad_norm": 0.1366070806980133, + "learning_rate": 2.6364751862700273e-05, + "loss": 0.0336, + "step": 79510 + }, + { + "epoch": 0.1753732083902142, + "grad_norm": 0.09711354970932007, + "learning_rate": 2.6363672638507567e-05, + "loss": 0.0333, + "step": 79520 + }, + { + "epoch": 0.17539526236511238, + "grad_norm": 0.10581652075052261, + "learning_rate": 2.6362593276235515e-05, + "loss": 0.0348, + "step": 79530 + }, + { + "epoch": 0.17541731634001054, + "grad_norm": 0.11473263800144196, + "learning_rate": 2.636151377589722e-05, + "loss": 0.0344, + "step": 79540 + }, + { + "epoch": 0.17543937031490872, + "grad_norm": 0.09011591970920563, + "learning_rate": 2.636043413750581e-05, + "loss": 0.0354, + "step": 79550 + }, + { + "epoch": 0.17546142428980688, + "grad_norm": 0.13156290352344513, + "learning_rate": 2.63593543610744e-05, + "loss": 0.0339, + "step": 79560 + }, + { + "epoch": 0.17548347826470503, + "grad_norm": 0.10931235551834106, + "learning_rate": 2.635827444661611e-05, + "loss": 0.0343, + "step": 79570 + }, + { + "epoch": 0.17550553223960322, + "grad_norm": 0.11697704344987869, + "learning_rate": 2.6357194394144057e-05, + "loss": 0.0334, + "step": 79580 + }, + { + "epoch": 0.17552758621450137, + "grad_norm": 0.1075698733329773, + "learning_rate": 2.6356114203671368e-05, + "loss": 0.034, + "step": 79590 + }, + { + "epoch": 0.17554964018939953, + "grad_norm": 0.09624119848012924, + "learning_rate": 2.6355033875211177e-05, + "loss": 0.0338, + "step": 79600 + }, + { + "epoch": 0.17557169416429771, + "grad_norm": 0.07960798591375351, + "learning_rate": 2.6353953408776596e-05, + "loss": 0.0323, + "step": 79610 + }, + { + "epoch": 0.17559374813919587, + "grad_norm": 0.09473370760679245, + "learning_rate": 2.6352872804380766e-05, + "loss": 0.0333, + "step": 79620 + }, + { + "epoch": 0.17561580211409403, + "grad_norm": 0.09332026541233063, + "learning_rate": 2.6351792062036807e-05, + "loss": 0.0342, + "step": 79630 + }, + { + "epoch": 0.1756378560889922, + "grad_norm": 0.11133435368537903, + "learning_rate": 2.6350711181757863e-05, + "loss": 0.0341, + "step": 79640 + }, + { + "epoch": 0.17565991006389037, + "grad_norm": 0.12985488772392273, + "learning_rate": 2.634963016355706e-05, + "loss": 0.0341, + "step": 79650 + }, + { + "epoch": 0.17568196403878852, + "grad_norm": 0.10465561598539352, + "learning_rate": 2.6348549007447535e-05, + "loss": 0.0346, + "step": 79660 + }, + { + "epoch": 0.1757040180136867, + "grad_norm": 0.1664150357246399, + "learning_rate": 2.634746771344242e-05, + "loss": 0.0357, + "step": 79670 + }, + { + "epoch": 0.17572607198858486, + "grad_norm": 0.1436089426279068, + "learning_rate": 2.6346386281554864e-05, + "loss": 0.0332, + "step": 79680 + }, + { + "epoch": 0.17574812596348302, + "grad_norm": 0.1326940804719925, + "learning_rate": 2.6345304711798e-05, + "loss": 0.0353, + "step": 79690 + }, + { + "epoch": 0.1757701799383812, + "grad_norm": 0.10795413702726364, + "learning_rate": 2.634422300418497e-05, + "loss": 0.0347, + "step": 79700 + }, + { + "epoch": 0.17579223391327936, + "grad_norm": 0.10067503154277802, + "learning_rate": 2.6343141158728923e-05, + "loss": 0.0321, + "step": 79710 + }, + { + "epoch": 0.17581428788817752, + "grad_norm": 0.11218520253896713, + "learning_rate": 2.6342059175442998e-05, + "loss": 0.0355, + "step": 79720 + }, + { + "epoch": 0.1758363418630757, + "grad_norm": 0.09575521945953369, + "learning_rate": 2.6340977054340346e-05, + "loss": 0.0348, + "step": 79730 + }, + { + "epoch": 0.17585839583797386, + "grad_norm": 0.12354429811239243, + "learning_rate": 2.6339894795434118e-05, + "loss": 0.0366, + "step": 79740 + }, + { + "epoch": 0.17588044981287201, + "grad_norm": 0.11458428204059601, + "learning_rate": 2.633881239873746e-05, + "loss": 0.0348, + "step": 79750 + }, + { + "epoch": 0.1759025037877702, + "grad_norm": 0.11048001796007156, + "learning_rate": 2.6337729864263525e-05, + "loss": 0.0336, + "step": 79760 + }, + { + "epoch": 0.17592455776266835, + "grad_norm": 0.13395468890666962, + "learning_rate": 2.633664719202547e-05, + "loss": 0.0334, + "step": 79770 + }, + { + "epoch": 0.1759466117375665, + "grad_norm": 0.10707229375839233, + "learning_rate": 2.6335564382036444e-05, + "loss": 0.0322, + "step": 79780 + }, + { + "epoch": 0.1759686657124647, + "grad_norm": 0.0961730107665062, + "learning_rate": 2.633448143430961e-05, + "loss": 0.033, + "step": 79790 + }, + { + "epoch": 0.17599071968736285, + "grad_norm": 0.09544569253921509, + "learning_rate": 2.6333398348858125e-05, + "loss": 0.0347, + "step": 79800 + }, + { + "epoch": 0.176012773662261, + "grad_norm": 0.09643170982599258, + "learning_rate": 2.633231512569515e-05, + "loss": 0.0331, + "step": 79810 + }, + { + "epoch": 0.1760348276371592, + "grad_norm": 0.13926072418689728, + "learning_rate": 2.6331231764833846e-05, + "loss": 0.0327, + "step": 79820 + }, + { + "epoch": 0.17605688161205735, + "grad_norm": 0.10272475332021713, + "learning_rate": 2.633014826628738e-05, + "loss": 0.0347, + "step": 79830 + }, + { + "epoch": 0.1760789355869555, + "grad_norm": 0.1092589721083641, + "learning_rate": 2.6329064630068914e-05, + "loss": 0.036, + "step": 79840 + }, + { + "epoch": 0.1761009895618537, + "grad_norm": 0.13113106787204742, + "learning_rate": 2.6327980856191614e-05, + "loss": 0.0345, + "step": 79850 + }, + { + "epoch": 0.17612304353675184, + "grad_norm": 0.12277953326702118, + "learning_rate": 2.6326896944668652e-05, + "loss": 0.0338, + "step": 79860 + }, + { + "epoch": 0.17614509751165, + "grad_norm": 0.14196696877479553, + "learning_rate": 2.63258128955132e-05, + "loss": 0.034, + "step": 79870 + }, + { + "epoch": 0.17616715148654818, + "grad_norm": 0.09951598197221756, + "learning_rate": 2.6324728708738426e-05, + "loss": 0.0315, + "step": 79880 + }, + { + "epoch": 0.17618920546144634, + "grad_norm": 0.09155093133449554, + "learning_rate": 2.6323644384357507e-05, + "loss": 0.0329, + "step": 79890 + }, + { + "epoch": 0.1762112594363445, + "grad_norm": 0.13724155724048615, + "learning_rate": 2.6322559922383618e-05, + "loss": 0.0341, + "step": 79900 + }, + { + "epoch": 0.17623331341124268, + "grad_norm": 0.11932964622974396, + "learning_rate": 2.6321475322829935e-05, + "loss": 0.0365, + "step": 79910 + }, + { + "epoch": 0.17625536738614084, + "grad_norm": 0.14473029971122742, + "learning_rate": 2.6320390585709635e-05, + "loss": 0.0326, + "step": 79920 + }, + { + "epoch": 0.176277421361039, + "grad_norm": 0.12683290243148804, + "learning_rate": 2.6319305711035906e-05, + "loss": 0.0328, + "step": 79930 + }, + { + "epoch": 0.17629947533593718, + "grad_norm": 0.12439320981502533, + "learning_rate": 2.631822069882192e-05, + "loss": 0.0334, + "step": 79940 + }, + { + "epoch": 0.17632152931083533, + "grad_norm": 0.11650607734918594, + "learning_rate": 2.631713554908087e-05, + "loss": 0.0337, + "step": 79950 + }, + { + "epoch": 0.1763435832857335, + "grad_norm": 0.09921195358037949, + "learning_rate": 2.6316050261825938e-05, + "loss": 0.0345, + "step": 79960 + }, + { + "epoch": 0.17636563726063167, + "grad_norm": 0.09389341622591019, + "learning_rate": 2.631496483707031e-05, + "loss": 0.0336, + "step": 79970 + }, + { + "epoch": 0.17638769123552983, + "grad_norm": 0.13311021029949188, + "learning_rate": 2.6313879274827177e-05, + "loss": 0.0322, + "step": 79980 + }, + { + "epoch": 0.17640974521042802, + "grad_norm": 0.10540739446878433, + "learning_rate": 2.6312793575109727e-05, + "loss": 0.0331, + "step": 79990 + }, + { + "epoch": 0.17643179918532617, + "grad_norm": 0.1247425228357315, + "learning_rate": 2.6311707737931153e-05, + "loss": 0.0329, + "step": 80000 + }, + { + "epoch": 0.17645385316022433, + "grad_norm": 0.09515587240457535, + "learning_rate": 2.6310621763304658e-05, + "loss": 0.0327, + "step": 80010 + }, + { + "epoch": 0.1764759071351225, + "grad_norm": 0.11094260215759277, + "learning_rate": 2.630953565124342e-05, + "loss": 0.0343, + "step": 80020 + }, + { + "epoch": 0.17649796111002067, + "grad_norm": 0.11980947852134705, + "learning_rate": 2.6308449401760654e-05, + "loss": 0.0356, + "step": 80030 + }, + { + "epoch": 0.17652001508491882, + "grad_norm": 0.11105991154909134, + "learning_rate": 2.6307363014869545e-05, + "loss": 0.0343, + "step": 80040 + }, + { + "epoch": 0.176542069059817, + "grad_norm": 0.12939339876174927, + "learning_rate": 2.63062764905833e-05, + "loss": 0.0334, + "step": 80050 + }, + { + "epoch": 0.17656412303471516, + "grad_norm": 0.11005281656980515, + "learning_rate": 2.6305189828915123e-05, + "loss": 0.0355, + "step": 80060 + }, + { + "epoch": 0.17658617700961332, + "grad_norm": 0.11087290942668915, + "learning_rate": 2.630410302987821e-05, + "loss": 0.0359, + "step": 80070 + }, + { + "epoch": 0.1766082309845115, + "grad_norm": 0.09602313488721848, + "learning_rate": 2.630301609348578e-05, + "loss": 0.033, + "step": 80080 + }, + { + "epoch": 0.17663028495940966, + "grad_norm": 0.10010555386543274, + "learning_rate": 2.630192901975103e-05, + "loss": 0.0346, + "step": 80090 + }, + { + "epoch": 0.17665233893430782, + "grad_norm": 0.13239921629428864, + "learning_rate": 2.630084180868717e-05, + "loss": 0.0341, + "step": 80100 + }, + { + "epoch": 0.176674392909206, + "grad_norm": 0.13260102272033691, + "learning_rate": 2.629975446030741e-05, + "loss": 0.0347, + "step": 80110 + }, + { + "epoch": 0.17669644688410416, + "grad_norm": 0.11789039522409439, + "learning_rate": 2.629866697462497e-05, + "loss": 0.032, + "step": 80120 + }, + { + "epoch": 0.17671850085900231, + "grad_norm": 0.10832143574953079, + "learning_rate": 2.6297579351653053e-05, + "loss": 0.0337, + "step": 80130 + }, + { + "epoch": 0.1767405548339005, + "grad_norm": 0.121601901948452, + "learning_rate": 2.6296491591404885e-05, + "loss": 0.0342, + "step": 80140 + }, + { + "epoch": 0.17676260880879865, + "grad_norm": 0.10934046655893326, + "learning_rate": 2.6295403693893677e-05, + "loss": 0.0338, + "step": 80150 + }, + { + "epoch": 0.1767846627836968, + "grad_norm": 0.11558603495359421, + "learning_rate": 2.6294315659132648e-05, + "loss": 0.0334, + "step": 80160 + }, + { + "epoch": 0.176806716758595, + "grad_norm": 0.09508434683084488, + "learning_rate": 2.6293227487135026e-05, + "loss": 0.034, + "step": 80170 + }, + { + "epoch": 0.17682877073349315, + "grad_norm": 0.10804201662540436, + "learning_rate": 2.6292139177914024e-05, + "loss": 0.0349, + "step": 80180 + }, + { + "epoch": 0.1768508247083913, + "grad_norm": 0.11852184683084488, + "learning_rate": 2.6291050731482864e-05, + "loss": 0.032, + "step": 80190 + }, + { + "epoch": 0.1768728786832895, + "grad_norm": 0.10806804150342941, + "learning_rate": 2.6289962147854785e-05, + "loss": 0.0341, + "step": 80200 + }, + { + "epoch": 0.17689493265818765, + "grad_norm": 0.10181527584791183, + "learning_rate": 2.6288873427043e-05, + "loss": 0.0332, + "step": 80210 + }, + { + "epoch": 0.1769169866330858, + "grad_norm": 0.12155644595623016, + "learning_rate": 2.6287784569060748e-05, + "loss": 0.0364, + "step": 80220 + }, + { + "epoch": 0.176939040607984, + "grad_norm": 0.10539204627275467, + "learning_rate": 2.6286695573921258e-05, + "loss": 0.0348, + "step": 80230 + }, + { + "epoch": 0.17696109458288214, + "grad_norm": 0.11168795078992844, + "learning_rate": 2.628560644163776e-05, + "loss": 0.0329, + "step": 80240 + }, + { + "epoch": 0.1769831485577803, + "grad_norm": 0.11386749893426895, + "learning_rate": 2.6284517172223484e-05, + "loss": 0.0356, + "step": 80250 + }, + { + "epoch": 0.17700520253267849, + "grad_norm": 0.1013840064406395, + "learning_rate": 2.6283427765691672e-05, + "loss": 0.0336, + "step": 80260 + }, + { + "epoch": 0.17702725650757664, + "grad_norm": 0.13780640065670013, + "learning_rate": 2.628233822205556e-05, + "loss": 0.0333, + "step": 80270 + }, + { + "epoch": 0.1770493104824748, + "grad_norm": 0.12549299001693726, + "learning_rate": 2.6281248541328385e-05, + "loss": 0.0339, + "step": 80280 + }, + { + "epoch": 0.17707136445737298, + "grad_norm": 0.08543501049280167, + "learning_rate": 2.6280158723523388e-05, + "loss": 0.035, + "step": 80290 + }, + { + "epoch": 0.17709341843227114, + "grad_norm": 0.13673287630081177, + "learning_rate": 2.6279068768653817e-05, + "loss": 0.0346, + "step": 80300 + }, + { + "epoch": 0.1771154724071693, + "grad_norm": 0.12055087089538574, + "learning_rate": 2.6277978676732905e-05, + "loss": 0.0329, + "step": 80310 + }, + { + "epoch": 0.17713752638206748, + "grad_norm": 0.09200971573591232, + "learning_rate": 2.6276888447773908e-05, + "loss": 0.0337, + "step": 80320 + }, + { + "epoch": 0.17715958035696563, + "grad_norm": 0.13415758311748505, + "learning_rate": 2.627579808179007e-05, + "loss": 0.0353, + "step": 80330 + }, + { + "epoch": 0.1771816343318638, + "grad_norm": 0.13450752198696136, + "learning_rate": 2.6274707578794635e-05, + "loss": 0.033, + "step": 80340 + }, + { + "epoch": 0.17720368830676198, + "grad_norm": 0.08605362474918365, + "learning_rate": 2.6273616938800865e-05, + "loss": 0.0339, + "step": 80350 + }, + { + "epoch": 0.17722574228166013, + "grad_norm": 0.1353718340396881, + "learning_rate": 2.6272526161821996e-05, + "loss": 0.0341, + "step": 80360 + }, + { + "epoch": 0.1772477962565583, + "grad_norm": 0.10374848544597626, + "learning_rate": 2.6271435247871296e-05, + "loss": 0.034, + "step": 80370 + }, + { + "epoch": 0.17726985023145647, + "grad_norm": 0.110003262758255, + "learning_rate": 2.6270344196962016e-05, + "loss": 0.0337, + "step": 80380 + }, + { + "epoch": 0.17729190420635463, + "grad_norm": 0.08841440826654434, + "learning_rate": 2.6269253009107414e-05, + "loss": 0.0355, + "step": 80390 + }, + { + "epoch": 0.17731395818125278, + "grad_norm": 0.08318808674812317, + "learning_rate": 2.6268161684320747e-05, + "loss": 0.0335, + "step": 80400 + }, + { + "epoch": 0.17733601215615097, + "grad_norm": 0.10301690548658371, + "learning_rate": 2.6267070222615274e-05, + "loss": 0.0333, + "step": 80410 + }, + { + "epoch": 0.17735806613104912, + "grad_norm": 0.1377577930688858, + "learning_rate": 2.6265978624004265e-05, + "loss": 0.0327, + "step": 80420 + }, + { + "epoch": 0.1773801201059473, + "grad_norm": 0.11820466071367264, + "learning_rate": 2.6264886888500975e-05, + "loss": 0.0345, + "step": 80430 + }, + { + "epoch": 0.17740217408084547, + "grad_norm": 0.10646762698888779, + "learning_rate": 2.6263795016118675e-05, + "loss": 0.0327, + "step": 80440 + }, + { + "epoch": 0.17742422805574362, + "grad_norm": 0.11710527539253235, + "learning_rate": 2.626270300687063e-05, + "loss": 0.0329, + "step": 80450 + }, + { + "epoch": 0.1774462820306418, + "grad_norm": 0.10878198593854904, + "learning_rate": 2.6261610860770107e-05, + "loss": 0.0344, + "step": 80460 + }, + { + "epoch": 0.17746833600553996, + "grad_norm": 0.10570286959409714, + "learning_rate": 2.6260518577830382e-05, + "loss": 0.0351, + "step": 80470 + }, + { + "epoch": 0.17749038998043812, + "grad_norm": 0.1039157435297966, + "learning_rate": 2.6259426158064726e-05, + "loss": 0.0358, + "step": 80480 + }, + { + "epoch": 0.1775124439553363, + "grad_norm": 0.09063820540904999, + "learning_rate": 2.625833360148641e-05, + "loss": 0.036, + "step": 80490 + }, + { + "epoch": 0.17753449793023446, + "grad_norm": 0.12129848450422287, + "learning_rate": 2.625724090810871e-05, + "loss": 0.0344, + "step": 80500 + }, + { + "epoch": 0.17755655190513261, + "grad_norm": 0.09772200137376785, + "learning_rate": 2.6256148077944906e-05, + "loss": 0.0344, + "step": 80510 + }, + { + "epoch": 0.1775786058800308, + "grad_norm": 0.09818032383918762, + "learning_rate": 2.6255055111008278e-05, + "loss": 0.0339, + "step": 80520 + }, + { + "epoch": 0.17760065985492896, + "grad_norm": 0.17282582819461823, + "learning_rate": 2.6253962007312102e-05, + "loss": 0.0355, + "step": 80530 + }, + { + "epoch": 0.1776227138298271, + "grad_norm": 0.15003888309001923, + "learning_rate": 2.6252868766869657e-05, + "loss": 0.0345, + "step": 80540 + }, + { + "epoch": 0.1776447678047253, + "grad_norm": 0.08426853269338608, + "learning_rate": 2.6251775389694238e-05, + "loss": 0.0326, + "step": 80550 + }, + { + "epoch": 0.17766682177962345, + "grad_norm": 0.10343106091022491, + "learning_rate": 2.6250681875799125e-05, + "loss": 0.0341, + "step": 80560 + }, + { + "epoch": 0.1776888757545216, + "grad_norm": 0.09123922139406204, + "learning_rate": 2.6249588225197598e-05, + "loss": 0.0347, + "step": 80570 + }, + { + "epoch": 0.1777109297294198, + "grad_norm": 0.1024993285536766, + "learning_rate": 2.624849443790296e-05, + "loss": 0.0341, + "step": 80580 + }, + { + "epoch": 0.17773298370431795, + "grad_norm": 0.1078987792134285, + "learning_rate": 2.624740051392849e-05, + "loss": 0.0336, + "step": 80590 + }, + { + "epoch": 0.1777550376792161, + "grad_norm": 0.11178848147392273, + "learning_rate": 2.624630645328749e-05, + "loss": 0.0342, + "step": 80600 + }, + { + "epoch": 0.1777770916541143, + "grad_norm": 0.10760153830051422, + "learning_rate": 2.6245212255993242e-05, + "loss": 0.0325, + "step": 80610 + }, + { + "epoch": 0.17779914562901245, + "grad_norm": 0.12101267278194427, + "learning_rate": 2.6244117922059048e-05, + "loss": 0.0329, + "step": 80620 + }, + { + "epoch": 0.1778211996039106, + "grad_norm": 0.12184153497219086, + "learning_rate": 2.624302345149821e-05, + "loss": 0.0341, + "step": 80630 + }, + { + "epoch": 0.17784325357880879, + "grad_norm": 0.10885797441005707, + "learning_rate": 2.6241928844324017e-05, + "loss": 0.0341, + "step": 80640 + }, + { + "epoch": 0.17786530755370694, + "grad_norm": 0.12194360047578812, + "learning_rate": 2.6240834100549778e-05, + "loss": 0.0344, + "step": 80650 + }, + { + "epoch": 0.1778873615286051, + "grad_norm": 0.08258727937936783, + "learning_rate": 2.623973922018879e-05, + "loss": 0.0337, + "step": 80660 + }, + { + "epoch": 0.17790941550350328, + "grad_norm": 0.1154400035738945, + "learning_rate": 2.6238644203254355e-05, + "loss": 0.0345, + "step": 80670 + }, + { + "epoch": 0.17793146947840144, + "grad_norm": 0.12313898652791977, + "learning_rate": 2.6237549049759785e-05, + "loss": 0.0343, + "step": 80680 + }, + { + "epoch": 0.1779535234532996, + "grad_norm": 0.11480017006397247, + "learning_rate": 2.623645375971839e-05, + "loss": 0.0327, + "step": 80690 + }, + { + "epoch": 0.17797557742819778, + "grad_norm": 0.12225263565778732, + "learning_rate": 2.6235358333143464e-05, + "loss": 0.0324, + "step": 80700 + }, + { + "epoch": 0.17799763140309594, + "grad_norm": 0.11209148913621902, + "learning_rate": 2.6234262770048328e-05, + "loss": 0.0319, + "step": 80710 + }, + { + "epoch": 0.1780196853779941, + "grad_norm": 0.12509852647781372, + "learning_rate": 2.6233167070446295e-05, + "loss": 0.0334, + "step": 80720 + }, + { + "epoch": 0.17804173935289228, + "grad_norm": 0.09314320981502533, + "learning_rate": 2.6232071234350675e-05, + "loss": 0.0367, + "step": 80730 + }, + { + "epoch": 0.17806379332779043, + "grad_norm": 0.11623342335224152, + "learning_rate": 2.6230975261774785e-05, + "loss": 0.0348, + "step": 80740 + }, + { + "epoch": 0.1780858473026886, + "grad_norm": 0.0952492207288742, + "learning_rate": 2.622987915273194e-05, + "loss": 0.0341, + "step": 80750 + }, + { + "epoch": 0.17810790127758677, + "grad_norm": 0.12672609090805054, + "learning_rate": 2.6228782907235464e-05, + "loss": 0.0346, + "step": 80760 + }, + { + "epoch": 0.17812995525248493, + "grad_norm": 0.10014280676841736, + "learning_rate": 2.6227686525298675e-05, + "loss": 0.0336, + "step": 80770 + }, + { + "epoch": 0.17815200922738308, + "grad_norm": 0.1140255331993103, + "learning_rate": 2.622659000693489e-05, + "loss": 0.0352, + "step": 80780 + }, + { + "epoch": 0.17817406320228127, + "grad_norm": 0.11514496058225632, + "learning_rate": 2.6225493352157437e-05, + "loss": 0.0339, + "step": 80790 + }, + { + "epoch": 0.17819611717717943, + "grad_norm": 0.13414783775806427, + "learning_rate": 2.622439656097964e-05, + "loss": 0.0327, + "step": 80800 + }, + { + "epoch": 0.17821817115207758, + "grad_norm": 0.1164945513010025, + "learning_rate": 2.622329963341483e-05, + "loss": 0.0337, + "step": 80810 + }, + { + "epoch": 0.17824022512697577, + "grad_norm": 0.11296474933624268, + "learning_rate": 2.622220256947633e-05, + "loss": 0.0327, + "step": 80820 + }, + { + "epoch": 0.17826227910187392, + "grad_norm": 0.0935077890753746, + "learning_rate": 2.6221105369177476e-05, + "loss": 0.0329, + "step": 80830 + }, + { + "epoch": 0.1782843330767721, + "grad_norm": 0.10247011482715607, + "learning_rate": 2.6220008032531596e-05, + "loss": 0.0332, + "step": 80840 + }, + { + "epoch": 0.17830638705167026, + "grad_norm": 0.09068536013364792, + "learning_rate": 2.6218910559552026e-05, + "loss": 0.0343, + "step": 80850 + }, + { + "epoch": 0.17832844102656842, + "grad_norm": 0.1166442260146141, + "learning_rate": 2.62178129502521e-05, + "loss": 0.032, + "step": 80860 + }, + { + "epoch": 0.1783504950014666, + "grad_norm": 0.09492149204015732, + "learning_rate": 2.6216715204645156e-05, + "loss": 0.0336, + "step": 80870 + }, + { + "epoch": 0.17837254897636476, + "grad_norm": 0.11777923256158829, + "learning_rate": 2.621561732274453e-05, + "loss": 0.0349, + "step": 80880 + }, + { + "epoch": 0.17839460295126292, + "grad_norm": 0.11115796118974686, + "learning_rate": 2.6214519304563564e-05, + "loss": 0.0336, + "step": 80890 + }, + { + "epoch": 0.1784166569261611, + "grad_norm": 0.14520591497421265, + "learning_rate": 2.6213421150115603e-05, + "loss": 0.0327, + "step": 80900 + }, + { + "epoch": 0.17843871090105926, + "grad_norm": 0.1124822273850441, + "learning_rate": 2.6212322859413987e-05, + "loss": 0.0322, + "step": 80910 + }, + { + "epoch": 0.1784607648759574, + "grad_norm": 0.10378704220056534, + "learning_rate": 2.6211224432472058e-05, + "loss": 0.0338, + "step": 80920 + }, + { + "epoch": 0.1784828188508556, + "grad_norm": 0.09676551818847656, + "learning_rate": 2.621012586930317e-05, + "loss": 0.0344, + "step": 80930 + }, + { + "epoch": 0.17850487282575375, + "grad_norm": 0.1609153300523758, + "learning_rate": 2.620902716992067e-05, + "loss": 0.0357, + "step": 80940 + }, + { + "epoch": 0.1785269268006519, + "grad_norm": 0.08713880181312561, + "learning_rate": 2.6207928334337908e-05, + "loss": 0.0343, + "step": 80950 + }, + { + "epoch": 0.1785489807755501, + "grad_norm": 0.08967182040214539, + "learning_rate": 2.6206829362568235e-05, + "loss": 0.0339, + "step": 80960 + }, + { + "epoch": 0.17857103475044825, + "grad_norm": 0.1224733218550682, + "learning_rate": 2.6205730254625e-05, + "loss": 0.034, + "step": 80970 + }, + { + "epoch": 0.1785930887253464, + "grad_norm": 0.10429622232913971, + "learning_rate": 2.6204631010521563e-05, + "loss": 0.034, + "step": 80980 + }, + { + "epoch": 0.1786151427002446, + "grad_norm": 0.12252474576234818, + "learning_rate": 2.6203531630271282e-05, + "loss": 0.0342, + "step": 80990 + }, + { + "epoch": 0.17863719667514275, + "grad_norm": 0.11299707740545273, + "learning_rate": 2.6202432113887513e-05, + "loss": 0.0329, + "step": 81000 + }, + { + "epoch": 0.1786592506500409, + "grad_norm": 0.10908157378435135, + "learning_rate": 2.6201332461383617e-05, + "loss": 0.0356, + "step": 81010 + }, + { + "epoch": 0.17868130462493909, + "grad_norm": 0.1438678652048111, + "learning_rate": 2.6200232672772955e-05, + "loss": 0.0326, + "step": 81020 + }, + { + "epoch": 0.17870335859983724, + "grad_norm": 0.11878713220357895, + "learning_rate": 2.6199132748068893e-05, + "loss": 0.034, + "step": 81030 + }, + { + "epoch": 0.1787254125747354, + "grad_norm": 0.1330304741859436, + "learning_rate": 2.6198032687284793e-05, + "loss": 0.0319, + "step": 81040 + }, + { + "epoch": 0.17874746654963358, + "grad_norm": 0.14500920474529266, + "learning_rate": 2.6196932490434025e-05, + "loss": 0.0333, + "step": 81050 + }, + { + "epoch": 0.17876952052453174, + "grad_norm": 0.0983443558216095, + "learning_rate": 2.6195832157529953e-05, + "loss": 0.0338, + "step": 81060 + }, + { + "epoch": 0.1787915744994299, + "grad_norm": 0.12141285091638565, + "learning_rate": 2.6194731688585954e-05, + "loss": 0.0334, + "step": 81070 + }, + { + "epoch": 0.17881362847432808, + "grad_norm": 0.12519094347953796, + "learning_rate": 2.6193631083615394e-05, + "loss": 0.0348, + "step": 81080 + }, + { + "epoch": 0.17883568244922624, + "grad_norm": 0.11144696176052094, + "learning_rate": 2.6192530342631643e-05, + "loss": 0.0348, + "step": 81090 + }, + { + "epoch": 0.1788577364241244, + "grad_norm": 0.10937860608100891, + "learning_rate": 2.6191429465648086e-05, + "loss": 0.0349, + "step": 81100 + }, + { + "epoch": 0.17887979039902258, + "grad_norm": 0.09959160536527634, + "learning_rate": 2.619032845267809e-05, + "loss": 0.0325, + "step": 81110 + }, + { + "epoch": 0.17890184437392073, + "grad_norm": 0.08530048280954361, + "learning_rate": 2.618922730373504e-05, + "loss": 0.0332, + "step": 81120 + }, + { + "epoch": 0.1789238983488189, + "grad_norm": 0.0883168950676918, + "learning_rate": 2.6188126018832316e-05, + "loss": 0.0344, + "step": 81130 + }, + { + "epoch": 0.17894595232371707, + "grad_norm": 0.11294975131750107, + "learning_rate": 2.6187024597983296e-05, + "loss": 0.0346, + "step": 81140 + }, + { + "epoch": 0.17896800629861523, + "grad_norm": 0.07996410131454468, + "learning_rate": 2.6185923041201365e-05, + "loss": 0.0326, + "step": 81150 + }, + { + "epoch": 0.17899006027351338, + "grad_norm": 0.13250397145748138, + "learning_rate": 2.6184821348499907e-05, + "loss": 0.0339, + "step": 81160 + }, + { + "epoch": 0.17901211424841157, + "grad_norm": 0.11112861335277557, + "learning_rate": 2.618371951989231e-05, + "loss": 0.0336, + "step": 81170 + }, + { + "epoch": 0.17903416822330973, + "grad_norm": 0.13848662376403809, + "learning_rate": 2.618261755539196e-05, + "loss": 0.0314, + "step": 81180 + }, + { + "epoch": 0.17905622219820788, + "grad_norm": 0.09378832578659058, + "learning_rate": 2.618151545501225e-05, + "loss": 0.0328, + "step": 81190 + }, + { + "epoch": 0.17907827617310607, + "grad_norm": 0.09824726730585098, + "learning_rate": 2.6180413218766574e-05, + "loss": 0.0339, + "step": 81200 + }, + { + "epoch": 0.17910033014800422, + "grad_norm": 0.11246819794178009, + "learning_rate": 2.6179310846668312e-05, + "loss": 0.033, + "step": 81210 + }, + { + "epoch": 0.17912238412290238, + "grad_norm": 0.09536783397197723, + "learning_rate": 2.6178208338730876e-05, + "loss": 0.0338, + "step": 81220 + }, + { + "epoch": 0.17914443809780056, + "grad_norm": 0.09507259726524353, + "learning_rate": 2.6177105694967654e-05, + "loss": 0.033, + "step": 81230 + }, + { + "epoch": 0.17916649207269872, + "grad_norm": 0.0952252745628357, + "learning_rate": 2.6176002915392044e-05, + "loss": 0.0328, + "step": 81240 + }, + { + "epoch": 0.17918854604759687, + "grad_norm": 0.11935427784919739, + "learning_rate": 2.6174900000017445e-05, + "loss": 0.0335, + "step": 81250 + }, + { + "epoch": 0.17921060002249506, + "grad_norm": 0.09860285371541977, + "learning_rate": 2.6173796948857264e-05, + "loss": 0.0332, + "step": 81260 + }, + { + "epoch": 0.17923265399739322, + "grad_norm": 0.11885834485292435, + "learning_rate": 2.6172693761924895e-05, + "loss": 0.0312, + "step": 81270 + }, + { + "epoch": 0.1792547079722914, + "grad_norm": 0.11076441407203674, + "learning_rate": 2.6171590439233747e-05, + "loss": 0.0322, + "step": 81280 + }, + { + "epoch": 0.17927676194718956, + "grad_norm": 0.11726812273263931, + "learning_rate": 2.6170486980797235e-05, + "loss": 0.0346, + "step": 81290 + }, + { + "epoch": 0.1792988159220877, + "grad_norm": 0.10951678454875946, + "learning_rate": 2.6169383386628752e-05, + "loss": 0.0333, + "step": 81300 + }, + { + "epoch": 0.1793208698969859, + "grad_norm": 0.11676900088787079, + "learning_rate": 2.616827965674172e-05, + "loss": 0.0337, + "step": 81310 + }, + { + "epoch": 0.17934292387188405, + "grad_norm": 0.13080067932605743, + "learning_rate": 2.6167175791149543e-05, + "loss": 0.0356, + "step": 81320 + }, + { + "epoch": 0.1793649778467822, + "grad_norm": 0.1385509967803955, + "learning_rate": 2.6166071789865637e-05, + "loss": 0.0339, + "step": 81330 + }, + { + "epoch": 0.1793870318216804, + "grad_norm": 0.10829445719718933, + "learning_rate": 2.616496765290342e-05, + "loss": 0.035, + "step": 81340 + }, + { + "epoch": 0.17940908579657855, + "grad_norm": 0.08752422034740448, + "learning_rate": 2.6163863380276303e-05, + "loss": 0.0317, + "step": 81350 + }, + { + "epoch": 0.1794311397714767, + "grad_norm": 0.10268618911504745, + "learning_rate": 2.61627589719977e-05, + "loss": 0.0338, + "step": 81360 + }, + { + "epoch": 0.1794531937463749, + "grad_norm": 0.13110382854938507, + "learning_rate": 2.6161654428081044e-05, + "loss": 0.0329, + "step": 81370 + }, + { + "epoch": 0.17947524772127305, + "grad_norm": 0.13133248686790466, + "learning_rate": 2.616054974853974e-05, + "loss": 0.0349, + "step": 81380 + }, + { + "epoch": 0.1794973016961712, + "grad_norm": 0.11871808022260666, + "learning_rate": 2.6159444933387224e-05, + "loss": 0.032, + "step": 81390 + }, + { + "epoch": 0.17951935567106939, + "grad_norm": 0.09574290364980698, + "learning_rate": 2.6158339982636918e-05, + "loss": 0.0335, + "step": 81400 + }, + { + "epoch": 0.17954140964596754, + "grad_norm": 0.11352700740098953, + "learning_rate": 2.6157234896302242e-05, + "loss": 0.0341, + "step": 81410 + }, + { + "epoch": 0.1795634636208657, + "grad_norm": 0.1319034844636917, + "learning_rate": 2.6156129674396633e-05, + "loss": 0.0326, + "step": 81420 + }, + { + "epoch": 0.17958551759576388, + "grad_norm": 0.14203302562236786, + "learning_rate": 2.615502431693351e-05, + "loss": 0.034, + "step": 81430 + }, + { + "epoch": 0.17960757157066204, + "grad_norm": 0.12256801873445511, + "learning_rate": 2.6153918823926308e-05, + "loss": 0.0316, + "step": 81440 + }, + { + "epoch": 0.1796296255455602, + "grad_norm": 0.12493890523910522, + "learning_rate": 2.6152813195388465e-05, + "loss": 0.0348, + "step": 81450 + }, + { + "epoch": 0.17965167952045838, + "grad_norm": 0.10228045284748077, + "learning_rate": 2.615170743133341e-05, + "loss": 0.0334, + "step": 81460 + }, + { + "epoch": 0.17967373349535654, + "grad_norm": 0.11210299283266068, + "learning_rate": 2.615060153177458e-05, + "loss": 0.0351, + "step": 81470 + }, + { + "epoch": 0.1796957874702547, + "grad_norm": 0.10330221056938171, + "learning_rate": 2.614949549672542e-05, + "loss": 0.0348, + "step": 81480 + }, + { + "epoch": 0.17971784144515288, + "grad_norm": 0.10659827291965485, + "learning_rate": 2.614838932619936e-05, + "loss": 0.035, + "step": 81490 + }, + { + "epoch": 0.17973989542005103, + "grad_norm": 0.13280577957630157, + "learning_rate": 2.614728302020984e-05, + "loss": 0.0339, + "step": 81500 + }, + { + "epoch": 0.1797619493949492, + "grad_norm": 0.12088880687952042, + "learning_rate": 2.6146176578770306e-05, + "loss": 0.034, + "step": 81510 + }, + { + "epoch": 0.17978400336984737, + "grad_norm": 0.12300379574298859, + "learning_rate": 2.6145070001894206e-05, + "loss": 0.0315, + "step": 81520 + }, + { + "epoch": 0.17980605734474553, + "grad_norm": 0.12704113125801086, + "learning_rate": 2.6143963289594982e-05, + "loss": 0.035, + "step": 81530 + }, + { + "epoch": 0.17982811131964369, + "grad_norm": 0.105507493019104, + "learning_rate": 2.6142856441886084e-05, + "loss": 0.0348, + "step": 81540 + }, + { + "epoch": 0.17985016529454187, + "grad_norm": 0.16036944091320038, + "learning_rate": 2.6141749458780957e-05, + "loss": 0.0343, + "step": 81550 + }, + { + "epoch": 0.17987221926944003, + "grad_norm": 0.15616579353809357, + "learning_rate": 2.6140642340293054e-05, + "loss": 0.0335, + "step": 81560 + }, + { + "epoch": 0.17989427324433818, + "grad_norm": 0.1590808928012848, + "learning_rate": 2.613953508643583e-05, + "loss": 0.0359, + "step": 81570 + }, + { + "epoch": 0.17991632721923637, + "grad_norm": 0.1266099214553833, + "learning_rate": 2.6138427697222737e-05, + "loss": 0.0353, + "step": 81580 + }, + { + "epoch": 0.17993838119413452, + "grad_norm": 0.09590008854866028, + "learning_rate": 2.6137320172667232e-05, + "loss": 0.0351, + "step": 81590 + }, + { + "epoch": 0.17996043516903268, + "grad_norm": 0.11250235885381699, + "learning_rate": 2.613621251278277e-05, + "loss": 0.0357, + "step": 81600 + }, + { + "epoch": 0.17998248914393086, + "grad_norm": 0.09451566636562347, + "learning_rate": 2.613510471758281e-05, + "loss": 0.0351, + "step": 81610 + }, + { + "epoch": 0.18000454311882902, + "grad_norm": 0.10939250886440277, + "learning_rate": 2.613399678708082e-05, + "loss": 0.0335, + "step": 81620 + }, + { + "epoch": 0.18002659709372718, + "grad_norm": 0.12351281940937042, + "learning_rate": 2.6132888721290255e-05, + "loss": 0.0339, + "step": 81630 + }, + { + "epoch": 0.18004865106862536, + "grad_norm": 0.10436529666185379, + "learning_rate": 2.6131780520224578e-05, + "loss": 0.0333, + "step": 81640 + }, + { + "epoch": 0.18007070504352352, + "grad_norm": 0.11559755355119705, + "learning_rate": 2.6130672183897257e-05, + "loss": 0.0325, + "step": 81650 + }, + { + "epoch": 0.18009275901842167, + "grad_norm": 0.1161288172006607, + "learning_rate": 2.6129563712321763e-05, + "loss": 0.0335, + "step": 81660 + }, + { + "epoch": 0.18011481299331986, + "grad_norm": 0.12399928271770477, + "learning_rate": 2.612845510551156e-05, + "loss": 0.0349, + "step": 81670 + }, + { + "epoch": 0.180136866968218, + "grad_norm": 0.11012212932109833, + "learning_rate": 2.6127346363480122e-05, + "loss": 0.0341, + "step": 81680 + }, + { + "epoch": 0.1801589209431162, + "grad_norm": 0.10112979263067245, + "learning_rate": 2.612623748624092e-05, + "loss": 0.0338, + "step": 81690 + }, + { + "epoch": 0.18018097491801435, + "grad_norm": 0.12761153280735016, + "learning_rate": 2.6125128473807423e-05, + "loss": 0.0347, + "step": 81700 + }, + { + "epoch": 0.1802030288929125, + "grad_norm": 0.1015341505408287, + "learning_rate": 2.6124019326193114e-05, + "loss": 0.0343, + "step": 81710 + }, + { + "epoch": 0.1802250828678107, + "grad_norm": 0.11657204478979111, + "learning_rate": 2.612291004341147e-05, + "loss": 0.0348, + "step": 81720 + }, + { + "epoch": 0.18024713684270885, + "grad_norm": 0.12065649032592773, + "learning_rate": 2.6121800625475963e-05, + "loss": 0.0351, + "step": 81730 + }, + { + "epoch": 0.180269190817607, + "grad_norm": 0.1545269638299942, + "learning_rate": 2.612069107240008e-05, + "loss": 0.0349, + "step": 81740 + }, + { + "epoch": 0.1802912447925052, + "grad_norm": 0.12042370438575745, + "learning_rate": 2.61195813841973e-05, + "loss": 0.033, + "step": 81750 + }, + { + "epoch": 0.18031329876740335, + "grad_norm": 0.10233347117900848, + "learning_rate": 2.6118471560881113e-05, + "loss": 0.0334, + "step": 81760 + }, + { + "epoch": 0.1803353527423015, + "grad_norm": 0.12065856903791428, + "learning_rate": 2.6117361602464993e-05, + "loss": 0.0329, + "step": 81770 + }, + { + "epoch": 0.1803574067171997, + "grad_norm": 0.0797099769115448, + "learning_rate": 2.6116251508962432e-05, + "loss": 0.0347, + "step": 81780 + }, + { + "epoch": 0.18037946069209784, + "grad_norm": 0.18704521656036377, + "learning_rate": 2.6115141280386922e-05, + "loss": 0.0346, + "step": 81790 + }, + { + "epoch": 0.180401514666996, + "grad_norm": 0.13743193447589874, + "learning_rate": 2.611403091675195e-05, + "loss": 0.033, + "step": 81800 + }, + { + "epoch": 0.18042356864189418, + "grad_norm": 0.11260942369699478, + "learning_rate": 2.611292041807101e-05, + "loss": 0.0341, + "step": 81810 + }, + { + "epoch": 0.18044562261679234, + "grad_norm": 0.13768504559993744, + "learning_rate": 2.6111809784357596e-05, + "loss": 0.0336, + "step": 81820 + }, + { + "epoch": 0.1804676765916905, + "grad_norm": 0.11719139665365219, + "learning_rate": 2.6110699015625198e-05, + "loss": 0.037, + "step": 81830 + }, + { + "epoch": 0.18048973056658868, + "grad_norm": 0.10012759268283844, + "learning_rate": 2.6109588111887318e-05, + "loss": 0.0333, + "step": 81840 + }, + { + "epoch": 0.18051178454148684, + "grad_norm": 0.09190261363983154, + "learning_rate": 2.6108477073157456e-05, + "loss": 0.034, + "step": 81850 + }, + { + "epoch": 0.180533838516385, + "grad_norm": 0.09826408326625824, + "learning_rate": 2.610736589944911e-05, + "loss": 0.0344, + "step": 81860 + }, + { + "epoch": 0.18055589249128318, + "grad_norm": 0.10184329748153687, + "learning_rate": 2.610625459077578e-05, + "loss": 0.0321, + "step": 81870 + }, + { + "epoch": 0.18057794646618133, + "grad_norm": 0.1295304000377655, + "learning_rate": 2.6105143147150964e-05, + "loss": 0.0339, + "step": 81880 + }, + { + "epoch": 0.1806000004410795, + "grad_norm": 0.09890414774417877, + "learning_rate": 2.610403156858818e-05, + "loss": 0.0328, + "step": 81890 + }, + { + "epoch": 0.18062205441597767, + "grad_norm": 0.09285156428813934, + "learning_rate": 2.6102919855100925e-05, + "loss": 0.0318, + "step": 81900 + }, + { + "epoch": 0.18064410839087583, + "grad_norm": 0.10873351246118546, + "learning_rate": 2.6101808006702717e-05, + "loss": 0.034, + "step": 81910 + }, + { + "epoch": 0.18066616236577399, + "grad_norm": 0.11758996546268463, + "learning_rate": 2.6100696023407052e-05, + "loss": 0.0349, + "step": 81920 + }, + { + "epoch": 0.18068821634067217, + "grad_norm": 0.10856574773788452, + "learning_rate": 2.609958390522746e-05, + "loss": 0.0323, + "step": 81930 + }, + { + "epoch": 0.18071027031557033, + "grad_norm": 0.1431165635585785, + "learning_rate": 2.6098471652177434e-05, + "loss": 0.0338, + "step": 81940 + }, + { + "epoch": 0.18073232429046848, + "grad_norm": 0.11176487803459167, + "learning_rate": 2.6097359264270505e-05, + "loss": 0.0321, + "step": 81950 + }, + { + "epoch": 0.18075437826536667, + "grad_norm": 0.13949893414974213, + "learning_rate": 2.609624674152018e-05, + "loss": 0.0339, + "step": 81960 + }, + { + "epoch": 0.18077643224026482, + "grad_norm": 0.11136494576931, + "learning_rate": 2.6095134083939982e-05, + "loss": 0.0336, + "step": 81970 + }, + { + "epoch": 0.18079848621516298, + "grad_norm": 0.15473568439483643, + "learning_rate": 2.6094021291543426e-05, + "loss": 0.0336, + "step": 81980 + }, + { + "epoch": 0.18082054019006116, + "grad_norm": 0.13609394431114197, + "learning_rate": 2.609290836434404e-05, + "loss": 0.0336, + "step": 81990 + }, + { + "epoch": 0.18084259416495932, + "grad_norm": 0.15388673543930054, + "learning_rate": 2.6091795302355342e-05, + "loss": 0.0357, + "step": 82000 + }, + { + "epoch": 0.18086464813985748, + "grad_norm": 0.1754292994737625, + "learning_rate": 2.609068210559086e-05, + "loss": 0.034, + "step": 82010 + }, + { + "epoch": 0.18088670211475566, + "grad_norm": 0.12156086415052414, + "learning_rate": 2.6089568774064123e-05, + "loss": 0.0333, + "step": 82020 + }, + { + "epoch": 0.18090875608965382, + "grad_norm": 0.12134640663862228, + "learning_rate": 2.608845530778865e-05, + "loss": 0.0354, + "step": 82030 + }, + { + "epoch": 0.18093081006455197, + "grad_norm": 0.11625880002975464, + "learning_rate": 2.6087341706777978e-05, + "loss": 0.0349, + "step": 82040 + }, + { + "epoch": 0.18095286403945016, + "grad_norm": 0.12048748135566711, + "learning_rate": 2.6086227971045633e-05, + "loss": 0.0361, + "step": 82050 + }, + { + "epoch": 0.1809749180143483, + "grad_norm": 0.1359235942363739, + "learning_rate": 2.6085114100605156e-05, + "loss": 0.0334, + "step": 82060 + }, + { + "epoch": 0.18099697198924647, + "grad_norm": 0.11664145439863205, + "learning_rate": 2.6084000095470072e-05, + "loss": 0.0334, + "step": 82070 + }, + { + "epoch": 0.18101902596414465, + "grad_norm": 0.08932178467512131, + "learning_rate": 2.6082885955653924e-05, + "loss": 0.0328, + "step": 82080 + }, + { + "epoch": 0.1810410799390428, + "grad_norm": 0.0998402088880539, + "learning_rate": 2.6081771681170248e-05, + "loss": 0.0346, + "step": 82090 + }, + { + "epoch": 0.18106313391394097, + "grad_norm": 0.12673892080783844, + "learning_rate": 2.608065727203258e-05, + "loss": 0.0328, + "step": 82100 + }, + { + "epoch": 0.18108518788883915, + "grad_norm": 0.09103989601135254, + "learning_rate": 2.6079542728254467e-05, + "loss": 0.0347, + "step": 82110 + }, + { + "epoch": 0.1811072418637373, + "grad_norm": 0.09963969886302948, + "learning_rate": 2.607842804984945e-05, + "loss": 0.0361, + "step": 82120 + }, + { + "epoch": 0.1811292958386355, + "grad_norm": 0.09537187218666077, + "learning_rate": 2.607731323683107e-05, + "loss": 0.0337, + "step": 82130 + }, + { + "epoch": 0.18115134981353365, + "grad_norm": 0.12051665782928467, + "learning_rate": 2.6076198289212875e-05, + "loss": 0.0325, + "step": 82140 + }, + { + "epoch": 0.1811734037884318, + "grad_norm": 0.10244424641132355, + "learning_rate": 2.6075083207008415e-05, + "loss": 0.035, + "step": 82150 + }, + { + "epoch": 0.18119545776333, + "grad_norm": 0.11652213335037231, + "learning_rate": 2.6073967990231237e-05, + "loss": 0.0333, + "step": 82160 + }, + { + "epoch": 0.18121751173822814, + "grad_norm": 0.08455038070678711, + "learning_rate": 2.6072852638894896e-05, + "loss": 0.0316, + "step": 82170 + }, + { + "epoch": 0.1812395657131263, + "grad_norm": 0.10346876829862595, + "learning_rate": 2.607173715301294e-05, + "loss": 0.033, + "step": 82180 + }, + { + "epoch": 0.18126161968802448, + "grad_norm": 0.1267363280057907, + "learning_rate": 2.6070621532598916e-05, + "loss": 0.0349, + "step": 82190 + }, + { + "epoch": 0.18128367366292264, + "grad_norm": 0.10518770664930344, + "learning_rate": 2.6069505777666396e-05, + "loss": 0.0366, + "step": 82200 + }, + { + "epoch": 0.1813057276378208, + "grad_norm": 0.13059715926647186, + "learning_rate": 2.6068389888228922e-05, + "loss": 0.0337, + "step": 82210 + }, + { + "epoch": 0.18132778161271898, + "grad_norm": 0.14492985606193542, + "learning_rate": 2.6067273864300073e-05, + "loss": 0.0328, + "step": 82220 + }, + { + "epoch": 0.18134983558761714, + "grad_norm": 0.08662354946136475, + "learning_rate": 2.6066157705893386e-05, + "loss": 0.0353, + "step": 82230 + }, + { + "epoch": 0.1813718895625153, + "grad_norm": 0.13786166906356812, + "learning_rate": 2.6065041413022438e-05, + "loss": 0.0341, + "step": 82240 + }, + { + "epoch": 0.18139394353741348, + "grad_norm": 0.1312989890575409, + "learning_rate": 2.6063924985700788e-05, + "loss": 0.0342, + "step": 82250 + }, + { + "epoch": 0.18141599751231163, + "grad_norm": 0.11827759444713593, + "learning_rate": 2.606280842394201e-05, + "loss": 0.0339, + "step": 82260 + }, + { + "epoch": 0.1814380514872098, + "grad_norm": 0.10546513646841049, + "learning_rate": 2.6061691727759656e-05, + "loss": 0.0322, + "step": 82270 + }, + { + "epoch": 0.18146010546210797, + "grad_norm": 0.11464611440896988, + "learning_rate": 2.6060574897167308e-05, + "loss": 0.0335, + "step": 82280 + }, + { + "epoch": 0.18148215943700613, + "grad_norm": 0.16904528439044952, + "learning_rate": 2.605945793217853e-05, + "loss": 0.0332, + "step": 82290 + }, + { + "epoch": 0.18150421341190429, + "grad_norm": 0.1092100739479065, + "learning_rate": 2.60583408328069e-05, + "loss": 0.0317, + "step": 82300 + }, + { + "epoch": 0.18152626738680247, + "grad_norm": 0.09595364332199097, + "learning_rate": 2.6057223599065988e-05, + "loss": 0.0339, + "step": 82310 + }, + { + "epoch": 0.18154832136170063, + "grad_norm": 0.12386906147003174, + "learning_rate": 2.6056106230969367e-05, + "loss": 0.0348, + "step": 82320 + }, + { + "epoch": 0.18157037533659878, + "grad_norm": 0.1007029116153717, + "learning_rate": 2.6054988728530616e-05, + "loss": 0.0326, + "step": 82330 + }, + { + "epoch": 0.18159242931149697, + "grad_norm": 0.10021960735321045, + "learning_rate": 2.6053871091763316e-05, + "loss": 0.0328, + "step": 82340 + }, + { + "epoch": 0.18161448328639512, + "grad_norm": 0.11916058510541916, + "learning_rate": 2.6052753320681043e-05, + "loss": 0.032, + "step": 82350 + }, + { + "epoch": 0.18163653726129328, + "grad_norm": 0.155710831284523, + "learning_rate": 2.6051635415297386e-05, + "loss": 0.0346, + "step": 82360 + }, + { + "epoch": 0.18165859123619146, + "grad_norm": 0.11416265368461609, + "learning_rate": 2.6050517375625923e-05, + "loss": 0.0331, + "step": 82370 + }, + { + "epoch": 0.18168064521108962, + "grad_norm": 0.10852962732315063, + "learning_rate": 2.604939920168024e-05, + "loss": 0.0321, + "step": 82380 + }, + { + "epoch": 0.18170269918598778, + "grad_norm": 0.3028702735900879, + "learning_rate": 2.6048280893473925e-05, + "loss": 0.0331, + "step": 82390 + }, + { + "epoch": 0.18172475316088596, + "grad_norm": 0.13296443223953247, + "learning_rate": 2.6047162451020565e-05, + "loss": 0.0324, + "step": 82400 + }, + { + "epoch": 0.18174680713578412, + "grad_norm": 0.12410970777273178, + "learning_rate": 2.604604387433375e-05, + "loss": 0.0346, + "step": 82410 + }, + { + "epoch": 0.18176886111068227, + "grad_norm": 0.11750780791044235, + "learning_rate": 2.6044925163427073e-05, + "loss": 0.0329, + "step": 82420 + }, + { + "epoch": 0.18179091508558046, + "grad_norm": 0.10127697885036469, + "learning_rate": 2.6043806318314127e-05, + "loss": 0.0331, + "step": 82430 + }, + { + "epoch": 0.1818129690604786, + "grad_norm": 0.11135175824165344, + "learning_rate": 2.6042687339008506e-05, + "loss": 0.0345, + "step": 82440 + }, + { + "epoch": 0.18183502303537677, + "grad_norm": 0.10785237699747086, + "learning_rate": 2.604156822552381e-05, + "loss": 0.033, + "step": 82450 + }, + { + "epoch": 0.18185707701027495, + "grad_norm": 0.12001810222864151, + "learning_rate": 2.6040448977873636e-05, + "loss": 0.0333, + "step": 82460 + }, + { + "epoch": 0.1818791309851731, + "grad_norm": 0.07395382970571518, + "learning_rate": 2.6039329596071582e-05, + "loss": 0.0329, + "step": 82470 + }, + { + "epoch": 0.18190118496007127, + "grad_norm": 0.13760879635810852, + "learning_rate": 2.603821008013125e-05, + "loss": 0.0344, + "step": 82480 + }, + { + "epoch": 0.18192323893496945, + "grad_norm": 0.1131434440612793, + "learning_rate": 2.6037090430066245e-05, + "loss": 0.0359, + "step": 82490 + }, + { + "epoch": 0.1819452929098676, + "grad_norm": 0.10428620129823685, + "learning_rate": 2.603597064589017e-05, + "loss": 0.0337, + "step": 82500 + }, + { + "epoch": 0.18196734688476576, + "grad_norm": 0.11989863216876984, + "learning_rate": 2.603485072761663e-05, + "loss": 0.034, + "step": 82510 + }, + { + "epoch": 0.18198940085966395, + "grad_norm": 0.129997119307518, + "learning_rate": 2.6033730675259236e-05, + "loss": 0.0337, + "step": 82520 + }, + { + "epoch": 0.1820114548345621, + "grad_norm": 0.10640957206487656, + "learning_rate": 2.6032610488831597e-05, + "loss": 0.0351, + "step": 82530 + }, + { + "epoch": 0.18203350880946026, + "grad_norm": 0.11909563094377518, + "learning_rate": 2.6031490168347323e-05, + "loss": 0.0332, + "step": 82540 + }, + { + "epoch": 0.18205556278435844, + "grad_norm": 0.13452312350273132, + "learning_rate": 2.603036971382003e-05, + "loss": 0.0332, + "step": 82550 + }, + { + "epoch": 0.1820776167592566, + "grad_norm": 0.13492925465106964, + "learning_rate": 2.602924912526333e-05, + "loss": 0.0338, + "step": 82560 + }, + { + "epoch": 0.18209967073415478, + "grad_norm": 0.13379119336605072, + "learning_rate": 2.602812840269084e-05, + "loss": 0.0328, + "step": 82570 + }, + { + "epoch": 0.18212172470905294, + "grad_norm": 0.0959472730755806, + "learning_rate": 2.6027007546116177e-05, + "loss": 0.0323, + "step": 82580 + }, + { + "epoch": 0.1821437786839511, + "grad_norm": 0.1273299604654312, + "learning_rate": 2.602588655555296e-05, + "loss": 0.0331, + "step": 82590 + }, + { + "epoch": 0.18216583265884928, + "grad_norm": 0.10028538107872009, + "learning_rate": 2.6024765431014812e-05, + "loss": 0.0336, + "step": 82600 + }, + { + "epoch": 0.18218788663374744, + "grad_norm": 0.10697515308856964, + "learning_rate": 2.6023644172515356e-05, + "loss": 0.0351, + "step": 82610 + }, + { + "epoch": 0.1822099406086456, + "grad_norm": 0.10524009168148041, + "learning_rate": 2.6022522780068214e-05, + "loss": 0.0322, + "step": 82620 + }, + { + "epoch": 0.18223199458354378, + "grad_norm": 0.11806570738554001, + "learning_rate": 2.6021401253687016e-05, + "loss": 0.0343, + "step": 82630 + }, + { + "epoch": 0.18225404855844193, + "grad_norm": 0.13536511361598969, + "learning_rate": 2.6020279593385382e-05, + "loss": 0.0306, + "step": 82640 + }, + { + "epoch": 0.1822761025333401, + "grad_norm": 0.09525511413812637, + "learning_rate": 2.601915779917695e-05, + "loss": 0.0344, + "step": 82650 + }, + { + "epoch": 0.18229815650823827, + "grad_norm": 0.1086287572979927, + "learning_rate": 2.6018035871075347e-05, + "loss": 0.0319, + "step": 82660 + }, + { + "epoch": 0.18232021048313643, + "grad_norm": 0.11887677758932114, + "learning_rate": 2.6016913809094203e-05, + "loss": 0.0337, + "step": 82670 + }, + { + "epoch": 0.1823422644580346, + "grad_norm": 0.10324014723300934, + "learning_rate": 2.6015791613247156e-05, + "loss": 0.0319, + "step": 82680 + }, + { + "epoch": 0.18236431843293277, + "grad_norm": 0.09857213497161865, + "learning_rate": 2.601466928354784e-05, + "loss": 0.037, + "step": 82690 + }, + { + "epoch": 0.18238637240783093, + "grad_norm": 0.10380757600069046, + "learning_rate": 2.6013546820009893e-05, + "loss": 0.0334, + "step": 82700 + }, + { + "epoch": 0.18240842638272908, + "grad_norm": 0.0919630154967308, + "learning_rate": 2.601242422264695e-05, + "loss": 0.0328, + "step": 82710 + }, + { + "epoch": 0.18243048035762727, + "grad_norm": 0.10029815882444382, + "learning_rate": 2.6011301491472662e-05, + "loss": 0.0323, + "step": 82720 + }, + { + "epoch": 0.18245253433252542, + "grad_norm": 0.10603296756744385, + "learning_rate": 2.6010178626500662e-05, + "loss": 0.0347, + "step": 82730 + }, + { + "epoch": 0.18247458830742358, + "grad_norm": 0.11990778893232346, + "learning_rate": 2.6009055627744596e-05, + "loss": 0.0347, + "step": 82740 + }, + { + "epoch": 0.18249664228232176, + "grad_norm": 0.11775574833154678, + "learning_rate": 2.6007932495218107e-05, + "loss": 0.0331, + "step": 82750 + }, + { + "epoch": 0.18251869625721992, + "grad_norm": 0.10076634585857391, + "learning_rate": 2.600680922893485e-05, + "loss": 0.0351, + "step": 82760 + }, + { + "epoch": 0.18254075023211808, + "grad_norm": 0.08569422364234924, + "learning_rate": 2.600568582890846e-05, + "loss": 0.0337, + "step": 82770 + }, + { + "epoch": 0.18256280420701626, + "grad_norm": 0.09965317696332932, + "learning_rate": 2.6004562295152602e-05, + "loss": 0.0335, + "step": 82780 + }, + { + "epoch": 0.18258485818191442, + "grad_norm": 0.11169856041669846, + "learning_rate": 2.6003438627680924e-05, + "loss": 0.0322, + "step": 82790 + }, + { + "epoch": 0.18260691215681257, + "grad_norm": 0.11589821428060532, + "learning_rate": 2.6002314826507077e-05, + "loss": 0.0349, + "step": 82800 + }, + { + "epoch": 0.18262896613171076, + "grad_norm": 0.1072283536195755, + "learning_rate": 2.6001190891644716e-05, + "loss": 0.0343, + "step": 82810 + }, + { + "epoch": 0.1826510201066089, + "grad_norm": 0.09783610701560974, + "learning_rate": 2.6000066823107494e-05, + "loss": 0.0349, + "step": 82820 + }, + { + "epoch": 0.18267307408150707, + "grad_norm": 0.1057969406247139, + "learning_rate": 2.5998942620909083e-05, + "loss": 0.033, + "step": 82830 + }, + { + "epoch": 0.18269512805640525, + "grad_norm": 0.11087442189455032, + "learning_rate": 2.599781828506313e-05, + "loss": 0.0332, + "step": 82840 + }, + { + "epoch": 0.1827171820313034, + "grad_norm": 0.11639190465211868, + "learning_rate": 2.59966938155833e-05, + "loss": 0.0342, + "step": 82850 + }, + { + "epoch": 0.18273923600620157, + "grad_norm": 0.12241200357675552, + "learning_rate": 2.599556921248326e-05, + "loss": 0.0337, + "step": 82860 + }, + { + "epoch": 0.18276128998109975, + "grad_norm": 0.10628607869148254, + "learning_rate": 2.5994444475776672e-05, + "loss": 0.0344, + "step": 82870 + }, + { + "epoch": 0.1827833439559979, + "grad_norm": 0.10038179904222488, + "learning_rate": 2.5993319605477202e-05, + "loss": 0.0351, + "step": 82880 + }, + { + "epoch": 0.18280539793089606, + "grad_norm": 0.11711171269416809, + "learning_rate": 2.5992194601598522e-05, + "loss": 0.0355, + "step": 82890 + }, + { + "epoch": 0.18282745190579425, + "grad_norm": 0.12862758338451385, + "learning_rate": 2.5991069464154297e-05, + "loss": 0.0321, + "step": 82900 + }, + { + "epoch": 0.1828495058806924, + "grad_norm": 0.11705023795366287, + "learning_rate": 2.59899441931582e-05, + "loss": 0.0339, + "step": 82910 + }, + { + "epoch": 0.18287155985559056, + "grad_norm": 0.12352501600980759, + "learning_rate": 2.5988818788623903e-05, + "loss": 0.0322, + "step": 82920 + }, + { + "epoch": 0.18289361383048874, + "grad_norm": 0.10462340712547302, + "learning_rate": 2.598769325056509e-05, + "loss": 0.0328, + "step": 82930 + }, + { + "epoch": 0.1829156678053869, + "grad_norm": 0.08650455623865128, + "learning_rate": 2.598656757899542e-05, + "loss": 0.0327, + "step": 82940 + }, + { + "epoch": 0.18293772178028506, + "grad_norm": 0.11486832052469254, + "learning_rate": 2.598544177392859e-05, + "loss": 0.0334, + "step": 82950 + }, + { + "epoch": 0.18295977575518324, + "grad_norm": 0.11024178564548492, + "learning_rate": 2.5984315835378267e-05, + "loss": 0.0327, + "step": 82960 + }, + { + "epoch": 0.1829818297300814, + "grad_norm": 0.0942033976316452, + "learning_rate": 2.5983189763358132e-05, + "loss": 0.0323, + "step": 82970 + }, + { + "epoch": 0.18300388370497958, + "grad_norm": 0.1373720020055771, + "learning_rate": 2.5982063557881874e-05, + "loss": 0.0338, + "step": 82980 + }, + { + "epoch": 0.18302593767987774, + "grad_norm": 0.09815706312656403, + "learning_rate": 2.598093721896318e-05, + "loss": 0.0348, + "step": 82990 + }, + { + "epoch": 0.1830479916547759, + "grad_norm": 0.10782738029956818, + "learning_rate": 2.5979810746615727e-05, + "loss": 0.0333, + "step": 83000 + }, + { + "epoch": 0.18307004562967408, + "grad_norm": 0.12451884895563126, + "learning_rate": 2.5978684140853206e-05, + "loss": 0.035, + "step": 83010 + }, + { + "epoch": 0.18309209960457223, + "grad_norm": 0.10787557810544968, + "learning_rate": 2.5977557401689304e-05, + "loss": 0.0338, + "step": 83020 + }, + { + "epoch": 0.1831141535794704, + "grad_norm": 0.14160344004631042, + "learning_rate": 2.597643052913772e-05, + "loss": 0.0352, + "step": 83030 + }, + { + "epoch": 0.18313620755436857, + "grad_norm": 0.11587053537368774, + "learning_rate": 2.5975303523212143e-05, + "loss": 0.0351, + "step": 83040 + }, + { + "epoch": 0.18315826152926673, + "grad_norm": 0.10787131637334824, + "learning_rate": 2.5974176383926263e-05, + "loss": 0.035, + "step": 83050 + }, + { + "epoch": 0.1831803155041649, + "grad_norm": 0.09001179039478302, + "learning_rate": 2.5973049111293777e-05, + "loss": 0.0324, + "step": 83060 + }, + { + "epoch": 0.18320236947906307, + "grad_norm": 0.09744726866483688, + "learning_rate": 2.5971921705328386e-05, + "loss": 0.0326, + "step": 83070 + }, + { + "epoch": 0.18322442345396123, + "grad_norm": 0.11916190385818481, + "learning_rate": 2.5970794166043786e-05, + "loss": 0.0342, + "step": 83080 + }, + { + "epoch": 0.18324647742885938, + "grad_norm": 0.1402144581079483, + "learning_rate": 2.596966649345368e-05, + "loss": 0.0342, + "step": 83090 + }, + { + "epoch": 0.18326853140375757, + "grad_norm": 0.10857114940881729, + "learning_rate": 2.596853868757177e-05, + "loss": 0.0335, + "step": 83100 + }, + { + "epoch": 0.18329058537865572, + "grad_norm": 0.10599343478679657, + "learning_rate": 2.5967410748411755e-05, + "loss": 0.0334, + "step": 83110 + }, + { + "epoch": 0.18331263935355388, + "grad_norm": 0.19785410165786743, + "learning_rate": 2.596628267598735e-05, + "loss": 0.0335, + "step": 83120 + }, + { + "epoch": 0.18333469332845206, + "grad_norm": 0.10930249840021133, + "learning_rate": 2.5965154470312253e-05, + "loss": 0.0338, + "step": 83130 + }, + { + "epoch": 0.18335674730335022, + "grad_norm": 0.16108854115009308, + "learning_rate": 2.5964026131400176e-05, + "loss": 0.0334, + "step": 83140 + }, + { + "epoch": 0.18337880127824838, + "grad_norm": 0.13690006732940674, + "learning_rate": 2.5962897659264832e-05, + "loss": 0.0341, + "step": 83150 + }, + { + "epoch": 0.18340085525314656, + "grad_norm": 0.1138555034995079, + "learning_rate": 2.596176905391993e-05, + "loss": 0.033, + "step": 83160 + }, + { + "epoch": 0.18342290922804472, + "grad_norm": 0.12894520163536072, + "learning_rate": 2.5960640315379182e-05, + "loss": 0.0344, + "step": 83170 + }, + { + "epoch": 0.18344496320294287, + "grad_norm": 0.11792370676994324, + "learning_rate": 2.5959511443656306e-05, + "loss": 0.0362, + "step": 83180 + }, + { + "epoch": 0.18346701717784106, + "grad_norm": 0.13822200894355774, + "learning_rate": 2.595838243876502e-05, + "loss": 0.0351, + "step": 83190 + }, + { + "epoch": 0.1834890711527392, + "grad_norm": 0.11460352689027786, + "learning_rate": 2.5957253300719042e-05, + "loss": 0.0335, + "step": 83200 + }, + { + "epoch": 0.18351112512763737, + "grad_norm": 0.10759641975164413, + "learning_rate": 2.595612402953209e-05, + "loss": 0.0343, + "step": 83210 + }, + { + "epoch": 0.18353317910253555, + "grad_norm": 0.12260216474533081, + "learning_rate": 2.5954994625217886e-05, + "loss": 0.0337, + "step": 83220 + }, + { + "epoch": 0.1835552330774337, + "grad_norm": 0.09621062874794006, + "learning_rate": 2.5953865087790157e-05, + "loss": 0.032, + "step": 83230 + }, + { + "epoch": 0.18357728705233187, + "grad_norm": 0.09980855137109756, + "learning_rate": 2.595273541726262e-05, + "loss": 0.0334, + "step": 83240 + }, + { + "epoch": 0.18359934102723005, + "grad_norm": 0.13742077350616455, + "learning_rate": 2.5951605613649008e-05, + "loss": 0.0316, + "step": 83250 + }, + { + "epoch": 0.1836213950021282, + "grad_norm": 0.14926303923130035, + "learning_rate": 2.5950475676963053e-05, + "loss": 0.034, + "step": 83260 + }, + { + "epoch": 0.18364344897702636, + "grad_norm": 0.0912647470831871, + "learning_rate": 2.5949345607218478e-05, + "loss": 0.0331, + "step": 83270 + }, + { + "epoch": 0.18366550295192455, + "grad_norm": 0.11076335608959198, + "learning_rate": 2.594821540442901e-05, + "loss": 0.0334, + "step": 83280 + }, + { + "epoch": 0.1836875569268227, + "grad_norm": 0.09213590621948242, + "learning_rate": 2.594708506860839e-05, + "loss": 0.0336, + "step": 83290 + }, + { + "epoch": 0.18370961090172086, + "grad_norm": 0.17544721066951752, + "learning_rate": 2.5945954599770355e-05, + "loss": 0.032, + "step": 83300 + }, + { + "epoch": 0.18373166487661904, + "grad_norm": 0.10550931841135025, + "learning_rate": 2.5944823997928633e-05, + "loss": 0.0335, + "step": 83310 + }, + { + "epoch": 0.1837537188515172, + "grad_norm": 0.12361986935138702, + "learning_rate": 2.5943693263096966e-05, + "loss": 0.0342, + "step": 83320 + }, + { + "epoch": 0.18377577282641536, + "grad_norm": 0.14628247916698456, + "learning_rate": 2.5942562395289095e-05, + "loss": 0.0336, + "step": 83330 + }, + { + "epoch": 0.18379782680131354, + "grad_norm": 0.11448166519403458, + "learning_rate": 2.5941431394518758e-05, + "loss": 0.0339, + "step": 83340 + }, + { + "epoch": 0.1838198807762117, + "grad_norm": 0.10233476012945175, + "learning_rate": 2.59403002607997e-05, + "loss": 0.0308, + "step": 83350 + }, + { + "epoch": 0.18384193475110985, + "grad_norm": 0.13237802684307098, + "learning_rate": 2.5939168994145667e-05, + "loss": 0.0344, + "step": 83360 + }, + { + "epoch": 0.18386398872600804, + "grad_norm": 0.10150457173585892, + "learning_rate": 2.59380375945704e-05, + "loss": 0.0315, + "step": 83370 + }, + { + "epoch": 0.1838860427009062, + "grad_norm": 0.09440777450799942, + "learning_rate": 2.5936906062087646e-05, + "loss": 0.0323, + "step": 83380 + }, + { + "epoch": 0.18390809667580435, + "grad_norm": 0.1125236228108406, + "learning_rate": 2.593577439671116e-05, + "loss": 0.0344, + "step": 83390 + }, + { + "epoch": 0.18393015065070253, + "grad_norm": 0.10915382206439972, + "learning_rate": 2.593464259845469e-05, + "loss": 0.0327, + "step": 83400 + }, + { + "epoch": 0.1839522046256007, + "grad_norm": 0.11879441142082214, + "learning_rate": 2.593351066733199e-05, + "loss": 0.0336, + "step": 83410 + }, + { + "epoch": 0.18397425860049887, + "grad_norm": 0.13308006525039673, + "learning_rate": 2.5932378603356807e-05, + "loss": 0.0321, + "step": 83420 + }, + { + "epoch": 0.18399631257539703, + "grad_norm": 0.10624061524868011, + "learning_rate": 2.5931246406542908e-05, + "loss": 0.0337, + "step": 83430 + }, + { + "epoch": 0.1840183665502952, + "grad_norm": 0.1100379228591919, + "learning_rate": 2.593011407690404e-05, + "loss": 0.0327, + "step": 83440 + }, + { + "epoch": 0.18404042052519337, + "grad_norm": 0.12227911502122879, + "learning_rate": 2.592898161445397e-05, + "loss": 0.0325, + "step": 83450 + }, + { + "epoch": 0.18406247450009153, + "grad_norm": 0.11200916767120361, + "learning_rate": 2.5927849019206455e-05, + "loss": 0.0344, + "step": 83460 + }, + { + "epoch": 0.18408452847498968, + "grad_norm": 0.13371992111206055, + "learning_rate": 2.5926716291175257e-05, + "loss": 0.0338, + "step": 83470 + }, + { + "epoch": 0.18410658244988787, + "grad_norm": 0.11584380269050598, + "learning_rate": 2.5925583430374135e-05, + "loss": 0.033, + "step": 83480 + }, + { + "epoch": 0.18412863642478602, + "grad_norm": 0.12067905813455582, + "learning_rate": 2.592445043681686e-05, + "loss": 0.0343, + "step": 83490 + }, + { + "epoch": 0.18415069039968418, + "grad_norm": 0.16361448168754578, + "learning_rate": 2.59233173105172e-05, + "loss": 0.0343, + "step": 83500 + }, + { + "epoch": 0.18417274437458236, + "grad_norm": 0.10860598087310791, + "learning_rate": 2.592218405148892e-05, + "loss": 0.034, + "step": 83510 + }, + { + "epoch": 0.18419479834948052, + "grad_norm": 0.14442715048789978, + "learning_rate": 2.5921050659745795e-05, + "loss": 0.0331, + "step": 83520 + }, + { + "epoch": 0.18421685232437868, + "grad_norm": 0.11829753965139389, + "learning_rate": 2.5919917135301586e-05, + "loss": 0.0324, + "step": 83530 + }, + { + "epoch": 0.18423890629927686, + "grad_norm": 0.14375551044940948, + "learning_rate": 2.591878347817008e-05, + "loss": 0.0347, + "step": 83540 + }, + { + "epoch": 0.18426096027417502, + "grad_norm": 0.10192977637052536, + "learning_rate": 2.5917649688365043e-05, + "loss": 0.0337, + "step": 83550 + }, + { + "epoch": 0.18428301424907317, + "grad_norm": 0.095784492790699, + "learning_rate": 2.5916515765900255e-05, + "loss": 0.0316, + "step": 83560 + }, + { + "epoch": 0.18430506822397136, + "grad_norm": 0.10923368483781815, + "learning_rate": 2.5915381710789494e-05, + "loss": 0.0333, + "step": 83570 + }, + { + "epoch": 0.18432712219886951, + "grad_norm": 0.11606040596961975, + "learning_rate": 2.591424752304654e-05, + "loss": 0.034, + "step": 83580 + }, + { + "epoch": 0.18434917617376767, + "grad_norm": 0.10932131111621857, + "learning_rate": 2.5913113202685172e-05, + "loss": 0.0317, + "step": 83590 + }, + { + "epoch": 0.18437123014866585, + "grad_norm": 0.14077810943126678, + "learning_rate": 2.5911978749719177e-05, + "loss": 0.0329, + "step": 83600 + }, + { + "epoch": 0.184393284123564, + "grad_norm": 0.12259773164987564, + "learning_rate": 2.5910844164162337e-05, + "loss": 0.033, + "step": 83610 + }, + { + "epoch": 0.18441533809846217, + "grad_norm": 0.10109580308198929, + "learning_rate": 2.5909709446028436e-05, + "loss": 0.0352, + "step": 83620 + }, + { + "epoch": 0.18443739207336035, + "grad_norm": 0.1022505983710289, + "learning_rate": 2.5908574595331266e-05, + "loss": 0.0343, + "step": 83630 + }, + { + "epoch": 0.1844594460482585, + "grad_norm": 0.13368606567382812, + "learning_rate": 2.590743961208462e-05, + "loss": 0.0323, + "step": 83640 + }, + { + "epoch": 0.18448150002315666, + "grad_norm": 0.10616355389356613, + "learning_rate": 2.5906304496302276e-05, + "loss": 0.0357, + "step": 83650 + }, + { + "epoch": 0.18450355399805485, + "grad_norm": 0.10109791159629822, + "learning_rate": 2.5905169247998043e-05, + "loss": 0.0328, + "step": 83660 + }, + { + "epoch": 0.184525607972953, + "grad_norm": 0.1004137247800827, + "learning_rate": 2.5904033867185702e-05, + "loss": 0.0341, + "step": 83670 + }, + { + "epoch": 0.18454766194785116, + "grad_norm": 0.10193372517824173, + "learning_rate": 2.590289835387906e-05, + "loss": 0.0326, + "step": 83680 + }, + { + "epoch": 0.18456971592274934, + "grad_norm": 0.10720454901456833, + "learning_rate": 2.5901762708091903e-05, + "loss": 0.034, + "step": 83690 + }, + { + "epoch": 0.1845917698976475, + "grad_norm": 0.13861168920993805, + "learning_rate": 2.590062692983804e-05, + "loss": 0.0327, + "step": 83700 + }, + { + "epoch": 0.18461382387254566, + "grad_norm": 0.0958232656121254, + "learning_rate": 2.5899491019131266e-05, + "loss": 0.0321, + "step": 83710 + }, + { + "epoch": 0.18463587784744384, + "grad_norm": 0.12695693969726562, + "learning_rate": 2.5898354975985386e-05, + "loss": 0.0349, + "step": 83720 + }, + { + "epoch": 0.184657931822342, + "grad_norm": 0.08379773050546646, + "learning_rate": 2.58972188004142e-05, + "loss": 0.0331, + "step": 83730 + }, + { + "epoch": 0.18467998579724015, + "grad_norm": 0.11504364758729935, + "learning_rate": 2.5896082492431523e-05, + "loss": 0.0342, + "step": 83740 + }, + { + "epoch": 0.18470203977213834, + "grad_norm": 0.12894666194915771, + "learning_rate": 2.589494605205115e-05, + "loss": 0.0351, + "step": 83750 + }, + { + "epoch": 0.1847240937470365, + "grad_norm": 0.08789178729057312, + "learning_rate": 2.5893809479286902e-05, + "loss": 0.0348, + "step": 83760 + }, + { + "epoch": 0.18474614772193465, + "grad_norm": 0.1045626625418663, + "learning_rate": 2.5892672774152583e-05, + "loss": 0.0315, + "step": 83770 + }, + { + "epoch": 0.18476820169683283, + "grad_norm": 0.09348919242620468, + "learning_rate": 2.5891535936662e-05, + "loss": 0.0323, + "step": 83780 + }, + { + "epoch": 0.184790255671731, + "grad_norm": 0.10018637031316757, + "learning_rate": 2.5890398966828973e-05, + "loss": 0.0327, + "step": 83790 + }, + { + "epoch": 0.18481230964662915, + "grad_norm": 0.10101630538702011, + "learning_rate": 2.5889261864667322e-05, + "loss": 0.032, + "step": 83800 + }, + { + "epoch": 0.18483436362152733, + "grad_norm": 0.11887560039758682, + "learning_rate": 2.5888124630190852e-05, + "loss": 0.0334, + "step": 83810 + }, + { + "epoch": 0.1848564175964255, + "grad_norm": 0.11751630157232285, + "learning_rate": 2.5886987263413388e-05, + "loss": 0.034, + "step": 83820 + }, + { + "epoch": 0.18487847157132364, + "grad_norm": 0.12052895873785019, + "learning_rate": 2.5885849764348755e-05, + "loss": 0.0344, + "step": 83830 + }, + { + "epoch": 0.18490052554622183, + "grad_norm": 0.1347903460264206, + "learning_rate": 2.588471213301077e-05, + "loss": 0.0339, + "step": 83840 + }, + { + "epoch": 0.18492257952111998, + "grad_norm": 0.11069769412279129, + "learning_rate": 2.5883574369413247e-05, + "loss": 0.0341, + "step": 83850 + }, + { + "epoch": 0.18494463349601817, + "grad_norm": 0.09282460063695908, + "learning_rate": 2.5882436473570025e-05, + "loss": 0.0349, + "step": 83860 + }, + { + "epoch": 0.18496668747091632, + "grad_norm": 0.09939751774072647, + "learning_rate": 2.5881298445494926e-05, + "loss": 0.0347, + "step": 83870 + }, + { + "epoch": 0.18498874144581448, + "grad_norm": 0.09214486926794052, + "learning_rate": 2.5880160285201774e-05, + "loss": 0.0345, + "step": 83880 + }, + { + "epoch": 0.18501079542071266, + "grad_norm": 0.10571273416280746, + "learning_rate": 2.5879021992704405e-05, + "loss": 0.0354, + "step": 83890 + }, + { + "epoch": 0.18503284939561082, + "grad_norm": 0.0824337974190712, + "learning_rate": 2.5877883568016647e-05, + "loss": 0.0315, + "step": 83900 + }, + { + "epoch": 0.18505490337050898, + "grad_norm": 0.10075628012418747, + "learning_rate": 2.5876745011152333e-05, + "loss": 0.0348, + "step": 83910 + }, + { + "epoch": 0.18507695734540716, + "grad_norm": 0.10895833373069763, + "learning_rate": 2.5875606322125295e-05, + "loss": 0.0322, + "step": 83920 + }, + { + "epoch": 0.18509901132030532, + "grad_norm": 0.11253329366445541, + "learning_rate": 2.5874467500949374e-05, + "loss": 0.0349, + "step": 83930 + }, + { + "epoch": 0.18512106529520347, + "grad_norm": 0.08036492764949799, + "learning_rate": 2.5873328547638403e-05, + "loss": 0.0357, + "step": 83940 + }, + { + "epoch": 0.18514311927010166, + "grad_norm": 0.10457289963960648, + "learning_rate": 2.587218946220623e-05, + "loss": 0.0344, + "step": 83950 + }, + { + "epoch": 0.18516517324499981, + "grad_norm": 0.11675528436899185, + "learning_rate": 2.5871050244666687e-05, + "loss": 0.036, + "step": 83960 + }, + { + "epoch": 0.18518722721989797, + "grad_norm": 0.10887649655342102, + "learning_rate": 2.586991089503362e-05, + "loss": 0.0347, + "step": 83970 + }, + { + "epoch": 0.18520928119479615, + "grad_norm": 0.111134834587574, + "learning_rate": 2.586877141332087e-05, + "loss": 0.0353, + "step": 83980 + }, + { + "epoch": 0.1852313351696943, + "grad_norm": 0.10170768201351166, + "learning_rate": 2.5867631799542288e-05, + "loss": 0.0311, + "step": 83990 + }, + { + "epoch": 0.18525338914459247, + "grad_norm": 0.1302124261856079, + "learning_rate": 2.586649205371172e-05, + "loss": 0.0329, + "step": 84000 + }, + { + "epoch": 0.18527544311949065, + "grad_norm": 0.16085538268089294, + "learning_rate": 2.586535217584301e-05, + "loss": 0.0333, + "step": 84010 + }, + { + "epoch": 0.1852974970943888, + "grad_norm": 0.11790749430656433, + "learning_rate": 2.5864212165950016e-05, + "loss": 0.0353, + "step": 84020 + }, + { + "epoch": 0.18531955106928696, + "grad_norm": 0.10533857345581055, + "learning_rate": 2.5863072024046584e-05, + "loss": 0.0341, + "step": 84030 + }, + { + "epoch": 0.18534160504418515, + "grad_norm": 0.12181327491998672, + "learning_rate": 2.5861931750146573e-05, + "loss": 0.0333, + "step": 84040 + }, + { + "epoch": 0.1853636590190833, + "grad_norm": 0.0978187620639801, + "learning_rate": 2.5860791344263833e-05, + "loss": 0.0327, + "step": 84050 + }, + { + "epoch": 0.18538571299398146, + "grad_norm": 0.112267404794693, + "learning_rate": 2.5859650806412224e-05, + "loss": 0.0337, + "step": 84060 + }, + { + "epoch": 0.18540776696887964, + "grad_norm": 0.13763226568698883, + "learning_rate": 2.585851013660561e-05, + "loss": 0.0319, + "step": 84070 + }, + { + "epoch": 0.1854298209437778, + "grad_norm": 0.11626698821783066, + "learning_rate": 2.5857369334857843e-05, + "loss": 0.033, + "step": 84080 + }, + { + "epoch": 0.18545187491867596, + "grad_norm": 0.12986452877521515, + "learning_rate": 2.5856228401182785e-05, + "loss": 0.0328, + "step": 84090 + }, + { + "epoch": 0.18547392889357414, + "grad_norm": 0.12700477242469788, + "learning_rate": 2.58550873355943e-05, + "loss": 0.0334, + "step": 84100 + }, + { + "epoch": 0.1854959828684723, + "grad_norm": 0.10947291553020477, + "learning_rate": 2.5853946138106258e-05, + "loss": 0.0338, + "step": 84110 + }, + { + "epoch": 0.18551803684337045, + "grad_norm": 0.11778642982244492, + "learning_rate": 2.5852804808732525e-05, + "loss": 0.0338, + "step": 84120 + }, + { + "epoch": 0.18554009081826864, + "grad_norm": 0.11910020560026169, + "learning_rate": 2.5851663347486963e-05, + "loss": 0.0334, + "step": 84130 + }, + { + "epoch": 0.1855621447931668, + "grad_norm": 0.0843011885881424, + "learning_rate": 2.585052175438345e-05, + "loss": 0.033, + "step": 84140 + }, + { + "epoch": 0.18558419876806495, + "grad_norm": 0.10062450915575027, + "learning_rate": 2.5849380029435847e-05, + "loss": 0.0319, + "step": 84150 + }, + { + "epoch": 0.18560625274296313, + "grad_norm": 0.09941942989826202, + "learning_rate": 2.5848238172658042e-05, + "loss": 0.0316, + "step": 84160 + }, + { + "epoch": 0.1856283067178613, + "grad_norm": 0.11372309923171997, + "learning_rate": 2.5847096184063893e-05, + "loss": 0.0341, + "step": 84170 + }, + { + "epoch": 0.18565036069275945, + "grad_norm": 0.1156703382730484, + "learning_rate": 2.5845954063667286e-05, + "loss": 0.033, + "step": 84180 + }, + { + "epoch": 0.18567241466765763, + "grad_norm": 0.11165736615657806, + "learning_rate": 2.5844811811482097e-05, + "loss": 0.0345, + "step": 84190 + }, + { + "epoch": 0.1856944686425558, + "grad_norm": 0.10020168870687485, + "learning_rate": 2.5843669427522207e-05, + "loss": 0.0335, + "step": 84200 + }, + { + "epoch": 0.18571652261745394, + "grad_norm": 0.09002503007650375, + "learning_rate": 2.5842526911801494e-05, + "loss": 0.0334, + "step": 84210 + }, + { + "epoch": 0.18573857659235213, + "grad_norm": 0.09883560240268707, + "learning_rate": 2.584138426433384e-05, + "loss": 0.034, + "step": 84220 + }, + { + "epoch": 0.18576063056725028, + "grad_norm": 0.09873608499765396, + "learning_rate": 2.5840241485133132e-05, + "loss": 0.0331, + "step": 84230 + }, + { + "epoch": 0.18578268454214844, + "grad_norm": 0.1039901077747345, + "learning_rate": 2.5839098574213256e-05, + "loss": 0.0335, + "step": 84240 + }, + { + "epoch": 0.18580473851704662, + "grad_norm": 0.11198177933692932, + "learning_rate": 2.58379555315881e-05, + "loss": 0.0353, + "step": 84250 + }, + { + "epoch": 0.18582679249194478, + "grad_norm": 0.12041007727384567, + "learning_rate": 2.583681235727155e-05, + "loss": 0.0318, + "step": 84260 + }, + { + "epoch": 0.18584884646684297, + "grad_norm": 0.11349494755268097, + "learning_rate": 2.58356690512775e-05, + "loss": 0.0335, + "step": 84270 + }, + { + "epoch": 0.18587090044174112, + "grad_norm": 0.08761904388666153, + "learning_rate": 2.583452561361984e-05, + "loss": 0.0331, + "step": 84280 + }, + { + "epoch": 0.18589295441663928, + "grad_norm": 0.1000838503241539, + "learning_rate": 2.583338204431246e-05, + "loss": 0.0338, + "step": 84290 + }, + { + "epoch": 0.18591500839153746, + "grad_norm": 0.13084226846694946, + "learning_rate": 2.583223834336926e-05, + "loss": 0.0329, + "step": 84300 + }, + { + "epoch": 0.18593706236643562, + "grad_norm": 0.11791811883449554, + "learning_rate": 2.583109451080414e-05, + "loss": 0.0341, + "step": 84310 + }, + { + "epoch": 0.18595911634133377, + "grad_norm": 0.1644982248544693, + "learning_rate": 2.5829950546631e-05, + "loss": 0.0335, + "step": 84320 + }, + { + "epoch": 0.18598117031623196, + "grad_norm": 0.12393836677074432, + "learning_rate": 2.5828806450863727e-05, + "loss": 0.0318, + "step": 84330 + }, + { + "epoch": 0.18600322429113011, + "grad_norm": 0.13134998083114624, + "learning_rate": 2.5827662223516236e-05, + "loss": 0.0324, + "step": 84340 + }, + { + "epoch": 0.18602527826602827, + "grad_norm": 0.09363047033548355, + "learning_rate": 2.5826517864602427e-05, + "loss": 0.0336, + "step": 84350 + }, + { + "epoch": 0.18604733224092646, + "grad_norm": 0.12340173125267029, + "learning_rate": 2.5825373374136202e-05, + "loss": 0.0332, + "step": 84360 + }, + { + "epoch": 0.1860693862158246, + "grad_norm": 0.12876249849796295, + "learning_rate": 2.582422875213147e-05, + "loss": 0.0328, + "step": 84370 + }, + { + "epoch": 0.18609144019072277, + "grad_norm": 0.11028323322534561, + "learning_rate": 2.582308399860214e-05, + "loss": 0.0336, + "step": 84380 + }, + { + "epoch": 0.18611349416562095, + "grad_norm": 0.12285353243350983, + "learning_rate": 2.5821939113562118e-05, + "loss": 0.034, + "step": 84390 + }, + { + "epoch": 0.1861355481405191, + "grad_norm": 0.12578807771205902, + "learning_rate": 2.5820794097025322e-05, + "loss": 0.0375, + "step": 84400 + }, + { + "epoch": 0.18615760211541726, + "grad_norm": 0.09833597391843796, + "learning_rate": 2.581964894900566e-05, + "loss": 0.0344, + "step": 84410 + }, + { + "epoch": 0.18617965609031545, + "grad_norm": 0.1480150818824768, + "learning_rate": 2.5818503669517046e-05, + "loss": 0.0333, + "step": 84420 + }, + { + "epoch": 0.1862017100652136, + "grad_norm": 0.13392111659049988, + "learning_rate": 2.5817358258573395e-05, + "loss": 0.0341, + "step": 84430 + }, + { + "epoch": 0.18622376404011176, + "grad_norm": 0.12247513979673386, + "learning_rate": 2.5816212716188632e-05, + "loss": 0.0362, + "step": 84440 + }, + { + "epoch": 0.18624581801500995, + "grad_norm": 0.10053983330726624, + "learning_rate": 2.581506704237667e-05, + "loss": 0.0315, + "step": 84450 + }, + { + "epoch": 0.1862678719899081, + "grad_norm": 0.09070880711078644, + "learning_rate": 2.581392123715144e-05, + "loss": 0.034, + "step": 84460 + }, + { + "epoch": 0.18628992596480626, + "grad_norm": 0.08212810754776001, + "learning_rate": 2.581277530052685e-05, + "loss": 0.0335, + "step": 84470 + }, + { + "epoch": 0.18631197993970444, + "grad_norm": 0.10711666941642761, + "learning_rate": 2.5811629232516834e-05, + "loss": 0.0333, + "step": 84480 + }, + { + "epoch": 0.1863340339146026, + "grad_norm": 0.11413656920194626, + "learning_rate": 2.5810483033135312e-05, + "loss": 0.0334, + "step": 84490 + }, + { + "epoch": 0.18635608788950075, + "grad_norm": 0.1100364625453949, + "learning_rate": 2.5809336702396216e-05, + "loss": 0.0333, + "step": 84500 + }, + { + "epoch": 0.18637814186439894, + "grad_norm": 0.09924869239330292, + "learning_rate": 2.5808190240313473e-05, + "loss": 0.0344, + "step": 84510 + }, + { + "epoch": 0.1864001958392971, + "grad_norm": 0.11038076132535934, + "learning_rate": 2.5807043646901017e-05, + "loss": 0.0333, + "step": 84520 + }, + { + "epoch": 0.18642224981419525, + "grad_norm": 0.0755314826965332, + "learning_rate": 2.5805896922172768e-05, + "loss": 0.0344, + "step": 84530 + }, + { + "epoch": 0.18644430378909344, + "grad_norm": 0.10570541769266129, + "learning_rate": 2.580475006614268e-05, + "loss": 0.0351, + "step": 84540 + }, + { + "epoch": 0.1864663577639916, + "grad_norm": 0.11973166465759277, + "learning_rate": 2.580360307882467e-05, + "loss": 0.0318, + "step": 84550 + }, + { + "epoch": 0.18648841173888975, + "grad_norm": 0.10053271055221558, + "learning_rate": 2.5802455960232686e-05, + "loss": 0.0343, + "step": 84560 + }, + { + "epoch": 0.18651046571378793, + "grad_norm": 0.10273640602827072, + "learning_rate": 2.5801308710380663e-05, + "loss": 0.0345, + "step": 84570 + }, + { + "epoch": 0.1865325196886861, + "grad_norm": 0.12501128017902374, + "learning_rate": 2.5800161329282538e-05, + "loss": 0.0341, + "step": 84580 + }, + { + "epoch": 0.18655457366358424, + "grad_norm": 0.12924188375473022, + "learning_rate": 2.579901381695226e-05, + "loss": 0.033, + "step": 84590 + }, + { + "epoch": 0.18657662763848243, + "grad_norm": 0.10998356342315674, + "learning_rate": 2.5797866173403764e-05, + "loss": 0.0334, + "step": 84600 + }, + { + "epoch": 0.18659868161338058, + "grad_norm": 0.10282735526561737, + "learning_rate": 2.5796718398651e-05, + "loss": 0.0331, + "step": 84610 + }, + { + "epoch": 0.18662073558827874, + "grad_norm": 0.1260881870985031, + "learning_rate": 2.5795570492707914e-05, + "loss": 0.0321, + "step": 84620 + }, + { + "epoch": 0.18664278956317693, + "grad_norm": 0.09481827169656754, + "learning_rate": 2.5794422455588456e-05, + "loss": 0.0327, + "step": 84630 + }, + { + "epoch": 0.18666484353807508, + "grad_norm": 0.09990253299474716, + "learning_rate": 2.579327428730657e-05, + "loss": 0.032, + "step": 84640 + }, + { + "epoch": 0.18668689751297324, + "grad_norm": 0.11913446336984634, + "learning_rate": 2.5792125987876215e-05, + "loss": 0.0353, + "step": 84650 + }, + { + "epoch": 0.18670895148787142, + "grad_norm": 0.1197209283709526, + "learning_rate": 2.5790977557311336e-05, + "loss": 0.0348, + "step": 84660 + }, + { + "epoch": 0.18673100546276958, + "grad_norm": 0.11351571977138519, + "learning_rate": 2.5789828995625892e-05, + "loss": 0.0349, + "step": 84670 + }, + { + "epoch": 0.18675305943766773, + "grad_norm": 0.11672205477952957, + "learning_rate": 2.5788680302833843e-05, + "loss": 0.0349, + "step": 84680 + }, + { + "epoch": 0.18677511341256592, + "grad_norm": 0.1085798591375351, + "learning_rate": 2.5787531478949137e-05, + "loss": 0.0329, + "step": 84690 + }, + { + "epoch": 0.18679716738746407, + "grad_norm": 0.09694113582372665, + "learning_rate": 2.578638252398574e-05, + "loss": 0.0335, + "step": 84700 + }, + { + "epoch": 0.18681922136236226, + "grad_norm": 0.09797705709934235, + "learning_rate": 2.5785233437957608e-05, + "loss": 0.033, + "step": 84710 + }, + { + "epoch": 0.18684127533726042, + "grad_norm": 0.11042830348014832, + "learning_rate": 2.5784084220878714e-05, + "loss": 0.0338, + "step": 84720 + }, + { + "epoch": 0.18686332931215857, + "grad_norm": 0.12421703338623047, + "learning_rate": 2.578293487276301e-05, + "loss": 0.0347, + "step": 84730 + }, + { + "epoch": 0.18688538328705676, + "grad_norm": 0.10044203698635101, + "learning_rate": 2.5781785393624467e-05, + "loss": 0.0332, + "step": 84740 + }, + { + "epoch": 0.1869074372619549, + "grad_norm": 0.12931565940380096, + "learning_rate": 2.578063578347705e-05, + "loss": 0.033, + "step": 84750 + }, + { + "epoch": 0.18692949123685307, + "grad_norm": 0.09036451578140259, + "learning_rate": 2.577948604233473e-05, + "loss": 0.0339, + "step": 84760 + }, + { + "epoch": 0.18695154521175125, + "grad_norm": 0.12195408344268799, + "learning_rate": 2.577833617021148e-05, + "loss": 0.0333, + "step": 84770 + }, + { + "epoch": 0.1869735991866494, + "grad_norm": 0.12013082206249237, + "learning_rate": 2.577718616712127e-05, + "loss": 0.0328, + "step": 84780 + }, + { + "epoch": 0.18699565316154756, + "grad_norm": 0.0970437154173851, + "learning_rate": 2.5776036033078067e-05, + "loss": 0.0336, + "step": 84790 + }, + { + "epoch": 0.18701770713644575, + "grad_norm": 0.12078727781772614, + "learning_rate": 2.5774885768095858e-05, + "loss": 0.0343, + "step": 84800 + }, + { + "epoch": 0.1870397611113439, + "grad_norm": 0.10450682044029236, + "learning_rate": 2.577373537218861e-05, + "loss": 0.0322, + "step": 84810 + }, + { + "epoch": 0.18706181508624206, + "grad_norm": 0.13890871405601501, + "learning_rate": 2.5772584845370307e-05, + "loss": 0.0351, + "step": 84820 + }, + { + "epoch": 0.18708386906114025, + "grad_norm": 0.1183527335524559, + "learning_rate": 2.5771434187654927e-05, + "loss": 0.0345, + "step": 84830 + }, + { + "epoch": 0.1871059230360384, + "grad_norm": 0.10669481009244919, + "learning_rate": 2.577028339905645e-05, + "loss": 0.0324, + "step": 84840 + }, + { + "epoch": 0.18712797701093656, + "grad_norm": 0.10273464024066925, + "learning_rate": 2.5769132479588863e-05, + "loss": 0.0336, + "step": 84850 + }, + { + "epoch": 0.18715003098583474, + "grad_norm": 0.12613339722156525, + "learning_rate": 2.5767981429266146e-05, + "loss": 0.0348, + "step": 84860 + }, + { + "epoch": 0.1871720849607329, + "grad_norm": 0.09906277805566788, + "learning_rate": 2.576683024810229e-05, + "loss": 0.0349, + "step": 84870 + }, + { + "epoch": 0.18719413893563105, + "grad_norm": 0.14090779423713684, + "learning_rate": 2.5765678936111282e-05, + "loss": 0.0327, + "step": 84880 + }, + { + "epoch": 0.18721619291052924, + "grad_norm": 0.1253347396850586, + "learning_rate": 2.5764527493307107e-05, + "loss": 0.0344, + "step": 84890 + }, + { + "epoch": 0.1872382468854274, + "grad_norm": 0.08965059369802475, + "learning_rate": 2.5763375919703763e-05, + "loss": 0.0343, + "step": 84900 + }, + { + "epoch": 0.18726030086032555, + "grad_norm": 0.07807936519384384, + "learning_rate": 2.5762224215315234e-05, + "loss": 0.031, + "step": 84910 + }, + { + "epoch": 0.18728235483522374, + "grad_norm": 0.10629940778017044, + "learning_rate": 2.576107238015553e-05, + "loss": 0.0329, + "step": 84920 + }, + { + "epoch": 0.1873044088101219, + "grad_norm": 0.12475327402353287, + "learning_rate": 2.5759920414238625e-05, + "loss": 0.0335, + "step": 84930 + }, + { + "epoch": 0.18732646278502005, + "grad_norm": 0.14313623309135437, + "learning_rate": 2.5758768317578534e-05, + "loss": 0.0321, + "step": 84940 + }, + { + "epoch": 0.18734851675991823, + "grad_norm": 0.12372015416622162, + "learning_rate": 2.5757616090189245e-05, + "loss": 0.0331, + "step": 84950 + }, + { + "epoch": 0.1873705707348164, + "grad_norm": 0.15646888315677643, + "learning_rate": 2.5756463732084774e-05, + "loss": 0.0337, + "step": 84960 + }, + { + "epoch": 0.18739262470971454, + "grad_norm": 0.10278648138046265, + "learning_rate": 2.5755311243279105e-05, + "loss": 0.0326, + "step": 84970 + }, + { + "epoch": 0.18741467868461273, + "grad_norm": 0.10214127600193024, + "learning_rate": 2.5754158623786247e-05, + "loss": 0.0334, + "step": 84980 + }, + { + "epoch": 0.18743673265951089, + "grad_norm": 0.08513208478689194, + "learning_rate": 2.5753005873620216e-05, + "loss": 0.0325, + "step": 84990 + }, + { + "epoch": 0.18745878663440904, + "grad_norm": 0.1316191405057907, + "learning_rate": 2.5751852992795007e-05, + "loss": 0.0338, + "step": 85000 + }, + { + "epoch": 0.18748084060930723, + "grad_norm": 0.07783669978380203, + "learning_rate": 2.5750699981324632e-05, + "loss": 0.0326, + "step": 85010 + }, + { + "epoch": 0.18750289458420538, + "grad_norm": 0.12208881974220276, + "learning_rate": 2.5749546839223104e-05, + "loss": 0.0324, + "step": 85020 + }, + { + "epoch": 0.18752494855910354, + "grad_norm": 0.130916029214859, + "learning_rate": 2.574839356650443e-05, + "loss": 0.036, + "step": 85030 + }, + { + "epoch": 0.18754700253400172, + "grad_norm": 0.1229529157280922, + "learning_rate": 2.5747240163182628e-05, + "loss": 0.0345, + "step": 85040 + }, + { + "epoch": 0.18756905650889988, + "grad_norm": 0.0867663249373436, + "learning_rate": 2.5746086629271712e-05, + "loss": 0.0327, + "step": 85050 + }, + { + "epoch": 0.18759111048379803, + "grad_norm": 0.1388309746980667, + "learning_rate": 2.5744932964785695e-05, + "loss": 0.0346, + "step": 85060 + }, + { + "epoch": 0.18761316445869622, + "grad_norm": 0.12969635426998138, + "learning_rate": 2.57437791697386e-05, + "loss": 0.0346, + "step": 85070 + }, + { + "epoch": 0.18763521843359438, + "grad_norm": 0.16982203722000122, + "learning_rate": 2.574262524414444e-05, + "loss": 0.0323, + "step": 85080 + }, + { + "epoch": 0.18765727240849253, + "grad_norm": 0.15655463933944702, + "learning_rate": 2.574147118801724e-05, + "loss": 0.0342, + "step": 85090 + }, + { + "epoch": 0.18767932638339072, + "grad_norm": 0.16384001076221466, + "learning_rate": 2.574031700137103e-05, + "loss": 0.0345, + "step": 85100 + }, + { + "epoch": 0.18770138035828887, + "grad_norm": 0.11632061749696732, + "learning_rate": 2.5739162684219827e-05, + "loss": 0.034, + "step": 85110 + }, + { + "epoch": 0.18772343433318703, + "grad_norm": 0.1039438545703888, + "learning_rate": 2.573800823657765e-05, + "loss": 0.0335, + "step": 85120 + }, + { + "epoch": 0.1877454883080852, + "grad_norm": 0.13774079084396362, + "learning_rate": 2.5736853658458543e-05, + "loss": 0.0321, + "step": 85130 + }, + { + "epoch": 0.18776754228298337, + "grad_norm": 0.10411244630813599, + "learning_rate": 2.573569894987652e-05, + "loss": 0.0335, + "step": 85140 + }, + { + "epoch": 0.18778959625788155, + "grad_norm": 0.11465663462877274, + "learning_rate": 2.5734544110845624e-05, + "loss": 0.0326, + "step": 85150 + }, + { + "epoch": 0.1878116502327797, + "grad_norm": 0.1051025465130806, + "learning_rate": 2.5733389141379882e-05, + "loss": 0.0329, + "step": 85160 + }, + { + "epoch": 0.18783370420767787, + "grad_norm": 0.09934850037097931, + "learning_rate": 2.5732234041493322e-05, + "loss": 0.0326, + "step": 85170 + }, + { + "epoch": 0.18785575818257605, + "grad_norm": 0.15722519159317017, + "learning_rate": 2.573107881119999e-05, + "loss": 0.0343, + "step": 85180 + }, + { + "epoch": 0.1878778121574742, + "grad_norm": 0.13375772535800934, + "learning_rate": 2.5729923450513923e-05, + "loss": 0.0326, + "step": 85190 + }, + { + "epoch": 0.18789986613237236, + "grad_norm": 0.12783004343509674, + "learning_rate": 2.572876795944915e-05, + "loss": 0.0348, + "step": 85200 + }, + { + "epoch": 0.18792192010727055, + "grad_norm": 0.10439912229776382, + "learning_rate": 2.5727612338019716e-05, + "loss": 0.0353, + "step": 85210 + }, + { + "epoch": 0.1879439740821687, + "grad_norm": 0.1418871283531189, + "learning_rate": 2.572645658623967e-05, + "loss": 0.0347, + "step": 85220 + }, + { + "epoch": 0.18796602805706686, + "grad_norm": 0.12070401757955551, + "learning_rate": 2.5725300704123042e-05, + "loss": 0.0352, + "step": 85230 + }, + { + "epoch": 0.18798808203196504, + "grad_norm": 0.10427557677030563, + "learning_rate": 2.5724144691683886e-05, + "loss": 0.0322, + "step": 85240 + }, + { + "epoch": 0.1880101360068632, + "grad_norm": 0.14194276928901672, + "learning_rate": 2.5722988548936244e-05, + "loss": 0.0337, + "step": 85250 + }, + { + "epoch": 0.18803218998176136, + "grad_norm": 0.09268515557050705, + "learning_rate": 2.5721832275894168e-05, + "loss": 0.0341, + "step": 85260 + }, + { + "epoch": 0.18805424395665954, + "grad_norm": 0.12303943186998367, + "learning_rate": 2.572067587257171e-05, + "loss": 0.0305, + "step": 85270 + }, + { + "epoch": 0.1880762979315577, + "grad_norm": 0.1288634091615677, + "learning_rate": 2.5719519338982916e-05, + "loss": 0.0339, + "step": 85280 + }, + { + "epoch": 0.18809835190645585, + "grad_norm": 0.1150696650147438, + "learning_rate": 2.5718362675141844e-05, + "loss": 0.0342, + "step": 85290 + }, + { + "epoch": 0.18812040588135404, + "grad_norm": 0.10559042543172836, + "learning_rate": 2.5717205881062545e-05, + "loss": 0.0335, + "step": 85300 + }, + { + "epoch": 0.1881424598562522, + "grad_norm": 0.11227284371852875, + "learning_rate": 2.5716048956759073e-05, + "loss": 0.0314, + "step": 85310 + }, + { + "epoch": 0.18816451383115035, + "grad_norm": 0.1261839121580124, + "learning_rate": 2.5714891902245488e-05, + "loss": 0.0338, + "step": 85320 + }, + { + "epoch": 0.18818656780604853, + "grad_norm": 0.11195728182792664, + "learning_rate": 2.571373471753585e-05, + "loss": 0.0303, + "step": 85330 + }, + { + "epoch": 0.1882086217809467, + "grad_norm": 0.10075420141220093, + "learning_rate": 2.5712577402644225e-05, + "loss": 0.0356, + "step": 85340 + }, + { + "epoch": 0.18823067575584485, + "grad_norm": 0.10948925465345383, + "learning_rate": 2.5711419957584666e-05, + "loss": 0.0337, + "step": 85350 + }, + { + "epoch": 0.18825272973074303, + "grad_norm": 0.08845899254083633, + "learning_rate": 2.5710262382371245e-05, + "loss": 0.0336, + "step": 85360 + }, + { + "epoch": 0.18827478370564119, + "grad_norm": 0.11404956877231598, + "learning_rate": 2.570910467701802e-05, + "loss": 0.0351, + "step": 85370 + }, + { + "epoch": 0.18829683768053934, + "grad_norm": 0.10450809448957443, + "learning_rate": 2.5707946841539062e-05, + "loss": 0.0332, + "step": 85380 + }, + { + "epoch": 0.18831889165543753, + "grad_norm": 0.12115707248449326, + "learning_rate": 2.570678887594844e-05, + "loss": 0.0337, + "step": 85390 + }, + { + "epoch": 0.18834094563033568, + "grad_norm": 0.13205298781394958, + "learning_rate": 2.5705630780260224e-05, + "loss": 0.0326, + "step": 85400 + }, + { + "epoch": 0.18836299960523384, + "grad_norm": 0.10896807909011841, + "learning_rate": 2.5704472554488487e-05, + "loss": 0.0351, + "step": 85410 + }, + { + "epoch": 0.18838505358013202, + "grad_norm": 0.1115478128194809, + "learning_rate": 2.5703314198647302e-05, + "loss": 0.0338, + "step": 85420 + }, + { + "epoch": 0.18840710755503018, + "grad_norm": 0.12684787809848785, + "learning_rate": 2.5702155712750744e-05, + "loss": 0.0334, + "step": 85430 + }, + { + "epoch": 0.18842916152992834, + "grad_norm": 0.1050758957862854, + "learning_rate": 2.570099709681289e-05, + "loss": 0.0346, + "step": 85440 + }, + { + "epoch": 0.18845121550482652, + "grad_norm": 0.09715079516172409, + "learning_rate": 2.5699838350847818e-05, + "loss": 0.0344, + "step": 85450 + }, + { + "epoch": 0.18847326947972468, + "grad_norm": 0.15104956924915314, + "learning_rate": 2.569867947486961e-05, + "loss": 0.033, + "step": 85460 + }, + { + "epoch": 0.18849532345462283, + "grad_norm": 0.1015796810388565, + "learning_rate": 2.569752046889234e-05, + "loss": 0.0341, + "step": 85470 + }, + { + "epoch": 0.18851737742952102, + "grad_norm": 0.10465025156736374, + "learning_rate": 2.5696361332930098e-05, + "loss": 0.0337, + "step": 85480 + }, + { + "epoch": 0.18853943140441917, + "grad_norm": 0.12291381508111954, + "learning_rate": 2.5695202066996965e-05, + "loss": 0.0326, + "step": 85490 + }, + { + "epoch": 0.18856148537931733, + "grad_norm": 0.104006327688694, + "learning_rate": 2.5694042671107033e-05, + "loss": 0.0334, + "step": 85500 + }, + { + "epoch": 0.1885835393542155, + "grad_norm": 0.10409774631261826, + "learning_rate": 2.5692883145274383e-05, + "loss": 0.0337, + "step": 85510 + }, + { + "epoch": 0.18860559332911367, + "grad_norm": 0.12910743057727814, + "learning_rate": 2.569172348951311e-05, + "loss": 0.0354, + "step": 85520 + }, + { + "epoch": 0.18862764730401183, + "grad_norm": 0.1063634380698204, + "learning_rate": 2.5690563703837297e-05, + "loss": 0.0324, + "step": 85530 + }, + { + "epoch": 0.18864970127891, + "grad_norm": 0.11226219683885574, + "learning_rate": 2.5689403788261044e-05, + "loss": 0.035, + "step": 85540 + }, + { + "epoch": 0.18867175525380817, + "grad_norm": 0.10288248211145401, + "learning_rate": 2.5688243742798445e-05, + "loss": 0.0322, + "step": 85550 + }, + { + "epoch": 0.18869380922870635, + "grad_norm": 0.10468834638595581, + "learning_rate": 2.5687083567463587e-05, + "loss": 0.0326, + "step": 85560 + }, + { + "epoch": 0.1887158632036045, + "grad_norm": 0.11347953975200653, + "learning_rate": 2.568592326227058e-05, + "loss": 0.0325, + "step": 85570 + }, + { + "epoch": 0.18873791717850266, + "grad_norm": 0.08060816675424576, + "learning_rate": 2.5684762827233513e-05, + "loss": 0.0336, + "step": 85580 + }, + { + "epoch": 0.18875997115340085, + "grad_norm": 0.09919006377458572, + "learning_rate": 2.5683602262366488e-05, + "loss": 0.0321, + "step": 85590 + }, + { + "epoch": 0.188782025128299, + "grad_norm": 0.10087428987026215, + "learning_rate": 2.5682441567683615e-05, + "loss": 0.0338, + "step": 85600 + }, + { + "epoch": 0.18880407910319716, + "grad_norm": 0.0900551974773407, + "learning_rate": 2.5681280743198986e-05, + "loss": 0.0325, + "step": 85610 + }, + { + "epoch": 0.18882613307809534, + "grad_norm": 0.1248774603009224, + "learning_rate": 2.5680119788926708e-05, + "loss": 0.0315, + "step": 85620 + }, + { + "epoch": 0.1888481870529935, + "grad_norm": 0.2164272665977478, + "learning_rate": 2.5678958704880898e-05, + "loss": 0.0354, + "step": 85630 + }, + { + "epoch": 0.18887024102789166, + "grad_norm": 0.12189250439405441, + "learning_rate": 2.5677797491075654e-05, + "loss": 0.0323, + "step": 85640 + }, + { + "epoch": 0.18889229500278984, + "grad_norm": 0.08838958293199539, + "learning_rate": 2.567663614752509e-05, + "loss": 0.0331, + "step": 85650 + }, + { + "epoch": 0.188914348977688, + "grad_norm": 0.09860806912183762, + "learning_rate": 2.5675474674243312e-05, + "loss": 0.0343, + "step": 85660 + }, + { + "epoch": 0.18893640295258615, + "grad_norm": 0.09675818681716919, + "learning_rate": 2.567431307124444e-05, + "loss": 0.0331, + "step": 85670 + }, + { + "epoch": 0.18895845692748434, + "grad_norm": 0.10752811282873154, + "learning_rate": 2.567315133854259e-05, + "loss": 0.0343, + "step": 85680 + }, + { + "epoch": 0.1889805109023825, + "grad_norm": 0.10996933281421661, + "learning_rate": 2.5671989476151867e-05, + "loss": 0.0338, + "step": 85690 + }, + { + "epoch": 0.18900256487728065, + "grad_norm": 0.12367398291826248, + "learning_rate": 2.56708274840864e-05, + "loss": 0.034, + "step": 85700 + }, + { + "epoch": 0.18902461885217883, + "grad_norm": 0.1457345187664032, + "learning_rate": 2.5669665362360305e-05, + "loss": 0.0334, + "step": 85710 + }, + { + "epoch": 0.189046672827077, + "grad_norm": 0.12025564163923264, + "learning_rate": 2.5668503110987698e-05, + "loss": 0.0348, + "step": 85720 + }, + { + "epoch": 0.18906872680197515, + "grad_norm": 0.12273959815502167, + "learning_rate": 2.5667340729982708e-05, + "loss": 0.0348, + "step": 85730 + }, + { + "epoch": 0.18909078077687333, + "grad_norm": 0.13287505507469177, + "learning_rate": 2.5666178219359454e-05, + "loss": 0.0335, + "step": 85740 + }, + { + "epoch": 0.18911283475177149, + "grad_norm": 0.11212105304002762, + "learning_rate": 2.566501557913207e-05, + "loss": 0.0334, + "step": 85750 + }, + { + "epoch": 0.18913488872666964, + "grad_norm": 0.0979062095284462, + "learning_rate": 2.566385280931467e-05, + "loss": 0.0341, + "step": 85760 + }, + { + "epoch": 0.18915694270156783, + "grad_norm": 0.12127434462308884, + "learning_rate": 2.5662689909921397e-05, + "loss": 0.0341, + "step": 85770 + }, + { + "epoch": 0.18917899667646598, + "grad_norm": 0.11430533975362778, + "learning_rate": 2.5661526880966374e-05, + "loss": 0.0327, + "step": 85780 + }, + { + "epoch": 0.18920105065136414, + "grad_norm": 0.11253393441438675, + "learning_rate": 2.566036372246373e-05, + "loss": 0.0333, + "step": 85790 + }, + { + "epoch": 0.18922310462626232, + "grad_norm": 0.11022568494081497, + "learning_rate": 2.56592004344276e-05, + "loss": 0.0343, + "step": 85800 + }, + { + "epoch": 0.18924515860116048, + "grad_norm": 0.10370147973299026, + "learning_rate": 2.5658037016872122e-05, + "loss": 0.0346, + "step": 85810 + }, + { + "epoch": 0.18926721257605864, + "grad_norm": 0.10861991345882416, + "learning_rate": 2.5656873469811433e-05, + "loss": 0.0342, + "step": 85820 + }, + { + "epoch": 0.18928926655095682, + "grad_norm": 0.09019330888986588, + "learning_rate": 2.565570979325967e-05, + "loss": 0.0334, + "step": 85830 + }, + { + "epoch": 0.18931132052585498, + "grad_norm": 0.11813251674175262, + "learning_rate": 2.5654545987230974e-05, + "loss": 0.0336, + "step": 85840 + }, + { + "epoch": 0.18933337450075313, + "grad_norm": 0.12675286829471588, + "learning_rate": 2.5653382051739478e-05, + "loss": 0.0338, + "step": 85850 + }, + { + "epoch": 0.18935542847565132, + "grad_norm": 0.11297095566987991, + "learning_rate": 2.5652217986799338e-05, + "loss": 0.0338, + "step": 85860 + }, + { + "epoch": 0.18937748245054947, + "grad_norm": 0.10570168495178223, + "learning_rate": 2.565105379242469e-05, + "loss": 0.0363, + "step": 85870 + }, + { + "epoch": 0.18939953642544763, + "grad_norm": 0.11514914035797119, + "learning_rate": 2.5649889468629682e-05, + "loss": 0.0331, + "step": 85880 + }, + { + "epoch": 0.1894215904003458, + "grad_norm": 0.11704779416322708, + "learning_rate": 2.564872501542846e-05, + "loss": 0.031, + "step": 85890 + }, + { + "epoch": 0.18944364437524397, + "grad_norm": 0.11193514615297318, + "learning_rate": 2.5647560432835177e-05, + "loss": 0.0325, + "step": 85900 + }, + { + "epoch": 0.18946569835014213, + "grad_norm": 0.12554898858070374, + "learning_rate": 2.564639572086398e-05, + "loss": 0.0337, + "step": 85910 + }, + { + "epoch": 0.1894877523250403, + "grad_norm": 0.0864499881863594, + "learning_rate": 2.5645230879529022e-05, + "loss": 0.034, + "step": 85920 + }, + { + "epoch": 0.18950980629993847, + "grad_norm": 0.11191099882125854, + "learning_rate": 2.5644065908844463e-05, + "loss": 0.0314, + "step": 85930 + }, + { + "epoch": 0.18953186027483662, + "grad_norm": 0.18978647887706757, + "learning_rate": 2.5642900808824455e-05, + "loss": 0.0356, + "step": 85940 + }, + { + "epoch": 0.1895539142497348, + "grad_norm": 0.0907294750213623, + "learning_rate": 2.5641735579483145e-05, + "loss": 0.031, + "step": 85950 + }, + { + "epoch": 0.18957596822463296, + "grad_norm": 0.10810613632202148, + "learning_rate": 2.5640570220834704e-05, + "loss": 0.0325, + "step": 85960 + }, + { + "epoch": 0.18959802219953112, + "grad_norm": 0.15473142266273499, + "learning_rate": 2.5639404732893294e-05, + "loss": 0.0332, + "step": 85970 + }, + { + "epoch": 0.1896200761744293, + "grad_norm": 0.15326914191246033, + "learning_rate": 2.5638239115673064e-05, + "loss": 0.0344, + "step": 85980 + }, + { + "epoch": 0.18964213014932746, + "grad_norm": 0.09195548295974731, + "learning_rate": 2.5637073369188185e-05, + "loss": 0.033, + "step": 85990 + }, + { + "epoch": 0.18966418412422564, + "grad_norm": 0.11332730203866959, + "learning_rate": 2.5635907493452825e-05, + "loss": 0.0348, + "step": 86000 + }, + { + "epoch": 0.1896862380991238, + "grad_norm": 0.132793590426445, + "learning_rate": 2.563474148848115e-05, + "loss": 0.0328, + "step": 86010 + }, + { + "epoch": 0.18970829207402196, + "grad_norm": 0.08491350710391998, + "learning_rate": 2.5633575354287322e-05, + "loss": 0.0353, + "step": 86020 + }, + { + "epoch": 0.18973034604892014, + "grad_norm": 0.09859348088502884, + "learning_rate": 2.563240909088551e-05, + "loss": 0.0334, + "step": 86030 + }, + { + "epoch": 0.1897524000238183, + "grad_norm": 0.09854135662317276, + "learning_rate": 2.563124269828989e-05, + "loss": 0.0332, + "step": 86040 + }, + { + "epoch": 0.18977445399871645, + "grad_norm": 0.1202395111322403, + "learning_rate": 2.5630076176514637e-05, + "loss": 0.0345, + "step": 86050 + }, + { + "epoch": 0.18979650797361464, + "grad_norm": 0.14640294015407562, + "learning_rate": 2.5628909525573917e-05, + "loss": 0.0348, + "step": 86060 + }, + { + "epoch": 0.1898185619485128, + "grad_norm": 0.11622775346040726, + "learning_rate": 2.5627742745481913e-05, + "loss": 0.0329, + "step": 86070 + }, + { + "epoch": 0.18984061592341095, + "grad_norm": 0.11464031040668488, + "learning_rate": 2.5626575836252804e-05, + "loss": 0.0346, + "step": 86080 + }, + { + "epoch": 0.18986266989830913, + "grad_norm": 0.09646182507276535, + "learning_rate": 2.5625408797900763e-05, + "loss": 0.0323, + "step": 86090 + }, + { + "epoch": 0.1898847238732073, + "grad_norm": 0.11987199634313583, + "learning_rate": 2.562424163043997e-05, + "loss": 0.0346, + "step": 86100 + }, + { + "epoch": 0.18990677784810545, + "grad_norm": 0.11587236821651459, + "learning_rate": 2.5623074333884617e-05, + "loss": 0.032, + "step": 86110 + }, + { + "epoch": 0.18992883182300363, + "grad_norm": 0.11194739490747452, + "learning_rate": 2.5621906908248875e-05, + "loss": 0.0348, + "step": 86120 + }, + { + "epoch": 0.1899508857979018, + "grad_norm": 0.1326143443584442, + "learning_rate": 2.5620739353546938e-05, + "loss": 0.0333, + "step": 86130 + }, + { + "epoch": 0.18997293977279994, + "grad_norm": 0.13867530226707458, + "learning_rate": 2.5619571669792992e-05, + "loss": 0.0335, + "step": 86140 + }, + { + "epoch": 0.18999499374769813, + "grad_norm": 0.12520548701286316, + "learning_rate": 2.5618403857001217e-05, + "loss": 0.0335, + "step": 86150 + }, + { + "epoch": 0.19001704772259628, + "grad_norm": 0.10751483589410782, + "learning_rate": 2.5617235915185816e-05, + "loss": 0.0331, + "step": 86160 + }, + { + "epoch": 0.19003910169749444, + "grad_norm": 0.11526135355234146, + "learning_rate": 2.561606784436097e-05, + "loss": 0.0327, + "step": 86170 + }, + { + "epoch": 0.19006115567239262, + "grad_norm": 0.10175454616546631, + "learning_rate": 2.561489964454088e-05, + "loss": 0.0324, + "step": 86180 + }, + { + "epoch": 0.19008320964729078, + "grad_norm": 0.13833366334438324, + "learning_rate": 2.5613731315739737e-05, + "loss": 0.0366, + "step": 86190 + }, + { + "epoch": 0.19010526362218894, + "grad_norm": 0.10944182425737381, + "learning_rate": 2.561256285797174e-05, + "loss": 0.0355, + "step": 86200 + }, + { + "epoch": 0.19012731759708712, + "grad_norm": 0.11510233581066132, + "learning_rate": 2.5611394271251077e-05, + "loss": 0.0346, + "step": 86210 + }, + { + "epoch": 0.19014937157198528, + "grad_norm": 0.1311793178319931, + "learning_rate": 2.5610225555591962e-05, + "loss": 0.0345, + "step": 86220 + }, + { + "epoch": 0.19017142554688343, + "grad_norm": 0.09861066937446594, + "learning_rate": 2.5609056711008585e-05, + "loss": 0.0345, + "step": 86230 + }, + { + "epoch": 0.19019347952178162, + "grad_norm": 0.10582178831100464, + "learning_rate": 2.560788773751515e-05, + "loss": 0.0339, + "step": 86240 + }, + { + "epoch": 0.19021553349667977, + "grad_norm": 0.1102227196097374, + "learning_rate": 2.5606718635125866e-05, + "loss": 0.0325, + "step": 86250 + }, + { + "epoch": 0.19023758747157793, + "grad_norm": 0.1112651526927948, + "learning_rate": 2.5605549403854936e-05, + "loss": 0.0336, + "step": 86260 + }, + { + "epoch": 0.1902596414464761, + "grad_norm": 0.10076964646577835, + "learning_rate": 2.560438004371657e-05, + "loss": 0.0337, + "step": 86270 + }, + { + "epoch": 0.19028169542137427, + "grad_norm": 0.0930318832397461, + "learning_rate": 2.560321055472497e-05, + "loss": 0.033, + "step": 86280 + }, + { + "epoch": 0.19030374939627243, + "grad_norm": 0.08678799122571945, + "learning_rate": 2.5602040936894353e-05, + "loss": 0.0324, + "step": 86290 + }, + { + "epoch": 0.1903258033711706, + "grad_norm": 0.13414356112480164, + "learning_rate": 2.560087119023893e-05, + "loss": 0.0335, + "step": 86300 + }, + { + "epoch": 0.19034785734606877, + "grad_norm": 0.08376440405845642, + "learning_rate": 2.559970131477291e-05, + "loss": 0.0331, + "step": 86310 + }, + { + "epoch": 0.19036991132096692, + "grad_norm": 0.13491782546043396, + "learning_rate": 2.5598531310510512e-05, + "loss": 0.0346, + "step": 86320 + }, + { + "epoch": 0.1903919652958651, + "grad_norm": 0.09934500604867935, + "learning_rate": 2.5597361177465955e-05, + "loss": 0.0331, + "step": 86330 + }, + { + "epoch": 0.19041401927076326, + "grad_norm": 0.13422033190727234, + "learning_rate": 2.559619091565345e-05, + "loss": 0.0337, + "step": 86340 + }, + { + "epoch": 0.19043607324566142, + "grad_norm": 0.36364850401878357, + "learning_rate": 2.5595020525087223e-05, + "loss": 0.0344, + "step": 86350 + }, + { + "epoch": 0.1904581272205596, + "grad_norm": 0.12094614654779434, + "learning_rate": 2.5593850005781493e-05, + "loss": 0.0343, + "step": 86360 + }, + { + "epoch": 0.19048018119545776, + "grad_norm": 0.12221099436283112, + "learning_rate": 2.559267935775048e-05, + "loss": 0.0334, + "step": 86370 + }, + { + "epoch": 0.19050223517035592, + "grad_norm": 0.11004725098609924, + "learning_rate": 2.559150858100842e-05, + "loss": 0.033, + "step": 86380 + }, + { + "epoch": 0.1905242891452541, + "grad_norm": 0.09196054935455322, + "learning_rate": 2.5590337675569527e-05, + "loss": 0.0336, + "step": 86390 + }, + { + "epoch": 0.19054634312015226, + "grad_norm": 0.14508511126041412, + "learning_rate": 2.5589166641448028e-05, + "loss": 0.0342, + "step": 86400 + }, + { + "epoch": 0.19056839709505044, + "grad_norm": 0.1231282502412796, + "learning_rate": 2.558799547865816e-05, + "loss": 0.0321, + "step": 86410 + }, + { + "epoch": 0.1905904510699486, + "grad_norm": 0.09361407160758972, + "learning_rate": 2.5586824187214152e-05, + "loss": 0.0315, + "step": 86420 + }, + { + "epoch": 0.19061250504484675, + "grad_norm": 0.09506280720233917, + "learning_rate": 2.558565276713023e-05, + "loss": 0.0341, + "step": 86430 + }, + { + "epoch": 0.19063455901974494, + "grad_norm": 0.10771697014570236, + "learning_rate": 2.5584481218420632e-05, + "loss": 0.0336, + "step": 86440 + }, + { + "epoch": 0.1906566129946431, + "grad_norm": 0.11360274255275726, + "learning_rate": 2.55833095410996e-05, + "loss": 0.033, + "step": 86450 + }, + { + "epoch": 0.19067866696954125, + "grad_norm": 0.11888719350099564, + "learning_rate": 2.558213773518136e-05, + "loss": 0.0327, + "step": 86460 + }, + { + "epoch": 0.19070072094443943, + "grad_norm": 0.10498954355716705, + "learning_rate": 2.558096580068016e-05, + "loss": 0.0344, + "step": 86470 + }, + { + "epoch": 0.1907227749193376, + "grad_norm": 0.11040877550840378, + "learning_rate": 2.557979373761023e-05, + "loss": 0.0352, + "step": 86480 + }, + { + "epoch": 0.19074482889423575, + "grad_norm": 0.10585123300552368, + "learning_rate": 2.5578621545985813e-05, + "loss": 0.0333, + "step": 86490 + }, + { + "epoch": 0.19076688286913393, + "grad_norm": 0.10586889833211899, + "learning_rate": 2.5577449225821167e-05, + "loss": 0.0322, + "step": 86500 + }, + { + "epoch": 0.1907889368440321, + "grad_norm": 0.08705378323793411, + "learning_rate": 2.557627677713052e-05, + "loss": 0.0359, + "step": 86510 + }, + { + "epoch": 0.19081099081893024, + "grad_norm": 0.1063535213470459, + "learning_rate": 2.5575104199928127e-05, + "loss": 0.0328, + "step": 86520 + }, + { + "epoch": 0.19083304479382843, + "grad_norm": 0.11577498912811279, + "learning_rate": 2.557393149422823e-05, + "loss": 0.0321, + "step": 86530 + }, + { + "epoch": 0.19085509876872658, + "grad_norm": 0.13399533927440643, + "learning_rate": 2.5572758660045086e-05, + "loss": 0.0313, + "step": 86540 + }, + { + "epoch": 0.19087715274362474, + "grad_norm": 0.12133731693029404, + "learning_rate": 2.5571585697392942e-05, + "loss": 0.0315, + "step": 86550 + }, + { + "epoch": 0.19089920671852292, + "grad_norm": 0.09180031716823578, + "learning_rate": 2.5570412606286046e-05, + "loss": 0.0325, + "step": 86560 + }, + { + "epoch": 0.19092126069342108, + "grad_norm": 0.143723726272583, + "learning_rate": 2.556923938673866e-05, + "loss": 0.0339, + "step": 86570 + }, + { + "epoch": 0.19094331466831924, + "grad_norm": 0.10079687833786011, + "learning_rate": 2.5568066038765036e-05, + "loss": 0.034, + "step": 86580 + }, + { + "epoch": 0.19096536864321742, + "grad_norm": 0.11758852005004883, + "learning_rate": 2.556689256237943e-05, + "loss": 0.034, + "step": 86590 + }, + { + "epoch": 0.19098742261811558, + "grad_norm": 0.11863239109516144, + "learning_rate": 2.5565718957596108e-05, + "loss": 0.0334, + "step": 86600 + }, + { + "epoch": 0.19100947659301373, + "grad_norm": 0.13231603801250458, + "learning_rate": 2.5564545224429316e-05, + "loss": 0.0322, + "step": 86610 + }, + { + "epoch": 0.19103153056791192, + "grad_norm": 0.13496927917003632, + "learning_rate": 2.5563371362893333e-05, + "loss": 0.033, + "step": 86620 + }, + { + "epoch": 0.19105358454281007, + "grad_norm": 0.10412980616092682, + "learning_rate": 2.5562197373002413e-05, + "loss": 0.0337, + "step": 86630 + }, + { + "epoch": 0.19107563851770823, + "grad_norm": 0.1368284374475479, + "learning_rate": 2.5561023254770818e-05, + "loss": 0.0328, + "step": 86640 + }, + { + "epoch": 0.1910976924926064, + "grad_norm": 0.0992073267698288, + "learning_rate": 2.5559849008212824e-05, + "loss": 0.0327, + "step": 86650 + }, + { + "epoch": 0.19111974646750457, + "grad_norm": 0.09392037987709045, + "learning_rate": 2.5558674633342695e-05, + "loss": 0.0313, + "step": 86660 + }, + { + "epoch": 0.19114180044240273, + "grad_norm": 0.1105855405330658, + "learning_rate": 2.555750013017469e-05, + "loss": 0.0347, + "step": 86670 + }, + { + "epoch": 0.1911638544173009, + "grad_norm": 0.09898720681667328, + "learning_rate": 2.5556325498723102e-05, + "loss": 0.032, + "step": 86680 + }, + { + "epoch": 0.19118590839219907, + "grad_norm": 0.10255416482686996, + "learning_rate": 2.5555150739002188e-05, + "loss": 0.0335, + "step": 86690 + }, + { + "epoch": 0.19120796236709722, + "grad_norm": 0.11491788923740387, + "learning_rate": 2.555397585102623e-05, + "loss": 0.0365, + "step": 86700 + }, + { + "epoch": 0.1912300163419954, + "grad_norm": 0.09774725884199142, + "learning_rate": 2.5552800834809496e-05, + "loss": 0.0329, + "step": 86710 + }, + { + "epoch": 0.19125207031689356, + "grad_norm": 0.1061306744813919, + "learning_rate": 2.5551625690366268e-05, + "loss": 0.0303, + "step": 86720 + }, + { + "epoch": 0.19127412429179172, + "grad_norm": 0.11931706964969635, + "learning_rate": 2.5550450417710834e-05, + "loss": 0.0345, + "step": 86730 + }, + { + "epoch": 0.1912961782666899, + "grad_norm": 0.08041465282440186, + "learning_rate": 2.5549275016857458e-05, + "loss": 0.0346, + "step": 86740 + }, + { + "epoch": 0.19131823224158806, + "grad_norm": 0.09947294741868973, + "learning_rate": 2.5548099487820434e-05, + "loss": 0.0344, + "step": 86750 + }, + { + "epoch": 0.19134028621648622, + "grad_norm": 0.14328327775001526, + "learning_rate": 2.554692383061404e-05, + "loss": 0.0324, + "step": 86760 + }, + { + "epoch": 0.1913623401913844, + "grad_norm": 0.09177370369434357, + "learning_rate": 2.5545748045252565e-05, + "loss": 0.0331, + "step": 86770 + }, + { + "epoch": 0.19138439416628256, + "grad_norm": 0.09617976099252701, + "learning_rate": 2.5544572131750296e-05, + "loss": 0.033, + "step": 86780 + }, + { + "epoch": 0.1914064481411807, + "grad_norm": 0.12431368231773376, + "learning_rate": 2.5543396090121513e-05, + "loss": 0.0349, + "step": 86790 + }, + { + "epoch": 0.1914285021160789, + "grad_norm": 0.11542283743619919, + "learning_rate": 2.5542219920380518e-05, + "loss": 0.0315, + "step": 86800 + }, + { + "epoch": 0.19145055609097705, + "grad_norm": 0.12265947461128235, + "learning_rate": 2.55410436225416e-05, + "loss": 0.0348, + "step": 86810 + }, + { + "epoch": 0.1914726100658752, + "grad_norm": 0.09964333474636078, + "learning_rate": 2.5539867196619046e-05, + "loss": 0.0343, + "step": 86820 + }, + { + "epoch": 0.1914946640407734, + "grad_norm": 0.08238319307565689, + "learning_rate": 2.5538690642627152e-05, + "loss": 0.033, + "step": 86830 + }, + { + "epoch": 0.19151671801567155, + "grad_norm": 0.09878350049257278, + "learning_rate": 2.553751396058022e-05, + "loss": 0.034, + "step": 86840 + }, + { + "epoch": 0.19153877199056973, + "grad_norm": 0.15434381365776062, + "learning_rate": 2.5536337150492543e-05, + "loss": 0.0336, + "step": 86850 + }, + { + "epoch": 0.1915608259654679, + "grad_norm": 0.10516682267189026, + "learning_rate": 2.5535160212378424e-05, + "loss": 0.0325, + "step": 86860 + }, + { + "epoch": 0.19158287994036605, + "grad_norm": 0.1466553509235382, + "learning_rate": 2.553398314625216e-05, + "loss": 0.0346, + "step": 86870 + }, + { + "epoch": 0.19160493391526423, + "grad_norm": 0.12392624467611313, + "learning_rate": 2.5532805952128056e-05, + "loss": 0.0317, + "step": 86880 + }, + { + "epoch": 0.1916269878901624, + "grad_norm": 0.08021976053714752, + "learning_rate": 2.553162863002041e-05, + "loss": 0.0329, + "step": 86890 + }, + { + "epoch": 0.19164904186506054, + "grad_norm": 0.10966150462627411, + "learning_rate": 2.553045117994354e-05, + "loss": 0.0334, + "step": 86900 + }, + { + "epoch": 0.19167109583995873, + "grad_norm": 0.11692608147859573, + "learning_rate": 2.552927360191174e-05, + "loss": 0.0329, + "step": 86910 + }, + { + "epoch": 0.19169314981485688, + "grad_norm": 0.09008163958787918, + "learning_rate": 2.552809589593933e-05, + "loss": 0.0335, + "step": 86920 + }, + { + "epoch": 0.19171520378975504, + "grad_norm": 0.10288514941930771, + "learning_rate": 2.552691806204061e-05, + "loss": 0.0342, + "step": 86930 + }, + { + "epoch": 0.19173725776465322, + "grad_norm": 0.08650781959295273, + "learning_rate": 2.5525740100229898e-05, + "loss": 0.0325, + "step": 86940 + }, + { + "epoch": 0.19175931173955138, + "grad_norm": 0.12997619807720184, + "learning_rate": 2.5524562010521503e-05, + "loss": 0.033, + "step": 86950 + }, + { + "epoch": 0.19178136571444954, + "grad_norm": 0.12189587205648422, + "learning_rate": 2.5523383792929748e-05, + "loss": 0.0335, + "step": 86960 + }, + { + "epoch": 0.19180341968934772, + "grad_norm": 0.0952959954738617, + "learning_rate": 2.5522205447468937e-05, + "loss": 0.0356, + "step": 86970 + }, + { + "epoch": 0.19182547366424588, + "grad_norm": 0.09892743825912476, + "learning_rate": 2.5521026974153405e-05, + "loss": 0.0332, + "step": 86980 + }, + { + "epoch": 0.19184752763914403, + "grad_norm": 0.12275324761867523, + "learning_rate": 2.5519848372997454e-05, + "loss": 0.0343, + "step": 86990 + }, + { + "epoch": 0.19186958161404222, + "grad_norm": 0.12211424857378006, + "learning_rate": 2.5518669644015418e-05, + "loss": 0.034, + "step": 87000 + }, + { + "epoch": 0.19189163558894037, + "grad_norm": 0.13483992218971252, + "learning_rate": 2.551749078722161e-05, + "loss": 0.0343, + "step": 87010 + }, + { + "epoch": 0.19191368956383853, + "grad_norm": 0.10952024906873703, + "learning_rate": 2.5516311802630364e-05, + "loss": 0.0317, + "step": 87020 + }, + { + "epoch": 0.19193574353873671, + "grad_norm": 0.12838289141654968, + "learning_rate": 2.5515132690256e-05, + "loss": 0.0327, + "step": 87030 + }, + { + "epoch": 0.19195779751363487, + "grad_norm": 0.120008185505867, + "learning_rate": 2.551395345011284e-05, + "loss": 0.0342, + "step": 87040 + }, + { + "epoch": 0.19197985148853303, + "grad_norm": 0.09900153428316116, + "learning_rate": 2.5512774082215223e-05, + "loss": 0.0334, + "step": 87050 + }, + { + "epoch": 0.1920019054634312, + "grad_norm": 0.09955921024084091, + "learning_rate": 2.5511594586577473e-05, + "loss": 0.0345, + "step": 87060 + }, + { + "epoch": 0.19202395943832937, + "grad_norm": 0.09399441629648209, + "learning_rate": 2.5510414963213925e-05, + "loss": 0.035, + "step": 87070 + }, + { + "epoch": 0.19204601341322752, + "grad_norm": 0.09454940259456635, + "learning_rate": 2.5509235212138916e-05, + "loss": 0.0349, + "step": 87080 + }, + { + "epoch": 0.1920680673881257, + "grad_norm": 0.10199233889579773, + "learning_rate": 2.550805533336677e-05, + "loss": 0.0335, + "step": 87090 + }, + { + "epoch": 0.19209012136302386, + "grad_norm": 0.1121784895658493, + "learning_rate": 2.5506875326911837e-05, + "loss": 0.033, + "step": 87100 + }, + { + "epoch": 0.19211217533792202, + "grad_norm": 0.12697181105613708, + "learning_rate": 2.550569519278844e-05, + "loss": 0.0342, + "step": 87110 + }, + { + "epoch": 0.1921342293128202, + "grad_norm": 0.1063341498374939, + "learning_rate": 2.5504514931010932e-05, + "loss": 0.0327, + "step": 87120 + }, + { + "epoch": 0.19215628328771836, + "grad_norm": 0.0808485820889473, + "learning_rate": 2.5503334541593652e-05, + "loss": 0.0325, + "step": 87130 + }, + { + "epoch": 0.19217833726261652, + "grad_norm": 0.11145287752151489, + "learning_rate": 2.5502154024550944e-05, + "loss": 0.0365, + "step": 87140 + }, + { + "epoch": 0.1922003912375147, + "grad_norm": 0.11099628359079361, + "learning_rate": 2.550097337989714e-05, + "loss": 0.0317, + "step": 87150 + }, + { + "epoch": 0.19222244521241286, + "grad_norm": 0.11769410967826843, + "learning_rate": 2.54997926076466e-05, + "loss": 0.0342, + "step": 87160 + }, + { + "epoch": 0.192244499187311, + "grad_norm": 0.08341164141893387, + "learning_rate": 2.5498611707813664e-05, + "loss": 0.0329, + "step": 87170 + }, + { + "epoch": 0.1922665531622092, + "grad_norm": 0.10949933528900146, + "learning_rate": 2.5497430680412682e-05, + "loss": 0.0355, + "step": 87180 + }, + { + "epoch": 0.19228860713710735, + "grad_norm": 0.10980390012264252, + "learning_rate": 2.5496249525458013e-05, + "loss": 0.0334, + "step": 87190 + }, + { + "epoch": 0.1923106611120055, + "grad_norm": 0.12498418986797333, + "learning_rate": 2.5495068242963996e-05, + "loss": 0.0329, + "step": 87200 + }, + { + "epoch": 0.1923327150869037, + "grad_norm": 0.10249780118465424, + "learning_rate": 2.5493886832944992e-05, + "loss": 0.0321, + "step": 87210 + }, + { + "epoch": 0.19235476906180185, + "grad_norm": 0.09395256638526917, + "learning_rate": 2.5492705295415355e-05, + "loss": 0.0316, + "step": 87220 + }, + { + "epoch": 0.1923768230367, + "grad_norm": 0.09984906762838364, + "learning_rate": 2.5491523630389442e-05, + "loss": 0.0335, + "step": 87230 + }, + { + "epoch": 0.1923988770115982, + "grad_norm": 0.10916256159543991, + "learning_rate": 2.549034183788161e-05, + "loss": 0.0356, + "step": 87240 + }, + { + "epoch": 0.19242093098649635, + "grad_norm": 0.13166013360023499, + "learning_rate": 2.5489159917906224e-05, + "loss": 0.031, + "step": 87250 + }, + { + "epoch": 0.1924429849613945, + "grad_norm": 0.08475461602210999, + "learning_rate": 2.548797787047764e-05, + "loss": 0.0314, + "step": 87260 + }, + { + "epoch": 0.1924650389362927, + "grad_norm": 0.09359075874090195, + "learning_rate": 2.5486795695610224e-05, + "loss": 0.034, + "step": 87270 + }, + { + "epoch": 0.19248709291119084, + "grad_norm": 0.12182501703500748, + "learning_rate": 2.5485613393318337e-05, + "loss": 0.0344, + "step": 87280 + }, + { + "epoch": 0.19250914688608903, + "grad_norm": 0.12787151336669922, + "learning_rate": 2.5484430963616344e-05, + "loss": 0.0351, + "step": 87290 + }, + { + "epoch": 0.19253120086098718, + "grad_norm": 0.10019958019256592, + "learning_rate": 2.548324840651862e-05, + "loss": 0.032, + "step": 87300 + }, + { + "epoch": 0.19255325483588534, + "grad_norm": 0.0902242586016655, + "learning_rate": 2.548206572203953e-05, + "loss": 0.0317, + "step": 87310 + }, + { + "epoch": 0.19257530881078352, + "grad_norm": 0.11242223531007767, + "learning_rate": 2.5480882910193442e-05, + "loss": 0.0333, + "step": 87320 + }, + { + "epoch": 0.19259736278568168, + "grad_norm": 0.11158882081508636, + "learning_rate": 2.547969997099473e-05, + "loss": 0.0324, + "step": 87330 + }, + { + "epoch": 0.19261941676057984, + "grad_norm": 0.10439855605363846, + "learning_rate": 2.5478516904457777e-05, + "loss": 0.0344, + "step": 87340 + }, + { + "epoch": 0.19264147073547802, + "grad_norm": 0.12737849354743958, + "learning_rate": 2.5477333710596944e-05, + "loss": 0.0325, + "step": 87350 + }, + { + "epoch": 0.19266352471037618, + "grad_norm": 0.12059297412633896, + "learning_rate": 2.5476150389426615e-05, + "loss": 0.0327, + "step": 87360 + }, + { + "epoch": 0.19268557868527433, + "grad_norm": 0.10643669962882996, + "learning_rate": 2.547496694096117e-05, + "loss": 0.0328, + "step": 87370 + }, + { + "epoch": 0.19270763266017252, + "grad_norm": 0.09967482835054398, + "learning_rate": 2.5473783365214983e-05, + "loss": 0.0331, + "step": 87380 + }, + { + "epoch": 0.19272968663507067, + "grad_norm": 0.11119163781404495, + "learning_rate": 2.547259966220244e-05, + "loss": 0.034, + "step": 87390 + }, + { + "epoch": 0.19275174060996883, + "grad_norm": 0.11979104578495026, + "learning_rate": 2.5471415831937925e-05, + "loss": 0.0323, + "step": 87400 + }, + { + "epoch": 0.19277379458486701, + "grad_norm": 0.12958934903144836, + "learning_rate": 2.547023187443582e-05, + "loss": 0.0362, + "step": 87410 + }, + { + "epoch": 0.19279584855976517, + "grad_norm": 0.09897752851247787, + "learning_rate": 2.5469047789710512e-05, + "loss": 0.0326, + "step": 87420 + }, + { + "epoch": 0.19281790253466333, + "grad_norm": 0.10986194014549255, + "learning_rate": 2.546786357777639e-05, + "loss": 0.0319, + "step": 87430 + }, + { + "epoch": 0.1928399565095615, + "grad_norm": 0.08846211433410645, + "learning_rate": 2.546667923864784e-05, + "loss": 0.033, + "step": 87440 + }, + { + "epoch": 0.19286201048445967, + "grad_norm": 0.10933534801006317, + "learning_rate": 2.546549477233926e-05, + "loss": 0.0327, + "step": 87450 + }, + { + "epoch": 0.19288406445935782, + "grad_norm": 0.1230756863951683, + "learning_rate": 2.5464310178865034e-05, + "loss": 0.0332, + "step": 87460 + }, + { + "epoch": 0.192906118434256, + "grad_norm": 0.11291245371103287, + "learning_rate": 2.5463125458239558e-05, + "loss": 0.0336, + "step": 87470 + }, + { + "epoch": 0.19292817240915416, + "grad_norm": 0.11038361489772797, + "learning_rate": 2.5461940610477232e-05, + "loss": 0.0325, + "step": 87480 + }, + { + "epoch": 0.19295022638405232, + "grad_norm": 0.09628463536500931, + "learning_rate": 2.5460755635592445e-05, + "loss": 0.0312, + "step": 87490 + }, + { + "epoch": 0.1929722803589505, + "grad_norm": 0.11566609144210815, + "learning_rate": 2.5459570533599607e-05, + "loss": 0.0332, + "step": 87500 + }, + { + "epoch": 0.19299433433384866, + "grad_norm": 0.12659187614917755, + "learning_rate": 2.545838530451311e-05, + "loss": 0.033, + "step": 87510 + }, + { + "epoch": 0.19301638830874682, + "grad_norm": 0.10964947193861008, + "learning_rate": 2.5457199948347352e-05, + "loss": 0.0344, + "step": 87520 + }, + { + "epoch": 0.193038442283645, + "grad_norm": 0.09916700422763824, + "learning_rate": 2.5456014465116745e-05, + "loss": 0.034, + "step": 87530 + }, + { + "epoch": 0.19306049625854316, + "grad_norm": 0.1374383419752121, + "learning_rate": 2.5454828854835688e-05, + "loss": 0.0346, + "step": 87540 + }, + { + "epoch": 0.1930825502334413, + "grad_norm": 0.09834275394678116, + "learning_rate": 2.5453643117518595e-05, + "loss": 0.0347, + "step": 87550 + }, + { + "epoch": 0.1931046042083395, + "grad_norm": 0.0959656611084938, + "learning_rate": 2.5452457253179862e-05, + "loss": 0.0321, + "step": 87560 + }, + { + "epoch": 0.19312665818323765, + "grad_norm": 0.09622934460639954, + "learning_rate": 2.545127126183391e-05, + "loss": 0.0326, + "step": 87570 + }, + { + "epoch": 0.1931487121581358, + "grad_norm": 0.11090053617954254, + "learning_rate": 2.5450085143495138e-05, + "loss": 0.0341, + "step": 87580 + }, + { + "epoch": 0.193170766133034, + "grad_norm": 0.09134811162948608, + "learning_rate": 2.544889889817797e-05, + "loss": 0.0326, + "step": 87590 + }, + { + "epoch": 0.19319282010793215, + "grad_norm": 0.11357109248638153, + "learning_rate": 2.5447712525896814e-05, + "loss": 0.0334, + "step": 87600 + }, + { + "epoch": 0.1932148740828303, + "grad_norm": 0.11219270527362823, + "learning_rate": 2.5446526026666083e-05, + "loss": 0.0343, + "step": 87610 + }, + { + "epoch": 0.1932369280577285, + "grad_norm": 0.1301131695508957, + "learning_rate": 2.5445339400500205e-05, + "loss": 0.0344, + "step": 87620 + }, + { + "epoch": 0.19325898203262665, + "grad_norm": 0.09447430074214935, + "learning_rate": 2.5444152647413585e-05, + "loss": 0.0323, + "step": 87630 + }, + { + "epoch": 0.1932810360075248, + "grad_norm": 0.08138403296470642, + "learning_rate": 2.544296576742065e-05, + "loss": 0.031, + "step": 87640 + }, + { + "epoch": 0.193303089982423, + "grad_norm": 0.12280598282814026, + "learning_rate": 2.5441778760535824e-05, + "loss": 0.0334, + "step": 87650 + }, + { + "epoch": 0.19332514395732114, + "grad_norm": 0.10904145240783691, + "learning_rate": 2.544059162677353e-05, + "loss": 0.0338, + "step": 87660 + }, + { + "epoch": 0.1933471979322193, + "grad_norm": 0.10090546309947968, + "learning_rate": 2.5439404366148182e-05, + "loss": 0.0352, + "step": 87670 + }, + { + "epoch": 0.19336925190711748, + "grad_norm": 0.14228351414203644, + "learning_rate": 2.5438216978674217e-05, + "loss": 0.0334, + "step": 87680 + }, + { + "epoch": 0.19339130588201564, + "grad_norm": 0.08764813840389252, + "learning_rate": 2.5437029464366064e-05, + "loss": 0.0328, + "step": 87690 + }, + { + "epoch": 0.19341335985691382, + "grad_norm": 0.0944867879152298, + "learning_rate": 2.543584182323815e-05, + "loss": 0.0325, + "step": 87700 + }, + { + "epoch": 0.19343541383181198, + "grad_norm": 0.09420900791883469, + "learning_rate": 2.54346540553049e-05, + "loss": 0.0315, + "step": 87710 + }, + { + "epoch": 0.19345746780671014, + "grad_norm": 0.10219374299049377, + "learning_rate": 2.543346616058075e-05, + "loss": 0.0321, + "step": 87720 + }, + { + "epoch": 0.19347952178160832, + "grad_norm": 0.09041371196508408, + "learning_rate": 2.5432278139080137e-05, + "loss": 0.0327, + "step": 87730 + }, + { + "epoch": 0.19350157575650648, + "grad_norm": 0.12240121513605118, + "learning_rate": 2.54310899908175e-05, + "loss": 0.034, + "step": 87740 + }, + { + "epoch": 0.19352362973140463, + "grad_norm": 0.12225539982318878, + "learning_rate": 2.5429901715807267e-05, + "loss": 0.0333, + "step": 87750 + }, + { + "epoch": 0.19354568370630282, + "grad_norm": 0.08934969455003738, + "learning_rate": 2.542871331406388e-05, + "loss": 0.03, + "step": 87760 + }, + { + "epoch": 0.19356773768120097, + "grad_norm": 0.12452881783246994, + "learning_rate": 2.542752478560178e-05, + "loss": 0.0322, + "step": 87770 + }, + { + "epoch": 0.19358979165609913, + "grad_norm": 0.1419249325990677, + "learning_rate": 2.542633613043541e-05, + "loss": 0.0323, + "step": 87780 + }, + { + "epoch": 0.19361184563099731, + "grad_norm": 0.11732687056064606, + "learning_rate": 2.542514734857921e-05, + "loss": 0.0332, + "step": 87790 + }, + { + "epoch": 0.19363389960589547, + "grad_norm": 0.09698359668254852, + "learning_rate": 2.5423958440047634e-05, + "loss": 0.0309, + "step": 87800 + }, + { + "epoch": 0.19365595358079363, + "grad_norm": 0.1018759161233902, + "learning_rate": 2.5422769404855113e-05, + "loss": 0.0328, + "step": 87810 + }, + { + "epoch": 0.1936780075556918, + "grad_norm": 0.08627337962388992, + "learning_rate": 2.5421580243016107e-05, + "loss": 0.0321, + "step": 87820 + }, + { + "epoch": 0.19370006153058997, + "grad_norm": 0.1047976016998291, + "learning_rate": 2.5420390954545058e-05, + "loss": 0.034, + "step": 87830 + }, + { + "epoch": 0.19372211550548812, + "grad_norm": 0.13461650907993317, + "learning_rate": 2.5419201539456424e-05, + "loss": 0.0333, + "step": 87840 + }, + { + "epoch": 0.1937441694803863, + "grad_norm": 0.10853743553161621, + "learning_rate": 2.5418011997764652e-05, + "loss": 0.0328, + "step": 87850 + }, + { + "epoch": 0.19376622345528446, + "grad_norm": 0.09152102470397949, + "learning_rate": 2.5416822329484192e-05, + "loss": 0.0326, + "step": 87860 + }, + { + "epoch": 0.19378827743018262, + "grad_norm": 0.12533913552761078, + "learning_rate": 2.5415632534629512e-05, + "loss": 0.0331, + "step": 87870 + }, + { + "epoch": 0.1938103314050808, + "grad_norm": 0.15935158729553223, + "learning_rate": 2.5414442613215066e-05, + "loss": 0.0347, + "step": 87880 + }, + { + "epoch": 0.19383238537997896, + "grad_norm": 0.11079184710979462, + "learning_rate": 2.5413252565255306e-05, + "loss": 0.0334, + "step": 87890 + }, + { + "epoch": 0.19385443935487712, + "grad_norm": 0.11821866035461426, + "learning_rate": 2.5412062390764694e-05, + "loss": 0.0327, + "step": 87900 + }, + { + "epoch": 0.1938764933297753, + "grad_norm": 0.10034307837486267, + "learning_rate": 2.5410872089757694e-05, + "loss": 0.0345, + "step": 87910 + }, + { + "epoch": 0.19389854730467346, + "grad_norm": 0.1052636057138443, + "learning_rate": 2.5409681662248772e-05, + "loss": 0.033, + "step": 87920 + }, + { + "epoch": 0.1939206012795716, + "grad_norm": 0.07736329734325409, + "learning_rate": 2.5408491108252384e-05, + "loss": 0.0318, + "step": 87930 + }, + { + "epoch": 0.1939426552544698, + "grad_norm": 0.10190455615520477, + "learning_rate": 2.540730042778301e-05, + "loss": 0.0345, + "step": 87940 + }, + { + "epoch": 0.19396470922936795, + "grad_norm": 0.08680398762226105, + "learning_rate": 2.54061096208551e-05, + "loss": 0.0303, + "step": 87950 + }, + { + "epoch": 0.1939867632042661, + "grad_norm": 0.09130673110485077, + "learning_rate": 2.5404918687483142e-05, + "loss": 0.0324, + "step": 87960 + }, + { + "epoch": 0.1940088171791643, + "grad_norm": 0.11763478815555573, + "learning_rate": 2.5403727627681588e-05, + "loss": 0.0361, + "step": 87970 + }, + { + "epoch": 0.19403087115406245, + "grad_norm": 0.1102641299366951, + "learning_rate": 2.540253644146493e-05, + "loss": 0.0341, + "step": 87980 + }, + { + "epoch": 0.1940529251289606, + "grad_norm": 0.10758210718631744, + "learning_rate": 2.540134512884763e-05, + "loss": 0.0322, + "step": 87990 + }, + { + "epoch": 0.1940749791038588, + "grad_norm": 0.1276312917470932, + "learning_rate": 2.540015368984416e-05, + "loss": 0.0316, + "step": 88000 + }, + { + "epoch": 0.19409703307875695, + "grad_norm": 0.08491349965333939, + "learning_rate": 2.539896212446901e-05, + "loss": 0.036, + "step": 88010 + }, + { + "epoch": 0.1941190870536551, + "grad_norm": 0.14914968609809875, + "learning_rate": 2.5397770432736648e-05, + "loss": 0.0323, + "step": 88020 + }, + { + "epoch": 0.1941411410285533, + "grad_norm": 0.10984136164188385, + "learning_rate": 2.5396578614661557e-05, + "loss": 0.0312, + "step": 88030 + }, + { + "epoch": 0.19416319500345144, + "grad_norm": 0.14207999408245087, + "learning_rate": 2.5395386670258227e-05, + "loss": 0.0338, + "step": 88040 + }, + { + "epoch": 0.1941852489783496, + "grad_norm": 0.10544757544994354, + "learning_rate": 2.539419459954113e-05, + "loss": 0.0338, + "step": 88050 + }, + { + "epoch": 0.19420730295324778, + "grad_norm": 0.10774567723274231, + "learning_rate": 2.5393002402524755e-05, + "loss": 0.035, + "step": 88060 + }, + { + "epoch": 0.19422935692814594, + "grad_norm": 0.10195888578891754, + "learning_rate": 2.5391810079223588e-05, + "loss": 0.0314, + "step": 88070 + }, + { + "epoch": 0.1942514109030441, + "grad_norm": 0.11859463900327682, + "learning_rate": 2.5390617629652116e-05, + "loss": 0.0321, + "step": 88080 + }, + { + "epoch": 0.19427346487794228, + "grad_norm": 0.17919261753559113, + "learning_rate": 2.5389425053824834e-05, + "loss": 0.0319, + "step": 88090 + }, + { + "epoch": 0.19429551885284044, + "grad_norm": 0.10748418420553207, + "learning_rate": 2.5388232351756225e-05, + "loss": 0.031, + "step": 88100 + }, + { + "epoch": 0.1943175728277386, + "grad_norm": 0.11559268087148666, + "learning_rate": 2.538703952346078e-05, + "loss": 0.0318, + "step": 88110 + }, + { + "epoch": 0.19433962680263678, + "grad_norm": 0.1423092633485794, + "learning_rate": 2.5385846568953006e-05, + "loss": 0.0311, + "step": 88120 + }, + { + "epoch": 0.19436168077753493, + "grad_norm": 0.10907130688428879, + "learning_rate": 2.5384653488247385e-05, + "loss": 0.0347, + "step": 88130 + }, + { + "epoch": 0.19438373475243312, + "grad_norm": 0.11965791881084442, + "learning_rate": 2.5383460281358425e-05, + "loss": 0.0335, + "step": 88140 + }, + { + "epoch": 0.19440578872733127, + "grad_norm": 0.11860159039497375, + "learning_rate": 2.5382266948300615e-05, + "loss": 0.0342, + "step": 88150 + }, + { + "epoch": 0.19442784270222943, + "grad_norm": 0.11537595838308334, + "learning_rate": 2.5381073489088458e-05, + "loss": 0.0334, + "step": 88160 + }, + { + "epoch": 0.19444989667712761, + "grad_norm": 0.11027953773736954, + "learning_rate": 2.5379879903736457e-05, + "loss": 0.0322, + "step": 88170 + }, + { + "epoch": 0.19447195065202577, + "grad_norm": 0.11166559159755707, + "learning_rate": 2.5378686192259118e-05, + "loss": 0.0321, + "step": 88180 + }, + { + "epoch": 0.19449400462692393, + "grad_norm": 0.09698672592639923, + "learning_rate": 2.5377492354670937e-05, + "loss": 0.0305, + "step": 88190 + }, + { + "epoch": 0.1945160586018221, + "grad_norm": 0.09604819118976593, + "learning_rate": 2.5376298390986428e-05, + "loss": 0.0325, + "step": 88200 + }, + { + "epoch": 0.19453811257672027, + "grad_norm": 0.11365777254104614, + "learning_rate": 2.53751043012201e-05, + "loss": 0.0345, + "step": 88210 + }, + { + "epoch": 0.19456016655161842, + "grad_norm": 0.1333056539297104, + "learning_rate": 2.5373910085386456e-05, + "loss": 0.034, + "step": 88220 + }, + { + "epoch": 0.1945822205265166, + "grad_norm": 0.12000343203544617, + "learning_rate": 2.537271574350001e-05, + "loss": 0.0355, + "step": 88230 + }, + { + "epoch": 0.19460427450141476, + "grad_norm": 0.11604708433151245, + "learning_rate": 2.537152127557528e-05, + "loss": 0.0324, + "step": 88240 + }, + { + "epoch": 0.19462632847631292, + "grad_norm": 0.1037454828619957, + "learning_rate": 2.5370326681626767e-05, + "loss": 0.032, + "step": 88250 + }, + { + "epoch": 0.1946483824512111, + "grad_norm": 0.10045619308948517, + "learning_rate": 2.5369131961668996e-05, + "loss": 0.0339, + "step": 88260 + }, + { + "epoch": 0.19467043642610926, + "grad_norm": 0.08976807445287704, + "learning_rate": 2.5367937115716484e-05, + "loss": 0.0325, + "step": 88270 + }, + { + "epoch": 0.19469249040100742, + "grad_norm": 0.10170462727546692, + "learning_rate": 2.5366742143783745e-05, + "loss": 0.0329, + "step": 88280 + }, + { + "epoch": 0.1947145443759056, + "grad_norm": 0.09523851424455643, + "learning_rate": 2.53655470458853e-05, + "loss": 0.0303, + "step": 88290 + }, + { + "epoch": 0.19473659835080376, + "grad_norm": 0.10729517042636871, + "learning_rate": 2.5364351822035677e-05, + "loss": 0.0336, + "step": 88300 + }, + { + "epoch": 0.19475865232570191, + "grad_norm": 0.10866089165210724, + "learning_rate": 2.5363156472249387e-05, + "loss": 0.0344, + "step": 88310 + }, + { + "epoch": 0.1947807063006001, + "grad_norm": 0.11467069387435913, + "learning_rate": 2.5361960996540964e-05, + "loss": 0.0328, + "step": 88320 + }, + { + "epoch": 0.19480276027549825, + "grad_norm": 0.09609740972518921, + "learning_rate": 2.536076539492493e-05, + "loss": 0.0334, + "step": 88330 + }, + { + "epoch": 0.1948248142503964, + "grad_norm": 0.11577298492193222, + "learning_rate": 2.5359569667415818e-05, + "loss": 0.0344, + "step": 88340 + }, + { + "epoch": 0.1948468682252946, + "grad_norm": 0.12720495462417603, + "learning_rate": 2.535837381402815e-05, + "loss": 0.0326, + "step": 88350 + }, + { + "epoch": 0.19486892220019275, + "grad_norm": 0.10672540962696075, + "learning_rate": 2.535717783477646e-05, + "loss": 0.0336, + "step": 88360 + }, + { + "epoch": 0.1948909761750909, + "grad_norm": 0.1166519895195961, + "learning_rate": 2.5355981729675283e-05, + "loss": 0.0345, + "step": 88370 + }, + { + "epoch": 0.1949130301499891, + "grad_norm": 0.10630429536104202, + "learning_rate": 2.5354785498739148e-05, + "loss": 0.0345, + "step": 88380 + }, + { + "epoch": 0.19493508412488725, + "grad_norm": 0.1461835503578186, + "learning_rate": 2.5353589141982594e-05, + "loss": 0.0324, + "step": 88390 + }, + { + "epoch": 0.1949571380997854, + "grad_norm": 0.10456996411085129, + "learning_rate": 2.5352392659420157e-05, + "loss": 0.0352, + "step": 88400 + }, + { + "epoch": 0.1949791920746836, + "grad_norm": 0.10313840955495834, + "learning_rate": 2.5351196051066374e-05, + "loss": 0.0331, + "step": 88410 + }, + { + "epoch": 0.19500124604958174, + "grad_norm": 0.11670901626348495, + "learning_rate": 2.534999931693579e-05, + "loss": 0.0348, + "step": 88420 + }, + { + "epoch": 0.1950233000244799, + "grad_norm": 0.10947223007678986, + "learning_rate": 2.5348802457042935e-05, + "loss": 0.032, + "step": 88430 + }, + { + "epoch": 0.19504535399937808, + "grad_norm": 0.12574295699596405, + "learning_rate": 2.534760547140236e-05, + "loss": 0.0338, + "step": 88440 + }, + { + "epoch": 0.19506740797427624, + "grad_norm": 0.12456028908491135, + "learning_rate": 2.5346408360028614e-05, + "loss": 0.032, + "step": 88450 + }, + { + "epoch": 0.1950894619491744, + "grad_norm": 0.11651458591222763, + "learning_rate": 2.5345211122936233e-05, + "loss": 0.0328, + "step": 88460 + }, + { + "epoch": 0.19511151592407258, + "grad_norm": 0.11540349572896957, + "learning_rate": 2.5344013760139776e-05, + "loss": 0.0326, + "step": 88470 + }, + { + "epoch": 0.19513356989897074, + "grad_norm": 0.11186211556196213, + "learning_rate": 2.5342816271653777e-05, + "loss": 0.0319, + "step": 88480 + }, + { + "epoch": 0.1951556238738689, + "grad_norm": 0.09827333688735962, + "learning_rate": 2.5341618657492802e-05, + "loss": 0.0316, + "step": 88490 + }, + { + "epoch": 0.19517767784876708, + "grad_norm": 0.3506397604942322, + "learning_rate": 2.5340420917671392e-05, + "loss": 0.0323, + "step": 88500 + }, + { + "epoch": 0.19519973182366523, + "grad_norm": 0.14834418892860413, + "learning_rate": 2.5339223052204107e-05, + "loss": 0.0357, + "step": 88510 + }, + { + "epoch": 0.1952217857985634, + "grad_norm": 0.10034399479627609, + "learning_rate": 2.5338025061105498e-05, + "loss": 0.034, + "step": 88520 + }, + { + "epoch": 0.19524383977346157, + "grad_norm": 0.12892429530620575, + "learning_rate": 2.5336826944390127e-05, + "loss": 0.0325, + "step": 88530 + }, + { + "epoch": 0.19526589374835973, + "grad_norm": 0.10502506792545319, + "learning_rate": 2.5335628702072544e-05, + "loss": 0.0344, + "step": 88540 + }, + { + "epoch": 0.1952879477232579, + "grad_norm": 0.11043020337820053, + "learning_rate": 2.5334430334167323e-05, + "loss": 0.0348, + "step": 88550 + }, + { + "epoch": 0.19531000169815607, + "grad_norm": 0.10222955793142319, + "learning_rate": 2.5333231840689007e-05, + "loss": 0.0352, + "step": 88560 + }, + { + "epoch": 0.19533205567305423, + "grad_norm": 0.1910989135503769, + "learning_rate": 2.5332033221652172e-05, + "loss": 0.0326, + "step": 88570 + }, + { + "epoch": 0.1953541096479524, + "grad_norm": 0.11733604967594147, + "learning_rate": 2.533083447707138e-05, + "loss": 0.0333, + "step": 88580 + }, + { + "epoch": 0.19537616362285057, + "grad_norm": 0.1055738553404808, + "learning_rate": 2.532963560696119e-05, + "loss": 0.0345, + "step": 88590 + }, + { + "epoch": 0.19539821759774872, + "grad_norm": 0.11911321431398392, + "learning_rate": 2.532843661133618e-05, + "loss": 0.0335, + "step": 88600 + }, + { + "epoch": 0.1954202715726469, + "grad_norm": 0.10829953849315643, + "learning_rate": 2.5327237490210908e-05, + "loss": 0.0315, + "step": 88610 + }, + { + "epoch": 0.19544232554754506, + "grad_norm": 0.10813605785369873, + "learning_rate": 2.532603824359995e-05, + "loss": 0.0315, + "step": 88620 + }, + { + "epoch": 0.19546437952244322, + "grad_norm": 0.09118840843439102, + "learning_rate": 2.532483887151788e-05, + "loss": 0.0334, + "step": 88630 + }, + { + "epoch": 0.1954864334973414, + "grad_norm": 0.12187916785478592, + "learning_rate": 2.532363937397927e-05, + "loss": 0.0329, + "step": 88640 + }, + { + "epoch": 0.19550848747223956, + "grad_norm": 0.10782977938652039, + "learning_rate": 2.53224397509987e-05, + "loss": 0.0334, + "step": 88650 + }, + { + "epoch": 0.19553054144713772, + "grad_norm": 0.10975240170955658, + "learning_rate": 2.5321240002590732e-05, + "loss": 0.0343, + "step": 88660 + }, + { + "epoch": 0.1955525954220359, + "grad_norm": 0.1148558184504509, + "learning_rate": 2.5320040128769958e-05, + "loss": 0.031, + "step": 88670 + }, + { + "epoch": 0.19557464939693406, + "grad_norm": 0.12377013266086578, + "learning_rate": 2.531884012955095e-05, + "loss": 0.0351, + "step": 88680 + }, + { + "epoch": 0.19559670337183221, + "grad_norm": 0.09599137306213379, + "learning_rate": 2.5317640004948294e-05, + "loss": 0.0332, + "step": 88690 + }, + { + "epoch": 0.1956187573467304, + "grad_norm": 0.1124526858329773, + "learning_rate": 2.531643975497657e-05, + "loss": 0.0301, + "step": 88700 + }, + { + "epoch": 0.19564081132162855, + "grad_norm": 0.08624959737062454, + "learning_rate": 2.531523937965036e-05, + "loss": 0.0323, + "step": 88710 + }, + { + "epoch": 0.1956628652965267, + "grad_norm": 0.1308547556400299, + "learning_rate": 2.5314038878984256e-05, + "loss": 0.0331, + "step": 88720 + }, + { + "epoch": 0.1956849192714249, + "grad_norm": 0.09556794166564941, + "learning_rate": 2.531283825299284e-05, + "loss": 0.0337, + "step": 88730 + }, + { + "epoch": 0.19570697324632305, + "grad_norm": 0.1276433765888214, + "learning_rate": 2.53116375016907e-05, + "loss": 0.0345, + "step": 88740 + }, + { + "epoch": 0.1957290272212212, + "grad_norm": 0.09607736021280289, + "learning_rate": 2.5310436625092432e-05, + "loss": 0.035, + "step": 88750 + }, + { + "epoch": 0.1957510811961194, + "grad_norm": 0.10246260464191437, + "learning_rate": 2.5309235623212626e-05, + "loss": 0.033, + "step": 88760 + }, + { + "epoch": 0.19577313517101755, + "grad_norm": 0.1296367347240448, + "learning_rate": 2.5308034496065867e-05, + "loss": 0.035, + "step": 88770 + }, + { + "epoch": 0.1957951891459157, + "grad_norm": 0.11581405252218246, + "learning_rate": 2.5306833243666764e-05, + "loss": 0.0344, + "step": 88780 + }, + { + "epoch": 0.1958172431208139, + "grad_norm": 0.1169232428073883, + "learning_rate": 2.53056318660299e-05, + "loss": 0.0324, + "step": 88790 + }, + { + "epoch": 0.19583929709571204, + "grad_norm": 0.10705263912677765, + "learning_rate": 2.530443036316988e-05, + "loss": 0.0325, + "step": 88800 + }, + { + "epoch": 0.1958613510706102, + "grad_norm": 0.11806502938270569, + "learning_rate": 2.5303228735101306e-05, + "loss": 0.0324, + "step": 88810 + }, + { + "epoch": 0.19588340504550839, + "grad_norm": 0.10318754613399506, + "learning_rate": 2.530202698183877e-05, + "loss": 0.0335, + "step": 88820 + }, + { + "epoch": 0.19590545902040654, + "grad_norm": 0.10220260918140411, + "learning_rate": 2.5300825103396887e-05, + "loss": 0.0338, + "step": 88830 + }, + { + "epoch": 0.1959275129953047, + "grad_norm": 0.11613113433122635, + "learning_rate": 2.5299623099790243e-05, + "loss": 0.0314, + "step": 88840 + }, + { + "epoch": 0.19594956697020288, + "grad_norm": 0.09574770927429199, + "learning_rate": 2.5298420971033458e-05, + "loss": 0.0343, + "step": 88850 + }, + { + "epoch": 0.19597162094510104, + "grad_norm": 0.1808609515428543, + "learning_rate": 2.5297218717141138e-05, + "loss": 0.0333, + "step": 88860 + }, + { + "epoch": 0.1959936749199992, + "grad_norm": 0.11462206393480301, + "learning_rate": 2.5296016338127882e-05, + "loss": 0.034, + "step": 88870 + }, + { + "epoch": 0.19601572889489738, + "grad_norm": 0.12769214808940887, + "learning_rate": 2.5294813834008314e-05, + "loss": 0.0331, + "step": 88880 + }, + { + "epoch": 0.19603778286979553, + "grad_norm": 0.12137368321418762, + "learning_rate": 2.5293611204797033e-05, + "loss": 0.0332, + "step": 88890 + }, + { + "epoch": 0.1960598368446937, + "grad_norm": 0.11207466572523117, + "learning_rate": 2.5292408450508663e-05, + "loss": 0.0336, + "step": 88900 + }, + { + "epoch": 0.19608189081959188, + "grad_norm": 0.10216074436903, + "learning_rate": 2.5291205571157802e-05, + "loss": 0.0323, + "step": 88910 + }, + { + "epoch": 0.19610394479449003, + "grad_norm": 0.1261090785264969, + "learning_rate": 2.5290002566759085e-05, + "loss": 0.0337, + "step": 88920 + }, + { + "epoch": 0.1961259987693882, + "grad_norm": 0.12473280727863312, + "learning_rate": 2.5288799437327117e-05, + "loss": 0.0342, + "step": 88930 + }, + { + "epoch": 0.19614805274428637, + "grad_norm": 0.12289794534444809, + "learning_rate": 2.5287596182876528e-05, + "loss": 0.0349, + "step": 88940 + }, + { + "epoch": 0.19617010671918453, + "grad_norm": 0.12246987968683243, + "learning_rate": 2.5286392803421925e-05, + "loss": 0.0325, + "step": 88950 + }, + { + "epoch": 0.19619216069408268, + "grad_norm": 0.09365900605916977, + "learning_rate": 2.5285189298977944e-05, + "loss": 0.0331, + "step": 88960 + }, + { + "epoch": 0.19621421466898087, + "grad_norm": 0.10027342289686203, + "learning_rate": 2.52839856695592e-05, + "loss": 0.0325, + "step": 88970 + }, + { + "epoch": 0.19623626864387902, + "grad_norm": 0.09435290098190308, + "learning_rate": 2.5282781915180318e-05, + "loss": 0.0341, + "step": 88980 + }, + { + "epoch": 0.1962583226187772, + "grad_norm": 0.11281267553567886, + "learning_rate": 2.5281578035855926e-05, + "loss": 0.0328, + "step": 88990 + }, + { + "epoch": 0.19628037659367537, + "grad_norm": 0.10056506097316742, + "learning_rate": 2.528037403160066e-05, + "loss": 0.0332, + "step": 89000 + }, + { + "epoch": 0.19630243056857352, + "grad_norm": 0.0934072881937027, + "learning_rate": 2.527916990242914e-05, + "loss": 0.0315, + "step": 89010 + }, + { + "epoch": 0.1963244845434717, + "grad_norm": 0.2110811471939087, + "learning_rate": 2.5277965648355998e-05, + "loss": 0.0326, + "step": 89020 + }, + { + "epoch": 0.19634653851836986, + "grad_norm": 0.12003614008426666, + "learning_rate": 2.5276761269395873e-05, + "loss": 0.0341, + "step": 89030 + }, + { + "epoch": 0.19636859249326802, + "grad_norm": 0.10289989411830902, + "learning_rate": 2.5275556765563396e-05, + "loss": 0.0329, + "step": 89040 + }, + { + "epoch": 0.1963906464681662, + "grad_norm": 0.11022063344717026, + "learning_rate": 2.5274352136873194e-05, + "loss": 0.0345, + "step": 89050 + }, + { + "epoch": 0.19641270044306436, + "grad_norm": 0.1188773587346077, + "learning_rate": 2.527314738333992e-05, + "loss": 0.0338, + "step": 89060 + }, + { + "epoch": 0.19643475441796251, + "grad_norm": 0.09083785861730576, + "learning_rate": 2.527194250497821e-05, + "loss": 0.0333, + "step": 89070 + }, + { + "epoch": 0.1964568083928607, + "grad_norm": 0.1095498576760292, + "learning_rate": 2.5270737501802694e-05, + "loss": 0.0328, + "step": 89080 + }, + { + "epoch": 0.19647886236775886, + "grad_norm": 0.10120062530040741, + "learning_rate": 2.526953237382802e-05, + "loss": 0.0319, + "step": 89090 + }, + { + "epoch": 0.196500916342657, + "grad_norm": 0.15228988230228424, + "learning_rate": 2.5268327121068835e-05, + "loss": 0.0341, + "step": 89100 + }, + { + "epoch": 0.1965229703175552, + "grad_norm": 0.08913110196590424, + "learning_rate": 2.526712174353978e-05, + "loss": 0.0327, + "step": 89110 + }, + { + "epoch": 0.19654502429245335, + "grad_norm": 0.10936233401298523, + "learning_rate": 2.52659162412555e-05, + "loss": 0.0327, + "step": 89120 + }, + { + "epoch": 0.1965670782673515, + "grad_norm": 0.10326045006513596, + "learning_rate": 2.5264710614230644e-05, + "loss": 0.0345, + "step": 89130 + }, + { + "epoch": 0.1965891322422497, + "grad_norm": 0.09476683288812637, + "learning_rate": 2.5263504862479864e-05, + "loss": 0.0319, + "step": 89140 + }, + { + "epoch": 0.19661118621714785, + "grad_norm": 0.10391465574502945, + "learning_rate": 2.5262298986017815e-05, + "loss": 0.0348, + "step": 89150 + }, + { + "epoch": 0.196633240192046, + "grad_norm": 0.11223974078893661, + "learning_rate": 2.5261092984859136e-05, + "loss": 0.0343, + "step": 89160 + }, + { + "epoch": 0.1966552941669442, + "grad_norm": 0.0807669460773468, + "learning_rate": 2.5259886859018494e-05, + "loss": 0.0332, + "step": 89170 + }, + { + "epoch": 0.19667734814184235, + "grad_norm": 0.10024738311767578, + "learning_rate": 2.5258680608510534e-05, + "loss": 0.0315, + "step": 89180 + }, + { + "epoch": 0.1966994021167405, + "grad_norm": 0.1362086683511734, + "learning_rate": 2.5257474233349925e-05, + "loss": 0.0352, + "step": 89190 + }, + { + "epoch": 0.19672145609163869, + "grad_norm": 0.102863609790802, + "learning_rate": 2.525626773355132e-05, + "loss": 0.0325, + "step": 89200 + }, + { + "epoch": 0.19674351006653684, + "grad_norm": 0.10409678518772125, + "learning_rate": 2.5255061109129375e-05, + "loss": 0.0338, + "step": 89210 + }, + { + "epoch": 0.196765564041435, + "grad_norm": 0.09640488773584366, + "learning_rate": 2.5253854360098754e-05, + "loss": 0.0324, + "step": 89220 + }, + { + "epoch": 0.19678761801633318, + "grad_norm": 0.09610101580619812, + "learning_rate": 2.525264748647412e-05, + "loss": 0.0337, + "step": 89230 + }, + { + "epoch": 0.19680967199123134, + "grad_norm": 0.09202240407466888, + "learning_rate": 2.5251440488270146e-05, + "loss": 0.0328, + "step": 89240 + }, + { + "epoch": 0.1968317259661295, + "grad_norm": 0.12621864676475525, + "learning_rate": 2.5250233365501487e-05, + "loss": 0.0339, + "step": 89250 + }, + { + "epoch": 0.19685377994102768, + "grad_norm": 0.11195891350507736, + "learning_rate": 2.5249026118182814e-05, + "loss": 0.0325, + "step": 89260 + }, + { + "epoch": 0.19687583391592584, + "grad_norm": 0.11666920781135559, + "learning_rate": 2.52478187463288e-05, + "loss": 0.0314, + "step": 89270 + }, + { + "epoch": 0.196897887890824, + "grad_norm": 0.09944503009319305, + "learning_rate": 2.524661124995411e-05, + "loss": 0.0325, + "step": 89280 + }, + { + "epoch": 0.19691994186572218, + "grad_norm": 0.10033261030912399, + "learning_rate": 2.5245403629073422e-05, + "loss": 0.0333, + "step": 89290 + }, + { + "epoch": 0.19694199584062033, + "grad_norm": 0.09672513604164124, + "learning_rate": 2.52441958837014e-05, + "loss": 0.0341, + "step": 89300 + }, + { + "epoch": 0.1969640498155185, + "grad_norm": 0.09308035671710968, + "learning_rate": 2.5242988013852732e-05, + "loss": 0.0344, + "step": 89310 + }, + { + "epoch": 0.19698610379041667, + "grad_norm": 0.11748793721199036, + "learning_rate": 2.5241780019542085e-05, + "loss": 0.0323, + "step": 89320 + }, + { + "epoch": 0.19700815776531483, + "grad_norm": 0.11917532980442047, + "learning_rate": 2.5240571900784144e-05, + "loss": 0.0329, + "step": 89330 + }, + { + "epoch": 0.19703021174021298, + "grad_norm": 0.11265714466571808, + "learning_rate": 2.5239363657593584e-05, + "loss": 0.0339, + "step": 89340 + }, + { + "epoch": 0.19705226571511117, + "grad_norm": 0.11784770339727402, + "learning_rate": 2.523815528998509e-05, + "loss": 0.034, + "step": 89350 + }, + { + "epoch": 0.19707431969000933, + "grad_norm": 0.09904246032238007, + "learning_rate": 2.5236946797973342e-05, + "loss": 0.0325, + "step": 89360 + }, + { + "epoch": 0.19709637366490748, + "grad_norm": 0.10176767408847809, + "learning_rate": 2.5235738181573022e-05, + "loss": 0.0319, + "step": 89370 + }, + { + "epoch": 0.19711842763980567, + "grad_norm": 0.07246215641498566, + "learning_rate": 2.5234529440798822e-05, + "loss": 0.0319, + "step": 89380 + }, + { + "epoch": 0.19714048161470382, + "grad_norm": 0.11507490277290344, + "learning_rate": 2.5233320575665427e-05, + "loss": 0.0344, + "step": 89390 + }, + { + "epoch": 0.19716253558960198, + "grad_norm": 0.10673555731773376, + "learning_rate": 2.5232111586187523e-05, + "loss": 0.032, + "step": 89400 + }, + { + "epoch": 0.19718458956450016, + "grad_norm": 0.08103639632463455, + "learning_rate": 2.52309024723798e-05, + "loss": 0.0336, + "step": 89410 + }, + { + "epoch": 0.19720664353939832, + "grad_norm": 0.1447765976190567, + "learning_rate": 2.5229693234256957e-05, + "loss": 0.0322, + "step": 89420 + }, + { + "epoch": 0.1972286975142965, + "grad_norm": 0.11658104509115219, + "learning_rate": 2.522848387183368e-05, + "loss": 0.0321, + "step": 89430 + }, + { + "epoch": 0.19725075148919466, + "grad_norm": 0.09211207926273346, + "learning_rate": 2.5227274385124672e-05, + "loss": 0.0339, + "step": 89440 + }, + { + "epoch": 0.19727280546409282, + "grad_norm": 0.1342763751745224, + "learning_rate": 2.522606477414462e-05, + "loss": 0.0344, + "step": 89450 + }, + { + "epoch": 0.197294859438991, + "grad_norm": 0.11280945688486099, + "learning_rate": 2.5224855038908225e-05, + "loss": 0.031, + "step": 89460 + }, + { + "epoch": 0.19731691341388916, + "grad_norm": 0.10522044450044632, + "learning_rate": 2.5223645179430188e-05, + "loss": 0.0326, + "step": 89470 + }, + { + "epoch": 0.1973389673887873, + "grad_norm": 0.1281757354736328, + "learning_rate": 2.522243519572521e-05, + "loss": 0.0323, + "step": 89480 + }, + { + "epoch": 0.1973610213636855, + "grad_norm": 0.09653200209140778, + "learning_rate": 2.5221225087807992e-05, + "loss": 0.0341, + "step": 89490 + }, + { + "epoch": 0.19738307533858365, + "grad_norm": 0.1165379211306572, + "learning_rate": 2.522001485569324e-05, + "loss": 0.0327, + "step": 89500 + }, + { + "epoch": 0.1974051293134818, + "grad_norm": 0.10814208537340164, + "learning_rate": 2.5218804499395654e-05, + "loss": 0.0339, + "step": 89510 + }, + { + "epoch": 0.19742718328838, + "grad_norm": 0.09527240693569183, + "learning_rate": 2.5217594018929954e-05, + "loss": 0.0318, + "step": 89520 + }, + { + "epoch": 0.19744923726327815, + "grad_norm": 0.08054034411907196, + "learning_rate": 2.5216383414310836e-05, + "loss": 0.0342, + "step": 89530 + }, + { + "epoch": 0.1974712912381763, + "grad_norm": 0.10999258607625961, + "learning_rate": 2.5215172685553013e-05, + "loss": 0.0331, + "step": 89540 + }, + { + "epoch": 0.1974933452130745, + "grad_norm": 0.10799966752529144, + "learning_rate": 2.521396183267119e-05, + "loss": 0.0342, + "step": 89550 + }, + { + "epoch": 0.19751539918797265, + "grad_norm": 0.11587398499250412, + "learning_rate": 2.5212750855680097e-05, + "loss": 0.034, + "step": 89560 + }, + { + "epoch": 0.1975374531628708, + "grad_norm": 0.11369997262954712, + "learning_rate": 2.5211539754594433e-05, + "loss": 0.0339, + "step": 89570 + }, + { + "epoch": 0.19755950713776899, + "grad_norm": 0.10440534353256226, + "learning_rate": 2.5210328529428923e-05, + "loss": 0.0329, + "step": 89580 + }, + { + "epoch": 0.19758156111266714, + "grad_norm": 0.10891126096248627, + "learning_rate": 2.520911718019828e-05, + "loss": 0.0326, + "step": 89590 + }, + { + "epoch": 0.1976036150875653, + "grad_norm": 0.07969895005226135, + "learning_rate": 2.5207905706917228e-05, + "loss": 0.0312, + "step": 89600 + }, + { + "epoch": 0.19762566906246348, + "grad_norm": 0.08698932081460953, + "learning_rate": 2.520669410960048e-05, + "loss": 0.0325, + "step": 89610 + }, + { + "epoch": 0.19764772303736164, + "grad_norm": 0.08560696244239807, + "learning_rate": 2.5205482388262758e-05, + "loss": 0.031, + "step": 89620 + }, + { + "epoch": 0.1976697770122598, + "grad_norm": 0.10872361063957214, + "learning_rate": 2.5204270542918798e-05, + "loss": 0.035, + "step": 89630 + }, + { + "epoch": 0.19769183098715798, + "grad_norm": 0.14326010644435883, + "learning_rate": 2.5203058573583306e-05, + "loss": 0.0315, + "step": 89640 + }, + { + "epoch": 0.19771388496205614, + "grad_norm": 0.09866654127836227, + "learning_rate": 2.5201846480271028e-05, + "loss": 0.0332, + "step": 89650 + }, + { + "epoch": 0.1977359389369543, + "grad_norm": 0.10007953643798828, + "learning_rate": 2.520063426299668e-05, + "loss": 0.0337, + "step": 89660 + }, + { + "epoch": 0.19775799291185248, + "grad_norm": 0.07111154496669769, + "learning_rate": 2.5199421921774994e-05, + "loss": 0.0319, + "step": 89670 + }, + { + "epoch": 0.19778004688675063, + "grad_norm": 0.09481178224086761, + "learning_rate": 2.51982094566207e-05, + "loss": 0.0317, + "step": 89680 + }, + { + "epoch": 0.1978021008616488, + "grad_norm": 0.09996240586042404, + "learning_rate": 2.5196996867548538e-05, + "loss": 0.035, + "step": 89690 + }, + { + "epoch": 0.19782415483654697, + "grad_norm": 0.12240394204854965, + "learning_rate": 2.5195784154573236e-05, + "loss": 0.0337, + "step": 89700 + }, + { + "epoch": 0.19784620881144513, + "grad_norm": 0.08774195611476898, + "learning_rate": 2.5194571317709524e-05, + "loss": 0.0332, + "step": 89710 + }, + { + "epoch": 0.19786826278634329, + "grad_norm": 0.09066301584243774, + "learning_rate": 2.519335835697215e-05, + "loss": 0.0342, + "step": 89720 + }, + { + "epoch": 0.19789031676124147, + "grad_norm": 0.12359645962715149, + "learning_rate": 2.5192145272375847e-05, + "loss": 0.0368, + "step": 89730 + }, + { + "epoch": 0.19791237073613963, + "grad_norm": 0.1588270515203476, + "learning_rate": 2.5190932063935355e-05, + "loss": 0.0364, + "step": 89740 + }, + { + "epoch": 0.19793442471103778, + "grad_norm": 0.09907222539186478, + "learning_rate": 2.5189718731665413e-05, + "loss": 0.0336, + "step": 89750 + }, + { + "epoch": 0.19795647868593597, + "grad_norm": 0.1421932727098465, + "learning_rate": 2.5188505275580775e-05, + "loss": 0.0333, + "step": 89760 + }, + { + "epoch": 0.19797853266083412, + "grad_norm": 0.14901824295520782, + "learning_rate": 2.5187291695696177e-05, + "loss": 0.0335, + "step": 89770 + }, + { + "epoch": 0.19800058663573228, + "grad_norm": 0.12446462363004684, + "learning_rate": 2.5186077992026363e-05, + "loss": 0.0338, + "step": 89780 + }, + { + "epoch": 0.19802264061063046, + "grad_norm": 0.14465513825416565, + "learning_rate": 2.518486416458609e-05, + "loss": 0.0334, + "step": 89790 + }, + { + "epoch": 0.19804469458552862, + "grad_norm": 0.14801393449306488, + "learning_rate": 2.5183650213390094e-05, + "loss": 0.0333, + "step": 89800 + }, + { + "epoch": 0.19806674856042678, + "grad_norm": 0.09725518524646759, + "learning_rate": 2.518243613845314e-05, + "loss": 0.0341, + "step": 89810 + }, + { + "epoch": 0.19808880253532496, + "grad_norm": 0.08665681630373001, + "learning_rate": 2.5181221939789963e-05, + "loss": 0.0342, + "step": 89820 + }, + { + "epoch": 0.19811085651022312, + "grad_norm": 0.09942994266748428, + "learning_rate": 2.5180007617415334e-05, + "loss": 0.0323, + "step": 89830 + }, + { + "epoch": 0.1981329104851213, + "grad_norm": 0.10737113654613495, + "learning_rate": 2.5178793171344e-05, + "loss": 0.0334, + "step": 89840 + }, + { + "epoch": 0.19815496446001946, + "grad_norm": 0.09800852835178375, + "learning_rate": 2.5177578601590718e-05, + "loss": 0.0328, + "step": 89850 + }, + { + "epoch": 0.1981770184349176, + "grad_norm": 0.11170497536659241, + "learning_rate": 2.517636390817025e-05, + "loss": 0.0333, + "step": 89860 + }, + { + "epoch": 0.1981990724098158, + "grad_norm": 0.12443267554044724, + "learning_rate": 2.5175149091097346e-05, + "loss": 0.0323, + "step": 89870 + }, + { + "epoch": 0.19822112638471395, + "grad_norm": 0.13233013451099396, + "learning_rate": 2.5173934150386776e-05, + "loss": 0.0356, + "step": 89880 + }, + { + "epoch": 0.1982431803596121, + "grad_norm": 0.15733125805854797, + "learning_rate": 2.51727190860533e-05, + "loss": 0.0346, + "step": 89890 + }, + { + "epoch": 0.1982652343345103, + "grad_norm": 0.12369387596845627, + "learning_rate": 2.517150389811169e-05, + "loss": 0.0342, + "step": 89900 + }, + { + "epoch": 0.19828728830940845, + "grad_norm": 0.1878354400396347, + "learning_rate": 2.5170288586576695e-05, + "loss": 0.0339, + "step": 89910 + }, + { + "epoch": 0.1983093422843066, + "grad_norm": 0.0884581133723259, + "learning_rate": 2.5169073151463096e-05, + "loss": 0.0321, + "step": 89920 + }, + { + "epoch": 0.1983313962592048, + "grad_norm": 0.1073065772652626, + "learning_rate": 2.516785759278566e-05, + "loss": 0.0319, + "step": 89930 + }, + { + "epoch": 0.19835345023410295, + "grad_norm": 0.10333054512739182, + "learning_rate": 2.5166641910559147e-05, + "loss": 0.0335, + "step": 89940 + }, + { + "epoch": 0.1983755042090011, + "grad_norm": 0.10184329748153687, + "learning_rate": 2.516542610479834e-05, + "loss": 0.0327, + "step": 89950 + }, + { + "epoch": 0.1983975581838993, + "grad_norm": 0.11672232300043106, + "learning_rate": 2.5164210175518003e-05, + "loss": 0.0334, + "step": 89960 + }, + { + "epoch": 0.19841961215879744, + "grad_norm": 0.0908874124288559, + "learning_rate": 2.5162994122732922e-05, + "loss": 0.0334, + "step": 89970 + }, + { + "epoch": 0.1984416661336956, + "grad_norm": 0.11999296396970749, + "learning_rate": 2.516177794645786e-05, + "loss": 0.0328, + "step": 89980 + }, + { + "epoch": 0.19846372010859378, + "grad_norm": 0.1074289008975029, + "learning_rate": 2.516056164670761e-05, + "loss": 0.0342, + "step": 89990 + }, + { + "epoch": 0.19848577408349194, + "grad_norm": 0.10561694949865341, + "learning_rate": 2.5159345223496936e-05, + "loss": 0.0338, + "step": 90000 + }, + { + "epoch": 0.1985078280583901, + "grad_norm": 0.10584673285484314, + "learning_rate": 2.5158128676840632e-05, + "loss": 0.0311, + "step": 90010 + }, + { + "epoch": 0.19852988203328828, + "grad_norm": 0.10517878085374832, + "learning_rate": 2.515691200675347e-05, + "loss": 0.0333, + "step": 90020 + }, + { + "epoch": 0.19855193600818644, + "grad_norm": 0.10752605646848679, + "learning_rate": 2.5155695213250235e-05, + "loss": 0.0343, + "step": 90030 + }, + { + "epoch": 0.1985739899830846, + "grad_norm": 0.10290884971618652, + "learning_rate": 2.5154478296345716e-05, + "loss": 0.0338, + "step": 90040 + }, + { + "epoch": 0.19859604395798278, + "grad_norm": 0.07920685410499573, + "learning_rate": 2.51532612560547e-05, + "loss": 0.034, + "step": 90050 + }, + { + "epoch": 0.19861809793288093, + "grad_norm": 0.08822523057460785, + "learning_rate": 2.5152044092391975e-05, + "loss": 0.0315, + "step": 90060 + }, + { + "epoch": 0.1986401519077791, + "grad_norm": 0.09199100732803345, + "learning_rate": 2.5150826805372327e-05, + "loss": 0.0318, + "step": 90070 + }, + { + "epoch": 0.19866220588267727, + "grad_norm": 0.1225055605173111, + "learning_rate": 2.5149609395010547e-05, + "loss": 0.0319, + "step": 90080 + }, + { + "epoch": 0.19868425985757543, + "grad_norm": 0.08682624250650406, + "learning_rate": 2.5148391861321434e-05, + "loss": 0.0301, + "step": 90090 + }, + { + "epoch": 0.19870631383247359, + "grad_norm": 0.10695075243711472, + "learning_rate": 2.5147174204319777e-05, + "loss": 0.0317, + "step": 90100 + }, + { + "epoch": 0.19872836780737177, + "grad_norm": 0.11046357452869415, + "learning_rate": 2.5145956424020373e-05, + "loss": 0.0362, + "step": 90110 + }, + { + "epoch": 0.19875042178226993, + "grad_norm": 0.12009332329034805, + "learning_rate": 2.514473852043802e-05, + "loss": 0.0327, + "step": 90120 + }, + { + "epoch": 0.19877247575716808, + "grad_norm": 0.09237553924322128, + "learning_rate": 2.5143520493587514e-05, + "loss": 0.0328, + "step": 90130 + }, + { + "epoch": 0.19879452973206627, + "grad_norm": 0.1429394632577896, + "learning_rate": 2.5142302343483654e-05, + "loss": 0.0327, + "step": 90140 + }, + { + "epoch": 0.19881658370696442, + "grad_norm": 0.11155404150485992, + "learning_rate": 2.5141084070141252e-05, + "loss": 0.0345, + "step": 90150 + }, + { + "epoch": 0.19883863768186258, + "grad_norm": 0.08896283805370331, + "learning_rate": 2.5139865673575098e-05, + "loss": 0.0353, + "step": 90160 + }, + { + "epoch": 0.19886069165676076, + "grad_norm": 0.1153583750128746, + "learning_rate": 2.5138647153800006e-05, + "loss": 0.034, + "step": 90170 + }, + { + "epoch": 0.19888274563165892, + "grad_norm": 0.11311129480600357, + "learning_rate": 2.5137428510830775e-05, + "loss": 0.0316, + "step": 90180 + }, + { + "epoch": 0.19890479960655708, + "grad_norm": 0.09601089358329773, + "learning_rate": 2.513620974468222e-05, + "loss": 0.0305, + "step": 90190 + }, + { + "epoch": 0.19892685358145526, + "grad_norm": 0.13679935038089752, + "learning_rate": 2.5134990855369146e-05, + "loss": 0.0336, + "step": 90200 + }, + { + "epoch": 0.19894890755635342, + "grad_norm": 0.12143616378307343, + "learning_rate": 2.513377184290636e-05, + "loss": 0.0338, + "step": 90210 + }, + { + "epoch": 0.19897096153125157, + "grad_norm": 0.1331164836883545, + "learning_rate": 2.513255270730868e-05, + "loss": 0.0324, + "step": 90220 + }, + { + "epoch": 0.19899301550614976, + "grad_norm": 0.11524458229541779, + "learning_rate": 2.5131333448590917e-05, + "loss": 0.0342, + "step": 90230 + }, + { + "epoch": 0.1990150694810479, + "grad_norm": 0.10343856364488602, + "learning_rate": 2.5130114066767887e-05, + "loss": 0.0342, + "step": 90240 + }, + { + "epoch": 0.19903712345594607, + "grad_norm": 0.10943413525819778, + "learning_rate": 2.512889456185441e-05, + "loss": 0.0334, + "step": 90250 + }, + { + "epoch": 0.19905917743084425, + "grad_norm": 0.09591887146234512, + "learning_rate": 2.5127674933865302e-05, + "loss": 0.0329, + "step": 90260 + }, + { + "epoch": 0.1990812314057424, + "grad_norm": 0.096041239798069, + "learning_rate": 2.5126455182815376e-05, + "loss": 0.0329, + "step": 90270 + }, + { + "epoch": 0.1991032853806406, + "grad_norm": 0.08999054878950119, + "learning_rate": 2.512523530871946e-05, + "loss": 0.0332, + "step": 90280 + }, + { + "epoch": 0.19912533935553875, + "grad_norm": 0.09879177808761597, + "learning_rate": 2.512401531159238e-05, + "loss": 0.0334, + "step": 90290 + }, + { + "epoch": 0.1991473933304369, + "grad_norm": 0.09424732625484467, + "learning_rate": 2.512279519144895e-05, + "loss": 0.0318, + "step": 90300 + }, + { + "epoch": 0.1991694473053351, + "grad_norm": 0.12292057275772095, + "learning_rate": 2.5121574948304006e-05, + "loss": 0.034, + "step": 90310 + }, + { + "epoch": 0.19919150128023325, + "grad_norm": 0.10293864458799362, + "learning_rate": 2.5120354582172363e-05, + "loss": 0.0329, + "step": 90320 + }, + { + "epoch": 0.1992135552551314, + "grad_norm": 0.12078153342008591, + "learning_rate": 2.511913409306886e-05, + "loss": 0.0328, + "step": 90330 + }, + { + "epoch": 0.1992356092300296, + "grad_norm": 0.1034015417098999, + "learning_rate": 2.5117913481008328e-05, + "loss": 0.0335, + "step": 90340 + }, + { + "epoch": 0.19925766320492774, + "grad_norm": 0.09301947802305222, + "learning_rate": 2.5116692746005584e-05, + "loss": 0.0312, + "step": 90350 + }, + { + "epoch": 0.1992797171798259, + "grad_norm": 0.0911988690495491, + "learning_rate": 2.511547188807548e-05, + "loss": 0.033, + "step": 90360 + }, + { + "epoch": 0.19930177115472408, + "grad_norm": 0.13256379961967468, + "learning_rate": 2.511425090723284e-05, + "loss": 0.0328, + "step": 90370 + }, + { + "epoch": 0.19932382512962224, + "grad_norm": 0.1030178815126419, + "learning_rate": 2.5113029803492504e-05, + "loss": 0.0337, + "step": 90380 + }, + { + "epoch": 0.1993458791045204, + "grad_norm": 0.11497621238231659, + "learning_rate": 2.5111808576869303e-05, + "loss": 0.0328, + "step": 90390 + }, + { + "epoch": 0.19936793307941858, + "grad_norm": 0.12191323190927505, + "learning_rate": 2.5110587227378084e-05, + "loss": 0.034, + "step": 90400 + }, + { + "epoch": 0.19938998705431674, + "grad_norm": 0.12549994885921478, + "learning_rate": 2.5109365755033685e-05, + "loss": 0.0315, + "step": 90410 + }, + { + "epoch": 0.1994120410292149, + "grad_norm": 0.11185752600431442, + "learning_rate": 2.5108144159850945e-05, + "loss": 0.0329, + "step": 90420 + }, + { + "epoch": 0.19943409500411308, + "grad_norm": 0.11839693784713745, + "learning_rate": 2.510692244184471e-05, + "loss": 0.0327, + "step": 90430 + }, + { + "epoch": 0.19945614897901123, + "grad_norm": 0.15195217728614807, + "learning_rate": 2.5105700601029825e-05, + "loss": 0.0347, + "step": 90440 + }, + { + "epoch": 0.1994782029539094, + "grad_norm": 0.11192797869443893, + "learning_rate": 2.510447863742114e-05, + "loss": 0.0344, + "step": 90450 + }, + { + "epoch": 0.19950025692880757, + "grad_norm": 0.14347237348556519, + "learning_rate": 2.510325655103349e-05, + "loss": 0.0331, + "step": 90460 + }, + { + "epoch": 0.19952231090370573, + "grad_norm": 0.1002664566040039, + "learning_rate": 2.510203434188174e-05, + "loss": 0.0327, + "step": 90470 + }, + { + "epoch": 0.19954436487860389, + "grad_norm": 0.09818600863218307, + "learning_rate": 2.5100812009980736e-05, + "loss": 0.0329, + "step": 90480 + }, + { + "epoch": 0.19956641885350207, + "grad_norm": 0.11649656295776367, + "learning_rate": 2.5099589555345327e-05, + "loss": 0.0319, + "step": 90490 + }, + { + "epoch": 0.19958847282840023, + "grad_norm": 0.11996065080165863, + "learning_rate": 2.509836697799037e-05, + "loss": 0.0346, + "step": 90500 + }, + { + "epoch": 0.19961052680329838, + "grad_norm": 0.11565268784761429, + "learning_rate": 2.5097144277930722e-05, + "loss": 0.0345, + "step": 90510 + }, + { + "epoch": 0.19963258077819657, + "grad_norm": 0.1281561553478241, + "learning_rate": 2.509592145518123e-05, + "loss": 0.034, + "step": 90520 + }, + { + "epoch": 0.19965463475309472, + "grad_norm": 0.1006859615445137, + "learning_rate": 2.509469850975677e-05, + "loss": 0.0318, + "step": 90530 + }, + { + "epoch": 0.19967668872799288, + "grad_norm": 0.13050059974193573, + "learning_rate": 2.5093475441672186e-05, + "loss": 0.0323, + "step": 90540 + }, + { + "epoch": 0.19969874270289106, + "grad_norm": 0.10796824097633362, + "learning_rate": 2.509225225094235e-05, + "loss": 0.0341, + "step": 90550 + }, + { + "epoch": 0.19972079667778922, + "grad_norm": 0.12284433841705322, + "learning_rate": 2.5091028937582115e-05, + "loss": 0.0316, + "step": 90560 + }, + { + "epoch": 0.19974285065268738, + "grad_norm": 0.10348573327064514, + "learning_rate": 2.5089805501606357e-05, + "loss": 0.0325, + "step": 90570 + }, + { + "epoch": 0.19976490462758556, + "grad_norm": 0.10892778635025024, + "learning_rate": 2.508858194302993e-05, + "loss": 0.0336, + "step": 90580 + }, + { + "epoch": 0.19978695860248372, + "grad_norm": 0.09731637686491013, + "learning_rate": 2.508735826186771e-05, + "loss": 0.0338, + "step": 90590 + }, + { + "epoch": 0.19980901257738187, + "grad_norm": 0.12890997529029846, + "learning_rate": 2.508613445813456e-05, + "loss": 0.0338, + "step": 90600 + }, + { + "epoch": 0.19983106655228006, + "grad_norm": 0.11639820784330368, + "learning_rate": 2.5084910531845362e-05, + "loss": 0.031, + "step": 90610 + }, + { + "epoch": 0.1998531205271782, + "grad_norm": 0.11838064342737198, + "learning_rate": 2.5083686483014968e-05, + "loss": 0.0326, + "step": 90620 + }, + { + "epoch": 0.19987517450207637, + "grad_norm": 0.1200229749083519, + "learning_rate": 2.5082462311658274e-05, + "loss": 0.0326, + "step": 90630 + }, + { + "epoch": 0.19989722847697455, + "grad_norm": 0.13434311747550964, + "learning_rate": 2.5081238017790135e-05, + "loss": 0.0329, + "step": 90640 + }, + { + "epoch": 0.1999192824518727, + "grad_norm": 0.11736005544662476, + "learning_rate": 2.508001360142544e-05, + "loss": 0.0332, + "step": 90650 + }, + { + "epoch": 0.19994133642677087, + "grad_norm": 0.09871197491884232, + "learning_rate": 2.507878906257906e-05, + "loss": 0.0338, + "step": 90660 + }, + { + "epoch": 0.19996339040166905, + "grad_norm": 0.12794548273086548, + "learning_rate": 2.507756440126588e-05, + "loss": 0.0324, + "step": 90670 + }, + { + "epoch": 0.1999854443765672, + "grad_norm": 0.09275364130735397, + "learning_rate": 2.5076339617500777e-05, + "loss": 0.0313, + "step": 90680 + }, + { + "epoch": 0.20000749835146536, + "grad_norm": 0.11589783430099487, + "learning_rate": 2.507511471129863e-05, + "loss": 0.0327, + "step": 90690 + }, + { + "epoch": 0.20002955232636355, + "grad_norm": 0.14230160415172577, + "learning_rate": 2.507388968267433e-05, + "loss": 0.032, + "step": 90700 + }, + { + "epoch": 0.2000516063012617, + "grad_norm": 0.09959530085325241, + "learning_rate": 2.5072664531642756e-05, + "loss": 0.0331, + "step": 90710 + }, + { + "epoch": 0.2000736602761599, + "grad_norm": 0.142619326710701, + "learning_rate": 2.5071439258218802e-05, + "loss": 0.0327, + "step": 90720 + }, + { + "epoch": 0.20009571425105804, + "grad_norm": 0.14757795631885529, + "learning_rate": 2.5070213862417354e-05, + "loss": 0.0323, + "step": 90730 + }, + { + "epoch": 0.2001177682259562, + "grad_norm": 0.10339643061161041, + "learning_rate": 2.5068988344253296e-05, + "loss": 0.0341, + "step": 90740 + }, + { + "epoch": 0.20013982220085438, + "grad_norm": 0.11211787909269333, + "learning_rate": 2.5067762703741522e-05, + "loss": 0.034, + "step": 90750 + }, + { + "epoch": 0.20016187617575254, + "grad_norm": 0.12045348435640335, + "learning_rate": 2.5066536940896927e-05, + "loss": 0.0329, + "step": 90760 + }, + { + "epoch": 0.2001839301506507, + "grad_norm": 0.11791035532951355, + "learning_rate": 2.5065311055734404e-05, + "loss": 0.0356, + "step": 90770 + }, + { + "epoch": 0.20020598412554888, + "grad_norm": 0.10668296366930008, + "learning_rate": 2.5064085048268845e-05, + "loss": 0.0317, + "step": 90780 + }, + { + "epoch": 0.20022803810044704, + "grad_norm": 0.09242177754640579, + "learning_rate": 2.5062858918515157e-05, + "loss": 0.0324, + "step": 90790 + }, + { + "epoch": 0.2002500920753452, + "grad_norm": 0.10728146880865097, + "learning_rate": 2.5061632666488225e-05, + "loss": 0.0329, + "step": 90800 + }, + { + "epoch": 0.20027214605024338, + "grad_norm": 0.09797323495149612, + "learning_rate": 2.5060406292202963e-05, + "loss": 0.0338, + "step": 90810 + }, + { + "epoch": 0.20029420002514153, + "grad_norm": 0.11705458909273148, + "learning_rate": 2.5059179795674265e-05, + "loss": 0.0315, + "step": 90820 + }, + { + "epoch": 0.2003162540000397, + "grad_norm": 0.11045743525028229, + "learning_rate": 2.5057953176917032e-05, + "loss": 0.033, + "step": 90830 + }, + { + "epoch": 0.20033830797493787, + "grad_norm": 0.09985888749361038, + "learning_rate": 2.5056726435946175e-05, + "loss": 0.033, + "step": 90840 + }, + { + "epoch": 0.20036036194983603, + "grad_norm": 0.10683353990316391, + "learning_rate": 2.5055499572776593e-05, + "loss": 0.0353, + "step": 90850 + }, + { + "epoch": 0.2003824159247342, + "grad_norm": 0.12122286111116409, + "learning_rate": 2.50542725874232e-05, + "loss": 0.0323, + "step": 90860 + }, + { + "epoch": 0.20040446989963237, + "grad_norm": 0.10680822283029556, + "learning_rate": 2.5053045479900897e-05, + "loss": 0.0319, + "step": 90870 + }, + { + "epoch": 0.20042652387453053, + "grad_norm": 0.10782060772180557, + "learning_rate": 2.5051818250224604e-05, + "loss": 0.0321, + "step": 90880 + }, + { + "epoch": 0.20044857784942868, + "grad_norm": 0.09093503654003143, + "learning_rate": 2.505059089840923e-05, + "loss": 0.0315, + "step": 90890 + }, + { + "epoch": 0.20047063182432687, + "grad_norm": 0.09148510545492172, + "learning_rate": 2.5049363424469684e-05, + "loss": 0.0318, + "step": 90900 + }, + { + "epoch": 0.20049268579922502, + "grad_norm": 0.09743491560220718, + "learning_rate": 2.5048135828420886e-05, + "loss": 0.0343, + "step": 90910 + }, + { + "epoch": 0.20051473977412318, + "grad_norm": 0.1193072572350502, + "learning_rate": 2.504690811027775e-05, + "loss": 0.0324, + "step": 90920 + }, + { + "epoch": 0.20053679374902136, + "grad_norm": 0.13401587307453156, + "learning_rate": 2.5045680270055196e-05, + "loss": 0.0322, + "step": 90930 + }, + { + "epoch": 0.20055884772391952, + "grad_norm": 0.11481829732656479, + "learning_rate": 2.504445230776814e-05, + "loss": 0.0322, + "step": 90940 + }, + { + "epoch": 0.20058090169881768, + "grad_norm": 0.12223388999700546, + "learning_rate": 2.504322422343151e-05, + "loss": 0.0339, + "step": 90950 + }, + { + "epoch": 0.20060295567371586, + "grad_norm": 0.11566653102636337, + "learning_rate": 2.5041996017060217e-05, + "loss": 0.0324, + "step": 90960 + }, + { + "epoch": 0.20062500964861402, + "grad_norm": 0.09586793929338455, + "learning_rate": 2.5040767688669196e-05, + "loss": 0.0335, + "step": 90970 + }, + { + "epoch": 0.20064706362351217, + "grad_norm": 0.09717848896980286, + "learning_rate": 2.5039539238273363e-05, + "loss": 0.0341, + "step": 90980 + }, + { + "epoch": 0.20066911759841036, + "grad_norm": 0.09468330442905426, + "learning_rate": 2.5038310665887648e-05, + "loss": 0.0316, + "step": 90990 + }, + { + "epoch": 0.2006911715733085, + "grad_norm": 0.10572487860918045, + "learning_rate": 2.5037081971526986e-05, + "loss": 0.0331, + "step": 91000 + }, + { + "epoch": 0.20071322554820667, + "grad_norm": 0.12143426388502121, + "learning_rate": 2.5035853155206302e-05, + "loss": 0.0318, + "step": 91010 + }, + { + "epoch": 0.20073527952310485, + "grad_norm": 0.10284112393856049, + "learning_rate": 2.5034624216940525e-05, + "loss": 0.0341, + "step": 91020 + }, + { + "epoch": 0.200757333498003, + "grad_norm": 0.10660416632890701, + "learning_rate": 2.5033395156744588e-05, + "loss": 0.032, + "step": 91030 + }, + { + "epoch": 0.20077938747290117, + "grad_norm": 0.13610823452472687, + "learning_rate": 2.5032165974633428e-05, + "loss": 0.0329, + "step": 91040 + }, + { + "epoch": 0.20080144144779935, + "grad_norm": 0.08614375442266464, + "learning_rate": 2.503093667062198e-05, + "loss": 0.0332, + "step": 91050 + }, + { + "epoch": 0.2008234954226975, + "grad_norm": 0.09882313013076782, + "learning_rate": 2.502970724472518e-05, + "loss": 0.0334, + "step": 91060 + }, + { + "epoch": 0.20084554939759566, + "grad_norm": 0.13005580008029938, + "learning_rate": 2.5028477696957967e-05, + "loss": 0.032, + "step": 91070 + }, + { + "epoch": 0.20086760337249385, + "grad_norm": 0.09586111456155777, + "learning_rate": 2.502724802733528e-05, + "loss": 0.0347, + "step": 91080 + }, + { + "epoch": 0.200889657347392, + "grad_norm": 0.1032654270529747, + "learning_rate": 2.502601823587207e-05, + "loss": 0.0336, + "step": 91090 + }, + { + "epoch": 0.20091171132229016, + "grad_norm": 0.11268588900566101, + "learning_rate": 2.502478832258326e-05, + "loss": 0.033, + "step": 91100 + }, + { + "epoch": 0.20093376529718834, + "grad_norm": 0.12215445935726166, + "learning_rate": 2.5023558287483816e-05, + "loss": 0.0323, + "step": 91110 + }, + { + "epoch": 0.2009558192720865, + "grad_norm": 0.09130201488733292, + "learning_rate": 2.502232813058867e-05, + "loss": 0.0312, + "step": 91120 + }, + { + "epoch": 0.20097787324698468, + "grad_norm": 0.09468741714954376, + "learning_rate": 2.5021097851912774e-05, + "loss": 0.0327, + "step": 91130 + }, + { + "epoch": 0.20099992722188284, + "grad_norm": 0.11081244796514511, + "learning_rate": 2.5019867451471082e-05, + "loss": 0.0328, + "step": 91140 + }, + { + "epoch": 0.201021981196781, + "grad_norm": 0.11076118052005768, + "learning_rate": 2.5018636929278542e-05, + "loss": 0.0328, + "step": 91150 + }, + { + "epoch": 0.20104403517167918, + "grad_norm": 0.12695807218551636, + "learning_rate": 2.50174062853501e-05, + "loss": 0.0339, + "step": 91160 + }, + { + "epoch": 0.20106608914657734, + "grad_norm": 0.10176286846399307, + "learning_rate": 2.501617551970071e-05, + "loss": 0.0326, + "step": 91170 + }, + { + "epoch": 0.2010881431214755, + "grad_norm": 0.10891919583082199, + "learning_rate": 2.501494463234534e-05, + "loss": 0.0331, + "step": 91180 + }, + { + "epoch": 0.20111019709637368, + "grad_norm": 0.09278804063796997, + "learning_rate": 2.501371362329893e-05, + "loss": 0.0328, + "step": 91190 + }, + { + "epoch": 0.20113225107127183, + "grad_norm": 0.10685782879590988, + "learning_rate": 2.5012482492576448e-05, + "loss": 0.0336, + "step": 91200 + }, + { + "epoch": 0.20115430504617, + "grad_norm": 0.11526663601398468, + "learning_rate": 2.5011251240192846e-05, + "loss": 0.0343, + "step": 91210 + }, + { + "epoch": 0.20117635902106817, + "grad_norm": 0.09871531277894974, + "learning_rate": 2.5010019866163093e-05, + "loss": 0.0338, + "step": 91220 + }, + { + "epoch": 0.20119841299596633, + "grad_norm": 0.11642493307590485, + "learning_rate": 2.500878837050215e-05, + "loss": 0.0327, + "step": 91230 + }, + { + "epoch": 0.2012204669708645, + "grad_norm": 0.12475372105836868, + "learning_rate": 2.5007556753224974e-05, + "loss": 0.0335, + "step": 91240 + }, + { + "epoch": 0.20124252094576267, + "grad_norm": 0.1048172265291214, + "learning_rate": 2.5006325014346532e-05, + "loss": 0.0329, + "step": 91250 + }, + { + "epoch": 0.20126457492066083, + "grad_norm": 0.0985722467303276, + "learning_rate": 2.50050931538818e-05, + "loss": 0.0324, + "step": 91260 + }, + { + "epoch": 0.20128662889555898, + "grad_norm": 0.17715243995189667, + "learning_rate": 2.500386117184574e-05, + "loss": 0.0328, + "step": 91270 + }, + { + "epoch": 0.20130868287045717, + "grad_norm": 0.09598474204540253, + "learning_rate": 2.500262906825332e-05, + "loss": 0.0336, + "step": 91280 + }, + { + "epoch": 0.20133073684535532, + "grad_norm": 0.10335735976696014, + "learning_rate": 2.500139684311951e-05, + "loss": 0.0323, + "step": 91290 + }, + { + "epoch": 0.20135279082025348, + "grad_norm": 0.09313292056322098, + "learning_rate": 2.5000164496459288e-05, + "loss": 0.0328, + "step": 91300 + }, + { + "epoch": 0.20137484479515166, + "grad_norm": 0.10859379172325134, + "learning_rate": 2.4998932028287624e-05, + "loss": 0.0328, + "step": 91310 + }, + { + "epoch": 0.20139689877004982, + "grad_norm": 0.08695786446332932, + "learning_rate": 2.4997699438619498e-05, + "loss": 0.0318, + "step": 91320 + }, + { + "epoch": 0.20141895274494798, + "grad_norm": 0.11066855490207672, + "learning_rate": 2.499646672746988e-05, + "loss": 0.0327, + "step": 91330 + }, + { + "epoch": 0.20144100671984616, + "grad_norm": 0.11746161431074142, + "learning_rate": 2.499523389485376e-05, + "loss": 0.0341, + "step": 91340 + }, + { + "epoch": 0.20146306069474432, + "grad_norm": 0.1056126207113266, + "learning_rate": 2.4994000940786103e-05, + "loss": 0.0331, + "step": 91350 + }, + { + "epoch": 0.20148511466964247, + "grad_norm": 0.08920740336179733, + "learning_rate": 2.4992767865281905e-05, + "loss": 0.031, + "step": 91360 + }, + { + "epoch": 0.20150716864454066, + "grad_norm": 0.1228218823671341, + "learning_rate": 2.4991534668356137e-05, + "loss": 0.0342, + "step": 91370 + }, + { + "epoch": 0.2015292226194388, + "grad_norm": 0.10639329254627228, + "learning_rate": 2.4990301350023795e-05, + "loss": 0.0329, + "step": 91380 + }, + { + "epoch": 0.20155127659433697, + "grad_norm": 0.11180827766656876, + "learning_rate": 2.4989067910299858e-05, + "loss": 0.0328, + "step": 91390 + }, + { + "epoch": 0.20157333056923515, + "grad_norm": 0.0947694182395935, + "learning_rate": 2.498783434919931e-05, + "loss": 0.0329, + "step": 91400 + }, + { + "epoch": 0.2015953845441333, + "grad_norm": 0.11038266867399216, + "learning_rate": 2.498660066673715e-05, + "loss": 0.0331, + "step": 91410 + }, + { + "epoch": 0.20161743851903147, + "grad_norm": 0.14857399463653564, + "learning_rate": 2.498536686292836e-05, + "loss": 0.0322, + "step": 91420 + }, + { + "epoch": 0.20163949249392965, + "grad_norm": 0.10622338205575943, + "learning_rate": 2.4984132937787938e-05, + "loss": 0.0312, + "step": 91430 + }, + { + "epoch": 0.2016615464688278, + "grad_norm": 0.10437152534723282, + "learning_rate": 2.498289889133087e-05, + "loss": 0.0329, + "step": 91440 + }, + { + "epoch": 0.20168360044372596, + "grad_norm": 0.10697855800390244, + "learning_rate": 2.498166472357216e-05, + "loss": 0.033, + "step": 91450 + }, + { + "epoch": 0.20170565441862415, + "grad_norm": 0.10714263468980789, + "learning_rate": 2.4980430434526795e-05, + "loss": 0.032, + "step": 91460 + }, + { + "epoch": 0.2017277083935223, + "grad_norm": 0.10234901309013367, + "learning_rate": 2.497919602420978e-05, + "loss": 0.0324, + "step": 91470 + }, + { + "epoch": 0.20174976236842046, + "grad_norm": 0.16848528385162354, + "learning_rate": 2.497796149263611e-05, + "loss": 0.0309, + "step": 91480 + }, + { + "epoch": 0.20177181634331864, + "grad_norm": 0.0928460955619812, + "learning_rate": 2.4976726839820786e-05, + "loss": 0.0338, + "step": 91490 + }, + { + "epoch": 0.2017938703182168, + "grad_norm": 0.14054515957832336, + "learning_rate": 2.497549206577881e-05, + "loss": 0.0317, + "step": 91500 + }, + { + "epoch": 0.20181592429311496, + "grad_norm": 0.09137681126594543, + "learning_rate": 2.497425717052519e-05, + "loss": 0.034, + "step": 91510 + }, + { + "epoch": 0.20183797826801314, + "grad_norm": 0.11400552839040756, + "learning_rate": 2.497302215407493e-05, + "loss": 0.0329, + "step": 91520 + }, + { + "epoch": 0.2018600322429113, + "grad_norm": 0.13127003610134125, + "learning_rate": 2.4971787016443032e-05, + "loss": 0.0323, + "step": 91530 + }, + { + "epoch": 0.20188208621780945, + "grad_norm": 0.12118404358625412, + "learning_rate": 2.4970551757644504e-05, + "loss": 0.0329, + "step": 91540 + }, + { + "epoch": 0.20190414019270764, + "grad_norm": 0.11333578824996948, + "learning_rate": 2.496931637769436e-05, + "loss": 0.0324, + "step": 91550 + }, + { + "epoch": 0.2019261941676058, + "grad_norm": 0.10583750903606415, + "learning_rate": 2.4968080876607612e-05, + "loss": 0.032, + "step": 91560 + }, + { + "epoch": 0.20194824814250398, + "grad_norm": 0.11451488733291626, + "learning_rate": 2.4966845254399267e-05, + "loss": 0.0314, + "step": 91570 + }, + { + "epoch": 0.20197030211740213, + "grad_norm": 0.12230340391397476, + "learning_rate": 2.4965609511084347e-05, + "loss": 0.0318, + "step": 91580 + }, + { + "epoch": 0.2019923560923003, + "grad_norm": 0.12279172241687775, + "learning_rate": 2.4964373646677858e-05, + "loss": 0.0323, + "step": 91590 + }, + { + "epoch": 0.20201441006719847, + "grad_norm": 0.09729668498039246, + "learning_rate": 2.496313766119482e-05, + "loss": 0.0319, + "step": 91600 + }, + { + "epoch": 0.20203646404209663, + "grad_norm": 0.10944455116987228, + "learning_rate": 2.4961901554650257e-05, + "loss": 0.0331, + "step": 91610 + }, + { + "epoch": 0.2020585180169948, + "grad_norm": 0.1471790373325348, + "learning_rate": 2.496066532705918e-05, + "loss": 0.0326, + "step": 91620 + }, + { + "epoch": 0.20208057199189297, + "grad_norm": 0.14157606661319733, + "learning_rate": 2.4959428978436614e-05, + "loss": 0.0323, + "step": 91630 + }, + { + "epoch": 0.20210262596679113, + "grad_norm": 0.13698439300060272, + "learning_rate": 2.4958192508797593e-05, + "loss": 0.0341, + "step": 91640 + }, + { + "epoch": 0.20212467994168928, + "grad_norm": 0.11475858092308044, + "learning_rate": 2.495695591815712e-05, + "loss": 0.0324, + "step": 91650 + }, + { + "epoch": 0.20214673391658747, + "grad_norm": 0.11543145030736923, + "learning_rate": 2.495571920653024e-05, + "loss": 0.0316, + "step": 91660 + }, + { + "epoch": 0.20216878789148562, + "grad_norm": 0.10095986723899841, + "learning_rate": 2.495448237393196e-05, + "loss": 0.0313, + "step": 91670 + }, + { + "epoch": 0.20219084186638378, + "grad_norm": 0.11102127283811569, + "learning_rate": 2.4953245420377332e-05, + "loss": 0.0335, + "step": 91680 + }, + { + "epoch": 0.20221289584128196, + "grad_norm": 0.10766879469156265, + "learning_rate": 2.495200834588137e-05, + "loss": 0.0342, + "step": 91690 + }, + { + "epoch": 0.20223494981618012, + "grad_norm": 0.11165275424718857, + "learning_rate": 2.4950771150459112e-05, + "loss": 0.033, + "step": 91700 + }, + { + "epoch": 0.20225700379107828, + "grad_norm": 0.11366568505764008, + "learning_rate": 2.4949533834125587e-05, + "loss": 0.0345, + "step": 91710 + }, + { + "epoch": 0.20227905776597646, + "grad_norm": 0.11484935134649277, + "learning_rate": 2.4948296396895833e-05, + "loss": 0.0333, + "step": 91720 + }, + { + "epoch": 0.20230111174087462, + "grad_norm": 0.12338677048683167, + "learning_rate": 2.4947058838784888e-05, + "loss": 0.0335, + "step": 91730 + }, + { + "epoch": 0.20232316571577277, + "grad_norm": 0.11619085818529129, + "learning_rate": 2.4945821159807783e-05, + "loss": 0.032, + "step": 91740 + }, + { + "epoch": 0.20234521969067096, + "grad_norm": 0.10876918584108353, + "learning_rate": 2.4944583359979562e-05, + "loss": 0.0344, + "step": 91750 + }, + { + "epoch": 0.20236727366556911, + "grad_norm": 0.10000599175691605, + "learning_rate": 2.4943345439315263e-05, + "loss": 0.0349, + "step": 91760 + }, + { + "epoch": 0.20238932764046727, + "grad_norm": 0.09848153591156006, + "learning_rate": 2.4942107397829927e-05, + "loss": 0.0313, + "step": 91770 + }, + { + "epoch": 0.20241138161536545, + "grad_norm": 0.11047166585922241, + "learning_rate": 2.4940869235538604e-05, + "loss": 0.0319, + "step": 91780 + }, + { + "epoch": 0.2024334355902636, + "grad_norm": 0.1187826469540596, + "learning_rate": 2.493963095245633e-05, + "loss": 0.0329, + "step": 91790 + }, + { + "epoch": 0.20245548956516177, + "grad_norm": 0.13467542827129364, + "learning_rate": 2.4938392548598157e-05, + "loss": 0.0339, + "step": 91800 + }, + { + "epoch": 0.20247754354005995, + "grad_norm": 0.1116815134882927, + "learning_rate": 2.493715402397913e-05, + "loss": 0.0337, + "step": 91810 + }, + { + "epoch": 0.2024995975149581, + "grad_norm": 0.10875142365694046, + "learning_rate": 2.4935915378614302e-05, + "loss": 0.0335, + "step": 91820 + }, + { + "epoch": 0.20252165148985626, + "grad_norm": 0.12551367282867432, + "learning_rate": 2.4934676612518718e-05, + "loss": 0.0322, + "step": 91830 + }, + { + "epoch": 0.20254370546475445, + "grad_norm": 0.09168051183223724, + "learning_rate": 2.493343772570744e-05, + "loss": 0.0328, + "step": 91840 + }, + { + "epoch": 0.2025657594396526, + "grad_norm": 0.22240833938121796, + "learning_rate": 2.4932198718195505e-05, + "loss": 0.0332, + "step": 91850 + }, + { + "epoch": 0.20258781341455076, + "grad_norm": 0.10362622886896133, + "learning_rate": 2.4930959589997984e-05, + "loss": 0.0333, + "step": 91860 + }, + { + "epoch": 0.20260986738944894, + "grad_norm": 0.11782988160848618, + "learning_rate": 2.4929720341129927e-05, + "loss": 0.0345, + "step": 91870 + }, + { + "epoch": 0.2026319213643471, + "grad_norm": 0.10603398084640503, + "learning_rate": 2.4928480971606393e-05, + "loss": 0.0331, + "step": 91880 + }, + { + "epoch": 0.20265397533924526, + "grad_norm": 0.1059839203953743, + "learning_rate": 2.4927241481442437e-05, + "loss": 0.0339, + "step": 91890 + }, + { + "epoch": 0.20267602931414344, + "grad_norm": 0.11447340995073318, + "learning_rate": 2.492600187065313e-05, + "loss": 0.0342, + "step": 91900 + }, + { + "epoch": 0.2026980832890416, + "grad_norm": 0.11584513634443283, + "learning_rate": 2.4924762139253526e-05, + "loss": 0.0325, + "step": 91910 + }, + { + "epoch": 0.20272013726393975, + "grad_norm": 0.1239982396364212, + "learning_rate": 2.4923522287258688e-05, + "loss": 0.0334, + "step": 91920 + }, + { + "epoch": 0.20274219123883794, + "grad_norm": 0.1094663068652153, + "learning_rate": 2.492228231468369e-05, + "loss": 0.0326, + "step": 91930 + }, + { + "epoch": 0.2027642452137361, + "grad_norm": 0.0927378311753273, + "learning_rate": 2.492104222154359e-05, + "loss": 0.0331, + "step": 91940 + }, + { + "epoch": 0.20278629918863425, + "grad_norm": 0.11611949652433395, + "learning_rate": 2.4919802007853463e-05, + "loss": 0.0323, + "step": 91950 + }, + { + "epoch": 0.20280835316353243, + "grad_norm": 0.10566096007823944, + "learning_rate": 2.4918561673628375e-05, + "loss": 0.0334, + "step": 91960 + }, + { + "epoch": 0.2028304071384306, + "grad_norm": 0.11695921421051025, + "learning_rate": 2.49173212188834e-05, + "loss": 0.0327, + "step": 91970 + }, + { + "epoch": 0.20285246111332875, + "grad_norm": 0.13404476642608643, + "learning_rate": 2.4916080643633606e-05, + "loss": 0.0344, + "step": 91980 + }, + { + "epoch": 0.20287451508822693, + "grad_norm": 0.13024963438510895, + "learning_rate": 2.4914839947894067e-05, + "loss": 0.035, + "step": 91990 + }, + { + "epoch": 0.2028965690631251, + "grad_norm": 0.10724122077226639, + "learning_rate": 2.4913599131679863e-05, + "loss": 0.0326, + "step": 92000 + }, + { + "epoch": 0.20291862303802327, + "grad_norm": 0.10937409847974777, + "learning_rate": 2.491235819500607e-05, + "loss": 0.0349, + "step": 92010 + }, + { + "epoch": 0.20294067701292143, + "grad_norm": 0.12068389356136322, + "learning_rate": 2.491111713788777e-05, + "loss": 0.0327, + "step": 92020 + }, + { + "epoch": 0.20296273098781958, + "grad_norm": 0.08642752468585968, + "learning_rate": 2.4909875960340038e-05, + "loss": 0.0326, + "step": 92030 + }, + { + "epoch": 0.20298478496271777, + "grad_norm": 0.10865237563848495, + "learning_rate": 2.4908634662377952e-05, + "loss": 0.0332, + "step": 92040 + }, + { + "epoch": 0.20300683893761592, + "grad_norm": 0.11606119573116302, + "learning_rate": 2.4907393244016605e-05, + "loss": 0.0327, + "step": 92050 + }, + { + "epoch": 0.20302889291251408, + "grad_norm": 0.10195697844028473, + "learning_rate": 2.4906151705271075e-05, + "loss": 0.0332, + "step": 92060 + }, + { + "epoch": 0.20305094688741226, + "grad_norm": 0.08872830867767334, + "learning_rate": 2.4904910046156445e-05, + "loss": 0.0308, + "step": 92070 + }, + { + "epoch": 0.20307300086231042, + "grad_norm": 0.12917163968086243, + "learning_rate": 2.4903668266687808e-05, + "loss": 0.0333, + "step": 92080 + }, + { + "epoch": 0.20309505483720858, + "grad_norm": 0.10974337160587311, + "learning_rate": 2.490242636688025e-05, + "loss": 0.0323, + "step": 92090 + }, + { + "epoch": 0.20311710881210676, + "grad_norm": 0.1300838589668274, + "learning_rate": 2.4901184346748862e-05, + "loss": 0.0331, + "step": 92100 + }, + { + "epoch": 0.20313916278700492, + "grad_norm": 0.09201396256685257, + "learning_rate": 2.4899942206308742e-05, + "loss": 0.0326, + "step": 92110 + }, + { + "epoch": 0.20316121676190307, + "grad_norm": 0.09967531263828278, + "learning_rate": 2.4898699945574975e-05, + "loss": 0.033, + "step": 92120 + }, + { + "epoch": 0.20318327073680126, + "grad_norm": 0.10110315680503845, + "learning_rate": 2.489745756456265e-05, + "loss": 0.0323, + "step": 92130 + }, + { + "epoch": 0.20320532471169941, + "grad_norm": 0.12003074586391449, + "learning_rate": 2.4896215063286878e-05, + "loss": 0.0336, + "step": 92140 + }, + { + "epoch": 0.20322737868659757, + "grad_norm": 0.09512102603912354, + "learning_rate": 2.4894972441762747e-05, + "loss": 0.0345, + "step": 92150 + }, + { + "epoch": 0.20324943266149575, + "grad_norm": 0.10862942785024643, + "learning_rate": 2.489372970000536e-05, + "loss": 0.033, + "step": 92160 + }, + { + "epoch": 0.2032714866363939, + "grad_norm": 0.10557430982589722, + "learning_rate": 2.4892486838029815e-05, + "loss": 0.0328, + "step": 92170 + }, + { + "epoch": 0.20329354061129207, + "grad_norm": 0.10377911478281021, + "learning_rate": 2.4891243855851215e-05, + "loss": 0.033, + "step": 92180 + }, + { + "epoch": 0.20331559458619025, + "grad_norm": 0.1072826012969017, + "learning_rate": 2.4890000753484664e-05, + "loss": 0.0323, + "step": 92190 + }, + { + "epoch": 0.2033376485610884, + "grad_norm": 0.13127191364765167, + "learning_rate": 2.4888757530945263e-05, + "loss": 0.0334, + "step": 92200 + }, + { + "epoch": 0.20335970253598656, + "grad_norm": 0.12143534421920776, + "learning_rate": 2.4887514188248123e-05, + "loss": 0.0324, + "step": 92210 + }, + { + "epoch": 0.20338175651088475, + "grad_norm": 0.09116606414318085, + "learning_rate": 2.488627072540835e-05, + "loss": 0.0315, + "step": 92220 + }, + { + "epoch": 0.2034038104857829, + "grad_norm": 0.11583053320646286, + "learning_rate": 2.488502714244106e-05, + "loss": 0.0323, + "step": 92230 + }, + { + "epoch": 0.20342586446068106, + "grad_norm": 0.11124023050069809, + "learning_rate": 2.488378343936135e-05, + "loss": 0.0318, + "step": 92240 + }, + { + "epoch": 0.20344791843557924, + "grad_norm": 0.11221718788146973, + "learning_rate": 2.488253961618434e-05, + "loss": 0.0354, + "step": 92250 + }, + { + "epoch": 0.2034699724104774, + "grad_norm": 0.145563006401062, + "learning_rate": 2.488129567292514e-05, + "loss": 0.0314, + "step": 92260 + }, + { + "epoch": 0.20349202638537556, + "grad_norm": 0.12798507511615753, + "learning_rate": 2.4880051609598877e-05, + "loss": 0.0335, + "step": 92270 + }, + { + "epoch": 0.20351408036027374, + "grad_norm": 0.10710804909467697, + "learning_rate": 2.4878807426220656e-05, + "loss": 0.0328, + "step": 92280 + }, + { + "epoch": 0.2035361343351719, + "grad_norm": 0.09039487689733505, + "learning_rate": 2.487756312280559e-05, + "loss": 0.0324, + "step": 92290 + }, + { + "epoch": 0.20355818831007005, + "grad_norm": 0.12190763652324677, + "learning_rate": 2.4876318699368813e-05, + "loss": 0.0341, + "step": 92300 + }, + { + "epoch": 0.20358024228496824, + "grad_norm": 0.08915936201810837, + "learning_rate": 2.487507415592544e-05, + "loss": 0.0331, + "step": 92310 + }, + { + "epoch": 0.2036022962598664, + "grad_norm": 0.10379093140363693, + "learning_rate": 2.487382949249059e-05, + "loss": 0.0342, + "step": 92320 + }, + { + "epoch": 0.20362435023476455, + "grad_norm": 0.10044589638710022, + "learning_rate": 2.4872584709079387e-05, + "loss": 0.031, + "step": 92330 + }, + { + "epoch": 0.20364640420966273, + "grad_norm": 0.09626834839582443, + "learning_rate": 2.487133980570696e-05, + "loss": 0.0327, + "step": 92340 + }, + { + "epoch": 0.2036684581845609, + "grad_norm": 0.11704234778881073, + "learning_rate": 2.4870094782388437e-05, + "loss": 0.0331, + "step": 92350 + }, + { + "epoch": 0.20369051215945905, + "grad_norm": 0.13047917187213898, + "learning_rate": 2.4868849639138938e-05, + "loss": 0.0332, + "step": 92360 + }, + { + "epoch": 0.20371256613435723, + "grad_norm": 0.10607343167066574, + "learning_rate": 2.4867604375973605e-05, + "loss": 0.0332, + "step": 92370 + }, + { + "epoch": 0.2037346201092554, + "grad_norm": 0.11761074513196945, + "learning_rate": 2.4866358992907553e-05, + "loss": 0.0339, + "step": 92380 + }, + { + "epoch": 0.20375667408415354, + "grad_norm": 0.11777419596910477, + "learning_rate": 2.4865113489955935e-05, + "loss": 0.0316, + "step": 92390 + }, + { + "epoch": 0.20377872805905173, + "grad_norm": 0.08334068953990936, + "learning_rate": 2.4863867867133866e-05, + "loss": 0.0323, + "step": 92400 + }, + { + "epoch": 0.20380078203394988, + "grad_norm": 0.12873980402946472, + "learning_rate": 2.486262212445649e-05, + "loss": 0.032, + "step": 92410 + }, + { + "epoch": 0.20382283600884807, + "grad_norm": 0.08159096539020538, + "learning_rate": 2.4861376261938945e-05, + "loss": 0.0327, + "step": 92420 + }, + { + "epoch": 0.20384488998374622, + "grad_norm": 0.08773428201675415, + "learning_rate": 2.4860130279596372e-05, + "loss": 0.0347, + "step": 92430 + }, + { + "epoch": 0.20386694395864438, + "grad_norm": 0.12306006997823715, + "learning_rate": 2.4858884177443902e-05, + "loss": 0.0318, + "step": 92440 + }, + { + "epoch": 0.20388899793354257, + "grad_norm": 0.10646256804466248, + "learning_rate": 2.485763795549668e-05, + "loss": 0.0341, + "step": 92450 + }, + { + "epoch": 0.20391105190844072, + "grad_norm": 0.09461409598588943, + "learning_rate": 2.4856391613769852e-05, + "loss": 0.0304, + "step": 92460 + }, + { + "epoch": 0.20393310588333888, + "grad_norm": 0.1020943745970726, + "learning_rate": 2.485514515227856e-05, + "loss": 0.0341, + "step": 92470 + }, + { + "epoch": 0.20395515985823706, + "grad_norm": 0.13079705834388733, + "learning_rate": 2.4853898571037948e-05, + "loss": 0.0314, + "step": 92480 + }, + { + "epoch": 0.20397721383313522, + "grad_norm": 0.11465110629796982, + "learning_rate": 2.4852651870063163e-05, + "loss": 0.0331, + "step": 92490 + }, + { + "epoch": 0.20399926780803337, + "grad_norm": 0.10291781276464462, + "learning_rate": 2.485140504936936e-05, + "loss": 0.0335, + "step": 92500 + }, + { + "epoch": 0.20402132178293156, + "grad_norm": 0.10135488957166672, + "learning_rate": 2.4850158108971683e-05, + "loss": 0.0343, + "step": 92510 + }, + { + "epoch": 0.20404337575782971, + "grad_norm": 0.11505229771137238, + "learning_rate": 2.4848911048885282e-05, + "loss": 0.035, + "step": 92520 + }, + { + "epoch": 0.20406542973272787, + "grad_norm": 0.10446292161941528, + "learning_rate": 2.4847663869125316e-05, + "loss": 0.0327, + "step": 92530 + }, + { + "epoch": 0.20408748370762606, + "grad_norm": 0.12249580025672913, + "learning_rate": 2.4846416569706934e-05, + "loss": 0.0327, + "step": 92540 + }, + { + "epoch": 0.2041095376825242, + "grad_norm": 0.10015030205249786, + "learning_rate": 2.48451691506453e-05, + "loss": 0.0325, + "step": 92550 + }, + { + "epoch": 0.20413159165742237, + "grad_norm": 0.11387860029935837, + "learning_rate": 2.4843921611955556e-05, + "loss": 0.0319, + "step": 92560 + }, + { + "epoch": 0.20415364563232055, + "grad_norm": 0.09026794880628586, + "learning_rate": 2.4842673953652875e-05, + "loss": 0.0336, + "step": 92570 + }, + { + "epoch": 0.2041756996072187, + "grad_norm": 0.11002852767705917, + "learning_rate": 2.484142617575241e-05, + "loss": 0.034, + "step": 92580 + }, + { + "epoch": 0.20419775358211686, + "grad_norm": 0.09316661953926086, + "learning_rate": 2.484017827826933e-05, + "loss": 0.0319, + "step": 92590 + }, + { + "epoch": 0.20421980755701505, + "grad_norm": 0.0950249433517456, + "learning_rate": 2.4838930261218794e-05, + "loss": 0.0322, + "step": 92600 + }, + { + "epoch": 0.2042418615319132, + "grad_norm": 0.11101856827735901, + "learning_rate": 2.483768212461596e-05, + "loss": 0.0331, + "step": 92610 + }, + { + "epoch": 0.20426391550681136, + "grad_norm": 0.11025779694318771, + "learning_rate": 2.4836433868476e-05, + "loss": 0.033, + "step": 92620 + }, + { + "epoch": 0.20428596948170955, + "grad_norm": 0.12967468798160553, + "learning_rate": 2.4835185492814087e-05, + "loss": 0.0322, + "step": 92630 + }, + { + "epoch": 0.2043080234566077, + "grad_norm": 0.09934818744659424, + "learning_rate": 2.4833936997645385e-05, + "loss": 0.0338, + "step": 92640 + }, + { + "epoch": 0.20433007743150586, + "grad_norm": 0.09957930445671082, + "learning_rate": 2.4832688382985053e-05, + "loss": 0.0321, + "step": 92650 + }, + { + "epoch": 0.20435213140640404, + "grad_norm": 0.09682903438806534, + "learning_rate": 2.4831439648848286e-05, + "loss": 0.0346, + "step": 92660 + }, + { + "epoch": 0.2043741853813022, + "grad_norm": 0.10582512617111206, + "learning_rate": 2.4830190795250236e-05, + "loss": 0.0339, + "step": 92670 + }, + { + "epoch": 0.20439623935620035, + "grad_norm": 0.1211017593741417, + "learning_rate": 2.4828941822206087e-05, + "loss": 0.0327, + "step": 92680 + }, + { + "epoch": 0.20441829333109854, + "grad_norm": 0.10605838894844055, + "learning_rate": 2.482769272973102e-05, + "loss": 0.0325, + "step": 92690 + }, + { + "epoch": 0.2044403473059967, + "grad_norm": 0.13897863030433655, + "learning_rate": 2.4826443517840204e-05, + "loss": 0.0342, + "step": 92700 + }, + { + "epoch": 0.20446240128089485, + "grad_norm": 0.1276896893978119, + "learning_rate": 2.4825194186548826e-05, + "loss": 0.0338, + "step": 92710 + }, + { + "epoch": 0.20448445525579304, + "grad_norm": 0.13665005564689636, + "learning_rate": 2.482394473587206e-05, + "loss": 0.0356, + "step": 92720 + }, + { + "epoch": 0.2045065092306912, + "grad_norm": 0.1018124371767044, + "learning_rate": 2.4822695165825085e-05, + "loss": 0.0319, + "step": 92730 + }, + { + "epoch": 0.20452856320558935, + "grad_norm": 0.11475902050733566, + "learning_rate": 2.4821445476423096e-05, + "loss": 0.0314, + "step": 92740 + }, + { + "epoch": 0.20455061718048753, + "grad_norm": 0.10407754778862, + "learning_rate": 2.4820195667681266e-05, + "loss": 0.0315, + "step": 92750 + }, + { + "epoch": 0.2045726711553857, + "grad_norm": 0.09932471066713333, + "learning_rate": 2.4818945739614794e-05, + "loss": 0.0331, + "step": 92760 + }, + { + "epoch": 0.20459472513028384, + "grad_norm": 0.09962233155965805, + "learning_rate": 2.4817695692238853e-05, + "loss": 0.0327, + "step": 92770 + }, + { + "epoch": 0.20461677910518203, + "grad_norm": 0.10921268165111542, + "learning_rate": 2.4816445525568645e-05, + "loss": 0.034, + "step": 92780 + }, + { + "epoch": 0.20463883308008018, + "grad_norm": 0.12761487066745758, + "learning_rate": 2.481519523961935e-05, + "loss": 0.0319, + "step": 92790 + }, + { + "epoch": 0.20466088705497834, + "grad_norm": 0.14399217069149017, + "learning_rate": 2.4813944834406173e-05, + "loss": 0.0305, + "step": 92800 + }, + { + "epoch": 0.20468294102987653, + "grad_norm": 0.09398972988128662, + "learning_rate": 2.4812694309944294e-05, + "loss": 0.0302, + "step": 92810 + }, + { + "epoch": 0.20470499500477468, + "grad_norm": 0.13550299406051636, + "learning_rate": 2.481144366624892e-05, + "loss": 0.0321, + "step": 92820 + }, + { + "epoch": 0.20472704897967284, + "grad_norm": 0.11279541999101639, + "learning_rate": 2.4810192903335237e-05, + "loss": 0.0332, + "step": 92830 + }, + { + "epoch": 0.20474910295457102, + "grad_norm": 0.07651329785585403, + "learning_rate": 2.480894202121845e-05, + "loss": 0.0327, + "step": 92840 + }, + { + "epoch": 0.20477115692946918, + "grad_norm": 0.1257375180721283, + "learning_rate": 2.4807691019913757e-05, + "loss": 0.0338, + "step": 92850 + }, + { + "epoch": 0.20479321090436736, + "grad_norm": 0.1122603714466095, + "learning_rate": 2.480643989943636e-05, + "loss": 0.0321, + "step": 92860 + }, + { + "epoch": 0.20481526487926552, + "grad_norm": 0.13490255177021027, + "learning_rate": 2.4805188659801456e-05, + "loss": 0.033, + "step": 92870 + }, + { + "epoch": 0.20483731885416367, + "grad_norm": 0.10254748910665512, + "learning_rate": 2.4803937301024253e-05, + "loss": 0.0308, + "step": 92880 + }, + { + "epoch": 0.20485937282906186, + "grad_norm": 0.11979129910469055, + "learning_rate": 2.4802685823119955e-05, + "loss": 0.0326, + "step": 92890 + }, + { + "epoch": 0.20488142680396002, + "grad_norm": 0.18786121904850006, + "learning_rate": 2.480143422610377e-05, + "loss": 0.0325, + "step": 92900 + }, + { + "epoch": 0.20490348077885817, + "grad_norm": 0.10474720597267151, + "learning_rate": 2.4800182509990905e-05, + "loss": 0.0332, + "step": 92910 + }, + { + "epoch": 0.20492553475375636, + "grad_norm": 0.12091553956270218, + "learning_rate": 2.479893067479657e-05, + "loss": 0.032, + "step": 92920 + }, + { + "epoch": 0.2049475887286545, + "grad_norm": 0.10395849496126175, + "learning_rate": 2.4797678720535974e-05, + "loss": 0.0314, + "step": 92930 + }, + { + "epoch": 0.20496964270355267, + "grad_norm": 0.09662365913391113, + "learning_rate": 2.479642664722433e-05, + "loss": 0.0347, + "step": 92940 + }, + { + "epoch": 0.20499169667845085, + "grad_norm": 0.12220864742994308, + "learning_rate": 2.4795174454876854e-05, + "loss": 0.0356, + "step": 92950 + }, + { + "epoch": 0.205013750653349, + "grad_norm": 0.10538990795612335, + "learning_rate": 2.479392214350876e-05, + "loss": 0.0346, + "step": 92960 + }, + { + "epoch": 0.20503580462824716, + "grad_norm": 0.13089360296726227, + "learning_rate": 2.4792669713135266e-05, + "loss": 0.034, + "step": 92970 + }, + { + "epoch": 0.20505785860314535, + "grad_norm": 0.1309659779071808, + "learning_rate": 2.4791417163771592e-05, + "loss": 0.0325, + "step": 92980 + }, + { + "epoch": 0.2050799125780435, + "grad_norm": 0.10200563073158264, + "learning_rate": 2.479016449543295e-05, + "loss": 0.0317, + "step": 92990 + }, + { + "epoch": 0.20510196655294166, + "grad_norm": 0.11998362839221954, + "learning_rate": 2.478891170813457e-05, + "loss": 0.0331, + "step": 93000 + }, + { + "epoch": 0.20512402052783985, + "grad_norm": 0.10636097192764282, + "learning_rate": 2.478765880189167e-05, + "loss": 0.0312, + "step": 93010 + }, + { + "epoch": 0.205146074502738, + "grad_norm": 0.08069206029176712, + "learning_rate": 2.4786405776719466e-05, + "loss": 0.031, + "step": 93020 + }, + { + "epoch": 0.20516812847763616, + "grad_norm": 0.12314024567604065, + "learning_rate": 2.47851526326332e-05, + "loss": 0.0312, + "step": 93030 + }, + { + "epoch": 0.20519018245253434, + "grad_norm": 0.12046980857849121, + "learning_rate": 2.478389936964809e-05, + "loss": 0.0315, + "step": 93040 + }, + { + "epoch": 0.2052122364274325, + "grad_norm": 0.1113935112953186, + "learning_rate": 2.4782645987779366e-05, + "loss": 0.0336, + "step": 93050 + }, + { + "epoch": 0.20523429040233065, + "grad_norm": 0.09642335772514343, + "learning_rate": 2.4781392487042255e-05, + "loss": 0.031, + "step": 93060 + }, + { + "epoch": 0.20525634437722884, + "grad_norm": 0.10978860408067703, + "learning_rate": 2.478013886745199e-05, + "loss": 0.0316, + "step": 93070 + }, + { + "epoch": 0.205278398352127, + "grad_norm": 0.13530507683753967, + "learning_rate": 2.4778885129023806e-05, + "loss": 0.0324, + "step": 93080 + }, + { + "epoch": 0.20530045232702515, + "grad_norm": 0.09594376385211945, + "learning_rate": 2.4777631271772933e-05, + "loss": 0.0316, + "step": 93090 + }, + { + "epoch": 0.20532250630192334, + "grad_norm": 0.10889847576618195, + "learning_rate": 2.4776377295714607e-05, + "loss": 0.0326, + "step": 93100 + }, + { + "epoch": 0.2053445602768215, + "grad_norm": 0.11164015531539917, + "learning_rate": 2.4775123200864067e-05, + "loss": 0.0318, + "step": 93110 + }, + { + "epoch": 0.20536661425171965, + "grad_norm": 0.09252988547086716, + "learning_rate": 2.477386898723655e-05, + "loss": 0.033, + "step": 93120 + }, + { + "epoch": 0.20538866822661783, + "grad_norm": 0.0985056608915329, + "learning_rate": 2.47726146548473e-05, + "loss": 0.0317, + "step": 93130 + }, + { + "epoch": 0.205410722201516, + "grad_norm": 0.12001337856054306, + "learning_rate": 2.4771360203711555e-05, + "loss": 0.0329, + "step": 93140 + }, + { + "epoch": 0.20543277617641414, + "grad_norm": 0.10873810946941376, + "learning_rate": 2.4770105633844555e-05, + "loss": 0.0325, + "step": 93150 + }, + { + "epoch": 0.20545483015131233, + "grad_norm": 0.10451294481754303, + "learning_rate": 2.4768850945261548e-05, + "loss": 0.0318, + "step": 93160 + }, + { + "epoch": 0.20547688412621049, + "grad_norm": 0.11220868676900864, + "learning_rate": 2.476759613797778e-05, + "loss": 0.0338, + "step": 93170 + }, + { + "epoch": 0.20549893810110864, + "grad_norm": 0.1345169097185135, + "learning_rate": 2.47663412120085e-05, + "loss": 0.0322, + "step": 93180 + }, + { + "epoch": 0.20552099207600683, + "grad_norm": 0.13943246006965637, + "learning_rate": 2.4765086167368943e-05, + "loss": 0.0347, + "step": 93190 + }, + { + "epoch": 0.20554304605090498, + "grad_norm": 0.08716879785060883, + "learning_rate": 2.4763831004074377e-05, + "loss": 0.0326, + "step": 93200 + }, + { + "epoch": 0.20556510002580314, + "grad_norm": 0.12974654138088226, + "learning_rate": 2.4762575722140047e-05, + "loss": 0.0314, + "step": 93210 + }, + { + "epoch": 0.20558715400070132, + "grad_norm": 0.11669645458459854, + "learning_rate": 2.47613203215812e-05, + "loss": 0.0318, + "step": 93220 + }, + { + "epoch": 0.20560920797559948, + "grad_norm": 0.11775629222393036, + "learning_rate": 2.4760064802413097e-05, + "loss": 0.0339, + "step": 93230 + }, + { + "epoch": 0.20563126195049763, + "grad_norm": 0.10561256110668182, + "learning_rate": 2.4758809164650996e-05, + "loss": 0.0322, + "step": 93240 + }, + { + "epoch": 0.20565331592539582, + "grad_norm": 0.10636414587497711, + "learning_rate": 2.475755340831014e-05, + "loss": 0.0318, + "step": 93250 + }, + { + "epoch": 0.20567536990029398, + "grad_norm": 0.09293769299983978, + "learning_rate": 2.4756297533405806e-05, + "loss": 0.0329, + "step": 93260 + }, + { + "epoch": 0.20569742387519216, + "grad_norm": 0.14325177669525146, + "learning_rate": 2.475504153995324e-05, + "loss": 0.0324, + "step": 93270 + }, + { + "epoch": 0.20571947785009032, + "grad_norm": 0.1145470067858696, + "learning_rate": 2.475378542796771e-05, + "loss": 0.0354, + "step": 93280 + }, + { + "epoch": 0.20574153182498847, + "grad_norm": 0.09041720628738403, + "learning_rate": 2.475252919746448e-05, + "loss": 0.0326, + "step": 93290 + }, + { + "epoch": 0.20576358579988666, + "grad_norm": 0.11089645326137543, + "learning_rate": 2.4751272848458813e-05, + "loss": 0.03, + "step": 93300 + }, + { + "epoch": 0.2057856397747848, + "grad_norm": 0.11113569885492325, + "learning_rate": 2.475001638096597e-05, + "loss": 0.0327, + "step": 93310 + }, + { + "epoch": 0.20580769374968297, + "grad_norm": 0.11154169589281082, + "learning_rate": 2.4748759795001228e-05, + "loss": 0.0323, + "step": 93320 + }, + { + "epoch": 0.20582974772458115, + "grad_norm": 0.13530723750591278, + "learning_rate": 2.4747503090579848e-05, + "loss": 0.0311, + "step": 93330 + }, + { + "epoch": 0.2058518016994793, + "grad_norm": 0.09552799165248871, + "learning_rate": 2.47462462677171e-05, + "loss": 0.0327, + "step": 93340 + }, + { + "epoch": 0.20587385567437747, + "grad_norm": 0.10891269892454147, + "learning_rate": 2.474498932642826e-05, + "loss": 0.0337, + "step": 93350 + }, + { + "epoch": 0.20589590964927565, + "grad_norm": 0.13573239743709564, + "learning_rate": 2.4743732266728597e-05, + "loss": 0.0329, + "step": 93360 + }, + { + "epoch": 0.2059179636241738, + "grad_norm": 0.08986112475395203, + "learning_rate": 2.4742475088633386e-05, + "loss": 0.033, + "step": 93370 + }, + { + "epoch": 0.20594001759907196, + "grad_norm": 0.11353980004787445, + "learning_rate": 2.4741217792157907e-05, + "loss": 0.032, + "step": 93380 + }, + { + "epoch": 0.20596207157397015, + "grad_norm": 0.1246364563703537, + "learning_rate": 2.4739960377317437e-05, + "loss": 0.032, + "step": 93390 + }, + { + "epoch": 0.2059841255488683, + "grad_norm": 0.10727895051240921, + "learning_rate": 2.473870284412725e-05, + "loss": 0.0321, + "step": 93400 + }, + { + "epoch": 0.20600617952376646, + "grad_norm": 0.1355818212032318, + "learning_rate": 2.473744519260263e-05, + "loss": 0.0324, + "step": 93410 + }, + { + "epoch": 0.20602823349866464, + "grad_norm": 0.11595065891742706, + "learning_rate": 2.4736187422758853e-05, + "loss": 0.0319, + "step": 93420 + }, + { + "epoch": 0.2060502874735628, + "grad_norm": 0.1615189015865326, + "learning_rate": 2.473492953461121e-05, + "loss": 0.034, + "step": 93430 + }, + { + "epoch": 0.20607234144846096, + "grad_norm": 0.12536530196666718, + "learning_rate": 2.473367152817498e-05, + "loss": 0.0358, + "step": 93440 + }, + { + "epoch": 0.20609439542335914, + "grad_norm": 0.11427098512649536, + "learning_rate": 2.473241340346545e-05, + "loss": 0.033, + "step": 93450 + }, + { + "epoch": 0.2061164493982573, + "grad_norm": 0.09005957841873169, + "learning_rate": 2.4731155160497912e-05, + "loss": 0.032, + "step": 93460 + }, + { + "epoch": 0.20613850337315545, + "grad_norm": 0.12360551208257675, + "learning_rate": 2.472989679928765e-05, + "loss": 0.0325, + "step": 93470 + }, + { + "epoch": 0.20616055734805364, + "grad_norm": 0.11041063070297241, + "learning_rate": 2.472863831984995e-05, + "loss": 0.0341, + "step": 93480 + }, + { + "epoch": 0.2061826113229518, + "grad_norm": 0.12757931649684906, + "learning_rate": 2.4727379722200117e-05, + "loss": 0.0333, + "step": 93490 + }, + { + "epoch": 0.20620466529784995, + "grad_norm": 0.10364773124456406, + "learning_rate": 2.4726121006353433e-05, + "loss": 0.0318, + "step": 93500 + }, + { + "epoch": 0.20622671927274813, + "grad_norm": 0.13289350271224976, + "learning_rate": 2.4724862172325194e-05, + "loss": 0.0323, + "step": 93510 + }, + { + "epoch": 0.2062487732476463, + "grad_norm": 0.1138952299952507, + "learning_rate": 2.47236032201307e-05, + "loss": 0.0315, + "step": 93520 + }, + { + "epoch": 0.20627082722254445, + "grad_norm": 0.09997035562992096, + "learning_rate": 2.4722344149785247e-05, + "loss": 0.0322, + "step": 93530 + }, + { + "epoch": 0.20629288119744263, + "grad_norm": 0.09515323489904404, + "learning_rate": 2.472108496130413e-05, + "loss": 0.0328, + "step": 93540 + }, + { + "epoch": 0.20631493517234079, + "grad_norm": 0.09446843713521957, + "learning_rate": 2.4719825654702653e-05, + "loss": 0.0308, + "step": 93550 + }, + { + "epoch": 0.20633698914723894, + "grad_norm": 0.10473428666591644, + "learning_rate": 2.471856622999612e-05, + "loss": 0.0327, + "step": 93560 + }, + { + "epoch": 0.20635904312213713, + "grad_norm": 0.12594622373580933, + "learning_rate": 2.471730668719983e-05, + "loss": 0.0345, + "step": 93570 + }, + { + "epoch": 0.20638109709703528, + "grad_norm": 0.1662857085466385, + "learning_rate": 2.4716047026329088e-05, + "loss": 0.0344, + "step": 93580 + }, + { + "epoch": 0.20640315107193344, + "grad_norm": 0.11217349022626877, + "learning_rate": 2.4714787247399198e-05, + "loss": 0.0315, + "step": 93590 + }, + { + "epoch": 0.20642520504683162, + "grad_norm": 0.14259245991706848, + "learning_rate": 2.471352735042548e-05, + "loss": 0.0327, + "step": 93600 + }, + { + "epoch": 0.20644725902172978, + "grad_norm": 0.1268632560968399, + "learning_rate": 2.4712267335423225e-05, + "loss": 0.036, + "step": 93610 + }, + { + "epoch": 0.20646931299662794, + "grad_norm": 0.09629929810762405, + "learning_rate": 2.4711007202407756e-05, + "loss": 0.0324, + "step": 93620 + }, + { + "epoch": 0.20649136697152612, + "grad_norm": 0.11194787174463272, + "learning_rate": 2.470974695139438e-05, + "loss": 0.0331, + "step": 93630 + }, + { + "epoch": 0.20651342094642428, + "grad_norm": 0.09149140864610672, + "learning_rate": 2.4708486582398412e-05, + "loss": 0.0323, + "step": 93640 + }, + { + "epoch": 0.20653547492132243, + "grad_norm": 0.08147663623094559, + "learning_rate": 2.4707226095435166e-05, + "loss": 0.0324, + "step": 93650 + }, + { + "epoch": 0.20655752889622062, + "grad_norm": 0.1209348663687706, + "learning_rate": 2.4705965490519952e-05, + "loss": 0.0342, + "step": 93660 + }, + { + "epoch": 0.20657958287111877, + "grad_norm": 0.10416405647993088, + "learning_rate": 2.4704704767668103e-05, + "loss": 0.0333, + "step": 93670 + }, + { + "epoch": 0.20660163684601693, + "grad_norm": 0.13489268720149994, + "learning_rate": 2.4703443926894922e-05, + "loss": 0.0319, + "step": 93680 + }, + { + "epoch": 0.2066236908209151, + "grad_norm": 0.120379239320755, + "learning_rate": 2.4702182968215735e-05, + "loss": 0.0343, + "step": 93690 + }, + { + "epoch": 0.20664574479581327, + "grad_norm": 0.1447220891714096, + "learning_rate": 2.4700921891645868e-05, + "loss": 0.0313, + "step": 93700 + }, + { + "epoch": 0.20666779877071145, + "grad_norm": 0.0924718901515007, + "learning_rate": 2.4699660697200643e-05, + "loss": 0.0309, + "step": 93710 + }, + { + "epoch": 0.2066898527456096, + "grad_norm": 0.09635630995035172, + "learning_rate": 2.4698399384895374e-05, + "loss": 0.0341, + "step": 93720 + }, + { + "epoch": 0.20671190672050777, + "grad_norm": 0.16910865902900696, + "learning_rate": 2.46971379547454e-05, + "loss": 0.0311, + "step": 93730 + }, + { + "epoch": 0.20673396069540595, + "grad_norm": 0.09850926697254181, + "learning_rate": 2.4695876406766044e-05, + "loss": 0.0325, + "step": 93740 + }, + { + "epoch": 0.2067560146703041, + "grad_norm": 0.10133534669876099, + "learning_rate": 2.469461474097264e-05, + "loss": 0.0312, + "step": 93750 + }, + { + "epoch": 0.20677806864520226, + "grad_norm": 0.13343310356140137, + "learning_rate": 2.469335295738051e-05, + "loss": 0.0319, + "step": 93760 + }, + { + "epoch": 0.20680012262010045, + "grad_norm": 0.112240269780159, + "learning_rate": 2.4692091056004987e-05, + "loss": 0.0323, + "step": 93770 + }, + { + "epoch": 0.2068221765949986, + "grad_norm": 0.10129692405462265, + "learning_rate": 2.4690829036861407e-05, + "loss": 0.032, + "step": 93780 + }, + { + "epoch": 0.20684423056989676, + "grad_norm": 0.11326589435338974, + "learning_rate": 2.4689566899965105e-05, + "loss": 0.0324, + "step": 93790 + }, + { + "epoch": 0.20686628454479494, + "grad_norm": 0.13190917670726776, + "learning_rate": 2.468830464533142e-05, + "loss": 0.0344, + "step": 93800 + }, + { + "epoch": 0.2068883385196931, + "grad_norm": 0.10596867650747299, + "learning_rate": 2.4687042272975687e-05, + "loss": 0.0341, + "step": 93810 + }, + { + "epoch": 0.20691039249459126, + "grad_norm": 0.1156659722328186, + "learning_rate": 2.468577978291324e-05, + "loss": 0.0337, + "step": 93820 + }, + { + "epoch": 0.20693244646948944, + "grad_norm": 0.10498857498168945, + "learning_rate": 2.4684517175159424e-05, + "loss": 0.0325, + "step": 93830 + }, + { + "epoch": 0.2069545004443876, + "grad_norm": 0.09408575296401978, + "learning_rate": 2.4683254449729584e-05, + "loss": 0.0333, + "step": 93840 + }, + { + "epoch": 0.20697655441928575, + "grad_norm": 0.12011396139860153, + "learning_rate": 2.468199160663906e-05, + "loss": 0.0321, + "step": 93850 + }, + { + "epoch": 0.20699860839418394, + "grad_norm": 0.07995296269655228, + "learning_rate": 2.4680728645903194e-05, + "loss": 0.0331, + "step": 93860 + }, + { + "epoch": 0.2070206623690821, + "grad_norm": 0.16015233099460602, + "learning_rate": 2.4679465567537337e-05, + "loss": 0.0328, + "step": 93870 + }, + { + "epoch": 0.20704271634398025, + "grad_norm": 0.12041396647691727, + "learning_rate": 2.4678202371556832e-05, + "loss": 0.0326, + "step": 93880 + }, + { + "epoch": 0.20706477031887843, + "grad_norm": 0.07389061897993088, + "learning_rate": 2.467693905797703e-05, + "loss": 0.0316, + "step": 93890 + }, + { + "epoch": 0.2070868242937766, + "grad_norm": 0.09398093819618225, + "learning_rate": 2.4675675626813285e-05, + "loss": 0.0353, + "step": 93900 + }, + { + "epoch": 0.20710887826867475, + "grad_norm": 0.08704037964344025, + "learning_rate": 2.4674412078080942e-05, + "loss": 0.032, + "step": 93910 + }, + { + "epoch": 0.20713093224357293, + "grad_norm": 0.09951692074537277, + "learning_rate": 2.467314841179536e-05, + "loss": 0.0313, + "step": 93920 + }, + { + "epoch": 0.20715298621847109, + "grad_norm": 0.12770070135593414, + "learning_rate": 2.467188462797189e-05, + "loss": 0.0352, + "step": 93930 + }, + { + "epoch": 0.20717504019336924, + "grad_norm": 0.09295819699764252, + "learning_rate": 2.4670620726625894e-05, + "loss": 0.0322, + "step": 93940 + }, + { + "epoch": 0.20719709416826743, + "grad_norm": 0.10948274284601212, + "learning_rate": 2.4669356707772726e-05, + "loss": 0.0319, + "step": 93950 + }, + { + "epoch": 0.20721914814316558, + "grad_norm": 0.09800764918327332, + "learning_rate": 2.4668092571427742e-05, + "loss": 0.0343, + "step": 93960 + }, + { + "epoch": 0.20724120211806374, + "grad_norm": 0.10929037630558014, + "learning_rate": 2.4666828317606305e-05, + "loss": 0.0341, + "step": 93970 + }, + { + "epoch": 0.20726325609296192, + "grad_norm": 0.12273488938808441, + "learning_rate": 2.4665563946323774e-05, + "loss": 0.0342, + "step": 93980 + }, + { + "epoch": 0.20728531006786008, + "grad_norm": 0.10710562765598297, + "learning_rate": 2.4664299457595524e-05, + "loss": 0.0322, + "step": 93990 + }, + { + "epoch": 0.20730736404275824, + "grad_norm": 0.09578662365674973, + "learning_rate": 2.4663034851436906e-05, + "loss": 0.032, + "step": 94000 + }, + { + "epoch": 0.20732941801765642, + "grad_norm": 0.1063658595085144, + "learning_rate": 2.466177012786329e-05, + "loss": 0.033, + "step": 94010 + }, + { + "epoch": 0.20735147199255458, + "grad_norm": 0.10554424673318863, + "learning_rate": 2.4660505286890045e-05, + "loss": 0.0321, + "step": 94020 + }, + { + "epoch": 0.20737352596745273, + "grad_norm": 0.11693083494901657, + "learning_rate": 2.465924032853254e-05, + "loss": 0.0347, + "step": 94030 + }, + { + "epoch": 0.20739557994235092, + "grad_norm": 0.2150605320930481, + "learning_rate": 2.4657975252806146e-05, + "loss": 0.034, + "step": 94040 + }, + { + "epoch": 0.20741763391724907, + "grad_norm": 0.10584550350904465, + "learning_rate": 2.4656710059726232e-05, + "loss": 0.0337, + "step": 94050 + }, + { + "epoch": 0.20743968789214723, + "grad_norm": 0.12740486860275269, + "learning_rate": 2.4655444749308176e-05, + "loss": 0.0326, + "step": 94060 + }, + { + "epoch": 0.2074617418670454, + "grad_norm": 0.10383065044879913, + "learning_rate": 2.4654179321567345e-05, + "loss": 0.033, + "step": 94070 + }, + { + "epoch": 0.20748379584194357, + "grad_norm": 0.11254368722438812, + "learning_rate": 2.465291377651913e-05, + "loss": 0.0325, + "step": 94080 + }, + { + "epoch": 0.20750584981684173, + "grad_norm": 0.11704596132040024, + "learning_rate": 2.4651648114178887e-05, + "loss": 0.0319, + "step": 94090 + }, + { + "epoch": 0.2075279037917399, + "grad_norm": 0.09480378031730652, + "learning_rate": 2.4650382334562013e-05, + "loss": 0.0321, + "step": 94100 + }, + { + "epoch": 0.20754995776663807, + "grad_norm": 0.11734017729759216, + "learning_rate": 2.464911643768388e-05, + "loss": 0.0329, + "step": 94110 + }, + { + "epoch": 0.20757201174153622, + "grad_norm": 0.09335783869028091, + "learning_rate": 2.4647850423559877e-05, + "loss": 0.0323, + "step": 94120 + }, + { + "epoch": 0.2075940657164344, + "grad_norm": 0.10429974645376205, + "learning_rate": 2.4646584292205376e-05, + "loss": 0.0331, + "step": 94130 + }, + { + "epoch": 0.20761611969133256, + "grad_norm": 0.09263575822114944, + "learning_rate": 2.464531804363577e-05, + "loss": 0.0326, + "step": 94140 + }, + { + "epoch": 0.20763817366623075, + "grad_norm": 0.09663351625204086, + "learning_rate": 2.4644051677866447e-05, + "loss": 0.0317, + "step": 94150 + }, + { + "epoch": 0.2076602276411289, + "grad_norm": 0.12542861700057983, + "learning_rate": 2.4642785194912787e-05, + "loss": 0.033, + "step": 94160 + }, + { + "epoch": 0.20768228161602706, + "grad_norm": 0.12720037996768951, + "learning_rate": 2.464151859479018e-05, + "loss": 0.0337, + "step": 94170 + }, + { + "epoch": 0.20770433559092524, + "grad_norm": 0.11592501401901245, + "learning_rate": 2.4640251877514025e-05, + "loss": 0.032, + "step": 94180 + }, + { + "epoch": 0.2077263895658234, + "grad_norm": 0.09648431092500687, + "learning_rate": 2.4638985043099708e-05, + "loss": 0.0331, + "step": 94190 + }, + { + "epoch": 0.20774844354072156, + "grad_norm": 0.10318546742200851, + "learning_rate": 2.4637718091562617e-05, + "loss": 0.0323, + "step": 94200 + }, + { + "epoch": 0.20777049751561974, + "grad_norm": 0.08193888515233994, + "learning_rate": 2.4636451022918153e-05, + "loss": 0.0322, + "step": 94210 + }, + { + "epoch": 0.2077925514905179, + "grad_norm": 0.0973476693034172, + "learning_rate": 2.463518383718171e-05, + "loss": 0.0322, + "step": 94220 + }, + { + "epoch": 0.20781460546541605, + "grad_norm": 0.1114017441868782, + "learning_rate": 2.4633916534368687e-05, + "loss": 0.0333, + "step": 94230 + }, + { + "epoch": 0.20783665944031424, + "grad_norm": 0.13753961026668549, + "learning_rate": 2.4632649114494483e-05, + "loss": 0.0346, + "step": 94240 + }, + { + "epoch": 0.2078587134152124, + "grad_norm": 0.14860419929027557, + "learning_rate": 2.4631381577574498e-05, + "loss": 0.0327, + "step": 94250 + }, + { + "epoch": 0.20788076739011055, + "grad_norm": 0.10837607830762863, + "learning_rate": 2.4630113923624135e-05, + "loss": 0.0328, + "step": 94260 + }, + { + "epoch": 0.20790282136500873, + "grad_norm": 0.09582210332155228, + "learning_rate": 2.462884615265879e-05, + "loss": 0.0331, + "step": 94270 + }, + { + "epoch": 0.2079248753399069, + "grad_norm": 0.14334514737129211, + "learning_rate": 2.4627578264693874e-05, + "loss": 0.033, + "step": 94280 + }, + { + "epoch": 0.20794692931480505, + "grad_norm": 0.13251446187496185, + "learning_rate": 2.4626310259744797e-05, + "loss": 0.0325, + "step": 94290 + }, + { + "epoch": 0.20796898328970323, + "grad_norm": 0.0989748165011406, + "learning_rate": 2.4625042137826962e-05, + "loss": 0.0321, + "step": 94300 + }, + { + "epoch": 0.20799103726460139, + "grad_norm": 0.11365996301174164, + "learning_rate": 2.462377389895577e-05, + "loss": 0.0344, + "step": 94310 + }, + { + "epoch": 0.20801309123949954, + "grad_norm": 0.10984683781862259, + "learning_rate": 2.4622505543146644e-05, + "loss": 0.0322, + "step": 94320 + }, + { + "epoch": 0.20803514521439773, + "grad_norm": 0.09156902134418488, + "learning_rate": 2.4621237070414993e-05, + "loss": 0.0327, + "step": 94330 + }, + { + "epoch": 0.20805719918929588, + "grad_norm": 0.13324421644210815, + "learning_rate": 2.4619968480776223e-05, + "loss": 0.0316, + "step": 94340 + }, + { + "epoch": 0.20807925316419404, + "grad_norm": 0.10265195369720459, + "learning_rate": 2.4618699774245752e-05, + "loss": 0.032, + "step": 94350 + }, + { + "epoch": 0.20810130713909222, + "grad_norm": 0.11202874779701233, + "learning_rate": 2.4617430950839e-05, + "loss": 0.034, + "step": 94360 + }, + { + "epoch": 0.20812336111399038, + "grad_norm": 0.12778770923614502, + "learning_rate": 2.4616162010571384e-05, + "loss": 0.0326, + "step": 94370 + }, + { + "epoch": 0.20814541508888854, + "grad_norm": 0.13321201503276825, + "learning_rate": 2.4614892953458316e-05, + "loss": 0.034, + "step": 94380 + }, + { + "epoch": 0.20816746906378672, + "grad_norm": 0.10564904659986496, + "learning_rate": 2.4613623779515223e-05, + "loss": 0.0319, + "step": 94390 + }, + { + "epoch": 0.20818952303868488, + "grad_norm": 0.1277165412902832, + "learning_rate": 2.4612354488757522e-05, + "loss": 0.0349, + "step": 94400 + }, + { + "epoch": 0.20821157701358303, + "grad_norm": 0.11665582656860352, + "learning_rate": 2.461108508120064e-05, + "loss": 0.0344, + "step": 94410 + }, + { + "epoch": 0.20823363098848122, + "grad_norm": 0.0919012799859047, + "learning_rate": 2.460981555686e-05, + "loss": 0.0327, + "step": 94420 + }, + { + "epoch": 0.20825568496337937, + "grad_norm": 0.10742750763893127, + "learning_rate": 2.4608545915751026e-05, + "loss": 0.0335, + "step": 94430 + }, + { + "epoch": 0.20827773893827753, + "grad_norm": 0.1262253075838089, + "learning_rate": 2.4607276157889152e-05, + "loss": 0.0328, + "step": 94440 + }, + { + "epoch": 0.2082997929131757, + "grad_norm": 0.10200836509466171, + "learning_rate": 2.4606006283289798e-05, + "loss": 0.0324, + "step": 94450 + }, + { + "epoch": 0.20832184688807387, + "grad_norm": 0.11510809510946274, + "learning_rate": 2.4604736291968398e-05, + "loss": 0.033, + "step": 94460 + }, + { + "epoch": 0.20834390086297203, + "grad_norm": 0.14966239035129547, + "learning_rate": 2.4603466183940387e-05, + "loss": 0.0337, + "step": 94470 + }, + { + "epoch": 0.2083659548378702, + "grad_norm": 0.09023607522249222, + "learning_rate": 2.4602195959221193e-05, + "loss": 0.0342, + "step": 94480 + }, + { + "epoch": 0.20838800881276837, + "grad_norm": 0.09501863270998001, + "learning_rate": 2.4600925617826247e-05, + "loss": 0.0316, + "step": 94490 + }, + { + "epoch": 0.20841006278766652, + "grad_norm": 0.10634000599384308, + "learning_rate": 2.4599655159770998e-05, + "loss": 0.0311, + "step": 94500 + }, + { + "epoch": 0.2084321167625647, + "grad_norm": 0.08648189157247543, + "learning_rate": 2.4598384585070872e-05, + "loss": 0.0323, + "step": 94510 + }, + { + "epoch": 0.20845417073746286, + "grad_norm": 0.11456409096717834, + "learning_rate": 2.4597113893741306e-05, + "loss": 0.0326, + "step": 94520 + }, + { + "epoch": 0.20847622471236102, + "grad_norm": 0.13785549998283386, + "learning_rate": 2.459584308579775e-05, + "loss": 0.0331, + "step": 94530 + }, + { + "epoch": 0.2084982786872592, + "grad_norm": 0.10993741452693939, + "learning_rate": 2.4594572161255646e-05, + "loss": 0.0327, + "step": 94540 + }, + { + "epoch": 0.20852033266215736, + "grad_norm": 0.10477019846439362, + "learning_rate": 2.459330112013042e-05, + "loss": 0.0319, + "step": 94550 + }, + { + "epoch": 0.20854238663705554, + "grad_norm": 0.12995198369026184, + "learning_rate": 2.4592029962437534e-05, + "loss": 0.0329, + "step": 94560 + }, + { + "epoch": 0.2085644406119537, + "grad_norm": 0.11717270314693451, + "learning_rate": 2.4590758688192427e-05, + "loss": 0.0344, + "step": 94570 + }, + { + "epoch": 0.20858649458685186, + "grad_norm": 0.09676692634820938, + "learning_rate": 2.4589487297410547e-05, + "loss": 0.0317, + "step": 94580 + }, + { + "epoch": 0.20860854856175004, + "grad_norm": 0.12276332080364227, + "learning_rate": 2.458821579010734e-05, + "loss": 0.0319, + "step": 94590 + }, + { + "epoch": 0.2086306025366482, + "grad_norm": 0.11398537456989288, + "learning_rate": 2.4586944166298264e-05, + "loss": 0.0335, + "step": 94600 + }, + { + "epoch": 0.20865265651154635, + "grad_norm": 0.10378094017505646, + "learning_rate": 2.458567242599876e-05, + "loss": 0.0332, + "step": 94610 + }, + { + "epoch": 0.20867471048644454, + "grad_norm": 0.08803167939186096, + "learning_rate": 2.458440056922429e-05, + "loss": 0.0315, + "step": 94620 + }, + { + "epoch": 0.2086967644613427, + "grad_norm": 0.09383940696716309, + "learning_rate": 2.45831285959903e-05, + "loss": 0.0319, + "step": 94630 + }, + { + "epoch": 0.20871881843624085, + "grad_norm": 0.10914182662963867, + "learning_rate": 2.4581856506312246e-05, + "loss": 0.0362, + "step": 94640 + }, + { + "epoch": 0.20874087241113903, + "grad_norm": 0.11219676584005356, + "learning_rate": 2.4580584300205596e-05, + "loss": 0.0336, + "step": 94650 + }, + { + "epoch": 0.2087629263860372, + "grad_norm": 0.10065510123968124, + "learning_rate": 2.4579311977685798e-05, + "loss": 0.0306, + "step": 94660 + }, + { + "epoch": 0.20878498036093535, + "grad_norm": 0.09846159815788269, + "learning_rate": 2.4578039538768317e-05, + "loss": 0.0323, + "step": 94670 + }, + { + "epoch": 0.20880703433583353, + "grad_norm": 0.09188453108072281, + "learning_rate": 2.457676698346861e-05, + "loss": 0.0336, + "step": 94680 + }, + { + "epoch": 0.2088290883107317, + "grad_norm": 0.10823065787553787, + "learning_rate": 2.457549431180214e-05, + "loss": 0.031, + "step": 94690 + }, + { + "epoch": 0.20885114228562984, + "grad_norm": 0.12124798446893692, + "learning_rate": 2.457422152378438e-05, + "loss": 0.0323, + "step": 94700 + }, + { + "epoch": 0.20887319626052803, + "grad_norm": 0.10743936151266098, + "learning_rate": 2.4572948619430784e-05, + "loss": 0.0314, + "step": 94710 + }, + { + "epoch": 0.20889525023542618, + "grad_norm": 0.11497445404529572, + "learning_rate": 2.4571675598756825e-05, + "loss": 0.0343, + "step": 94720 + }, + { + "epoch": 0.20891730421032434, + "grad_norm": 0.1294305920600891, + "learning_rate": 2.457040246177797e-05, + "loss": 0.0325, + "step": 94730 + }, + { + "epoch": 0.20893935818522252, + "grad_norm": 0.09675118327140808, + "learning_rate": 2.456912920850969e-05, + "loss": 0.0327, + "step": 94740 + }, + { + "epoch": 0.20896141216012068, + "grad_norm": 0.14328213036060333, + "learning_rate": 2.4567855838967455e-05, + "loss": 0.0315, + "step": 94750 + }, + { + "epoch": 0.20898346613501884, + "grad_norm": 0.13004419207572937, + "learning_rate": 2.4566582353166735e-05, + "loss": 0.0338, + "step": 94760 + }, + { + "epoch": 0.20900552010991702, + "grad_norm": 0.09319063276052475, + "learning_rate": 2.4565308751123016e-05, + "loss": 0.0318, + "step": 94770 + }, + { + "epoch": 0.20902757408481518, + "grad_norm": 0.11163508147001266, + "learning_rate": 2.4564035032851752e-05, + "loss": 0.0329, + "step": 94780 + }, + { + "epoch": 0.20904962805971333, + "grad_norm": 0.1064860001206398, + "learning_rate": 2.4562761198368443e-05, + "loss": 0.033, + "step": 94790 + }, + { + "epoch": 0.20907168203461152, + "grad_norm": 0.1186908483505249, + "learning_rate": 2.456148724768855e-05, + "loss": 0.0322, + "step": 94800 + }, + { + "epoch": 0.20909373600950967, + "grad_norm": 0.09022221714258194, + "learning_rate": 2.4560213180827563e-05, + "loss": 0.0316, + "step": 94810 + }, + { + "epoch": 0.20911578998440783, + "grad_norm": 0.09958522766828537, + "learning_rate": 2.455893899780096e-05, + "loss": 0.0328, + "step": 94820 + }, + { + "epoch": 0.209137843959306, + "grad_norm": 0.12533284723758698, + "learning_rate": 2.4557664698624222e-05, + "loss": 0.0329, + "step": 94830 + }, + { + "epoch": 0.20915989793420417, + "grad_norm": 0.10495787113904953, + "learning_rate": 2.4556390283312833e-05, + "loss": 0.0321, + "step": 94840 + }, + { + "epoch": 0.20918195190910233, + "grad_norm": 0.09825848042964935, + "learning_rate": 2.455511575188228e-05, + "loss": 0.0322, + "step": 94850 + }, + { + "epoch": 0.2092040058840005, + "grad_norm": 0.10408658534288406, + "learning_rate": 2.455384110434805e-05, + "loss": 0.0318, + "step": 94860 + }, + { + "epoch": 0.20922605985889867, + "grad_norm": 0.09047435969114304, + "learning_rate": 2.4552566340725627e-05, + "loss": 0.0323, + "step": 94870 + }, + { + "epoch": 0.20924811383379682, + "grad_norm": 0.09999356418848038, + "learning_rate": 2.4551291461030504e-05, + "loss": 0.0338, + "step": 94880 + }, + { + "epoch": 0.209270167808695, + "grad_norm": 0.12363433092832565, + "learning_rate": 2.4550016465278177e-05, + "loss": 0.0328, + "step": 94890 + }, + { + "epoch": 0.20929222178359316, + "grad_norm": 0.11420723050832748, + "learning_rate": 2.454874135348413e-05, + "loss": 0.0323, + "step": 94900 + }, + { + "epoch": 0.20931427575849132, + "grad_norm": 0.13748914003372192, + "learning_rate": 2.454746612566386e-05, + "loss": 0.0331, + "step": 94910 + }, + { + "epoch": 0.2093363297333895, + "grad_norm": 0.0884779766201973, + "learning_rate": 2.454619078183286e-05, + "loss": 0.0334, + "step": 94920 + }, + { + "epoch": 0.20935838370828766, + "grad_norm": 0.09512154757976532, + "learning_rate": 2.4544915322006636e-05, + "loss": 0.0317, + "step": 94930 + }, + { + "epoch": 0.20938043768318582, + "grad_norm": 0.12289640307426453, + "learning_rate": 2.454363974620067e-05, + "loss": 0.0327, + "step": 94940 + }, + { + "epoch": 0.209402491658084, + "grad_norm": 0.11322551965713501, + "learning_rate": 2.454236405443048e-05, + "loss": 0.0331, + "step": 94950 + }, + { + "epoch": 0.20942454563298216, + "grad_norm": 0.10267560929059982, + "learning_rate": 2.4541088246711548e-05, + "loss": 0.0342, + "step": 94960 + }, + { + "epoch": 0.2094465996078803, + "grad_norm": 0.09364785999059677, + "learning_rate": 2.453981232305939e-05, + "loss": 0.0321, + "step": 94970 + }, + { + "epoch": 0.2094686535827785, + "grad_norm": 0.10514859855175018, + "learning_rate": 2.4538536283489504e-05, + "loss": 0.032, + "step": 94980 + }, + { + "epoch": 0.20949070755767665, + "grad_norm": 0.10411155968904495, + "learning_rate": 2.4537260128017395e-05, + "loss": 0.0309, + "step": 94990 + }, + { + "epoch": 0.20951276153257484, + "grad_norm": 0.12221334129571915, + "learning_rate": 2.4535983856658572e-05, + "loss": 0.031, + "step": 95000 + }, + { + "epoch": 0.209534815507473, + "grad_norm": 0.14025482535362244, + "learning_rate": 2.4534707469428543e-05, + "loss": 0.033, + "step": 95010 + }, + { + "epoch": 0.20955686948237115, + "grad_norm": 0.12691381573677063, + "learning_rate": 2.453343096634282e-05, + "loss": 0.0338, + "step": 95020 + }, + { + "epoch": 0.20957892345726933, + "grad_norm": 0.10709551721811295, + "learning_rate": 2.4532154347416905e-05, + "loss": 0.0323, + "step": 95030 + }, + { + "epoch": 0.2096009774321675, + "grad_norm": 0.10162655264139175, + "learning_rate": 2.4530877612666317e-05, + "loss": 0.0323, + "step": 95040 + }, + { + "epoch": 0.20962303140706565, + "grad_norm": 0.09435974061489105, + "learning_rate": 2.4529600762106557e-05, + "loss": 0.0335, + "step": 95050 + }, + { + "epoch": 0.20964508538196383, + "grad_norm": 0.08959546685218811, + "learning_rate": 2.4528323795753163e-05, + "loss": 0.0347, + "step": 95060 + }, + { + "epoch": 0.209667139356862, + "grad_norm": 0.10665048658847809, + "learning_rate": 2.4527046713621636e-05, + "loss": 0.0325, + "step": 95070 + }, + { + "epoch": 0.20968919333176014, + "grad_norm": 0.10187015682458878, + "learning_rate": 2.4525769515727492e-05, + "loss": 0.0319, + "step": 95080 + }, + { + "epoch": 0.20971124730665833, + "grad_norm": 0.11205917596817017, + "learning_rate": 2.452449220208626e-05, + "loss": 0.0325, + "step": 95090 + }, + { + "epoch": 0.20973330128155648, + "grad_norm": 0.13666017353534698, + "learning_rate": 2.452321477271345e-05, + "loss": 0.0298, + "step": 95100 + }, + { + "epoch": 0.20975535525645464, + "grad_norm": 0.12800350785255432, + "learning_rate": 2.4521937227624595e-05, + "loss": 0.0321, + "step": 95110 + }, + { + "epoch": 0.20977740923135282, + "grad_norm": 0.08726253360509872, + "learning_rate": 2.4520659566835206e-05, + "loss": 0.0338, + "step": 95120 + }, + { + "epoch": 0.20979946320625098, + "grad_norm": 0.11770449578762054, + "learning_rate": 2.4519381790360818e-05, + "loss": 0.0311, + "step": 95130 + }, + { + "epoch": 0.20982151718114914, + "grad_norm": 0.12055293470621109, + "learning_rate": 2.4518103898216953e-05, + "loss": 0.0313, + "step": 95140 + }, + { + "epoch": 0.20984357115604732, + "grad_norm": 0.12458613514900208, + "learning_rate": 2.4516825890419137e-05, + "loss": 0.0311, + "step": 95150 + }, + { + "epoch": 0.20986562513094548, + "grad_norm": 0.11049237102270126, + "learning_rate": 2.4515547766982904e-05, + "loss": 0.0331, + "step": 95160 + }, + { + "epoch": 0.20988767910584363, + "grad_norm": 0.10075824707746506, + "learning_rate": 2.4514269527923783e-05, + "loss": 0.0334, + "step": 95170 + }, + { + "epoch": 0.20990973308074182, + "grad_norm": 0.09876123815774918, + "learning_rate": 2.4512991173257302e-05, + "loss": 0.0316, + "step": 95180 + }, + { + "epoch": 0.20993178705563997, + "grad_norm": 0.11824635416269302, + "learning_rate": 2.4511712702998993e-05, + "loss": 0.0328, + "step": 95190 + }, + { + "epoch": 0.20995384103053813, + "grad_norm": 0.10986935347318649, + "learning_rate": 2.4510434117164394e-05, + "loss": 0.0319, + "step": 95200 + }, + { + "epoch": 0.2099758950054363, + "grad_norm": 0.13070984184741974, + "learning_rate": 2.4509155415769045e-05, + "loss": 0.0323, + "step": 95210 + }, + { + "epoch": 0.20999794898033447, + "grad_norm": 0.1021849662065506, + "learning_rate": 2.4507876598828483e-05, + "loss": 0.0327, + "step": 95220 + }, + { + "epoch": 0.21002000295523263, + "grad_norm": 0.10402462631464005, + "learning_rate": 2.4506597666358236e-05, + "loss": 0.0347, + "step": 95230 + }, + { + "epoch": 0.2100420569301308, + "grad_norm": 0.09859392791986465, + "learning_rate": 2.450531861837386e-05, + "loss": 0.0324, + "step": 95240 + }, + { + "epoch": 0.21006411090502897, + "grad_norm": 0.09880127012729645, + "learning_rate": 2.450403945489088e-05, + "loss": 0.0323, + "step": 95250 + }, + { + "epoch": 0.21008616487992712, + "grad_norm": 0.11049795150756836, + "learning_rate": 2.4502760175924846e-05, + "loss": 0.0304, + "step": 95260 + }, + { + "epoch": 0.2101082188548253, + "grad_norm": 0.0997227132320404, + "learning_rate": 2.450148078149131e-05, + "loss": 0.0318, + "step": 95270 + }, + { + "epoch": 0.21013027282972346, + "grad_norm": 0.1756352037191391, + "learning_rate": 2.4500201271605813e-05, + "loss": 0.0338, + "step": 95280 + }, + { + "epoch": 0.21015232680462162, + "grad_norm": 0.10617413371801376, + "learning_rate": 2.4498921646283897e-05, + "loss": 0.0313, + "step": 95290 + }, + { + "epoch": 0.2101743807795198, + "grad_norm": 0.08353354781866074, + "learning_rate": 2.4497641905541114e-05, + "loss": 0.0328, + "step": 95300 + }, + { + "epoch": 0.21019643475441796, + "grad_norm": 0.10446628928184509, + "learning_rate": 2.449636204939302e-05, + "loss": 0.0312, + "step": 95310 + }, + { + "epoch": 0.21021848872931612, + "grad_norm": 0.11082641035318375, + "learning_rate": 2.4495082077855155e-05, + "loss": 0.0328, + "step": 95320 + }, + { + "epoch": 0.2102405427042143, + "grad_norm": 0.16267932951450348, + "learning_rate": 2.449380199094308e-05, + "loss": 0.0318, + "step": 95330 + }, + { + "epoch": 0.21026259667911246, + "grad_norm": 0.12297394871711731, + "learning_rate": 2.4492521788672347e-05, + "loss": 0.0319, + "step": 95340 + }, + { + "epoch": 0.2102846506540106, + "grad_norm": 0.11176249384880066, + "learning_rate": 2.449124147105851e-05, + "loss": 0.0331, + "step": 95350 + }, + { + "epoch": 0.2103067046289088, + "grad_norm": 0.1388195902109146, + "learning_rate": 2.4489961038117134e-05, + "loss": 0.0325, + "step": 95360 + }, + { + "epoch": 0.21032875860380695, + "grad_norm": 0.09740517288446426, + "learning_rate": 2.4488680489863765e-05, + "loss": 0.0313, + "step": 95370 + }, + { + "epoch": 0.2103508125787051, + "grad_norm": 0.10032990574836731, + "learning_rate": 2.4487399826313972e-05, + "loss": 0.0319, + "step": 95380 + }, + { + "epoch": 0.2103728665536033, + "grad_norm": 0.10473112761974335, + "learning_rate": 2.4486119047483313e-05, + "loss": 0.0341, + "step": 95390 + }, + { + "epoch": 0.21039492052850145, + "grad_norm": 0.12388323247432709, + "learning_rate": 2.448483815338735e-05, + "loss": 0.0328, + "step": 95400 + }, + { + "epoch": 0.2104169745033996, + "grad_norm": 0.11767831444740295, + "learning_rate": 2.4483557144041652e-05, + "loss": 0.0339, + "step": 95410 + }, + { + "epoch": 0.2104390284782978, + "grad_norm": 0.12984444200992584, + "learning_rate": 2.4482276019461777e-05, + "loss": 0.0315, + "step": 95420 + }, + { + "epoch": 0.21046108245319595, + "grad_norm": 0.11436658352613449, + "learning_rate": 2.4480994779663296e-05, + "loss": 0.0328, + "step": 95430 + }, + { + "epoch": 0.21048313642809413, + "grad_norm": 0.10186909884214401, + "learning_rate": 2.4479713424661775e-05, + "loss": 0.033, + "step": 95440 + }, + { + "epoch": 0.2105051904029923, + "grad_norm": 0.08943182975053787, + "learning_rate": 2.4478431954472792e-05, + "loss": 0.032, + "step": 95450 + }, + { + "epoch": 0.21052724437789044, + "grad_norm": 0.09857654571533203, + "learning_rate": 2.4477150369111903e-05, + "loss": 0.0309, + "step": 95460 + }, + { + "epoch": 0.21054929835278863, + "grad_norm": 0.1046033725142479, + "learning_rate": 2.44758686685947e-05, + "loss": 0.0338, + "step": 95470 + }, + { + "epoch": 0.21057135232768678, + "grad_norm": 0.13051936030387878, + "learning_rate": 2.4474586852936736e-05, + "loss": 0.0336, + "step": 95480 + }, + { + "epoch": 0.21059340630258494, + "grad_norm": 0.12095806747674942, + "learning_rate": 2.44733049221536e-05, + "loss": 0.0317, + "step": 95490 + }, + { + "epoch": 0.21061546027748312, + "grad_norm": 0.13087214529514313, + "learning_rate": 2.447202287626087e-05, + "loss": 0.0333, + "step": 95500 + }, + { + "epoch": 0.21063751425238128, + "grad_norm": 0.1331114023923874, + "learning_rate": 2.447074071527411e-05, + "loss": 0.0332, + "step": 95510 + }, + { + "epoch": 0.21065956822727944, + "grad_norm": 0.10990997403860092, + "learning_rate": 2.4469458439208916e-05, + "loss": 0.0297, + "step": 95520 + }, + { + "epoch": 0.21068162220217762, + "grad_norm": 0.11233528703451157, + "learning_rate": 2.4468176048080858e-05, + "loss": 0.034, + "step": 95530 + }, + { + "epoch": 0.21070367617707578, + "grad_norm": 0.1162920817732811, + "learning_rate": 2.446689354190552e-05, + "loss": 0.0333, + "step": 95540 + }, + { + "epoch": 0.21072573015197393, + "grad_norm": 0.11639862507581711, + "learning_rate": 2.4465610920698487e-05, + "loss": 0.0341, + "step": 95550 + }, + { + "epoch": 0.21074778412687212, + "grad_norm": 0.09521403163671494, + "learning_rate": 2.4464328184475345e-05, + "loss": 0.032, + "step": 95560 + }, + { + "epoch": 0.21076983810177027, + "grad_norm": 0.12648816406726837, + "learning_rate": 2.4463045333251682e-05, + "loss": 0.0326, + "step": 95570 + }, + { + "epoch": 0.21079189207666843, + "grad_norm": 0.11474903672933578, + "learning_rate": 2.446176236704308e-05, + "loss": 0.0314, + "step": 95580 + }, + { + "epoch": 0.21081394605156661, + "grad_norm": 0.11200227588415146, + "learning_rate": 2.4460479285865132e-05, + "loss": 0.0342, + "step": 95590 + }, + { + "epoch": 0.21083600002646477, + "grad_norm": 0.1233963593840599, + "learning_rate": 2.4459196089733434e-05, + "loss": 0.0327, + "step": 95600 + }, + { + "epoch": 0.21085805400136293, + "grad_norm": 0.13420963287353516, + "learning_rate": 2.4457912778663573e-05, + "loss": 0.0302, + "step": 95610 + }, + { + "epoch": 0.2108801079762611, + "grad_norm": 0.1097707748413086, + "learning_rate": 2.4456629352671138e-05, + "loss": 0.0324, + "step": 95620 + }, + { + "epoch": 0.21090216195115927, + "grad_norm": 0.08576277643442154, + "learning_rate": 2.4455345811771724e-05, + "loss": 0.0323, + "step": 95630 + }, + { + "epoch": 0.21092421592605742, + "grad_norm": 0.11200440675020218, + "learning_rate": 2.4454062155980934e-05, + "loss": 0.0316, + "step": 95640 + }, + { + "epoch": 0.2109462699009556, + "grad_norm": 0.10165179520845413, + "learning_rate": 2.4452778385314367e-05, + "loss": 0.0324, + "step": 95650 + }, + { + "epoch": 0.21096832387585376, + "grad_norm": 0.13636650145053864, + "learning_rate": 2.4451494499787615e-05, + "loss": 0.0331, + "step": 95660 + }, + { + "epoch": 0.21099037785075192, + "grad_norm": 0.11919007450342178, + "learning_rate": 2.4450210499416276e-05, + "loss": 0.0336, + "step": 95670 + }, + { + "epoch": 0.2110124318256501, + "grad_norm": 0.11888550221920013, + "learning_rate": 2.4448926384215964e-05, + "loss": 0.0326, + "step": 95680 + }, + { + "epoch": 0.21103448580054826, + "grad_norm": 0.11734497547149658, + "learning_rate": 2.444764215420227e-05, + "loss": 0.0341, + "step": 95690 + }, + { + "epoch": 0.21105653977544642, + "grad_norm": 0.09580735117197037, + "learning_rate": 2.444635780939081e-05, + "loss": 0.033, + "step": 95700 + }, + { + "epoch": 0.2110785937503446, + "grad_norm": 0.09121724963188171, + "learning_rate": 2.444507334979718e-05, + "loss": 0.0329, + "step": 95710 + }, + { + "epoch": 0.21110064772524276, + "grad_norm": 0.09218105673789978, + "learning_rate": 2.444378877543699e-05, + "loss": 0.0329, + "step": 95720 + }, + { + "epoch": 0.2111227017001409, + "grad_norm": 0.12985600531101227, + "learning_rate": 2.444250408632585e-05, + "loss": 0.0324, + "step": 95730 + }, + { + "epoch": 0.2111447556750391, + "grad_norm": 0.09274030476808548, + "learning_rate": 2.4441219282479373e-05, + "loss": 0.0301, + "step": 95740 + }, + { + "epoch": 0.21116680964993725, + "grad_norm": 0.11173269897699356, + "learning_rate": 2.4439934363913167e-05, + "loss": 0.0318, + "step": 95750 + }, + { + "epoch": 0.2111888636248354, + "grad_norm": 0.11053602397441864, + "learning_rate": 2.4438649330642846e-05, + "loss": 0.032, + "step": 95760 + }, + { + "epoch": 0.2112109175997336, + "grad_norm": 0.10472354292869568, + "learning_rate": 2.4437364182684023e-05, + "loss": 0.032, + "step": 95770 + }, + { + "epoch": 0.21123297157463175, + "grad_norm": 0.12759573757648468, + "learning_rate": 2.4436078920052315e-05, + "loss": 0.0325, + "step": 95780 + }, + { + "epoch": 0.2112550255495299, + "grad_norm": 0.10863728821277618, + "learning_rate": 2.443479354276334e-05, + "loss": 0.0313, + "step": 95790 + }, + { + "epoch": 0.2112770795244281, + "grad_norm": 0.07069983333349228, + "learning_rate": 2.443350805083271e-05, + "loss": 0.034, + "step": 95800 + }, + { + "epoch": 0.21129913349932625, + "grad_norm": 0.0962286964058876, + "learning_rate": 2.443222244427606e-05, + "loss": 0.0327, + "step": 95810 + }, + { + "epoch": 0.2113211874742244, + "grad_norm": 0.1136631891131401, + "learning_rate": 2.4430936723108993e-05, + "loss": 0.037, + "step": 95820 + }, + { + "epoch": 0.2113432414491226, + "grad_norm": 0.12810707092285156, + "learning_rate": 2.442965088734715e-05, + "loss": 0.0331, + "step": 95830 + }, + { + "epoch": 0.21136529542402074, + "grad_norm": 0.09617310762405396, + "learning_rate": 2.4428364937006135e-05, + "loss": 0.0334, + "step": 95840 + }, + { + "epoch": 0.21138734939891893, + "grad_norm": 0.12206797301769257, + "learning_rate": 2.442707887210159e-05, + "loss": 0.033, + "step": 95850 + }, + { + "epoch": 0.21140940337381708, + "grad_norm": 0.1059115082025528, + "learning_rate": 2.4425792692649135e-05, + "loss": 0.0329, + "step": 95860 + }, + { + "epoch": 0.21143145734871524, + "grad_norm": 0.14329108595848083, + "learning_rate": 2.4424506398664398e-05, + "loss": 0.0329, + "step": 95870 + }, + { + "epoch": 0.21145351132361342, + "grad_norm": 0.14674362540245056, + "learning_rate": 2.4423219990163015e-05, + "loss": 0.0331, + "step": 95880 + }, + { + "epoch": 0.21147556529851158, + "grad_norm": 0.14132048189640045, + "learning_rate": 2.4421933467160603e-05, + "loss": 0.0324, + "step": 95890 + }, + { + "epoch": 0.21149761927340974, + "grad_norm": 0.1060526892542839, + "learning_rate": 2.442064682967281e-05, + "loss": 0.0315, + "step": 95900 + }, + { + "epoch": 0.21151967324830792, + "grad_norm": 0.13125334680080414, + "learning_rate": 2.4419360077715265e-05, + "loss": 0.0328, + "step": 95910 + }, + { + "epoch": 0.21154172722320608, + "grad_norm": 0.12693852186203003, + "learning_rate": 2.4418073211303602e-05, + "loss": 0.0339, + "step": 95920 + }, + { + "epoch": 0.21156378119810423, + "grad_norm": 0.10996479541063309, + "learning_rate": 2.441678623045345e-05, + "loss": 0.0334, + "step": 95930 + }, + { + "epoch": 0.21158583517300242, + "grad_norm": 0.10971380770206451, + "learning_rate": 2.4415499135180462e-05, + "loss": 0.0317, + "step": 95940 + }, + { + "epoch": 0.21160788914790057, + "grad_norm": 0.09799636900424957, + "learning_rate": 2.4414211925500264e-05, + "loss": 0.033, + "step": 95950 + }, + { + "epoch": 0.21162994312279873, + "grad_norm": 0.1243370994925499, + "learning_rate": 2.441292460142851e-05, + "loss": 0.0311, + "step": 95960 + }, + { + "epoch": 0.21165199709769691, + "grad_norm": 0.1478598415851593, + "learning_rate": 2.4411637162980828e-05, + "loss": 0.0316, + "step": 95970 + }, + { + "epoch": 0.21167405107259507, + "grad_norm": 0.1795307844877243, + "learning_rate": 2.4410349610172872e-05, + "loss": 0.0296, + "step": 95980 + }, + { + "epoch": 0.21169610504749323, + "grad_norm": 0.0982639268040657, + "learning_rate": 2.440906194302028e-05, + "loss": 0.0331, + "step": 95990 + }, + { + "epoch": 0.2117181590223914, + "grad_norm": 0.11606186628341675, + "learning_rate": 2.4407774161538706e-05, + "loss": 0.0337, + "step": 96000 + }, + { + "epoch": 0.21174021299728957, + "grad_norm": 0.10228821635246277, + "learning_rate": 2.440648626574379e-05, + "loss": 0.0332, + "step": 96010 + }, + { + "epoch": 0.21176226697218772, + "grad_norm": 0.0812671110033989, + "learning_rate": 2.4405198255651185e-05, + "loss": 0.0322, + "step": 96020 + }, + { + "epoch": 0.2117843209470859, + "grad_norm": 0.098365917801857, + "learning_rate": 2.440391013127654e-05, + "loss": 0.0329, + "step": 96030 + }, + { + "epoch": 0.21180637492198406, + "grad_norm": 0.11299674212932587, + "learning_rate": 2.4402621892635514e-05, + "loss": 0.0326, + "step": 96040 + }, + { + "epoch": 0.21182842889688222, + "grad_norm": 0.10015381127595901, + "learning_rate": 2.440133353974375e-05, + "loss": 0.0329, + "step": 96050 + }, + { + "epoch": 0.2118504828717804, + "grad_norm": 0.1029401645064354, + "learning_rate": 2.440004507261691e-05, + "loss": 0.0311, + "step": 96060 + }, + { + "epoch": 0.21187253684667856, + "grad_norm": 0.1160631999373436, + "learning_rate": 2.4398756491270643e-05, + "loss": 0.0323, + "step": 96070 + }, + { + "epoch": 0.21189459082157672, + "grad_norm": 0.12647323310375214, + "learning_rate": 2.4397467795720614e-05, + "loss": 0.0342, + "step": 96080 + }, + { + "epoch": 0.2119166447964749, + "grad_norm": 0.08560729771852493, + "learning_rate": 2.439617898598248e-05, + "loss": 0.0329, + "step": 96090 + }, + { + "epoch": 0.21193869877137306, + "grad_norm": 0.10600441694259644, + "learning_rate": 2.4394890062071898e-05, + "loss": 0.0319, + "step": 96100 + }, + { + "epoch": 0.2119607527462712, + "grad_norm": 0.10865170508623123, + "learning_rate": 2.439360102400453e-05, + "loss": 0.0301, + "step": 96110 + }, + { + "epoch": 0.2119828067211694, + "grad_norm": 0.12020042538642883, + "learning_rate": 2.439231187179604e-05, + "loss": 0.0317, + "step": 96120 + }, + { + "epoch": 0.21200486069606755, + "grad_norm": 0.126191183924675, + "learning_rate": 2.4391022605462096e-05, + "loss": 0.0321, + "step": 96130 + }, + { + "epoch": 0.2120269146709657, + "grad_norm": 0.10958744585514069, + "learning_rate": 2.438973322501836e-05, + "loss": 0.0314, + "step": 96140 + }, + { + "epoch": 0.2120489686458639, + "grad_norm": 0.10895536094903946, + "learning_rate": 2.4388443730480498e-05, + "loss": 0.0338, + "step": 96150 + }, + { + "epoch": 0.21207102262076205, + "grad_norm": 0.09159638732671738, + "learning_rate": 2.4387154121864182e-05, + "loss": 0.0316, + "step": 96160 + }, + { + "epoch": 0.2120930765956602, + "grad_norm": 0.11379586160182953, + "learning_rate": 2.4385864399185084e-05, + "loss": 0.0331, + "step": 96170 + }, + { + "epoch": 0.2121151305705584, + "grad_norm": 0.09182512760162354, + "learning_rate": 2.4384574562458865e-05, + "loss": 0.0321, + "step": 96180 + }, + { + "epoch": 0.21213718454545655, + "grad_norm": 0.11031973361968994, + "learning_rate": 2.4383284611701208e-05, + "loss": 0.0312, + "step": 96190 + }, + { + "epoch": 0.2121592385203547, + "grad_norm": 0.09746236354112625, + "learning_rate": 2.4381994546927783e-05, + "loss": 0.0333, + "step": 96200 + }, + { + "epoch": 0.2121812924952529, + "grad_norm": 0.10956739634275436, + "learning_rate": 2.4380704368154264e-05, + "loss": 0.0325, + "step": 96210 + }, + { + "epoch": 0.21220334647015104, + "grad_norm": 0.0953529104590416, + "learning_rate": 2.4379414075396335e-05, + "loss": 0.0311, + "step": 96220 + }, + { + "epoch": 0.2122254004450492, + "grad_norm": 0.09142123907804489, + "learning_rate": 2.4378123668669665e-05, + "loss": 0.0316, + "step": 96230 + }, + { + "epoch": 0.21224745441994738, + "grad_norm": 0.1112242341041565, + "learning_rate": 2.437683314798994e-05, + "loss": 0.0338, + "step": 96240 + }, + { + "epoch": 0.21226950839484554, + "grad_norm": 0.1122489720582962, + "learning_rate": 2.4375542513372838e-05, + "loss": 0.0316, + "step": 96250 + }, + { + "epoch": 0.2122915623697437, + "grad_norm": 0.10510194301605225, + "learning_rate": 2.437425176483405e-05, + "loss": 0.0323, + "step": 96260 + }, + { + "epoch": 0.21231361634464188, + "grad_norm": 0.1150180771946907, + "learning_rate": 2.4372960902389243e-05, + "loss": 0.0321, + "step": 96270 + }, + { + "epoch": 0.21233567031954004, + "grad_norm": 0.11868645250797272, + "learning_rate": 2.4371669926054112e-05, + "loss": 0.0316, + "step": 96280 + }, + { + "epoch": 0.21235772429443822, + "grad_norm": 0.09468545019626617, + "learning_rate": 2.4370378835844346e-05, + "loss": 0.0316, + "step": 96290 + }, + { + "epoch": 0.21237977826933638, + "grad_norm": 0.10999353229999542, + "learning_rate": 2.4369087631775632e-05, + "loss": 0.033, + "step": 96300 + }, + { + "epoch": 0.21240183224423453, + "grad_norm": 0.12795332074165344, + "learning_rate": 2.436779631386365e-05, + "loss": 0.0318, + "step": 96310 + }, + { + "epoch": 0.21242388621913272, + "grad_norm": 0.11349648982286453, + "learning_rate": 2.4366504882124105e-05, + "loss": 0.032, + "step": 96320 + }, + { + "epoch": 0.21244594019403087, + "grad_norm": 0.1002606451511383, + "learning_rate": 2.436521333657268e-05, + "loss": 0.0328, + "step": 96330 + }, + { + "epoch": 0.21246799416892903, + "grad_norm": 0.12752598524093628, + "learning_rate": 2.436392167722507e-05, + "loss": 0.0334, + "step": 96340 + }, + { + "epoch": 0.21249004814382721, + "grad_norm": 0.16182997822761536, + "learning_rate": 2.4362629904096973e-05, + "loss": 0.0318, + "step": 96350 + }, + { + "epoch": 0.21251210211872537, + "grad_norm": 0.11705461889505386, + "learning_rate": 2.4361338017204083e-05, + "loss": 0.0339, + "step": 96360 + }, + { + "epoch": 0.21253415609362353, + "grad_norm": 0.10375947505235672, + "learning_rate": 2.436004601656209e-05, + "loss": 0.0326, + "step": 96370 + }, + { + "epoch": 0.2125562100685217, + "grad_norm": 0.10363668203353882, + "learning_rate": 2.4358753902186708e-05, + "loss": 0.0328, + "step": 96380 + }, + { + "epoch": 0.21257826404341987, + "grad_norm": 0.10354333370923996, + "learning_rate": 2.435746167409363e-05, + "loss": 0.0332, + "step": 96390 + }, + { + "epoch": 0.21260031801831802, + "grad_norm": 0.13076412677764893, + "learning_rate": 2.4356169332298555e-05, + "loss": 0.0314, + "step": 96400 + }, + { + "epoch": 0.2126223719932162, + "grad_norm": 0.1326092630624771, + "learning_rate": 2.4354876876817194e-05, + "loss": 0.0327, + "step": 96410 + }, + { + "epoch": 0.21264442596811436, + "grad_norm": 0.13377155363559723, + "learning_rate": 2.435358430766524e-05, + "loss": 0.0343, + "step": 96420 + }, + { + "epoch": 0.21266647994301252, + "grad_norm": 0.1538924127817154, + "learning_rate": 2.4352291624858407e-05, + "loss": 0.0318, + "step": 96430 + }, + { + "epoch": 0.2126885339179107, + "grad_norm": 0.10949523746967316, + "learning_rate": 2.4350998828412404e-05, + "loss": 0.0351, + "step": 96440 + }, + { + "epoch": 0.21271058789280886, + "grad_norm": 0.16562344133853912, + "learning_rate": 2.4349705918342932e-05, + "loss": 0.0323, + "step": 96450 + }, + { + "epoch": 0.21273264186770702, + "grad_norm": 0.09682822227478027, + "learning_rate": 2.4348412894665708e-05, + "loss": 0.0308, + "step": 96460 + }, + { + "epoch": 0.2127546958426052, + "grad_norm": 0.13490231335163116, + "learning_rate": 2.434711975739644e-05, + "loss": 0.0327, + "step": 96470 + }, + { + "epoch": 0.21277674981750336, + "grad_norm": 0.09275829046964645, + "learning_rate": 2.4345826506550843e-05, + "loss": 0.0307, + "step": 96480 + }, + { + "epoch": 0.21279880379240151, + "grad_norm": 0.10664687305688858, + "learning_rate": 2.4344533142144628e-05, + "loss": 0.0332, + "step": 96490 + }, + { + "epoch": 0.2128208577672997, + "grad_norm": 0.10805784165859222, + "learning_rate": 2.4343239664193514e-05, + "loss": 0.0323, + "step": 96500 + }, + { + "epoch": 0.21284291174219785, + "grad_norm": 0.11508762091398239, + "learning_rate": 2.434194607271322e-05, + "loss": 0.0328, + "step": 96510 + }, + { + "epoch": 0.212864965717096, + "grad_norm": 0.11571390926837921, + "learning_rate": 2.4340652367719457e-05, + "loss": 0.0318, + "step": 96520 + }, + { + "epoch": 0.2128870196919942, + "grad_norm": 0.1001431941986084, + "learning_rate": 2.4339358549227945e-05, + "loss": 0.0344, + "step": 96530 + }, + { + "epoch": 0.21290907366689235, + "grad_norm": 0.10200539976358414, + "learning_rate": 2.4338064617254418e-05, + "loss": 0.0333, + "step": 96540 + }, + { + "epoch": 0.2129311276417905, + "grad_norm": 0.09859758615493774, + "learning_rate": 2.433677057181458e-05, + "loss": 0.0328, + "step": 96550 + }, + { + "epoch": 0.2129531816166887, + "grad_norm": 0.10358741879463196, + "learning_rate": 2.433547641292417e-05, + "loss": 0.0325, + "step": 96560 + }, + { + "epoch": 0.21297523559158685, + "grad_norm": 0.10669668763875961, + "learning_rate": 2.433418214059891e-05, + "loss": 0.0325, + "step": 96570 + }, + { + "epoch": 0.212997289566485, + "grad_norm": 0.10641072690486908, + "learning_rate": 2.4332887754854515e-05, + "loss": 0.0338, + "step": 96580 + }, + { + "epoch": 0.2130193435413832, + "grad_norm": 0.09528905153274536, + "learning_rate": 2.433159325570673e-05, + "loss": 0.0324, + "step": 96590 + }, + { + "epoch": 0.21304139751628134, + "grad_norm": 0.13623274862766266, + "learning_rate": 2.433029864317127e-05, + "loss": 0.032, + "step": 96600 + }, + { + "epoch": 0.2130634514911795, + "grad_norm": 0.13287001848220825, + "learning_rate": 2.432900391726388e-05, + "loss": 0.0333, + "step": 96610 + }, + { + "epoch": 0.21308550546607768, + "grad_norm": 0.09957504272460938, + "learning_rate": 2.4327709078000278e-05, + "loss": 0.0322, + "step": 96620 + }, + { + "epoch": 0.21310755944097584, + "grad_norm": 0.10219456255435944, + "learning_rate": 2.4326414125396207e-05, + "loss": 0.0325, + "step": 96630 + }, + { + "epoch": 0.213129613415874, + "grad_norm": 0.1097409576177597, + "learning_rate": 2.43251190594674e-05, + "loss": 0.0312, + "step": 96640 + }, + { + "epoch": 0.21315166739077218, + "grad_norm": 0.10107407718896866, + "learning_rate": 2.432382388022959e-05, + "loss": 0.0336, + "step": 96650 + }, + { + "epoch": 0.21317372136567034, + "grad_norm": 0.0971764400601387, + "learning_rate": 2.4322528587698516e-05, + "loss": 0.0314, + "step": 96660 + }, + { + "epoch": 0.2131957753405685, + "grad_norm": 0.09416045248508453, + "learning_rate": 2.4321233181889918e-05, + "loss": 0.0307, + "step": 96670 + }, + { + "epoch": 0.21321782931546668, + "grad_norm": 0.08363909274339676, + "learning_rate": 2.4319937662819536e-05, + "loss": 0.0326, + "step": 96680 + }, + { + "epoch": 0.21323988329036483, + "grad_norm": 0.12081082910299301, + "learning_rate": 2.431864203050311e-05, + "loss": 0.0342, + "step": 96690 + }, + { + "epoch": 0.21326193726526302, + "grad_norm": 0.1079774722456932, + "learning_rate": 2.431734628495639e-05, + "loss": 0.0336, + "step": 96700 + }, + { + "epoch": 0.21328399124016117, + "grad_norm": 0.08038952946662903, + "learning_rate": 2.4316050426195113e-05, + "loss": 0.032, + "step": 96710 + }, + { + "epoch": 0.21330604521505933, + "grad_norm": 0.0959298238158226, + "learning_rate": 2.4314754454235025e-05, + "loss": 0.033, + "step": 96720 + }, + { + "epoch": 0.21332809918995752, + "grad_norm": 0.10613872110843658, + "learning_rate": 2.431345836909188e-05, + "loss": 0.0304, + "step": 96730 + }, + { + "epoch": 0.21335015316485567, + "grad_norm": 0.14037716388702393, + "learning_rate": 2.431216217078142e-05, + "loss": 0.0342, + "step": 96740 + }, + { + "epoch": 0.21337220713975383, + "grad_norm": 0.10378660261631012, + "learning_rate": 2.4310865859319396e-05, + "loss": 0.0328, + "step": 96750 + }, + { + "epoch": 0.213394261114652, + "grad_norm": 0.11315564066171646, + "learning_rate": 2.4309569434721565e-05, + "loss": 0.0313, + "step": 96760 + }, + { + "epoch": 0.21341631508955017, + "grad_norm": 0.1266423612833023, + "learning_rate": 2.4308272897003675e-05, + "loss": 0.0325, + "step": 96770 + }, + { + "epoch": 0.21343836906444832, + "grad_norm": 0.1050196960568428, + "learning_rate": 2.430697624618148e-05, + "loss": 0.0329, + "step": 96780 + }, + { + "epoch": 0.2134604230393465, + "grad_norm": 0.11159010976552963, + "learning_rate": 2.4305679482270737e-05, + "loss": 0.0325, + "step": 96790 + }, + { + "epoch": 0.21348247701424466, + "grad_norm": 0.07666636258363724, + "learning_rate": 2.43043826052872e-05, + "loss": 0.0323, + "step": 96800 + }, + { + "epoch": 0.21350453098914282, + "grad_norm": 0.1400281935930252, + "learning_rate": 2.430308561524663e-05, + "loss": 0.0324, + "step": 96810 + }, + { + "epoch": 0.213526584964041, + "grad_norm": 0.09495547413825989, + "learning_rate": 2.430178851216479e-05, + "loss": 0.0332, + "step": 96820 + }, + { + "epoch": 0.21354863893893916, + "grad_norm": 0.10051233321428299, + "learning_rate": 2.4300491296057433e-05, + "loss": 0.0312, + "step": 96830 + }, + { + "epoch": 0.21357069291383732, + "grad_norm": 0.12368693202733994, + "learning_rate": 2.4299193966940324e-05, + "loss": 0.0322, + "step": 96840 + }, + { + "epoch": 0.2135927468887355, + "grad_norm": 0.11567328870296478, + "learning_rate": 2.4297896524829233e-05, + "loss": 0.0315, + "step": 96850 + }, + { + "epoch": 0.21361480086363366, + "grad_norm": 0.11202061176300049, + "learning_rate": 2.4296598969739922e-05, + "loss": 0.03, + "step": 96860 + }, + { + "epoch": 0.21363685483853181, + "grad_norm": 0.09032201021909714, + "learning_rate": 2.4295301301688152e-05, + "loss": 0.0318, + "step": 96870 + }, + { + "epoch": 0.21365890881343, + "grad_norm": 0.09095261991024017, + "learning_rate": 2.4294003520689696e-05, + "loss": 0.0316, + "step": 96880 + }, + { + "epoch": 0.21368096278832815, + "grad_norm": 0.08314230293035507, + "learning_rate": 2.429270562676032e-05, + "loss": 0.0333, + "step": 96890 + }, + { + "epoch": 0.2137030167632263, + "grad_norm": 0.12062038481235504, + "learning_rate": 2.42914076199158e-05, + "loss": 0.0329, + "step": 96900 + }, + { + "epoch": 0.2137250707381245, + "grad_norm": 0.1232779324054718, + "learning_rate": 2.4290109500171906e-05, + "loss": 0.0334, + "step": 96910 + }, + { + "epoch": 0.21374712471302265, + "grad_norm": 0.10358699411153793, + "learning_rate": 2.4288811267544406e-05, + "loss": 0.033, + "step": 96920 + }, + { + "epoch": 0.2137691786879208, + "grad_norm": 0.11516933888196945, + "learning_rate": 2.4287512922049086e-05, + "loss": 0.0336, + "step": 96930 + }, + { + "epoch": 0.213791232662819, + "grad_norm": 0.11853265017271042, + "learning_rate": 2.428621446370171e-05, + "loss": 0.0347, + "step": 96940 + }, + { + "epoch": 0.21381328663771715, + "grad_norm": 0.11381449550390244, + "learning_rate": 2.4284915892518057e-05, + "loss": 0.0332, + "step": 96950 + }, + { + "epoch": 0.2138353406126153, + "grad_norm": 0.10999186336994171, + "learning_rate": 2.4283617208513917e-05, + "loss": 0.0328, + "step": 96960 + }, + { + "epoch": 0.2138573945875135, + "grad_norm": 0.07689811289310455, + "learning_rate": 2.4282318411705055e-05, + "loss": 0.0321, + "step": 96970 + }, + { + "epoch": 0.21387944856241164, + "grad_norm": 0.10164255648851395, + "learning_rate": 2.4281019502107266e-05, + "loss": 0.0324, + "step": 96980 + }, + { + "epoch": 0.2139015025373098, + "grad_norm": 0.11166057735681534, + "learning_rate": 2.4279720479736326e-05, + "loss": 0.0318, + "step": 96990 + }, + { + "epoch": 0.21392355651220799, + "grad_norm": 0.10841410607099533, + "learning_rate": 2.427842134460802e-05, + "loss": 0.0322, + "step": 97000 + }, + { + "epoch": 0.21394561048710614, + "grad_norm": 0.13761794567108154, + "learning_rate": 2.4277122096738134e-05, + "loss": 0.0327, + "step": 97010 + }, + { + "epoch": 0.2139676644620043, + "grad_norm": 0.1070483848452568, + "learning_rate": 2.427582273614246e-05, + "loss": 0.0322, + "step": 97020 + }, + { + "epoch": 0.21398971843690248, + "grad_norm": 0.09787621349096298, + "learning_rate": 2.427452326283678e-05, + "loss": 0.033, + "step": 97030 + }, + { + "epoch": 0.21401177241180064, + "grad_norm": 0.13120700418949127, + "learning_rate": 2.427322367683688e-05, + "loss": 0.0339, + "step": 97040 + }, + { + "epoch": 0.2140338263866988, + "grad_norm": 0.09631303697824478, + "learning_rate": 2.4271923978158564e-05, + "loss": 0.0302, + "step": 97050 + }, + { + "epoch": 0.21405588036159698, + "grad_norm": 0.08634969592094421, + "learning_rate": 2.427062416681761e-05, + "loss": 0.0316, + "step": 97060 + }, + { + "epoch": 0.21407793433649513, + "grad_norm": 0.11639430373907089, + "learning_rate": 2.4269324242829825e-05, + "loss": 0.0329, + "step": 97070 + }, + { + "epoch": 0.2140999883113933, + "grad_norm": 0.13328254222869873, + "learning_rate": 2.4268024206210995e-05, + "loss": 0.0322, + "step": 97080 + }, + { + "epoch": 0.21412204228629148, + "grad_norm": 0.12315750122070312, + "learning_rate": 2.4266724056976926e-05, + "loss": 0.0352, + "step": 97090 + }, + { + "epoch": 0.21414409626118963, + "grad_norm": 0.13116511702537537, + "learning_rate": 2.4265423795143403e-05, + "loss": 0.0336, + "step": 97100 + }, + { + "epoch": 0.2141661502360878, + "grad_norm": 0.10644765198230743, + "learning_rate": 2.4264123420726242e-05, + "loss": 0.0313, + "step": 97110 + }, + { + "epoch": 0.21418820421098597, + "grad_norm": 0.16348537802696228, + "learning_rate": 2.426282293374123e-05, + "loss": 0.0325, + "step": 97120 + }, + { + "epoch": 0.21421025818588413, + "grad_norm": 0.13587340712547302, + "learning_rate": 2.4261522334204176e-05, + "loss": 0.0337, + "step": 97130 + }, + { + "epoch": 0.2142323121607823, + "grad_norm": 0.1301957070827484, + "learning_rate": 2.426022162213088e-05, + "loss": 0.0334, + "step": 97140 + }, + { + "epoch": 0.21425436613568047, + "grad_norm": 0.1303301602602005, + "learning_rate": 2.4258920797537147e-05, + "loss": 0.0341, + "step": 97150 + }, + { + "epoch": 0.21427642011057862, + "grad_norm": 0.10566990822553635, + "learning_rate": 2.4257619860438783e-05, + "loss": 0.0321, + "step": 97160 + }, + { + "epoch": 0.2142984740854768, + "grad_norm": 0.09219307452440262, + "learning_rate": 2.42563188108516e-05, + "loss": 0.033, + "step": 97170 + }, + { + "epoch": 0.21432052806037497, + "grad_norm": 0.1377437561750412, + "learning_rate": 2.4255017648791402e-05, + "loss": 0.0337, + "step": 97180 + }, + { + "epoch": 0.21434258203527312, + "grad_norm": 0.1076604500412941, + "learning_rate": 2.4253716374274e-05, + "loss": 0.0331, + "step": 97190 + }, + { + "epoch": 0.2143646360101713, + "grad_norm": 0.10557366907596588, + "learning_rate": 2.4252414987315213e-05, + "loss": 0.0312, + "step": 97200 + }, + { + "epoch": 0.21438668998506946, + "grad_norm": 0.0894191786646843, + "learning_rate": 2.4251113487930847e-05, + "loss": 0.0321, + "step": 97210 + }, + { + "epoch": 0.21440874395996762, + "grad_norm": 0.09119027853012085, + "learning_rate": 2.4249811876136714e-05, + "loss": 0.0309, + "step": 97220 + }, + { + "epoch": 0.2144307979348658, + "grad_norm": 0.10564063489437103, + "learning_rate": 2.4248510151948635e-05, + "loss": 0.0339, + "step": 97230 + }, + { + "epoch": 0.21445285190976396, + "grad_norm": 0.10193327814340591, + "learning_rate": 2.424720831538242e-05, + "loss": 0.0327, + "step": 97240 + }, + { + "epoch": 0.21447490588466211, + "grad_norm": 0.12090475857257843, + "learning_rate": 2.42459063664539e-05, + "loss": 0.0334, + "step": 97250 + }, + { + "epoch": 0.2144969598595603, + "grad_norm": 0.11569879949092865, + "learning_rate": 2.4244604305178886e-05, + "loss": 0.0335, + "step": 97260 + }, + { + "epoch": 0.21451901383445846, + "grad_norm": 0.09685914218425751, + "learning_rate": 2.4243302131573207e-05, + "loss": 0.0329, + "step": 97270 + }, + { + "epoch": 0.2145410678093566, + "grad_norm": 0.1091841533780098, + "learning_rate": 2.4241999845652673e-05, + "loss": 0.0322, + "step": 97280 + }, + { + "epoch": 0.2145631217842548, + "grad_norm": 0.10191944241523743, + "learning_rate": 2.4240697447433115e-05, + "loss": 0.0328, + "step": 97290 + }, + { + "epoch": 0.21458517575915295, + "grad_norm": 0.10699860006570816, + "learning_rate": 2.423939493693036e-05, + "loss": 0.0329, + "step": 97300 + }, + { + "epoch": 0.2146072297340511, + "grad_norm": 0.12185104936361313, + "learning_rate": 2.4238092314160233e-05, + "loss": 0.0318, + "step": 97310 + }, + { + "epoch": 0.2146292837089493, + "grad_norm": 0.11876045912504196, + "learning_rate": 2.4236789579138565e-05, + "loss": 0.0318, + "step": 97320 + }, + { + "epoch": 0.21465133768384745, + "grad_norm": 0.09169398993253708, + "learning_rate": 2.423548673188118e-05, + "loss": 0.0338, + "step": 97330 + }, + { + "epoch": 0.2146733916587456, + "grad_norm": 0.09216176718473434, + "learning_rate": 2.423418377240391e-05, + "loss": 0.0325, + "step": 97340 + }, + { + "epoch": 0.2146954456336438, + "grad_norm": 0.1285777986049652, + "learning_rate": 2.4232880700722593e-05, + "loss": 0.0335, + "step": 97350 + }, + { + "epoch": 0.21471749960854195, + "grad_norm": 0.09633796662092209, + "learning_rate": 2.4231577516853054e-05, + "loss": 0.0328, + "step": 97360 + }, + { + "epoch": 0.2147395535834401, + "grad_norm": 0.10232540220022202, + "learning_rate": 2.4230274220811135e-05, + "loss": 0.0314, + "step": 97370 + }, + { + "epoch": 0.21476160755833829, + "grad_norm": 0.11726104468107224, + "learning_rate": 2.4228970812612665e-05, + "loss": 0.0327, + "step": 97380 + }, + { + "epoch": 0.21478366153323644, + "grad_norm": 0.12283284217119217, + "learning_rate": 2.4227667292273487e-05, + "loss": 0.0329, + "step": 97390 + }, + { + "epoch": 0.2148057155081346, + "grad_norm": 0.10504401475191116, + "learning_rate": 2.4226363659809444e-05, + "loss": 0.0317, + "step": 97400 + }, + { + "epoch": 0.21482776948303278, + "grad_norm": 0.1400761902332306, + "learning_rate": 2.422505991523637e-05, + "loss": 0.0328, + "step": 97410 + }, + { + "epoch": 0.21484982345793094, + "grad_norm": 0.10725034028291702, + "learning_rate": 2.4223756058570105e-05, + "loss": 0.0333, + "step": 97420 + }, + { + "epoch": 0.2148718774328291, + "grad_norm": 0.09622856229543686, + "learning_rate": 2.4222452089826495e-05, + "loss": 0.0328, + "step": 97430 + }, + { + "epoch": 0.21489393140772728, + "grad_norm": 0.1183992475271225, + "learning_rate": 2.422114800902139e-05, + "loss": 0.032, + "step": 97440 + }, + { + "epoch": 0.21491598538262544, + "grad_norm": 0.12091764807701111, + "learning_rate": 2.421984381617062e-05, + "loss": 0.0332, + "step": 97450 + }, + { + "epoch": 0.2149380393575236, + "grad_norm": 0.14653868973255157, + "learning_rate": 2.4218539511290055e-05, + "loss": 0.0323, + "step": 97460 + }, + { + "epoch": 0.21496009333242178, + "grad_norm": 0.13664932548999786, + "learning_rate": 2.421723509439552e-05, + "loss": 0.03, + "step": 97470 + }, + { + "epoch": 0.21498214730731993, + "grad_norm": 0.11189582198858261, + "learning_rate": 2.4215930565502887e-05, + "loss": 0.0327, + "step": 97480 + }, + { + "epoch": 0.2150042012822181, + "grad_norm": 0.10719682276248932, + "learning_rate": 2.421462592462799e-05, + "loss": 0.0343, + "step": 97490 + }, + { + "epoch": 0.21502625525711627, + "grad_norm": 0.10997316241264343, + "learning_rate": 2.4213321171786693e-05, + "loss": 0.0317, + "step": 97500 + }, + { + "epoch": 0.21504830923201443, + "grad_norm": 0.09030533581972122, + "learning_rate": 2.421201630699484e-05, + "loss": 0.0323, + "step": 97510 + }, + { + "epoch": 0.21507036320691258, + "grad_norm": 0.11586705595254898, + "learning_rate": 2.421071133026829e-05, + "loss": 0.0339, + "step": 97520 + }, + { + "epoch": 0.21509241718181077, + "grad_norm": 0.11647433787584305, + "learning_rate": 2.4209406241622906e-05, + "loss": 0.0314, + "step": 97530 + }, + { + "epoch": 0.21511447115670893, + "grad_norm": 0.12436031550168991, + "learning_rate": 2.4208101041074537e-05, + "loss": 0.0328, + "step": 97540 + }, + { + "epoch": 0.21513652513160708, + "grad_norm": 0.11467593908309937, + "learning_rate": 2.4206795728639046e-05, + "loss": 0.034, + "step": 97550 + }, + { + "epoch": 0.21515857910650527, + "grad_norm": 0.12532831728458405, + "learning_rate": 2.4205490304332297e-05, + "loss": 0.0319, + "step": 97560 + }, + { + "epoch": 0.21518063308140342, + "grad_norm": 0.11081237345933914, + "learning_rate": 2.420418476817015e-05, + "loss": 0.0319, + "step": 97570 + }, + { + "epoch": 0.2152026870563016, + "grad_norm": 0.13666889071464539, + "learning_rate": 2.4202879120168467e-05, + "loss": 0.0341, + "step": 97580 + }, + { + "epoch": 0.21522474103119976, + "grad_norm": 0.1068260669708252, + "learning_rate": 2.420157336034311e-05, + "loss": 0.0317, + "step": 97590 + }, + { + "epoch": 0.21524679500609792, + "grad_norm": 0.10987827926874161, + "learning_rate": 2.4200267488709956e-05, + "loss": 0.0335, + "step": 97600 + }, + { + "epoch": 0.2152688489809961, + "grad_norm": 0.10186482965946198, + "learning_rate": 2.4198961505284857e-05, + "loss": 0.0307, + "step": 97610 + }, + { + "epoch": 0.21529090295589426, + "grad_norm": 0.10973434150218964, + "learning_rate": 2.4197655410083697e-05, + "loss": 0.0367, + "step": 97620 + }, + { + "epoch": 0.21531295693079242, + "grad_norm": 0.07685243338346481, + "learning_rate": 2.4196349203122338e-05, + "loss": 0.0327, + "step": 97630 + }, + { + "epoch": 0.2153350109056906, + "grad_norm": 0.11390472203493118, + "learning_rate": 2.4195042884416654e-05, + "loss": 0.0339, + "step": 97640 + }, + { + "epoch": 0.21535706488058876, + "grad_norm": 0.12265577912330627, + "learning_rate": 2.4193736453982514e-05, + "loss": 0.0315, + "step": 97650 + }, + { + "epoch": 0.2153791188554869, + "grad_norm": 0.10557478666305542, + "learning_rate": 2.4192429911835797e-05, + "loss": 0.0333, + "step": 97660 + }, + { + "epoch": 0.2154011728303851, + "grad_norm": 0.13089534640312195, + "learning_rate": 2.419112325799238e-05, + "loss": 0.0326, + "step": 97670 + }, + { + "epoch": 0.21542322680528325, + "grad_norm": 0.09446097910404205, + "learning_rate": 2.4189816492468137e-05, + "loss": 0.0331, + "step": 97680 + }, + { + "epoch": 0.2154452807801814, + "grad_norm": 0.11628615856170654, + "learning_rate": 2.4188509615278948e-05, + "loss": 0.0321, + "step": 97690 + }, + { + "epoch": 0.2154673347550796, + "grad_norm": 0.11452413350343704, + "learning_rate": 2.418720262644069e-05, + "loss": 0.0335, + "step": 97700 + }, + { + "epoch": 0.21548938872997775, + "grad_norm": 0.09581871330738068, + "learning_rate": 2.4185895525969247e-05, + "loss": 0.0315, + "step": 97710 + }, + { + "epoch": 0.2155114427048759, + "grad_norm": 0.08072064816951752, + "learning_rate": 2.41845883138805e-05, + "loss": 0.0328, + "step": 97720 + }, + { + "epoch": 0.2155334966797741, + "grad_norm": 0.09353280067443848, + "learning_rate": 2.418328099019033e-05, + "loss": 0.033, + "step": 97730 + }, + { + "epoch": 0.21555555065467225, + "grad_norm": 0.08626612275838852, + "learning_rate": 2.4181973554914627e-05, + "loss": 0.0328, + "step": 97740 + }, + { + "epoch": 0.2155776046295704, + "grad_norm": 0.10179411619901657, + "learning_rate": 2.418066600806928e-05, + "loss": 0.0323, + "step": 97750 + }, + { + "epoch": 0.21559965860446859, + "grad_norm": 0.1171877533197403, + "learning_rate": 2.4179358349670172e-05, + "loss": 0.0307, + "step": 97760 + }, + { + "epoch": 0.21562171257936674, + "grad_norm": 0.13505446910858154, + "learning_rate": 2.417805057973319e-05, + "loss": 0.0331, + "step": 97770 + }, + { + "epoch": 0.2156437665542649, + "grad_norm": 0.12192633748054504, + "learning_rate": 2.417674269827423e-05, + "loss": 0.0317, + "step": 97780 + }, + { + "epoch": 0.21566582052916308, + "grad_norm": 0.09859133511781693, + "learning_rate": 2.417543470530918e-05, + "loss": 0.0334, + "step": 97790 + }, + { + "epoch": 0.21568787450406124, + "grad_norm": 0.08132602274417877, + "learning_rate": 2.4174126600853935e-05, + "loss": 0.032, + "step": 97800 + }, + { + "epoch": 0.2157099284789594, + "grad_norm": 0.09098488092422485, + "learning_rate": 2.4172818384924395e-05, + "loss": 0.0314, + "step": 97810 + }, + { + "epoch": 0.21573198245385758, + "grad_norm": 0.10572498291730881, + "learning_rate": 2.417151005753645e-05, + "loss": 0.0335, + "step": 97820 + }, + { + "epoch": 0.21575403642875574, + "grad_norm": 0.09300991147756577, + "learning_rate": 2.4170201618705994e-05, + "loss": 0.0322, + "step": 97830 + }, + { + "epoch": 0.2157760904036539, + "grad_norm": 0.1146039143204689, + "learning_rate": 2.416889306844893e-05, + "loss": 0.0339, + "step": 97840 + }, + { + "epoch": 0.21579814437855208, + "grad_norm": 0.09045253694057465, + "learning_rate": 2.4167584406781164e-05, + "loss": 0.032, + "step": 97850 + }, + { + "epoch": 0.21582019835345023, + "grad_norm": 0.10281793773174286, + "learning_rate": 2.4166275633718587e-05, + "loss": 0.0329, + "step": 97860 + }, + { + "epoch": 0.2158422523283484, + "grad_norm": 0.10443246364593506, + "learning_rate": 2.416496674927711e-05, + "loss": 0.0315, + "step": 97870 + }, + { + "epoch": 0.21586430630324657, + "grad_norm": 0.10515588521957397, + "learning_rate": 2.416365775347263e-05, + "loss": 0.0338, + "step": 97880 + }, + { + "epoch": 0.21588636027814473, + "grad_norm": 0.11558127403259277, + "learning_rate": 2.4162348646321062e-05, + "loss": 0.033, + "step": 97890 + }, + { + "epoch": 0.21590841425304289, + "grad_norm": 0.11880907416343689, + "learning_rate": 2.41610394278383e-05, + "loss": 0.0327, + "step": 97900 + }, + { + "epoch": 0.21593046822794107, + "grad_norm": 0.10962024331092834, + "learning_rate": 2.4159730098040266e-05, + "loss": 0.0317, + "step": 97910 + }, + { + "epoch": 0.21595252220283923, + "grad_norm": 0.09921576827764511, + "learning_rate": 2.4158420656942864e-05, + "loss": 0.0324, + "step": 97920 + }, + { + "epoch": 0.21597457617773738, + "grad_norm": 0.0806787759065628, + "learning_rate": 2.4157111104561995e-05, + "loss": 0.0322, + "step": 97930 + }, + { + "epoch": 0.21599663015263557, + "grad_norm": 0.12619860470294952, + "learning_rate": 2.4155801440913588e-05, + "loss": 0.0336, + "step": 97940 + }, + { + "epoch": 0.21601868412753372, + "grad_norm": 0.09930717945098877, + "learning_rate": 2.415449166601355e-05, + "loss": 0.034, + "step": 97950 + }, + { + "epoch": 0.21604073810243188, + "grad_norm": 0.21190011501312256, + "learning_rate": 2.415318177987779e-05, + "loss": 0.0336, + "step": 97960 + }, + { + "epoch": 0.21606279207733006, + "grad_norm": 0.11551021784543991, + "learning_rate": 2.415187178252223e-05, + "loss": 0.0311, + "step": 97970 + }, + { + "epoch": 0.21608484605222822, + "grad_norm": 0.11621967703104019, + "learning_rate": 2.415056167396279e-05, + "loss": 0.0306, + "step": 97980 + }, + { + "epoch": 0.2161069000271264, + "grad_norm": 0.12283115088939667, + "learning_rate": 2.4149251454215382e-05, + "loss": 0.0349, + "step": 97990 + }, + { + "epoch": 0.21612895400202456, + "grad_norm": 0.09114693105220795, + "learning_rate": 2.4147941123295934e-05, + "loss": 0.0328, + "step": 98000 + }, + { + "epoch": 0.21615100797692272, + "grad_norm": 0.09320082515478134, + "learning_rate": 2.4146630681220365e-05, + "loss": 0.033, + "step": 98010 + }, + { + "epoch": 0.2161730619518209, + "grad_norm": 0.3789086639881134, + "learning_rate": 2.414532012800459e-05, + "loss": 0.0324, + "step": 98020 + }, + { + "epoch": 0.21619511592671906, + "grad_norm": 0.10585139691829681, + "learning_rate": 2.414400946366455e-05, + "loss": 0.0304, + "step": 98030 + }, + { + "epoch": 0.2162171699016172, + "grad_norm": 0.11789150536060333, + "learning_rate": 2.4142698688216156e-05, + "loss": 0.0323, + "step": 98040 + }, + { + "epoch": 0.2162392238765154, + "grad_norm": 0.10567814111709595, + "learning_rate": 2.4141387801675346e-05, + "loss": 0.0311, + "step": 98050 + }, + { + "epoch": 0.21626127785141355, + "grad_norm": 0.09852258116006851, + "learning_rate": 2.4140076804058037e-05, + "loss": 0.0305, + "step": 98060 + }, + { + "epoch": 0.2162833318263117, + "grad_norm": 0.1058579683303833, + "learning_rate": 2.4138765695380166e-05, + "loss": 0.0315, + "step": 98070 + }, + { + "epoch": 0.2163053858012099, + "grad_norm": 0.12109334021806717, + "learning_rate": 2.413745447565767e-05, + "loss": 0.0324, + "step": 98080 + }, + { + "epoch": 0.21632743977610805, + "grad_norm": 0.09205557405948639, + "learning_rate": 2.4136143144906468e-05, + "loss": 0.0326, + "step": 98090 + }, + { + "epoch": 0.2163494937510062, + "grad_norm": 0.11128013581037521, + "learning_rate": 2.4134831703142503e-05, + "loss": 0.0317, + "step": 98100 + }, + { + "epoch": 0.2163715477259044, + "grad_norm": 0.07480703294277191, + "learning_rate": 2.413352015038171e-05, + "loss": 0.0338, + "step": 98110 + }, + { + "epoch": 0.21639360170080255, + "grad_norm": 0.09960010647773743, + "learning_rate": 2.4132208486640023e-05, + "loss": 0.0318, + "step": 98120 + }, + { + "epoch": 0.2164156556757007, + "grad_norm": 0.11191418021917343, + "learning_rate": 2.4130896711933376e-05, + "loss": 0.0336, + "step": 98130 + }, + { + "epoch": 0.2164377096505989, + "grad_norm": 0.10080570727586746, + "learning_rate": 2.412958482627772e-05, + "loss": 0.0337, + "step": 98140 + }, + { + "epoch": 0.21645976362549704, + "grad_norm": 0.14153462648391724, + "learning_rate": 2.4128272829688983e-05, + "loss": 0.0313, + "step": 98150 + }, + { + "epoch": 0.2164818176003952, + "grad_norm": 0.1114189475774765, + "learning_rate": 2.4126960722183115e-05, + "loss": 0.0328, + "step": 98160 + }, + { + "epoch": 0.21650387157529338, + "grad_norm": 0.0896962583065033, + "learning_rate": 2.412564850377606e-05, + "loss": 0.0325, + "step": 98170 + }, + { + "epoch": 0.21652592555019154, + "grad_norm": 0.12666013836860657, + "learning_rate": 2.4124336174483753e-05, + "loss": 0.0309, + "step": 98180 + }, + { + "epoch": 0.2165479795250897, + "grad_norm": 0.1084151640534401, + "learning_rate": 2.412302373432215e-05, + "loss": 0.0326, + "step": 98190 + }, + { + "epoch": 0.21657003349998788, + "grad_norm": 0.11449480801820755, + "learning_rate": 2.4121711183307196e-05, + "loss": 0.0329, + "step": 98200 + }, + { + "epoch": 0.21659208747488604, + "grad_norm": 0.10444317013025284, + "learning_rate": 2.412039852145484e-05, + "loss": 0.0338, + "step": 98210 + }, + { + "epoch": 0.2166141414497842, + "grad_norm": 0.1107306256890297, + "learning_rate": 2.411908574878103e-05, + "loss": 0.0313, + "step": 98220 + }, + { + "epoch": 0.21663619542468238, + "grad_norm": 0.10185589641332626, + "learning_rate": 2.4117772865301715e-05, + "loss": 0.0298, + "step": 98230 + }, + { + "epoch": 0.21665824939958053, + "grad_norm": 0.10250863432884216, + "learning_rate": 2.4116459871032856e-05, + "loss": 0.033, + "step": 98240 + }, + { + "epoch": 0.2166803033744787, + "grad_norm": 0.08179095387458801, + "learning_rate": 2.4115146765990398e-05, + "loss": 0.0328, + "step": 98250 + }, + { + "epoch": 0.21670235734937687, + "grad_norm": 0.10361792147159576, + "learning_rate": 2.41138335501903e-05, + "loss": 0.0312, + "step": 98260 + }, + { + "epoch": 0.21672441132427503, + "grad_norm": 0.10723407566547394, + "learning_rate": 2.411252022364852e-05, + "loss": 0.0307, + "step": 98270 + }, + { + "epoch": 0.21674646529917319, + "grad_norm": 0.14088980853557587, + "learning_rate": 2.411120678638102e-05, + "loss": 0.0332, + "step": 98280 + }, + { + "epoch": 0.21676851927407137, + "grad_norm": 0.1188361793756485, + "learning_rate": 2.4109893238403754e-05, + "loss": 0.0319, + "step": 98290 + }, + { + "epoch": 0.21679057324896953, + "grad_norm": 0.09680058062076569, + "learning_rate": 2.4108579579732684e-05, + "loss": 0.0316, + "step": 98300 + }, + { + "epoch": 0.21681262722386768, + "grad_norm": 0.154350146651268, + "learning_rate": 2.4107265810383766e-05, + "loss": 0.0323, + "step": 98310 + }, + { + "epoch": 0.21683468119876587, + "grad_norm": 0.10004197061061859, + "learning_rate": 2.4105951930372978e-05, + "loss": 0.0326, + "step": 98320 + }, + { + "epoch": 0.21685673517366402, + "grad_norm": 0.08413590490818024, + "learning_rate": 2.4104637939716272e-05, + "loss": 0.0321, + "step": 98330 + }, + { + "epoch": 0.21687878914856218, + "grad_norm": 0.08360707759857178, + "learning_rate": 2.410332383842962e-05, + "loss": 0.0323, + "step": 98340 + }, + { + "epoch": 0.21690084312346036, + "grad_norm": 0.11336325854063034, + "learning_rate": 2.4102009626528992e-05, + "loss": 0.0322, + "step": 98350 + }, + { + "epoch": 0.21692289709835852, + "grad_norm": 0.0961323082447052, + "learning_rate": 2.4100695304030344e-05, + "loss": 0.0317, + "step": 98360 + }, + { + "epoch": 0.21694495107325668, + "grad_norm": 0.09176996350288391, + "learning_rate": 2.4099380870949665e-05, + "loss": 0.0317, + "step": 98370 + }, + { + "epoch": 0.21696700504815486, + "grad_norm": 0.12690791487693787, + "learning_rate": 2.4098066327302912e-05, + "loss": 0.0308, + "step": 98380 + }, + { + "epoch": 0.21698905902305302, + "grad_norm": 0.13262973725795746, + "learning_rate": 2.4096751673106064e-05, + "loss": 0.0338, + "step": 98390 + }, + { + "epoch": 0.21701111299795117, + "grad_norm": 0.11216600239276886, + "learning_rate": 2.40954369083751e-05, + "loss": 0.0317, + "step": 98400 + }, + { + "epoch": 0.21703316697284936, + "grad_norm": 0.11574287712574005, + "learning_rate": 2.409412203312598e-05, + "loss": 0.0318, + "step": 98410 + }, + { + "epoch": 0.2170552209477475, + "grad_norm": 0.11268088221549988, + "learning_rate": 2.4092807047374695e-05, + "loss": 0.0318, + "step": 98420 + }, + { + "epoch": 0.2170772749226457, + "grad_norm": 0.09975632280111313, + "learning_rate": 2.409149195113722e-05, + "loss": 0.0324, + "step": 98430 + }, + { + "epoch": 0.21709932889754385, + "grad_norm": 0.18542273342609406, + "learning_rate": 2.4090176744429535e-05, + "loss": 0.0323, + "step": 98440 + }, + { + "epoch": 0.217121382872442, + "grad_norm": 0.11009972542524338, + "learning_rate": 2.4088861427267616e-05, + "loss": 0.0335, + "step": 98450 + }, + { + "epoch": 0.2171434368473402, + "grad_norm": 0.09235366433858871, + "learning_rate": 2.4087545999667457e-05, + "loss": 0.0322, + "step": 98460 + }, + { + "epoch": 0.21716549082223835, + "grad_norm": 0.10602255165576935, + "learning_rate": 2.4086230461645027e-05, + "loss": 0.032, + "step": 98470 + }, + { + "epoch": 0.2171875447971365, + "grad_norm": 0.12096536159515381, + "learning_rate": 2.4084914813216318e-05, + "loss": 0.0336, + "step": 98480 + }, + { + "epoch": 0.2172095987720347, + "grad_norm": 0.14468304812908173, + "learning_rate": 2.4083599054397325e-05, + "loss": 0.0306, + "step": 98490 + }, + { + "epoch": 0.21723165274693285, + "grad_norm": 0.14454659819602966, + "learning_rate": 2.408228318520402e-05, + "loss": 0.0328, + "step": 98500 + }, + { + "epoch": 0.217253706721831, + "grad_norm": 0.11883177608251572, + "learning_rate": 2.40809672056524e-05, + "loss": 0.0334, + "step": 98510 + }, + { + "epoch": 0.2172757606967292, + "grad_norm": 0.11023129522800446, + "learning_rate": 2.4079651115758452e-05, + "loss": 0.0324, + "step": 98520 + }, + { + "epoch": 0.21729781467162734, + "grad_norm": 0.10020388662815094, + "learning_rate": 2.4078334915538176e-05, + "loss": 0.0324, + "step": 98530 + }, + { + "epoch": 0.2173198686465255, + "grad_norm": 0.09775055199861526, + "learning_rate": 2.4077018605007556e-05, + "loss": 0.0327, + "step": 98540 + }, + { + "epoch": 0.21734192262142368, + "grad_norm": 0.1072549968957901, + "learning_rate": 2.4075702184182594e-05, + "loss": 0.0314, + "step": 98550 + }, + { + "epoch": 0.21736397659632184, + "grad_norm": 0.1407739520072937, + "learning_rate": 2.407438565307928e-05, + "loss": 0.0323, + "step": 98560 + }, + { + "epoch": 0.21738603057122, + "grad_norm": 0.09822959452867508, + "learning_rate": 2.4073069011713605e-05, + "loss": 0.0314, + "step": 98570 + }, + { + "epoch": 0.21740808454611818, + "grad_norm": 0.0957733765244484, + "learning_rate": 2.4071752260101586e-05, + "loss": 0.0324, + "step": 98580 + }, + { + "epoch": 0.21743013852101634, + "grad_norm": 0.11534906923770905, + "learning_rate": 2.40704353982592e-05, + "loss": 0.0326, + "step": 98590 + }, + { + "epoch": 0.2174521924959145, + "grad_norm": 0.1327044665813446, + "learning_rate": 2.406911842620247e-05, + "loss": 0.0329, + "step": 98600 + }, + { + "epoch": 0.21747424647081268, + "grad_norm": 0.11869077384471893, + "learning_rate": 2.4067801343947387e-05, + "loss": 0.0325, + "step": 98610 + }, + { + "epoch": 0.21749630044571083, + "grad_norm": 0.15926805138587952, + "learning_rate": 2.4066484151509956e-05, + "loss": 0.0328, + "step": 98620 + }, + { + "epoch": 0.217518354420609, + "grad_norm": 0.1013006642460823, + "learning_rate": 2.406516684890618e-05, + "loss": 0.033, + "step": 98630 + }, + { + "epoch": 0.21754040839550717, + "grad_norm": 0.09602707624435425, + "learning_rate": 2.4063849436152066e-05, + "loss": 0.0343, + "step": 98640 + }, + { + "epoch": 0.21756246237040533, + "grad_norm": 0.09662611782550812, + "learning_rate": 2.406253191326363e-05, + "loss": 0.0334, + "step": 98650 + }, + { + "epoch": 0.21758451634530349, + "grad_norm": 0.11035275459289551, + "learning_rate": 2.4061214280256867e-05, + "loss": 0.0314, + "step": 98660 + }, + { + "epoch": 0.21760657032020167, + "grad_norm": 0.12488842010498047, + "learning_rate": 2.4059896537147802e-05, + "loss": 0.0319, + "step": 98670 + }, + { + "epoch": 0.21762862429509983, + "grad_norm": 0.10186272114515305, + "learning_rate": 2.405857868395244e-05, + "loss": 0.0321, + "step": 98680 + }, + { + "epoch": 0.21765067826999798, + "grad_norm": 0.10900271683931351, + "learning_rate": 2.405726072068679e-05, + "loss": 0.0324, + "step": 98690 + }, + { + "epoch": 0.21767273224489617, + "grad_norm": 0.07612539082765579, + "learning_rate": 2.4055942647366874e-05, + "loss": 0.0322, + "step": 98700 + }, + { + "epoch": 0.21769478621979432, + "grad_norm": 0.0929718092083931, + "learning_rate": 2.40546244640087e-05, + "loss": 0.0298, + "step": 98710 + }, + { + "epoch": 0.21771684019469248, + "grad_norm": 0.1546488255262375, + "learning_rate": 2.4053306170628298e-05, + "loss": 0.0321, + "step": 98720 + }, + { + "epoch": 0.21773889416959066, + "grad_norm": 0.11143248528242111, + "learning_rate": 2.4051987767241668e-05, + "loss": 0.0301, + "step": 98730 + }, + { + "epoch": 0.21776094814448882, + "grad_norm": 0.09490042179822922, + "learning_rate": 2.405066925386485e-05, + "loss": 0.0318, + "step": 98740 + }, + { + "epoch": 0.21778300211938698, + "grad_norm": 0.09598945081233978, + "learning_rate": 2.4049350630513847e-05, + "loss": 0.0323, + "step": 98750 + }, + { + "epoch": 0.21780505609428516, + "grad_norm": 0.09885899722576141, + "learning_rate": 2.4048031897204693e-05, + "loss": 0.0315, + "step": 98760 + }, + { + "epoch": 0.21782711006918332, + "grad_norm": 0.11098670214414597, + "learning_rate": 2.4046713053953414e-05, + "loss": 0.0317, + "step": 98770 + }, + { + "epoch": 0.21784916404408147, + "grad_norm": 0.11515165120363235, + "learning_rate": 2.4045394100776025e-05, + "loss": 0.0322, + "step": 98780 + }, + { + "epoch": 0.21787121801897966, + "grad_norm": 0.09341664612293243, + "learning_rate": 2.4044075037688552e-05, + "loss": 0.0333, + "step": 98790 + }, + { + "epoch": 0.2178932719938778, + "grad_norm": 0.09142998605966568, + "learning_rate": 2.4042755864707038e-05, + "loss": 0.0333, + "step": 98800 + }, + { + "epoch": 0.21791532596877597, + "grad_norm": 0.11298968642950058, + "learning_rate": 2.40414365818475e-05, + "loss": 0.0305, + "step": 98810 + }, + { + "epoch": 0.21793737994367415, + "grad_norm": 0.10449091345071793, + "learning_rate": 2.4040117189125967e-05, + "loss": 0.0328, + "step": 98820 + }, + { + "epoch": 0.2179594339185723, + "grad_norm": 0.10624836385250092, + "learning_rate": 2.403879768655848e-05, + "loss": 0.0345, + "step": 98830 + }, + { + "epoch": 0.21798148789347047, + "grad_norm": 0.09638579934835434, + "learning_rate": 2.4037478074161063e-05, + "loss": 0.0322, + "step": 98840 + }, + { + "epoch": 0.21800354186836865, + "grad_norm": 0.11316047608852386, + "learning_rate": 2.4036158351949756e-05, + "loss": 0.0336, + "step": 98850 + }, + { + "epoch": 0.2180255958432668, + "grad_norm": 0.14267697930335999, + "learning_rate": 2.4034838519940593e-05, + "loss": 0.0313, + "step": 98860 + }, + { + "epoch": 0.218047649818165, + "grad_norm": 0.1449456363916397, + "learning_rate": 2.4033518578149615e-05, + "loss": 0.031, + "step": 98870 + }, + { + "epoch": 0.21806970379306315, + "grad_norm": 0.11244285106658936, + "learning_rate": 2.403219852659285e-05, + "loss": 0.0313, + "step": 98880 + }, + { + "epoch": 0.2180917577679613, + "grad_norm": 0.10843009501695633, + "learning_rate": 2.403087836528635e-05, + "loss": 0.0314, + "step": 98890 + }, + { + "epoch": 0.2181138117428595, + "grad_norm": 0.09440027922391891, + "learning_rate": 2.4029558094246154e-05, + "loss": 0.0332, + "step": 98900 + }, + { + "epoch": 0.21813586571775764, + "grad_norm": 0.09838031977415085, + "learning_rate": 2.4028237713488296e-05, + "loss": 0.0312, + "step": 98910 + }, + { + "epoch": 0.2181579196926558, + "grad_norm": 0.1258450597524643, + "learning_rate": 2.402691722302883e-05, + "loss": 0.0328, + "step": 98920 + }, + { + "epoch": 0.21817997366755398, + "grad_norm": 0.08653740584850311, + "learning_rate": 2.4025596622883792e-05, + "loss": 0.0331, + "step": 98930 + }, + { + "epoch": 0.21820202764245214, + "grad_norm": 0.10153314471244812, + "learning_rate": 2.4024275913069236e-05, + "loss": 0.0299, + "step": 98940 + }, + { + "epoch": 0.2182240816173503, + "grad_norm": 0.11535097658634186, + "learning_rate": 2.4022955093601207e-05, + "loss": 0.0327, + "step": 98950 + }, + { + "epoch": 0.21824613559224848, + "grad_norm": 0.12187670171260834, + "learning_rate": 2.4021634164495757e-05, + "loss": 0.0313, + "step": 98960 + }, + { + "epoch": 0.21826818956714664, + "grad_norm": 0.10630020499229431, + "learning_rate": 2.4020313125768933e-05, + "loss": 0.0304, + "step": 98970 + }, + { + "epoch": 0.2182902435420448, + "grad_norm": 0.08708173036575317, + "learning_rate": 2.4018991977436787e-05, + "loss": 0.0316, + "step": 98980 + }, + { + "epoch": 0.21831229751694298, + "grad_norm": 0.12918774783611298, + "learning_rate": 2.4017670719515373e-05, + "loss": 0.0338, + "step": 98990 + }, + { + "epoch": 0.21833435149184113, + "grad_norm": 0.10369137674570084, + "learning_rate": 2.401634935202075e-05, + "loss": 0.0315, + "step": 99000 + }, + { + "epoch": 0.2183564054667393, + "grad_norm": 0.11066761612892151, + "learning_rate": 2.4015027874968968e-05, + "loss": 0.0311, + "step": 99010 + }, + { + "epoch": 0.21837845944163747, + "grad_norm": 0.10285905003547668, + "learning_rate": 2.4013706288376077e-05, + "loss": 0.0321, + "step": 99020 + }, + { + "epoch": 0.21840051341653563, + "grad_norm": 0.09368851780891418, + "learning_rate": 2.4012384592258153e-05, + "loss": 0.0334, + "step": 99030 + }, + { + "epoch": 0.2184225673914338, + "grad_norm": 0.10184131562709808, + "learning_rate": 2.4011062786631245e-05, + "loss": 0.0303, + "step": 99040 + }, + { + "epoch": 0.21844462136633197, + "grad_norm": 0.17008835077285767, + "learning_rate": 2.4009740871511418e-05, + "loss": 0.0341, + "step": 99050 + }, + { + "epoch": 0.21846667534123013, + "grad_norm": 0.09073378890752792, + "learning_rate": 2.4008418846914733e-05, + "loss": 0.0317, + "step": 99060 + }, + { + "epoch": 0.21848872931612828, + "grad_norm": 0.10786725580692291, + "learning_rate": 2.400709671285725e-05, + "loss": 0.0346, + "step": 99070 + }, + { + "epoch": 0.21851078329102647, + "grad_norm": 0.11285027861595154, + "learning_rate": 2.400577446935504e-05, + "loss": 0.0338, + "step": 99080 + }, + { + "epoch": 0.21853283726592462, + "grad_norm": 0.08022122830152512, + "learning_rate": 2.4004452116424166e-05, + "loss": 0.0335, + "step": 99090 + }, + { + "epoch": 0.21855489124082278, + "grad_norm": 0.0991043746471405, + "learning_rate": 2.40031296540807e-05, + "loss": 0.0322, + "step": 99100 + }, + { + "epoch": 0.21857694521572096, + "grad_norm": 0.130398690700531, + "learning_rate": 2.40018070823407e-05, + "loss": 0.0326, + "step": 99110 + }, + { + "epoch": 0.21859899919061912, + "grad_norm": 0.12930409610271454, + "learning_rate": 2.4000484401220257e-05, + "loss": 0.0323, + "step": 99120 + }, + { + "epoch": 0.21862105316551728, + "grad_norm": 0.09884236007928848, + "learning_rate": 2.3999161610735423e-05, + "loss": 0.0318, + "step": 99130 + }, + { + "epoch": 0.21864310714041546, + "grad_norm": 0.10519309341907501, + "learning_rate": 2.399783871090228e-05, + "loss": 0.0312, + "step": 99140 + }, + { + "epoch": 0.21866516111531362, + "grad_norm": 0.12845870852470398, + "learning_rate": 2.3996515701736904e-05, + "loss": 0.0319, + "step": 99150 + }, + { + "epoch": 0.21868721509021177, + "grad_norm": 0.09784797579050064, + "learning_rate": 2.3995192583255366e-05, + "loss": 0.0324, + "step": 99160 + }, + { + "epoch": 0.21870926906510996, + "grad_norm": 0.09664712846279144, + "learning_rate": 2.3993869355473747e-05, + "loss": 0.0322, + "step": 99170 + }, + { + "epoch": 0.2187313230400081, + "grad_norm": 0.11587876826524734, + "learning_rate": 2.3992546018408117e-05, + "loss": 0.0328, + "step": 99180 + }, + { + "epoch": 0.21875337701490627, + "grad_norm": 0.11610633879899979, + "learning_rate": 2.399122257207457e-05, + "loss": 0.034, + "step": 99190 + }, + { + "epoch": 0.21877543098980445, + "grad_norm": 0.1017080694437027, + "learning_rate": 2.3989899016489176e-05, + "loss": 0.0325, + "step": 99200 + }, + { + "epoch": 0.2187974849647026, + "grad_norm": 0.10083739459514618, + "learning_rate": 2.3988575351668023e-05, + "loss": 0.0313, + "step": 99210 + }, + { + "epoch": 0.21881953893960077, + "grad_norm": 0.09641417860984802, + "learning_rate": 2.3987251577627196e-05, + "loss": 0.031, + "step": 99220 + }, + { + "epoch": 0.21884159291449895, + "grad_norm": 0.08392437547445297, + "learning_rate": 2.398592769438277e-05, + "loss": 0.0325, + "step": 99230 + }, + { + "epoch": 0.2188636468893971, + "grad_norm": 0.12731941044330597, + "learning_rate": 2.398460370195084e-05, + "loss": 0.0306, + "step": 99240 + }, + { + "epoch": 0.21888570086429526, + "grad_norm": 0.09113696217536926, + "learning_rate": 2.398327960034749e-05, + "loss": 0.0311, + "step": 99250 + }, + { + "epoch": 0.21890775483919345, + "grad_norm": 0.09989709407091141, + "learning_rate": 2.3981955389588818e-05, + "loss": 0.0342, + "step": 99260 + }, + { + "epoch": 0.2189298088140916, + "grad_norm": 0.11907714605331421, + "learning_rate": 2.3980631069690903e-05, + "loss": 0.0328, + "step": 99270 + }, + { + "epoch": 0.2189518627889898, + "grad_norm": 0.0893833115696907, + "learning_rate": 2.397930664066985e-05, + "loss": 0.0333, + "step": 99280 + }, + { + "epoch": 0.21897391676388794, + "grad_norm": 0.11459476500749588, + "learning_rate": 2.397798210254173e-05, + "loss": 0.031, + "step": 99290 + }, + { + "epoch": 0.2189959707387861, + "grad_norm": 0.14455129206180573, + "learning_rate": 2.397665745532266e-05, + "loss": 0.033, + "step": 99300 + }, + { + "epoch": 0.21901802471368428, + "grad_norm": 0.10775403678417206, + "learning_rate": 2.3975332699028724e-05, + "loss": 0.0318, + "step": 99310 + }, + { + "epoch": 0.21904007868858244, + "grad_norm": 0.12182843685150146, + "learning_rate": 2.397400783367602e-05, + "loss": 0.0316, + "step": 99320 + }, + { + "epoch": 0.2190621326634806, + "grad_norm": 0.09395703673362732, + "learning_rate": 2.397268285928065e-05, + "loss": 0.0331, + "step": 99330 + }, + { + "epoch": 0.21908418663837878, + "grad_norm": 0.10499341040849686, + "learning_rate": 2.3971357775858708e-05, + "loss": 0.0318, + "step": 99340 + }, + { + "epoch": 0.21910624061327694, + "grad_norm": 0.13631704449653625, + "learning_rate": 2.3970032583426303e-05, + "loss": 0.0327, + "step": 99350 + }, + { + "epoch": 0.2191282945881751, + "grad_norm": 0.13656917214393616, + "learning_rate": 2.396870728199953e-05, + "loss": 0.0309, + "step": 99360 + }, + { + "epoch": 0.21915034856307328, + "grad_norm": 0.09275297820568085, + "learning_rate": 2.39673818715945e-05, + "loss": 0.0343, + "step": 99370 + }, + { + "epoch": 0.21917240253797143, + "grad_norm": 0.11614879220724106, + "learning_rate": 2.396605635222731e-05, + "loss": 0.0323, + "step": 99380 + }, + { + "epoch": 0.2191944565128696, + "grad_norm": 0.1145179346203804, + "learning_rate": 2.396473072391407e-05, + "loss": 0.0342, + "step": 99390 + }, + { + "epoch": 0.21921651048776777, + "grad_norm": 0.10317946970462799, + "learning_rate": 2.396340498667089e-05, + "loss": 0.0309, + "step": 99400 + }, + { + "epoch": 0.21923856446266593, + "grad_norm": 0.13485214114189148, + "learning_rate": 2.3962079140513874e-05, + "loss": 0.0319, + "step": 99410 + }, + { + "epoch": 0.2192606184375641, + "grad_norm": 0.1410234570503235, + "learning_rate": 2.3960753185459135e-05, + "loss": 0.0318, + "step": 99420 + }, + { + "epoch": 0.21928267241246227, + "grad_norm": 0.08794428408145905, + "learning_rate": 2.3959427121522784e-05, + "loss": 0.0319, + "step": 99430 + }, + { + "epoch": 0.21930472638736043, + "grad_norm": 0.09741934388875961, + "learning_rate": 2.3958100948720937e-05, + "loss": 0.0343, + "step": 99440 + }, + { + "epoch": 0.21932678036225858, + "grad_norm": 0.12778900563716888, + "learning_rate": 2.3956774667069704e-05, + "loss": 0.0321, + "step": 99450 + }, + { + "epoch": 0.21934883433715677, + "grad_norm": 0.08583890646696091, + "learning_rate": 2.39554482765852e-05, + "loss": 0.0324, + "step": 99460 + }, + { + "epoch": 0.21937088831205492, + "grad_norm": 0.08693958073854446, + "learning_rate": 2.395412177728355e-05, + "loss": 0.0313, + "step": 99470 + }, + { + "epoch": 0.21939294228695308, + "grad_norm": 0.1412392109632492, + "learning_rate": 2.395279516918086e-05, + "loss": 0.0331, + "step": 99480 + }, + { + "epoch": 0.21941499626185126, + "grad_norm": 0.13102805614471436, + "learning_rate": 2.3951468452293264e-05, + "loss": 0.0336, + "step": 99490 + }, + { + "epoch": 0.21943705023674942, + "grad_norm": 0.1253218650817871, + "learning_rate": 2.3950141626636867e-05, + "loss": 0.0336, + "step": 99500 + }, + { + "epoch": 0.21945910421164758, + "grad_norm": 0.12074201554059982, + "learning_rate": 2.39488146922278e-05, + "loss": 0.0327, + "step": 99510 + }, + { + "epoch": 0.21948115818654576, + "grad_norm": 0.09922632575035095, + "learning_rate": 2.3947487649082186e-05, + "loss": 0.0333, + "step": 99520 + }, + { + "epoch": 0.21950321216144392, + "grad_norm": 0.09716904163360596, + "learning_rate": 2.394616049721615e-05, + "loss": 0.0327, + "step": 99530 + }, + { + "epoch": 0.21952526613634207, + "grad_norm": 0.09022380411624908, + "learning_rate": 2.3944833236645818e-05, + "loss": 0.03, + "step": 99540 + }, + { + "epoch": 0.21954732011124026, + "grad_norm": 0.10856136679649353, + "learning_rate": 2.3943505867387317e-05, + "loss": 0.0316, + "step": 99550 + }, + { + "epoch": 0.2195693740861384, + "grad_norm": 0.19198837876319885, + "learning_rate": 2.3942178389456776e-05, + "loss": 0.0328, + "step": 99560 + }, + { + "epoch": 0.21959142806103657, + "grad_norm": 0.10084950923919678, + "learning_rate": 2.3940850802870323e-05, + "loss": 0.0314, + "step": 99570 + }, + { + "epoch": 0.21961348203593475, + "grad_norm": 0.10240191966295242, + "learning_rate": 2.393952310764409e-05, + "loss": 0.0348, + "step": 99580 + }, + { + "epoch": 0.2196355360108329, + "grad_norm": 0.13843898475170135, + "learning_rate": 2.393819530379421e-05, + "loss": 0.0329, + "step": 99590 + }, + { + "epoch": 0.21965758998573107, + "grad_norm": 0.10770637542009354, + "learning_rate": 2.3936867391336825e-05, + "loss": 0.0318, + "step": 99600 + }, + { + "epoch": 0.21967964396062925, + "grad_norm": 0.11022487282752991, + "learning_rate": 2.3935539370288057e-05, + "loss": 0.0327, + "step": 99610 + }, + { + "epoch": 0.2197016979355274, + "grad_norm": 0.12324310839176178, + "learning_rate": 2.393421124066405e-05, + "loss": 0.0334, + "step": 99620 + }, + { + "epoch": 0.21972375191042556, + "grad_norm": 0.10617078840732574, + "learning_rate": 2.3932883002480942e-05, + "loss": 0.031, + "step": 99630 + }, + { + "epoch": 0.21974580588532375, + "grad_norm": 0.11545966565608978, + "learning_rate": 2.3931554655754873e-05, + "loss": 0.0343, + "step": 99640 + }, + { + "epoch": 0.2197678598602219, + "grad_norm": 0.10651824623346329, + "learning_rate": 2.3930226200501983e-05, + "loss": 0.0325, + "step": 99650 + }, + { + "epoch": 0.21978991383512006, + "grad_norm": 0.1083463504910469, + "learning_rate": 2.392889763673841e-05, + "loss": 0.0306, + "step": 99660 + }, + { + "epoch": 0.21981196781001824, + "grad_norm": 0.12625357508659363, + "learning_rate": 2.3927568964480303e-05, + "loss": 0.0304, + "step": 99670 + }, + { + "epoch": 0.2198340217849164, + "grad_norm": 0.12138210982084274, + "learning_rate": 2.3926240183743802e-05, + "loss": 0.0328, + "step": 99680 + }, + { + "epoch": 0.21985607575981456, + "grad_norm": 0.10672394931316376, + "learning_rate": 2.3924911294545054e-05, + "loss": 0.0323, + "step": 99690 + }, + { + "epoch": 0.21987812973471274, + "grad_norm": 0.13913346827030182, + "learning_rate": 2.3923582296900213e-05, + "loss": 0.0327, + "step": 99700 + }, + { + "epoch": 0.2199001837096109, + "grad_norm": 0.09653976559638977, + "learning_rate": 2.3922253190825414e-05, + "loss": 0.0325, + "step": 99710 + }, + { + "epoch": 0.21992223768450908, + "grad_norm": 0.09664107859134674, + "learning_rate": 2.392092397633682e-05, + "loss": 0.0314, + "step": 99720 + }, + { + "epoch": 0.21994429165940724, + "grad_norm": 0.10540413111448288, + "learning_rate": 2.391959465345057e-05, + "loss": 0.0328, + "step": 99730 + }, + { + "epoch": 0.2199663456343054, + "grad_norm": 0.13295374810695648, + "learning_rate": 2.3918265222182833e-05, + "loss": 0.0324, + "step": 99740 + }, + { + "epoch": 0.21998839960920358, + "grad_norm": 0.1313531994819641, + "learning_rate": 2.3916935682549747e-05, + "loss": 0.0328, + "step": 99750 + }, + { + "epoch": 0.22001045358410173, + "grad_norm": 0.09646840393543243, + "learning_rate": 2.391560603456748e-05, + "loss": 0.0333, + "step": 99760 + }, + { + "epoch": 0.2200325075589999, + "grad_norm": 0.1269872635602951, + "learning_rate": 2.3914276278252177e-05, + "loss": 0.0321, + "step": 99770 + }, + { + "epoch": 0.22005456153389807, + "grad_norm": 0.09120209515094757, + "learning_rate": 2.3912946413619997e-05, + "loss": 0.0311, + "step": 99780 + }, + { + "epoch": 0.22007661550879623, + "grad_norm": 0.10703544318675995, + "learning_rate": 2.391161644068711e-05, + "loss": 0.0301, + "step": 99790 + }, + { + "epoch": 0.2200986694836944, + "grad_norm": 0.0959688201546669, + "learning_rate": 2.391028635946967e-05, + "loss": 0.0325, + "step": 99800 + }, + { + "epoch": 0.22012072345859257, + "grad_norm": 0.09782731533050537, + "learning_rate": 2.3908956169983833e-05, + "loss": 0.033, + "step": 99810 + }, + { + "epoch": 0.22014277743349073, + "grad_norm": 0.11651371419429779, + "learning_rate": 2.3907625872245767e-05, + "loss": 0.0313, + "step": 99820 + }, + { + "epoch": 0.22016483140838888, + "grad_norm": 0.11615761369466782, + "learning_rate": 2.390629546627164e-05, + "loss": 0.032, + "step": 99830 + }, + { + "epoch": 0.22018688538328707, + "grad_norm": 0.13565796613693237, + "learning_rate": 2.3904964952077607e-05, + "loss": 0.0337, + "step": 99840 + }, + { + "epoch": 0.22020893935818522, + "grad_norm": 0.10493867844343185, + "learning_rate": 2.390363432967985e-05, + "loss": 0.0347, + "step": 99850 + }, + { + "epoch": 0.22023099333308338, + "grad_norm": 0.09931568801403046, + "learning_rate": 2.3902303599094526e-05, + "loss": 0.0321, + "step": 99860 + }, + { + "epoch": 0.22025304730798156, + "grad_norm": 0.09743789583444595, + "learning_rate": 2.390097276033781e-05, + "loss": 0.0327, + "step": 99870 + }, + { + "epoch": 0.22027510128287972, + "grad_norm": 0.1107320785522461, + "learning_rate": 2.389964181342587e-05, + "loss": 0.0336, + "step": 99880 + }, + { + "epoch": 0.22029715525777788, + "grad_norm": 0.14111986756324768, + "learning_rate": 2.3898310758374878e-05, + "loss": 0.0347, + "step": 99890 + }, + { + "epoch": 0.22031920923267606, + "grad_norm": 0.1004650890827179, + "learning_rate": 2.389697959520101e-05, + "loss": 0.0336, + "step": 99900 + }, + { + "epoch": 0.22034126320757422, + "grad_norm": 0.11953889578580856, + "learning_rate": 2.3895648323920442e-05, + "loss": 0.0324, + "step": 99910 + }, + { + "epoch": 0.22036331718247237, + "grad_norm": 0.14909563958644867, + "learning_rate": 2.3894316944549346e-05, + "loss": 0.0308, + "step": 99920 + }, + { + "epoch": 0.22038537115737056, + "grad_norm": 0.1273273080587387, + "learning_rate": 2.38929854571039e-05, + "loss": 0.032, + "step": 99930 + }, + { + "epoch": 0.22040742513226871, + "grad_norm": 0.1192566528916359, + "learning_rate": 2.3891653861600286e-05, + "loss": 0.0331, + "step": 99940 + }, + { + "epoch": 0.22042947910716687, + "grad_norm": 0.09130678325891495, + "learning_rate": 2.3890322158054682e-05, + "loss": 0.0337, + "step": 99950 + }, + { + "epoch": 0.22045153308206505, + "grad_norm": 0.1125696524977684, + "learning_rate": 2.388899034648327e-05, + "loss": 0.0323, + "step": 99960 + }, + { + "epoch": 0.2204735870569632, + "grad_norm": 0.08989265561103821, + "learning_rate": 2.3887658426902234e-05, + "loss": 0.0319, + "step": 99970 + }, + { + "epoch": 0.22049564103186137, + "grad_norm": 0.1536608189344406, + "learning_rate": 2.3886326399327754e-05, + "loss": 0.0332, + "step": 99980 + }, + { + "epoch": 0.22051769500675955, + "grad_norm": 0.11047989875078201, + "learning_rate": 2.388499426377602e-05, + "loss": 0.0333, + "step": 99990 + }, + { + "epoch": 0.2205397489816577, + "grad_norm": 0.07719263434410095, + "learning_rate": 2.3883662020263216e-05, + "loss": 0.0347, + "step": 100000 + }, + { + "epoch": 0.22056180295655586, + "grad_norm": 0.11273082345724106, + "learning_rate": 2.3882329668805532e-05, + "loss": 0.0324, + "step": 100010 + }, + { + "epoch": 0.22058385693145405, + "grad_norm": 0.14840158820152283, + "learning_rate": 2.3880997209419154e-05, + "loss": 0.0321, + "step": 100020 + }, + { + "epoch": 0.2206059109063522, + "grad_norm": 0.09146811813116074, + "learning_rate": 2.3879664642120278e-05, + "loss": 0.0315, + "step": 100030 + }, + { + "epoch": 0.22062796488125036, + "grad_norm": 0.09947234392166138, + "learning_rate": 2.387833196692509e-05, + "loss": 0.0326, + "step": 100040 + }, + { + "epoch": 0.22065001885614854, + "grad_norm": 0.11472473293542862, + "learning_rate": 2.3876999183849788e-05, + "loss": 0.031, + "step": 100050 + }, + { + "epoch": 0.2206720728310467, + "grad_norm": 0.0988389328122139, + "learning_rate": 2.387566629291057e-05, + "loss": 0.0319, + "step": 100060 + }, + { + "epoch": 0.22069412680594486, + "grad_norm": 0.1148076206445694, + "learning_rate": 2.3874333294123615e-05, + "loss": 0.0334, + "step": 100070 + }, + { + "epoch": 0.22071618078084304, + "grad_norm": 0.131975457072258, + "learning_rate": 2.387300018750514e-05, + "loss": 0.0322, + "step": 100080 + }, + { + "epoch": 0.2207382347557412, + "grad_norm": 0.13489365577697754, + "learning_rate": 2.3871666973071332e-05, + "loss": 0.0322, + "step": 100090 + }, + { + "epoch": 0.22076028873063935, + "grad_norm": 0.11082693189382553, + "learning_rate": 2.3870333650838393e-05, + "loss": 0.0317, + "step": 100100 + }, + { + "epoch": 0.22078234270553754, + "grad_norm": 0.1718171238899231, + "learning_rate": 2.386900022082253e-05, + "loss": 0.0315, + "step": 100110 + }, + { + "epoch": 0.2208043966804357, + "grad_norm": 0.08534752577543259, + "learning_rate": 2.3867666683039937e-05, + "loss": 0.0337, + "step": 100120 + }, + { + "epoch": 0.22082645065533388, + "grad_norm": 0.10215164721012115, + "learning_rate": 2.386633303750682e-05, + "loss": 0.0317, + "step": 100130 + }, + { + "epoch": 0.22084850463023203, + "grad_norm": 0.09049231559038162, + "learning_rate": 2.3864999284239385e-05, + "loss": 0.0338, + "step": 100140 + }, + { + "epoch": 0.2208705586051302, + "grad_norm": 0.11413977295160294, + "learning_rate": 2.3863665423253843e-05, + "loss": 0.0313, + "step": 100150 + }, + { + "epoch": 0.22089261258002837, + "grad_norm": 0.10029693692922592, + "learning_rate": 2.3862331454566392e-05, + "loss": 0.0357, + "step": 100160 + }, + { + "epoch": 0.22091466655492653, + "grad_norm": 0.11889975517988205, + "learning_rate": 2.386099737819325e-05, + "loss": 0.0338, + "step": 100170 + }, + { + "epoch": 0.2209367205298247, + "grad_norm": 0.08769261837005615, + "learning_rate": 2.3859663194150628e-05, + "loss": 0.0319, + "step": 100180 + }, + { + "epoch": 0.22095877450472287, + "grad_norm": 0.11429008096456528, + "learning_rate": 2.3858328902454728e-05, + "loss": 0.034, + "step": 100190 + }, + { + "epoch": 0.22098082847962103, + "grad_norm": 0.1076032817363739, + "learning_rate": 2.3856994503121767e-05, + "loss": 0.0319, + "step": 100200 + }, + { + "epoch": 0.22100288245451918, + "grad_norm": 0.10852351039648056, + "learning_rate": 2.3855659996167962e-05, + "loss": 0.0325, + "step": 100210 + }, + { + "epoch": 0.22102493642941737, + "grad_norm": 0.10242648422718048, + "learning_rate": 2.3854325381609527e-05, + "loss": 0.0351, + "step": 100220 + }, + { + "epoch": 0.22104699040431552, + "grad_norm": 0.1265111267566681, + "learning_rate": 2.385299065946268e-05, + "loss": 0.0332, + "step": 100230 + }, + { + "epoch": 0.22106904437921368, + "grad_norm": 0.127615824341774, + "learning_rate": 2.3851655829743637e-05, + "loss": 0.0335, + "step": 100240 + }, + { + "epoch": 0.22109109835411186, + "grad_norm": 0.11227051913738251, + "learning_rate": 2.3850320892468617e-05, + "loss": 0.0337, + "step": 100250 + }, + { + "epoch": 0.22111315232901002, + "grad_norm": 0.09978631138801575, + "learning_rate": 2.3848985847653843e-05, + "loss": 0.0333, + "step": 100260 + }, + { + "epoch": 0.22113520630390818, + "grad_norm": 0.14472892880439758, + "learning_rate": 2.3847650695315535e-05, + "loss": 0.0316, + "step": 100270 + }, + { + "epoch": 0.22115726027880636, + "grad_norm": 0.09323039650917053, + "learning_rate": 2.384631543546992e-05, + "loss": 0.0317, + "step": 100280 + }, + { + "epoch": 0.22117931425370452, + "grad_norm": 0.12594017386436462, + "learning_rate": 2.3844980068133218e-05, + "loss": 0.033, + "step": 100290 + }, + { + "epoch": 0.22120136822860267, + "grad_norm": 0.09168526530265808, + "learning_rate": 2.384364459332166e-05, + "loss": 0.0316, + "step": 100300 + }, + { + "epoch": 0.22122342220350086, + "grad_norm": 0.09744597971439362, + "learning_rate": 2.3842309011051468e-05, + "loss": 0.0322, + "step": 100310 + }, + { + "epoch": 0.22124547617839901, + "grad_norm": 0.12237712740898132, + "learning_rate": 2.3840973321338873e-05, + "loss": 0.0331, + "step": 100320 + }, + { + "epoch": 0.22126753015329717, + "grad_norm": 0.10610274225473404, + "learning_rate": 2.3839637524200104e-05, + "loss": 0.0314, + "step": 100330 + }, + { + "epoch": 0.22128958412819535, + "grad_norm": 0.1231994479894638, + "learning_rate": 2.3838301619651396e-05, + "loss": 0.0337, + "step": 100340 + }, + { + "epoch": 0.2213116381030935, + "grad_norm": 0.1210222989320755, + "learning_rate": 2.383696560770898e-05, + "loss": 0.0312, + "step": 100350 + }, + { + "epoch": 0.22133369207799167, + "grad_norm": 0.1128256618976593, + "learning_rate": 2.3835629488389082e-05, + "loss": 0.0329, + "step": 100360 + }, + { + "epoch": 0.22135574605288985, + "grad_norm": 0.12339119613170624, + "learning_rate": 2.3834293261707943e-05, + "loss": 0.0313, + "step": 100370 + }, + { + "epoch": 0.221377800027788, + "grad_norm": 0.10154630243778229, + "learning_rate": 2.3832956927681807e-05, + "loss": 0.0317, + "step": 100380 + }, + { + "epoch": 0.22139985400268616, + "grad_norm": 0.12736254930496216, + "learning_rate": 2.38316204863269e-05, + "loss": 0.0344, + "step": 100390 + }, + { + "epoch": 0.22142190797758435, + "grad_norm": 0.1050921157002449, + "learning_rate": 2.3830283937659467e-05, + "loss": 0.0322, + "step": 100400 + }, + { + "epoch": 0.2214439619524825, + "grad_norm": 0.09598434716463089, + "learning_rate": 2.3828947281695745e-05, + "loss": 0.0331, + "step": 100410 + }, + { + "epoch": 0.22146601592738066, + "grad_norm": 0.10876026749610901, + "learning_rate": 2.3827610518451983e-05, + "loss": 0.0318, + "step": 100420 + }, + { + "epoch": 0.22148806990227884, + "grad_norm": 0.10468500107526779, + "learning_rate": 2.3826273647944418e-05, + "loss": 0.0336, + "step": 100430 + }, + { + "epoch": 0.221510123877177, + "grad_norm": 0.11397620290517807, + "learning_rate": 2.3824936670189288e-05, + "loss": 0.0321, + "step": 100440 + }, + { + "epoch": 0.22153217785207516, + "grad_norm": 0.12223204970359802, + "learning_rate": 2.3823599585202853e-05, + "loss": 0.0349, + "step": 100450 + }, + { + "epoch": 0.22155423182697334, + "grad_norm": 0.11828228086233139, + "learning_rate": 2.382226239300135e-05, + "loss": 0.0317, + "step": 100460 + }, + { + "epoch": 0.2215762858018715, + "grad_norm": 0.11006958782672882, + "learning_rate": 2.3820925093601026e-05, + "loss": 0.0336, + "step": 100470 + }, + { + "epoch": 0.22159833977676965, + "grad_norm": 0.11199901252985, + "learning_rate": 2.3819587687018133e-05, + "loss": 0.0316, + "step": 100480 + }, + { + "epoch": 0.22162039375166784, + "grad_norm": 0.10512904077768326, + "learning_rate": 2.3818250173268926e-05, + "loss": 0.0326, + "step": 100490 + }, + { + "epoch": 0.221642447726566, + "grad_norm": 0.1261397749185562, + "learning_rate": 2.3816912552369655e-05, + "loss": 0.0317, + "step": 100500 + }, + { + "epoch": 0.22166450170146415, + "grad_norm": 0.11366899311542511, + "learning_rate": 2.3815574824336566e-05, + "loss": 0.0332, + "step": 100510 + }, + { + "epoch": 0.22168655567636233, + "grad_norm": 0.09616101533174515, + "learning_rate": 2.3814236989185925e-05, + "loss": 0.0317, + "step": 100520 + }, + { + "epoch": 0.2217086096512605, + "grad_norm": 0.13152539730072021, + "learning_rate": 2.381289904693398e-05, + "loss": 0.0343, + "step": 100530 + }, + { + "epoch": 0.22173066362615865, + "grad_norm": 0.10741549730300903, + "learning_rate": 2.3811560997596992e-05, + "loss": 0.0318, + "step": 100540 + }, + { + "epoch": 0.22175271760105683, + "grad_norm": 0.10124623030424118, + "learning_rate": 2.3810222841191216e-05, + "loss": 0.0319, + "step": 100550 + }, + { + "epoch": 0.221774771575955, + "grad_norm": 0.1342909187078476, + "learning_rate": 2.3808884577732918e-05, + "loss": 0.0339, + "step": 100560 + }, + { + "epoch": 0.22179682555085317, + "grad_norm": 0.11202674359083176, + "learning_rate": 2.3807546207238355e-05, + "loss": 0.0314, + "step": 100570 + }, + { + "epoch": 0.22181887952575133, + "grad_norm": 0.10321125388145447, + "learning_rate": 2.380620772972379e-05, + "loss": 0.0321, + "step": 100580 + }, + { + "epoch": 0.22184093350064948, + "grad_norm": 0.12991246581077576, + "learning_rate": 2.3804869145205484e-05, + "loss": 0.0306, + "step": 100590 + }, + { + "epoch": 0.22186298747554767, + "grad_norm": 0.1016366109251976, + "learning_rate": 2.3803530453699705e-05, + "loss": 0.0324, + "step": 100600 + }, + { + "epoch": 0.22188504145044582, + "grad_norm": 0.10877645015716553, + "learning_rate": 2.3802191655222722e-05, + "loss": 0.0338, + "step": 100610 + }, + { + "epoch": 0.22190709542534398, + "grad_norm": 0.0944114550948143, + "learning_rate": 2.3800852749790797e-05, + "loss": 0.0309, + "step": 100620 + }, + { + "epoch": 0.22192914940024216, + "grad_norm": 0.1152220070362091, + "learning_rate": 2.3799513737420203e-05, + "loss": 0.0321, + "step": 100630 + }, + { + "epoch": 0.22195120337514032, + "grad_norm": 0.106789231300354, + "learning_rate": 2.379817461812721e-05, + "loss": 0.0313, + "step": 100640 + }, + { + "epoch": 0.22197325735003848, + "grad_norm": 0.12595611810684204, + "learning_rate": 2.3796835391928085e-05, + "loss": 0.0327, + "step": 100650 + }, + { + "epoch": 0.22199531132493666, + "grad_norm": 0.13080377876758575, + "learning_rate": 2.3795496058839108e-05, + "loss": 0.0308, + "step": 100660 + }, + { + "epoch": 0.22201736529983482, + "grad_norm": 0.09453918039798737, + "learning_rate": 2.3794156618876545e-05, + "loss": 0.0319, + "step": 100670 + }, + { + "epoch": 0.22203941927473297, + "grad_norm": 0.08174601197242737, + "learning_rate": 2.379281707205668e-05, + "loss": 0.0316, + "step": 100680 + }, + { + "epoch": 0.22206147324963116, + "grad_norm": 0.1373957246541977, + "learning_rate": 2.379147741839578e-05, + "loss": 0.031, + "step": 100690 + }, + { + "epoch": 0.22208352722452931, + "grad_norm": 0.11676212400197983, + "learning_rate": 2.379013765791014e-05, + "loss": 0.0348, + "step": 100700 + }, + { + "epoch": 0.22210558119942747, + "grad_norm": 0.11769616603851318, + "learning_rate": 2.378879779061602e-05, + "loss": 0.0335, + "step": 100710 + }, + { + "epoch": 0.22212763517432565, + "grad_norm": 0.10164742171764374, + "learning_rate": 2.3787457816529713e-05, + "loss": 0.0332, + "step": 100720 + }, + { + "epoch": 0.2221496891492238, + "grad_norm": 0.08626115322113037, + "learning_rate": 2.378611773566749e-05, + "loss": 0.0313, + "step": 100730 + }, + { + "epoch": 0.22217174312412197, + "grad_norm": 0.1085323691368103, + "learning_rate": 2.3784777548045648e-05, + "loss": 0.0335, + "step": 100740 + }, + { + "epoch": 0.22219379709902015, + "grad_norm": 0.18674512207508087, + "learning_rate": 2.3783437253680463e-05, + "loss": 0.0334, + "step": 100750 + }, + { + "epoch": 0.2222158510739183, + "grad_norm": 0.1348223239183426, + "learning_rate": 2.3782096852588223e-05, + "loss": 0.0325, + "step": 100760 + }, + { + "epoch": 0.22223790504881646, + "grad_norm": 0.10832666605710983, + "learning_rate": 2.3780756344785213e-05, + "loss": 0.0307, + "step": 100770 + }, + { + "epoch": 0.22225995902371465, + "grad_norm": 0.1212647557258606, + "learning_rate": 2.377941573028772e-05, + "loss": 0.0351, + "step": 100780 + }, + { + "epoch": 0.2222820129986128, + "grad_norm": 0.09721267968416214, + "learning_rate": 2.3778075009112042e-05, + "loss": 0.0312, + "step": 100790 + }, + { + "epoch": 0.22230406697351096, + "grad_norm": 0.08978412300348282, + "learning_rate": 2.3776734181274458e-05, + "loss": 0.0322, + "step": 100800 + }, + { + "epoch": 0.22232612094840914, + "grad_norm": 0.09998262673616409, + "learning_rate": 2.3775393246791272e-05, + "loss": 0.0314, + "step": 100810 + }, + { + "epoch": 0.2223481749233073, + "grad_norm": 0.08292088657617569, + "learning_rate": 2.377405220567878e-05, + "loss": 0.0321, + "step": 100820 + }, + { + "epoch": 0.22237022889820546, + "grad_norm": 0.10632981359958649, + "learning_rate": 2.377271105795326e-05, + "loss": 0.0322, + "step": 100830 + }, + { + "epoch": 0.22239228287310364, + "grad_norm": 0.08594202995300293, + "learning_rate": 2.377136980363102e-05, + "loss": 0.0315, + "step": 100840 + }, + { + "epoch": 0.2224143368480018, + "grad_norm": 0.11235519498586655, + "learning_rate": 2.3770028442728354e-05, + "loss": 0.0322, + "step": 100850 + }, + { + "epoch": 0.22243639082289995, + "grad_norm": 0.1120978593826294, + "learning_rate": 2.3768686975261562e-05, + "loss": 0.0311, + "step": 100860 + }, + { + "epoch": 0.22245844479779814, + "grad_norm": 0.12143310904502869, + "learning_rate": 2.3767345401246946e-05, + "loss": 0.0327, + "step": 100870 + }, + { + "epoch": 0.2224804987726963, + "grad_norm": 0.09728176146745682, + "learning_rate": 2.3766003720700803e-05, + "loss": 0.0321, + "step": 100880 + }, + { + "epoch": 0.22250255274759445, + "grad_norm": 0.10293128341436386, + "learning_rate": 2.3764661933639443e-05, + "loss": 0.0337, + "step": 100890 + }, + { + "epoch": 0.22252460672249263, + "grad_norm": 0.11595247685909271, + "learning_rate": 2.376332004007916e-05, + "loss": 0.0322, + "step": 100900 + }, + { + "epoch": 0.2225466606973908, + "grad_norm": 0.1460648626089096, + "learning_rate": 2.3761978040036266e-05, + "loss": 0.0307, + "step": 100910 + }, + { + "epoch": 0.22256871467228895, + "grad_norm": 0.11005956679582596, + "learning_rate": 2.376063593352707e-05, + "loss": 0.032, + "step": 100920 + }, + { + "epoch": 0.22259076864718713, + "grad_norm": 0.10523166507482529, + "learning_rate": 2.3759293720567874e-05, + "loss": 0.0329, + "step": 100930 + }, + { + "epoch": 0.2226128226220853, + "grad_norm": 0.08992468565702438, + "learning_rate": 2.3757951401174988e-05, + "loss": 0.0311, + "step": 100940 + }, + { + "epoch": 0.22263487659698344, + "grad_norm": 0.10813958197832108, + "learning_rate": 2.3756608975364725e-05, + "loss": 0.0314, + "step": 100950 + }, + { + "epoch": 0.22265693057188163, + "grad_norm": 0.1229824647307396, + "learning_rate": 2.3755266443153394e-05, + "loss": 0.0321, + "step": 100960 + }, + { + "epoch": 0.22267898454677978, + "grad_norm": 0.11176618188619614, + "learning_rate": 2.3753923804557307e-05, + "loss": 0.0321, + "step": 100970 + }, + { + "epoch": 0.22270103852167794, + "grad_norm": 0.10830859839916229, + "learning_rate": 2.3752581059592788e-05, + "loss": 0.0333, + "step": 100980 + }, + { + "epoch": 0.22272309249657612, + "grad_norm": 0.088863804936409, + "learning_rate": 2.375123820827614e-05, + "loss": 0.0318, + "step": 100990 + }, + { + "epoch": 0.22274514647147428, + "grad_norm": 0.09430859237909317, + "learning_rate": 2.3749895250623684e-05, + "loss": 0.0328, + "step": 101000 + }, + { + "epoch": 0.22276720044637247, + "grad_norm": 0.11383035033941269, + "learning_rate": 2.374855218665174e-05, + "loss": 0.0317, + "step": 101010 + }, + { + "epoch": 0.22278925442127062, + "grad_norm": 0.1343849003314972, + "learning_rate": 2.374720901637663e-05, + "loss": 0.0328, + "step": 101020 + }, + { + "epoch": 0.22281130839616878, + "grad_norm": 0.11437922716140747, + "learning_rate": 2.374586573981467e-05, + "loss": 0.0322, + "step": 101030 + }, + { + "epoch": 0.22283336237106696, + "grad_norm": 0.11098839342594147, + "learning_rate": 2.3744522356982184e-05, + "loss": 0.0321, + "step": 101040 + }, + { + "epoch": 0.22285541634596512, + "grad_norm": 0.10052047669887543, + "learning_rate": 2.3743178867895494e-05, + "loss": 0.0346, + "step": 101050 + }, + { + "epoch": 0.22287747032086327, + "grad_norm": 0.10050737112760544, + "learning_rate": 2.374183527257093e-05, + "loss": 0.032, + "step": 101060 + }, + { + "epoch": 0.22289952429576146, + "grad_norm": 0.09562349319458008, + "learning_rate": 2.3740491571024808e-05, + "loss": 0.0316, + "step": 101070 + }, + { + "epoch": 0.22292157827065961, + "grad_norm": 0.1164296418428421, + "learning_rate": 2.3739147763273467e-05, + "loss": 0.0326, + "step": 101080 + }, + { + "epoch": 0.22294363224555777, + "grad_norm": 0.11499057710170746, + "learning_rate": 2.3737803849333228e-05, + "loss": 0.033, + "step": 101090 + }, + { + "epoch": 0.22296568622045596, + "grad_norm": 0.10618041455745697, + "learning_rate": 2.3736459829220417e-05, + "loss": 0.0311, + "step": 101100 + }, + { + "epoch": 0.2229877401953541, + "grad_norm": 0.10536638647317886, + "learning_rate": 2.373511570295138e-05, + "loss": 0.0334, + "step": 101110 + }, + { + "epoch": 0.22300979417025227, + "grad_norm": 0.10274798423051834, + "learning_rate": 2.3733771470542425e-05, + "loss": 0.0324, + "step": 101120 + }, + { + "epoch": 0.22303184814515045, + "grad_norm": 0.10852192342281342, + "learning_rate": 2.3732427132009913e-05, + "loss": 0.0327, + "step": 101130 + }, + { + "epoch": 0.2230539021200486, + "grad_norm": 0.10031357407569885, + "learning_rate": 2.3731082687370165e-05, + "loss": 0.0306, + "step": 101140 + }, + { + "epoch": 0.22307595609494676, + "grad_norm": 0.10905183851718903, + "learning_rate": 2.3729738136639518e-05, + "loss": 0.0324, + "step": 101150 + }, + { + "epoch": 0.22309801006984495, + "grad_norm": 0.11635316163301468, + "learning_rate": 2.372839347983431e-05, + "loss": 0.0348, + "step": 101160 + }, + { + "epoch": 0.2231200640447431, + "grad_norm": 0.10771991312503815, + "learning_rate": 2.3727048716970875e-05, + "loss": 0.0318, + "step": 101170 + }, + { + "epoch": 0.22314211801964126, + "grad_norm": 0.08848545700311661, + "learning_rate": 2.372570384806557e-05, + "loss": 0.035, + "step": 101180 + }, + { + "epoch": 0.22316417199453945, + "grad_norm": 0.09660337120294571, + "learning_rate": 2.3724358873134716e-05, + "loss": 0.0335, + "step": 101190 + }, + { + "epoch": 0.2231862259694376, + "grad_norm": 0.10588014125823975, + "learning_rate": 2.3723013792194664e-05, + "loss": 0.0322, + "step": 101200 + }, + { + "epoch": 0.22320827994433576, + "grad_norm": 0.18243949115276337, + "learning_rate": 2.372166860526176e-05, + "loss": 0.0321, + "step": 101210 + }, + { + "epoch": 0.22323033391923394, + "grad_norm": 0.11476660519838333, + "learning_rate": 2.372032331235235e-05, + "loss": 0.0322, + "step": 101220 + }, + { + "epoch": 0.2232523878941321, + "grad_norm": 0.09072860330343246, + "learning_rate": 2.3718977913482782e-05, + "loss": 0.0328, + "step": 101230 + }, + { + "epoch": 0.22327444186903025, + "grad_norm": 0.09727354347705841, + "learning_rate": 2.371763240866939e-05, + "loss": 0.0305, + "step": 101240 + }, + { + "epoch": 0.22329649584392844, + "grad_norm": 0.12167803943157196, + "learning_rate": 2.3716286797928543e-05, + "loss": 0.0321, + "step": 101250 + }, + { + "epoch": 0.2233185498188266, + "grad_norm": 0.10389037430286407, + "learning_rate": 2.3714941081276577e-05, + "loss": 0.0317, + "step": 101260 + }, + { + "epoch": 0.22334060379372475, + "grad_norm": 0.09231109172105789, + "learning_rate": 2.3713595258729848e-05, + "loss": 0.0339, + "step": 101270 + }, + { + "epoch": 0.22336265776862294, + "grad_norm": 0.11218106746673584, + "learning_rate": 2.371224933030471e-05, + "loss": 0.0324, + "step": 101280 + }, + { + "epoch": 0.2233847117435211, + "grad_norm": 0.12512469291687012, + "learning_rate": 2.371090329601752e-05, + "loss": 0.0327, + "step": 101290 + }, + { + "epoch": 0.22340676571841925, + "grad_norm": 0.1264970600605011, + "learning_rate": 2.370955715588463e-05, + "loss": 0.0311, + "step": 101300 + }, + { + "epoch": 0.22342881969331743, + "grad_norm": 0.10109378397464752, + "learning_rate": 2.3708210909922394e-05, + "loss": 0.0302, + "step": 101310 + }, + { + "epoch": 0.2234508736682156, + "grad_norm": 0.1399964690208435, + "learning_rate": 2.3706864558147173e-05, + "loss": 0.0343, + "step": 101320 + }, + { + "epoch": 0.22347292764311374, + "grad_norm": 0.14634078741073608, + "learning_rate": 2.3705518100575328e-05, + "loss": 0.0326, + "step": 101330 + }, + { + "epoch": 0.22349498161801193, + "grad_norm": 0.09414448589086533, + "learning_rate": 2.3704171537223223e-05, + "loss": 0.0314, + "step": 101340 + }, + { + "epoch": 0.22351703559291008, + "grad_norm": 0.123480424284935, + "learning_rate": 2.3702824868107213e-05, + "loss": 0.0316, + "step": 101350 + }, + { + "epoch": 0.22353908956780824, + "grad_norm": 0.15956783294677734, + "learning_rate": 2.3701478093243663e-05, + "loss": 0.0326, + "step": 101360 + }, + { + "epoch": 0.22356114354270643, + "grad_norm": 0.12446223944425583, + "learning_rate": 2.370013121264894e-05, + "loss": 0.0312, + "step": 101370 + }, + { + "epoch": 0.22358319751760458, + "grad_norm": 0.1153874546289444, + "learning_rate": 2.3698784226339408e-05, + "loss": 0.0319, + "step": 101380 + }, + { + "epoch": 0.22360525149250274, + "grad_norm": 0.09268418699502945, + "learning_rate": 2.3697437134331435e-05, + "loss": 0.0324, + "step": 101390 + }, + { + "epoch": 0.22362730546740092, + "grad_norm": 0.12294536083936691, + "learning_rate": 2.369608993664139e-05, + "loss": 0.0306, + "step": 101400 + }, + { + "epoch": 0.22364935944229908, + "grad_norm": 0.10212094336748123, + "learning_rate": 2.3694742633285644e-05, + "loss": 0.0322, + "step": 101410 + }, + { + "epoch": 0.22367141341719726, + "grad_norm": 0.14288699626922607, + "learning_rate": 2.3693395224280557e-05, + "loss": 0.0314, + "step": 101420 + }, + { + "epoch": 0.22369346739209542, + "grad_norm": 0.10448028892278671, + "learning_rate": 2.3692047709642517e-05, + "loss": 0.031, + "step": 101430 + }, + { + "epoch": 0.22371552136699357, + "grad_norm": 0.0865260511636734, + "learning_rate": 2.3690700089387885e-05, + "loss": 0.03, + "step": 101440 + }, + { + "epoch": 0.22373757534189176, + "grad_norm": 0.11995472013950348, + "learning_rate": 2.368935236353305e-05, + "loss": 0.0313, + "step": 101450 + }, + { + "epoch": 0.22375962931678992, + "grad_norm": 0.09589361399412155, + "learning_rate": 2.3688004532094373e-05, + "loss": 0.0311, + "step": 101460 + }, + { + "epoch": 0.22378168329168807, + "grad_norm": 0.10702148079872131, + "learning_rate": 2.3686656595088243e-05, + "loss": 0.0309, + "step": 101470 + }, + { + "epoch": 0.22380373726658626, + "grad_norm": 0.10775991529226303, + "learning_rate": 2.368530855253103e-05, + "loss": 0.0341, + "step": 101480 + }, + { + "epoch": 0.2238257912414844, + "grad_norm": 0.09222877770662308, + "learning_rate": 2.368396040443912e-05, + "loss": 0.0317, + "step": 101490 + }, + { + "epoch": 0.22384784521638257, + "grad_norm": 0.08737903088331223, + "learning_rate": 2.368261215082889e-05, + "loss": 0.0306, + "step": 101500 + }, + { + "epoch": 0.22386989919128075, + "grad_norm": 0.08972238749265671, + "learning_rate": 2.3681263791716724e-05, + "loss": 0.0303, + "step": 101510 + }, + { + "epoch": 0.2238919531661789, + "grad_norm": 0.09662433713674545, + "learning_rate": 2.367991532711901e-05, + "loss": 0.0306, + "step": 101520 + }, + { + "epoch": 0.22391400714107706, + "grad_norm": 0.10993416607379913, + "learning_rate": 2.3678566757052127e-05, + "loss": 0.0316, + "step": 101530 + }, + { + "epoch": 0.22393606111597525, + "grad_norm": 0.08540076017379761, + "learning_rate": 2.3677218081532464e-05, + "loss": 0.0322, + "step": 101540 + }, + { + "epoch": 0.2239581150908734, + "grad_norm": 0.09791652113199234, + "learning_rate": 2.367586930057641e-05, + "loss": 0.0321, + "step": 101550 + }, + { + "epoch": 0.22398016906577156, + "grad_norm": 0.10480918735265732, + "learning_rate": 2.367452041420035e-05, + "loss": 0.031, + "step": 101560 + }, + { + "epoch": 0.22400222304066975, + "grad_norm": 0.08762586116790771, + "learning_rate": 2.3673171422420684e-05, + "loss": 0.0308, + "step": 101570 + }, + { + "epoch": 0.2240242770155679, + "grad_norm": 0.12995751202106476, + "learning_rate": 2.367182232525379e-05, + "loss": 0.0318, + "step": 101580 + }, + { + "epoch": 0.22404633099046606, + "grad_norm": 0.11302119493484497, + "learning_rate": 2.3670473122716066e-05, + "loss": 0.0315, + "step": 101590 + }, + { + "epoch": 0.22406838496536424, + "grad_norm": 0.11649908125400543, + "learning_rate": 2.366912381482391e-05, + "loss": 0.0301, + "step": 101600 + }, + { + "epoch": 0.2240904389402624, + "grad_norm": 0.1362299770116806, + "learning_rate": 2.3667774401593715e-05, + "loss": 0.0333, + "step": 101610 + }, + { + "epoch": 0.22411249291516055, + "grad_norm": 0.12716814875602722, + "learning_rate": 2.3666424883041876e-05, + "loss": 0.0323, + "step": 101620 + }, + { + "epoch": 0.22413454689005874, + "grad_norm": 0.12251546233892441, + "learning_rate": 2.366507525918479e-05, + "loss": 0.0339, + "step": 101630 + }, + { + "epoch": 0.2241566008649569, + "grad_norm": 0.16811923682689667, + "learning_rate": 2.3663725530038863e-05, + "loss": 0.0312, + "step": 101640 + }, + { + "epoch": 0.22417865483985505, + "grad_norm": 0.11898549646139145, + "learning_rate": 2.366237569562049e-05, + "loss": 0.0335, + "step": 101650 + }, + { + "epoch": 0.22420070881475324, + "grad_norm": 0.11610997468233109, + "learning_rate": 2.366102575594607e-05, + "loss": 0.032, + "step": 101660 + }, + { + "epoch": 0.2242227627896514, + "grad_norm": 0.11886286735534668, + "learning_rate": 2.3659675711032014e-05, + "loss": 0.0336, + "step": 101670 + }, + { + "epoch": 0.22424481676454955, + "grad_norm": 0.10875695943832397, + "learning_rate": 2.3658325560894723e-05, + "loss": 0.0326, + "step": 101680 + }, + { + "epoch": 0.22426687073944773, + "grad_norm": 0.1024208664894104, + "learning_rate": 2.3656975305550594e-05, + "loss": 0.0312, + "step": 101690 + }, + { + "epoch": 0.2242889247143459, + "grad_norm": 0.1073707863688469, + "learning_rate": 2.365562494501605e-05, + "loss": 0.0335, + "step": 101700 + }, + { + "epoch": 0.22431097868924404, + "grad_norm": 0.10568457841873169, + "learning_rate": 2.365427447930749e-05, + "loss": 0.0307, + "step": 101710 + }, + { + "epoch": 0.22433303266414223, + "grad_norm": 0.1024865061044693, + "learning_rate": 2.3652923908441316e-05, + "loss": 0.0309, + "step": 101720 + }, + { + "epoch": 0.22435508663904039, + "grad_norm": 0.1316179782152176, + "learning_rate": 2.3651573232433954e-05, + "loss": 0.0334, + "step": 101730 + }, + { + "epoch": 0.22437714061393854, + "grad_norm": 0.12763427197933197, + "learning_rate": 2.3650222451301807e-05, + "loss": 0.0339, + "step": 101740 + }, + { + "epoch": 0.22439919458883673, + "grad_norm": 0.09479925036430359, + "learning_rate": 2.364887156506129e-05, + "loss": 0.0314, + "step": 101750 + }, + { + "epoch": 0.22442124856373488, + "grad_norm": 0.11054974794387817, + "learning_rate": 2.3647520573728817e-05, + "loss": 0.0298, + "step": 101760 + }, + { + "epoch": 0.22444330253863304, + "grad_norm": 0.09657638520002365, + "learning_rate": 2.3646169477320805e-05, + "loss": 0.0333, + "step": 101770 + }, + { + "epoch": 0.22446535651353122, + "grad_norm": 0.09097421169281006, + "learning_rate": 2.364481827585367e-05, + "loss": 0.0317, + "step": 101780 + }, + { + "epoch": 0.22448741048842938, + "grad_norm": 0.07113207876682281, + "learning_rate": 2.3643466969343832e-05, + "loss": 0.0331, + "step": 101790 + }, + { + "epoch": 0.22450946446332753, + "grad_norm": 0.11546214669942856, + "learning_rate": 2.364211555780771e-05, + "loss": 0.0324, + "step": 101800 + }, + { + "epoch": 0.22453151843822572, + "grad_norm": 0.1287022978067398, + "learning_rate": 2.364076404126172e-05, + "loss": 0.0317, + "step": 101810 + }, + { + "epoch": 0.22455357241312388, + "grad_norm": 0.09058281779289246, + "learning_rate": 2.3639412419722293e-05, + "loss": 0.0318, + "step": 101820 + }, + { + "epoch": 0.22457562638802203, + "grad_norm": 0.1155899167060852, + "learning_rate": 2.363806069320585e-05, + "loss": 0.0329, + "step": 101830 + }, + { + "epoch": 0.22459768036292022, + "grad_norm": 0.12034756690263748, + "learning_rate": 2.3636708861728812e-05, + "loss": 0.0319, + "step": 101840 + }, + { + "epoch": 0.22461973433781837, + "grad_norm": 0.1178416982293129, + "learning_rate": 2.36353569253076e-05, + "loss": 0.0313, + "step": 101850 + }, + { + "epoch": 0.22464178831271656, + "grad_norm": 0.11021646857261658, + "learning_rate": 2.363400488395866e-05, + "loss": 0.034, + "step": 101860 + }, + { + "epoch": 0.2246638422876147, + "grad_norm": 0.08760526776313782, + "learning_rate": 2.3632652737698402e-05, + "loss": 0.033, + "step": 101870 + }, + { + "epoch": 0.22468589626251287, + "grad_norm": 0.09909743815660477, + "learning_rate": 2.3631300486543262e-05, + "loss": 0.0315, + "step": 101880 + }, + { + "epoch": 0.22470795023741105, + "grad_norm": 0.11501549184322357, + "learning_rate": 2.3629948130509675e-05, + "loss": 0.0339, + "step": 101890 + }, + { + "epoch": 0.2247300042123092, + "grad_norm": 0.14941677451133728, + "learning_rate": 2.3628595669614066e-05, + "loss": 0.0336, + "step": 101900 + }, + { + "epoch": 0.22475205818720737, + "grad_norm": 0.08946490287780762, + "learning_rate": 2.3627243103872878e-05, + "loss": 0.0312, + "step": 101910 + }, + { + "epoch": 0.22477411216210555, + "grad_norm": 0.10597004741430283, + "learning_rate": 2.3625890433302536e-05, + "loss": 0.0315, + "step": 101920 + }, + { + "epoch": 0.2247961661370037, + "grad_norm": 0.0920591801404953, + "learning_rate": 2.3624537657919488e-05, + "loss": 0.032, + "step": 101930 + }, + { + "epoch": 0.22481822011190186, + "grad_norm": 0.10333190113306046, + "learning_rate": 2.3623184777740154e-05, + "loss": 0.0329, + "step": 101940 + }, + { + "epoch": 0.22484027408680005, + "grad_norm": 0.12397344410419464, + "learning_rate": 2.362183179278099e-05, + "loss": 0.0326, + "step": 101950 + }, + { + "epoch": 0.2248623280616982, + "grad_norm": 0.1176285594701767, + "learning_rate": 2.362047870305843e-05, + "loss": 0.0316, + "step": 101960 + }, + { + "epoch": 0.22488438203659636, + "grad_norm": 0.09012740105390549, + "learning_rate": 2.3619125508588913e-05, + "loss": 0.0301, + "step": 101970 + }, + { + "epoch": 0.22490643601149454, + "grad_norm": 0.12180396169424057, + "learning_rate": 2.3617772209388885e-05, + "loss": 0.0321, + "step": 101980 + }, + { + "epoch": 0.2249284899863927, + "grad_norm": 0.1468665897846222, + "learning_rate": 2.3616418805474786e-05, + "loss": 0.0317, + "step": 101990 + }, + { + "epoch": 0.22495054396129086, + "grad_norm": 0.10143571346998215, + "learning_rate": 2.3615065296863067e-05, + "loss": 0.033, + "step": 102000 + }, + { + "epoch": 0.22497259793618904, + "grad_norm": 0.10116124898195267, + "learning_rate": 2.3613711683570165e-05, + "loss": 0.0307, + "step": 102010 + }, + { + "epoch": 0.2249946519110872, + "grad_norm": 0.11994962394237518, + "learning_rate": 2.3612357965612535e-05, + "loss": 0.0332, + "step": 102020 + }, + { + "epoch": 0.22501670588598535, + "grad_norm": 0.09596531093120575, + "learning_rate": 2.3611004143006628e-05, + "loss": 0.0328, + "step": 102030 + }, + { + "epoch": 0.22503875986088354, + "grad_norm": 0.10640431940555573, + "learning_rate": 2.3609650215768885e-05, + "loss": 0.0314, + "step": 102040 + }, + { + "epoch": 0.2250608138357817, + "grad_norm": 0.0988059863448143, + "learning_rate": 2.3608296183915764e-05, + "loss": 0.0328, + "step": 102050 + }, + { + "epoch": 0.22508286781067985, + "grad_norm": 0.1007164865732193, + "learning_rate": 2.3606942047463722e-05, + "loss": 0.0318, + "step": 102060 + }, + { + "epoch": 0.22510492178557803, + "grad_norm": 0.12237399071455002, + "learning_rate": 2.3605587806429204e-05, + "loss": 0.0327, + "step": 102070 + }, + { + "epoch": 0.2251269757604762, + "grad_norm": 0.09129608422517776, + "learning_rate": 2.360423346082867e-05, + "loss": 0.0316, + "step": 102080 + }, + { + "epoch": 0.22514902973537435, + "grad_norm": 0.11357727646827698, + "learning_rate": 2.3602879010678575e-05, + "loss": 0.0335, + "step": 102090 + }, + { + "epoch": 0.22517108371027253, + "grad_norm": 0.10254424065351486, + "learning_rate": 2.360152445599538e-05, + "loss": 0.0346, + "step": 102100 + }, + { + "epoch": 0.22519313768517069, + "grad_norm": 0.11089153587818146, + "learning_rate": 2.3600169796795535e-05, + "loss": 0.0314, + "step": 102110 + }, + { + "epoch": 0.22521519166006884, + "grad_norm": 0.10082009434700012, + "learning_rate": 2.3598815033095515e-05, + "loss": 0.0323, + "step": 102120 + }, + { + "epoch": 0.22523724563496703, + "grad_norm": 0.09733545035123825, + "learning_rate": 2.3597460164911768e-05, + "loss": 0.0325, + "step": 102130 + }, + { + "epoch": 0.22525929960986518, + "grad_norm": 0.09901858121156693, + "learning_rate": 2.3596105192260768e-05, + "loss": 0.0334, + "step": 102140 + }, + { + "epoch": 0.22528135358476334, + "grad_norm": 0.10380314290523529, + "learning_rate": 2.3594750115158967e-05, + "loss": 0.0328, + "step": 102150 + }, + { + "epoch": 0.22530340755966152, + "grad_norm": 0.1041189581155777, + "learning_rate": 2.3593394933622846e-05, + "loss": 0.0307, + "step": 102160 + }, + { + "epoch": 0.22532546153455968, + "grad_norm": 0.10010280460119247, + "learning_rate": 2.3592039647668856e-05, + "loss": 0.0315, + "step": 102170 + }, + { + "epoch": 0.22534751550945784, + "grad_norm": 0.12033851444721222, + "learning_rate": 2.3590684257313475e-05, + "loss": 0.0331, + "step": 102180 + }, + { + "epoch": 0.22536956948435602, + "grad_norm": 0.11278395354747772, + "learning_rate": 2.3589328762573166e-05, + "loss": 0.0315, + "step": 102190 + }, + { + "epoch": 0.22539162345925418, + "grad_norm": 0.09195645898580551, + "learning_rate": 2.3587973163464407e-05, + "loss": 0.0326, + "step": 102200 + }, + { + "epoch": 0.22541367743415233, + "grad_norm": 0.13594597578048706, + "learning_rate": 2.3586617460003663e-05, + "loss": 0.0345, + "step": 102210 + }, + { + "epoch": 0.22543573140905052, + "grad_norm": 0.10857740044593811, + "learning_rate": 2.358526165220741e-05, + "loss": 0.0306, + "step": 102220 + }, + { + "epoch": 0.22545778538394867, + "grad_norm": 0.11489304900169373, + "learning_rate": 2.3583905740092124e-05, + "loss": 0.0322, + "step": 102230 + }, + { + "epoch": 0.22547983935884683, + "grad_norm": 0.1365533173084259, + "learning_rate": 2.3582549723674275e-05, + "loss": 0.0321, + "step": 102240 + }, + { + "epoch": 0.225501893333745, + "grad_norm": 0.10693826526403427, + "learning_rate": 2.358119360297035e-05, + "loss": 0.0321, + "step": 102250 + }, + { + "epoch": 0.22552394730864317, + "grad_norm": 0.09949205815792084, + "learning_rate": 2.3579837377996813e-05, + "loss": 0.0343, + "step": 102260 + }, + { + "epoch": 0.22554600128354133, + "grad_norm": 0.11560427397489548, + "learning_rate": 2.357848104877016e-05, + "loss": 0.0308, + "step": 102270 + }, + { + "epoch": 0.2255680552584395, + "grad_norm": 0.12646503746509552, + "learning_rate": 2.3577124615306855e-05, + "loss": 0.0312, + "step": 102280 + }, + { + "epoch": 0.22559010923333767, + "grad_norm": 0.09874618798494339, + "learning_rate": 2.357576807762339e-05, + "loss": 0.0325, + "step": 102290 + }, + { + "epoch": 0.22561216320823585, + "grad_norm": 0.1073242574930191, + "learning_rate": 2.3574411435736248e-05, + "loss": 0.0318, + "step": 102300 + }, + { + "epoch": 0.225634217183134, + "grad_norm": 0.10025430470705032, + "learning_rate": 2.3573054689661907e-05, + "loss": 0.0297, + "step": 102310 + }, + { + "epoch": 0.22565627115803216, + "grad_norm": 0.10307583957910538, + "learning_rate": 2.3571697839416866e-05, + "loss": 0.0323, + "step": 102320 + }, + { + "epoch": 0.22567832513293035, + "grad_norm": 0.12617209553718567, + "learning_rate": 2.3570340885017597e-05, + "loss": 0.0324, + "step": 102330 + }, + { + "epoch": 0.2257003791078285, + "grad_norm": 0.1013433188199997, + "learning_rate": 2.356898382648059e-05, + "loss": 0.0321, + "step": 102340 + }, + { + "epoch": 0.22572243308272666, + "grad_norm": 0.10480498522520065, + "learning_rate": 2.356762666382235e-05, + "loss": 0.0321, + "step": 102350 + }, + { + "epoch": 0.22574448705762484, + "grad_norm": 0.1127684935927391, + "learning_rate": 2.356626939705935e-05, + "loss": 0.0325, + "step": 102360 + }, + { + "epoch": 0.225766541032523, + "grad_norm": 0.1509864330291748, + "learning_rate": 2.356491202620809e-05, + "loss": 0.0321, + "step": 102370 + }, + { + "epoch": 0.22578859500742116, + "grad_norm": 0.08301151543855667, + "learning_rate": 2.356355455128506e-05, + "loss": 0.0298, + "step": 102380 + }, + { + "epoch": 0.22581064898231934, + "grad_norm": 0.12599629163742065, + "learning_rate": 2.356219697230676e-05, + "loss": 0.0316, + "step": 102390 + }, + { + "epoch": 0.2258327029572175, + "grad_norm": 0.10809999704360962, + "learning_rate": 2.3560839289289683e-05, + "loss": 0.0328, + "step": 102400 + }, + { + "epoch": 0.22585475693211565, + "grad_norm": 0.10549362748861313, + "learning_rate": 2.3559481502250322e-05, + "loss": 0.0328, + "step": 102410 + }, + { + "epoch": 0.22587681090701384, + "grad_norm": 0.09212688356637955, + "learning_rate": 2.3558123611205182e-05, + "loss": 0.0302, + "step": 102420 + }, + { + "epoch": 0.225898864881912, + "grad_norm": 0.12066840380430222, + "learning_rate": 2.355676561617076e-05, + "loss": 0.0332, + "step": 102430 + }, + { + "epoch": 0.22592091885681015, + "grad_norm": 0.09951464831829071, + "learning_rate": 2.355540751716356e-05, + "loss": 0.0313, + "step": 102440 + }, + { + "epoch": 0.22594297283170833, + "grad_norm": 0.10801784694194794, + "learning_rate": 2.3554049314200074e-05, + "loss": 0.0344, + "step": 102450 + }, + { + "epoch": 0.2259650268066065, + "grad_norm": 0.10426915436983109, + "learning_rate": 2.355269100729682e-05, + "loss": 0.031, + "step": 102460 + }, + { + "epoch": 0.22598708078150465, + "grad_norm": 0.08763976395130157, + "learning_rate": 2.3551332596470288e-05, + "loss": 0.0315, + "step": 102470 + }, + { + "epoch": 0.22600913475640283, + "grad_norm": 0.12819284200668335, + "learning_rate": 2.3549974081736994e-05, + "loss": 0.0362, + "step": 102480 + }, + { + "epoch": 0.22603118873130099, + "grad_norm": 0.11421319842338562, + "learning_rate": 2.3548615463113446e-05, + "loss": 0.0319, + "step": 102490 + }, + { + "epoch": 0.22605324270619914, + "grad_norm": 0.10761517286300659, + "learning_rate": 2.3547256740616146e-05, + "loss": 0.0324, + "step": 102500 + }, + { + "epoch": 0.22607529668109733, + "grad_norm": 0.0969860777258873, + "learning_rate": 2.3545897914261603e-05, + "loss": 0.0308, + "step": 102510 + }, + { + "epoch": 0.22609735065599548, + "grad_norm": 0.08629832416772842, + "learning_rate": 2.3544538984066336e-05, + "loss": 0.0313, + "step": 102520 + }, + { + "epoch": 0.22611940463089364, + "grad_norm": 0.149607852101326, + "learning_rate": 2.3543179950046856e-05, + "loss": 0.0319, + "step": 102530 + }, + { + "epoch": 0.22614145860579182, + "grad_norm": 0.14429238438606262, + "learning_rate": 2.3541820812219666e-05, + "loss": 0.0303, + "step": 102540 + }, + { + "epoch": 0.22616351258068998, + "grad_norm": 0.11006046086549759, + "learning_rate": 2.354046157060129e-05, + "loss": 0.0337, + "step": 102550 + }, + { + "epoch": 0.22618556655558814, + "grad_norm": 0.11784235388040543, + "learning_rate": 2.3539102225208244e-05, + "loss": 0.032, + "step": 102560 + }, + { + "epoch": 0.22620762053048632, + "grad_norm": 0.11890636384487152, + "learning_rate": 2.3537742776057046e-05, + "loss": 0.0316, + "step": 102570 + }, + { + "epoch": 0.22622967450538448, + "grad_norm": 0.11460389941930771, + "learning_rate": 2.3536383223164207e-05, + "loss": 0.0304, + "step": 102580 + }, + { + "epoch": 0.22625172848028263, + "grad_norm": 0.11191119998693466, + "learning_rate": 2.353502356654626e-05, + "loss": 0.0338, + "step": 102590 + }, + { + "epoch": 0.22627378245518082, + "grad_norm": 0.1210194006562233, + "learning_rate": 2.3533663806219715e-05, + "loss": 0.0319, + "step": 102600 + }, + { + "epoch": 0.22629583643007897, + "grad_norm": 0.10785462707281113, + "learning_rate": 2.3532303942201093e-05, + "loss": 0.0313, + "step": 102610 + }, + { + "epoch": 0.22631789040497713, + "grad_norm": 0.09825388342142105, + "learning_rate": 2.353094397450693e-05, + "loss": 0.0345, + "step": 102620 + }, + { + "epoch": 0.2263399443798753, + "grad_norm": 0.1243959590792656, + "learning_rate": 2.3529583903153735e-05, + "loss": 0.0299, + "step": 102630 + }, + { + "epoch": 0.22636199835477347, + "grad_norm": 0.08383002877235413, + "learning_rate": 2.3528223728158045e-05, + "loss": 0.0308, + "step": 102640 + }, + { + "epoch": 0.22638405232967163, + "grad_norm": 0.11086123436689377, + "learning_rate": 2.3526863449536386e-05, + "loss": 0.031, + "step": 102650 + }, + { + "epoch": 0.2264061063045698, + "grad_norm": 0.10869248956441879, + "learning_rate": 2.3525503067305287e-05, + "loss": 0.0314, + "step": 102660 + }, + { + "epoch": 0.22642816027946797, + "grad_norm": 0.09690830856561661, + "learning_rate": 2.3524142581481276e-05, + "loss": 0.0316, + "step": 102670 + }, + { + "epoch": 0.22645021425436612, + "grad_norm": 0.12091946601867676, + "learning_rate": 2.3522781992080884e-05, + "loss": 0.0324, + "step": 102680 + }, + { + "epoch": 0.2264722682292643, + "grad_norm": 0.12429416924715042, + "learning_rate": 2.3521421299120645e-05, + "loss": 0.0334, + "step": 102690 + }, + { + "epoch": 0.22649432220416246, + "grad_norm": 0.13670945167541504, + "learning_rate": 2.3520060502617085e-05, + "loss": 0.0305, + "step": 102700 + }, + { + "epoch": 0.22651637617906065, + "grad_norm": 0.15082839131355286, + "learning_rate": 2.351869960258675e-05, + "loss": 0.0334, + "step": 102710 + }, + { + "epoch": 0.2265384301539588, + "grad_norm": 0.11749490350484848, + "learning_rate": 2.3517338599046173e-05, + "loss": 0.0312, + "step": 102720 + }, + { + "epoch": 0.22656048412885696, + "grad_norm": 0.10665799677371979, + "learning_rate": 2.3515977492011888e-05, + "loss": 0.0323, + "step": 102730 + }, + { + "epoch": 0.22658253810375514, + "grad_norm": 0.10827649384737015, + "learning_rate": 2.351461628150044e-05, + "loss": 0.0301, + "step": 102740 + }, + { + "epoch": 0.2266045920786533, + "grad_norm": 0.10691743344068527, + "learning_rate": 2.3513254967528363e-05, + "loss": 0.0329, + "step": 102750 + }, + { + "epoch": 0.22662664605355146, + "grad_norm": 0.10044308751821518, + "learning_rate": 2.3511893550112197e-05, + "loss": 0.0317, + "step": 102760 + }, + { + "epoch": 0.22664870002844964, + "grad_norm": 0.0795537680387497, + "learning_rate": 2.3510532029268494e-05, + "loss": 0.0321, + "step": 102770 + }, + { + "epoch": 0.2266707540033478, + "grad_norm": 0.12200673669576645, + "learning_rate": 2.3509170405013787e-05, + "loss": 0.0304, + "step": 102780 + }, + { + "epoch": 0.22669280797824595, + "grad_norm": 0.10103403031826019, + "learning_rate": 2.350780867736463e-05, + "loss": 0.0318, + "step": 102790 + }, + { + "epoch": 0.22671486195314414, + "grad_norm": 0.13623569905757904, + "learning_rate": 2.350644684633756e-05, + "loss": 0.0323, + "step": 102800 + }, + { + "epoch": 0.2267369159280423, + "grad_norm": 0.10481250286102295, + "learning_rate": 2.3505084911949128e-05, + "loss": 0.0305, + "step": 102810 + }, + { + "epoch": 0.22675896990294045, + "grad_norm": 0.11639219522476196, + "learning_rate": 2.3503722874215888e-05, + "loss": 0.032, + "step": 102820 + }, + { + "epoch": 0.22678102387783863, + "grad_norm": 0.10651643574237823, + "learning_rate": 2.3502360733154383e-05, + "loss": 0.0321, + "step": 102830 + }, + { + "epoch": 0.2268030778527368, + "grad_norm": 0.08265916258096695, + "learning_rate": 2.3500998488781174e-05, + "loss": 0.0308, + "step": 102840 + }, + { + "epoch": 0.22682513182763495, + "grad_norm": 0.11577749252319336, + "learning_rate": 2.3499636141112802e-05, + "loss": 0.0335, + "step": 102850 + }, + { + "epoch": 0.22684718580253313, + "grad_norm": 0.10847334563732147, + "learning_rate": 2.3498273690165828e-05, + "loss": 0.0342, + "step": 102860 + }, + { + "epoch": 0.2268692397774313, + "grad_norm": 0.1071823239326477, + "learning_rate": 2.3496911135956806e-05, + "loss": 0.0321, + "step": 102870 + }, + { + "epoch": 0.22689129375232944, + "grad_norm": 0.08947183191776276, + "learning_rate": 2.3495548478502285e-05, + "loss": 0.0312, + "step": 102880 + }, + { + "epoch": 0.22691334772722763, + "grad_norm": 0.11572843790054321, + "learning_rate": 2.3494185717818835e-05, + "loss": 0.0331, + "step": 102890 + }, + { + "epoch": 0.22693540170212578, + "grad_norm": 0.09324736893177032, + "learning_rate": 2.3492822853923005e-05, + "loss": 0.0331, + "step": 102900 + }, + { + "epoch": 0.22695745567702394, + "grad_norm": 0.08937633782625198, + "learning_rate": 2.349145988683136e-05, + "loss": 0.0333, + "step": 102910 + }, + { + "epoch": 0.22697950965192212, + "grad_norm": 0.10626555234193802, + "learning_rate": 2.3490096816560466e-05, + "loss": 0.031, + "step": 102920 + }, + { + "epoch": 0.22700156362682028, + "grad_norm": 0.11985796689987183, + "learning_rate": 2.3488733643126873e-05, + "loss": 0.0325, + "step": 102930 + }, + { + "epoch": 0.22702361760171844, + "grad_norm": 0.10438688844442368, + "learning_rate": 2.348737036654715e-05, + "loss": 0.0336, + "step": 102940 + }, + { + "epoch": 0.22704567157661662, + "grad_norm": 0.11187278479337692, + "learning_rate": 2.3486006986837868e-05, + "loss": 0.0309, + "step": 102950 + }, + { + "epoch": 0.22706772555151478, + "grad_norm": 0.13809309899806976, + "learning_rate": 2.3484643504015588e-05, + "loss": 0.0327, + "step": 102960 + }, + { + "epoch": 0.22708977952641293, + "grad_norm": 0.12180900573730469, + "learning_rate": 2.3483279918096875e-05, + "loss": 0.031, + "step": 102970 + }, + { + "epoch": 0.22711183350131112, + "grad_norm": 0.12891839444637299, + "learning_rate": 2.348191622909831e-05, + "loss": 0.0323, + "step": 102980 + }, + { + "epoch": 0.22713388747620927, + "grad_norm": 0.12139058858156204, + "learning_rate": 2.348055243703645e-05, + "loss": 0.03, + "step": 102990 + }, + { + "epoch": 0.22715594145110743, + "grad_norm": 0.1243610605597496, + "learning_rate": 2.347918854192787e-05, + "loss": 0.0339, + "step": 103000 + }, + { + "epoch": 0.2271779954260056, + "grad_norm": 0.11960775405168533, + "learning_rate": 2.347782454378914e-05, + "loss": 0.0334, + "step": 103010 + }, + { + "epoch": 0.22720004940090377, + "grad_norm": 0.13471488654613495, + "learning_rate": 2.3476460442636845e-05, + "loss": 0.0325, + "step": 103020 + }, + { + "epoch": 0.22722210337580193, + "grad_norm": 0.10465741157531738, + "learning_rate": 2.3475096238487545e-05, + "loss": 0.031, + "step": 103030 + }, + { + "epoch": 0.2272441573507001, + "grad_norm": 0.12654457986354828, + "learning_rate": 2.347373193135783e-05, + "loss": 0.0316, + "step": 103040 + }, + { + "epoch": 0.22726621132559827, + "grad_norm": 0.11668692529201508, + "learning_rate": 2.3472367521264268e-05, + "loss": 0.0333, + "step": 103050 + }, + { + "epoch": 0.22728826530049642, + "grad_norm": 0.13223616778850555, + "learning_rate": 2.3471003008223444e-05, + "loss": 0.0311, + "step": 103060 + }, + { + "epoch": 0.2273103192753946, + "grad_norm": 0.08319901674985886, + "learning_rate": 2.3469638392251932e-05, + "loss": 0.0306, + "step": 103070 + }, + { + "epoch": 0.22733237325029276, + "grad_norm": 0.10251093655824661, + "learning_rate": 2.346827367336632e-05, + "loss": 0.0328, + "step": 103080 + }, + { + "epoch": 0.22735442722519092, + "grad_norm": 0.1255602240562439, + "learning_rate": 2.3466908851583187e-05, + "loss": 0.0309, + "step": 103090 + }, + { + "epoch": 0.2273764812000891, + "grad_norm": 0.13912054896354675, + "learning_rate": 2.3465543926919113e-05, + "loss": 0.0316, + "step": 103100 + }, + { + "epoch": 0.22739853517498726, + "grad_norm": 0.09125299006700516, + "learning_rate": 2.346417889939069e-05, + "loss": 0.0318, + "step": 103110 + }, + { + "epoch": 0.22742058914988542, + "grad_norm": 0.09733790159225464, + "learning_rate": 2.3462813769014505e-05, + "loss": 0.0318, + "step": 103120 + }, + { + "epoch": 0.2274426431247836, + "grad_norm": 0.10589028149843216, + "learning_rate": 2.346144853580714e-05, + "loss": 0.0323, + "step": 103130 + }, + { + "epoch": 0.22746469709968176, + "grad_norm": 0.09724689275026321, + "learning_rate": 2.346008319978519e-05, + "loss": 0.0326, + "step": 103140 + }, + { + "epoch": 0.22748675107457994, + "grad_norm": 0.11398717015981674, + "learning_rate": 2.3458717760965238e-05, + "loss": 0.0319, + "step": 103150 + }, + { + "epoch": 0.2275088050494781, + "grad_norm": 0.103585384786129, + "learning_rate": 2.3457352219363877e-05, + "loss": 0.0323, + "step": 103160 + }, + { + "epoch": 0.22753085902437625, + "grad_norm": 0.10029201954603195, + "learning_rate": 2.3455986574997707e-05, + "loss": 0.0323, + "step": 103170 + }, + { + "epoch": 0.22755291299927444, + "grad_norm": 0.11559555679559708, + "learning_rate": 2.345462082788331e-05, + "loss": 0.0328, + "step": 103180 + }, + { + "epoch": 0.2275749669741726, + "grad_norm": 0.09301058202981949, + "learning_rate": 2.3453254978037294e-05, + "loss": 0.033, + "step": 103190 + }, + { + "epoch": 0.22759702094907075, + "grad_norm": 0.10532861202955246, + "learning_rate": 2.345188902547624e-05, + "loss": 0.0338, + "step": 103200 + }, + { + "epoch": 0.22761907492396893, + "grad_norm": 0.12461312860250473, + "learning_rate": 2.345052297021676e-05, + "loss": 0.0303, + "step": 103210 + }, + { + "epoch": 0.2276411288988671, + "grad_norm": 0.10990775376558304, + "learning_rate": 2.3449156812275447e-05, + "loss": 0.0323, + "step": 103220 + }, + { + "epoch": 0.22766318287376525, + "grad_norm": 0.09700795263051987, + "learning_rate": 2.3447790551668905e-05, + "loss": 0.032, + "step": 103230 + }, + { + "epoch": 0.22768523684866343, + "grad_norm": 0.08606813102960587, + "learning_rate": 2.3446424188413723e-05, + "loss": 0.0301, + "step": 103240 + }, + { + "epoch": 0.2277072908235616, + "grad_norm": 0.11103098839521408, + "learning_rate": 2.3445057722526523e-05, + "loss": 0.0309, + "step": 103250 + }, + { + "epoch": 0.22772934479845974, + "grad_norm": 0.09590814262628555, + "learning_rate": 2.344369115402389e-05, + "loss": 0.0319, + "step": 103260 + }, + { + "epoch": 0.22775139877335793, + "grad_norm": 0.11603248864412308, + "learning_rate": 2.344232448292244e-05, + "loss": 0.0316, + "step": 103270 + }, + { + "epoch": 0.22777345274825608, + "grad_norm": 0.12982676923274994, + "learning_rate": 2.3440957709238776e-05, + "loss": 0.0302, + "step": 103280 + }, + { + "epoch": 0.22779550672315424, + "grad_norm": 0.09872674942016602, + "learning_rate": 2.3439590832989508e-05, + "loss": 0.0323, + "step": 103290 + }, + { + "epoch": 0.22781756069805242, + "grad_norm": 0.1073322519659996, + "learning_rate": 2.343822385419124e-05, + "loss": 0.0334, + "step": 103300 + }, + { + "epoch": 0.22783961467295058, + "grad_norm": 0.10882237553596497, + "learning_rate": 2.343685677286059e-05, + "loss": 0.0325, + "step": 103310 + }, + { + "epoch": 0.22786166864784874, + "grad_norm": 0.12886016070842743, + "learning_rate": 2.343548958901416e-05, + "loss": 0.0318, + "step": 103320 + }, + { + "epoch": 0.22788372262274692, + "grad_norm": 0.0952286422252655, + "learning_rate": 2.343412230266857e-05, + "loss": 0.0321, + "step": 103330 + }, + { + "epoch": 0.22790577659764508, + "grad_norm": 0.11815380305051804, + "learning_rate": 2.343275491384043e-05, + "loss": 0.0313, + "step": 103340 + }, + { + "epoch": 0.22792783057254323, + "grad_norm": 0.11594989895820618, + "learning_rate": 2.3431387422546355e-05, + "loss": 0.0308, + "step": 103350 + }, + { + "epoch": 0.22794988454744142, + "grad_norm": 0.11828982830047607, + "learning_rate": 2.343001982880296e-05, + "loss": 0.0349, + "step": 103360 + }, + { + "epoch": 0.22797193852233957, + "grad_norm": 0.10738903284072876, + "learning_rate": 2.342865213262687e-05, + "loss": 0.0317, + "step": 103370 + }, + { + "epoch": 0.22799399249723773, + "grad_norm": 0.12346848100423813, + "learning_rate": 2.3427284334034695e-05, + "loss": 0.032, + "step": 103380 + }, + { + "epoch": 0.2280160464721359, + "grad_norm": 0.11955475062131882, + "learning_rate": 2.342591643304306e-05, + "loss": 0.0308, + "step": 103390 + }, + { + "epoch": 0.22803810044703407, + "grad_norm": 0.14933589100837708, + "learning_rate": 2.3424548429668586e-05, + "loss": 0.0323, + "step": 103400 + }, + { + "epoch": 0.22806015442193223, + "grad_norm": 0.08445478975772858, + "learning_rate": 2.3423180323927894e-05, + "loss": 0.0316, + "step": 103410 + }, + { + "epoch": 0.2280822083968304, + "grad_norm": 0.0888948142528534, + "learning_rate": 2.342181211583761e-05, + "loss": 0.0321, + "step": 103420 + }, + { + "epoch": 0.22810426237172857, + "grad_norm": 0.10994972288608551, + "learning_rate": 2.342044380541435e-05, + "loss": 0.0303, + "step": 103430 + }, + { + "epoch": 0.22812631634662672, + "grad_norm": 0.11647293716669083, + "learning_rate": 2.3419075392674756e-05, + "loss": 0.0312, + "step": 103440 + }, + { + "epoch": 0.2281483703215249, + "grad_norm": 0.10272403806447983, + "learning_rate": 2.3417706877635442e-05, + "loss": 0.0322, + "step": 103450 + }, + { + "epoch": 0.22817042429642306, + "grad_norm": 0.09626191109418869, + "learning_rate": 2.3416338260313047e-05, + "loss": 0.0303, + "step": 103460 + }, + { + "epoch": 0.22819247827132122, + "grad_norm": 0.10025914013385773, + "learning_rate": 2.3414969540724187e-05, + "loss": 0.0318, + "step": 103470 + }, + { + "epoch": 0.2282145322462194, + "grad_norm": 0.09897471964359283, + "learning_rate": 2.3413600718885508e-05, + "loss": 0.0328, + "step": 103480 + }, + { + "epoch": 0.22823658622111756, + "grad_norm": 0.1261129230260849, + "learning_rate": 2.3412231794813638e-05, + "loss": 0.0324, + "step": 103490 + }, + { + "epoch": 0.22825864019601572, + "grad_norm": 0.10520262271165848, + "learning_rate": 2.3410862768525206e-05, + "loss": 0.0317, + "step": 103500 + }, + { + "epoch": 0.2282806941709139, + "grad_norm": 0.09506290405988693, + "learning_rate": 2.340949364003685e-05, + "loss": 0.0333, + "step": 103510 + }, + { + "epoch": 0.22830274814581206, + "grad_norm": 0.11301840841770172, + "learning_rate": 2.340812440936521e-05, + "loss": 0.033, + "step": 103520 + }, + { + "epoch": 0.2283248021207102, + "grad_norm": 0.12225756794214249, + "learning_rate": 2.3406755076526914e-05, + "loss": 0.0321, + "step": 103530 + }, + { + "epoch": 0.2283468560956084, + "grad_norm": 0.10653489083051682, + "learning_rate": 2.340538564153861e-05, + "loss": 0.031, + "step": 103540 + }, + { + "epoch": 0.22836891007050655, + "grad_norm": 0.10154864192008972, + "learning_rate": 2.3404016104416935e-05, + "loss": 0.0348, + "step": 103550 + }, + { + "epoch": 0.22839096404540474, + "grad_norm": 0.11943327635526657, + "learning_rate": 2.340264646517853e-05, + "loss": 0.0322, + "step": 103560 + }, + { + "epoch": 0.2284130180203029, + "grad_norm": 0.12316185981035233, + "learning_rate": 2.340127672384004e-05, + "loss": 0.0326, + "step": 103570 + }, + { + "epoch": 0.22843507199520105, + "grad_norm": 0.12098555266857147, + "learning_rate": 2.3399906880418096e-05, + "loss": 0.0312, + "step": 103580 + }, + { + "epoch": 0.22845712597009923, + "grad_norm": 0.1053587794303894, + "learning_rate": 2.339853693492936e-05, + "loss": 0.0324, + "step": 103590 + }, + { + "epoch": 0.2284791799449974, + "grad_norm": 0.13293138146400452, + "learning_rate": 2.339716688739047e-05, + "loss": 0.0319, + "step": 103600 + }, + { + "epoch": 0.22850123391989555, + "grad_norm": 0.11991850286722183, + "learning_rate": 2.339579673781807e-05, + "loss": 0.0324, + "step": 103610 + }, + { + "epoch": 0.22852328789479373, + "grad_norm": 0.08947274088859558, + "learning_rate": 2.339442648622882e-05, + "loss": 0.0326, + "step": 103620 + }, + { + "epoch": 0.2285453418696919, + "grad_norm": 0.13561466336250305, + "learning_rate": 2.3393056132639353e-05, + "loss": 0.0331, + "step": 103630 + }, + { + "epoch": 0.22856739584459004, + "grad_norm": 0.12988802790641785, + "learning_rate": 2.3391685677066338e-05, + "loss": 0.0334, + "step": 103640 + }, + { + "epoch": 0.22858944981948823, + "grad_norm": 0.10775826871395111, + "learning_rate": 2.3390315119526415e-05, + "loss": 0.031, + "step": 103650 + }, + { + "epoch": 0.22861150379438638, + "grad_norm": 0.11424358189105988, + "learning_rate": 2.338894446003624e-05, + "loss": 0.0346, + "step": 103660 + }, + { + "epoch": 0.22863355776928454, + "grad_norm": 0.11725933104753494, + "learning_rate": 2.3387573698612476e-05, + "loss": 0.0318, + "step": 103670 + }, + { + "epoch": 0.22865561174418272, + "grad_norm": 0.09732314199209213, + "learning_rate": 2.3386202835271768e-05, + "loss": 0.0309, + "step": 103680 + }, + { + "epoch": 0.22867766571908088, + "grad_norm": 0.1040511429309845, + "learning_rate": 2.3384831870030778e-05, + "loss": 0.0312, + "step": 103690 + }, + { + "epoch": 0.22869971969397904, + "grad_norm": 0.11239119619131088, + "learning_rate": 2.3383460802906164e-05, + "loss": 0.0339, + "step": 103700 + }, + { + "epoch": 0.22872177366887722, + "grad_norm": 0.10194582492113113, + "learning_rate": 2.3382089633914583e-05, + "loss": 0.0317, + "step": 103710 + }, + { + "epoch": 0.22874382764377538, + "grad_norm": 0.13273385167121887, + "learning_rate": 2.3380718363072704e-05, + "loss": 0.0314, + "step": 103720 + }, + { + "epoch": 0.22876588161867353, + "grad_norm": 0.1168365404009819, + "learning_rate": 2.3379346990397177e-05, + "loss": 0.0324, + "step": 103730 + }, + { + "epoch": 0.22878793559357172, + "grad_norm": 0.10657237470149994, + "learning_rate": 2.3377975515904682e-05, + "loss": 0.0322, + "step": 103740 + }, + { + "epoch": 0.22880998956846987, + "grad_norm": 0.09964528679847717, + "learning_rate": 2.337660393961187e-05, + "loss": 0.0326, + "step": 103750 + }, + { + "epoch": 0.22883204354336803, + "grad_norm": 0.09375476092100143, + "learning_rate": 2.3375232261535412e-05, + "loss": 0.0326, + "step": 103760 + }, + { + "epoch": 0.22885409751826621, + "grad_norm": 0.10694378614425659, + "learning_rate": 2.337386048169197e-05, + "loss": 0.0327, + "step": 103770 + }, + { + "epoch": 0.22887615149316437, + "grad_norm": 0.10051742196083069, + "learning_rate": 2.3372488600098226e-05, + "loss": 0.0311, + "step": 103780 + }, + { + "epoch": 0.22889820546806253, + "grad_norm": 0.11554409563541412, + "learning_rate": 2.3371116616770833e-05, + "loss": 0.0317, + "step": 103790 + }, + { + "epoch": 0.2289202594429607, + "grad_norm": 0.08797440677881241, + "learning_rate": 2.3369744531726472e-05, + "loss": 0.0312, + "step": 103800 + }, + { + "epoch": 0.22894231341785887, + "grad_norm": 0.0910504087805748, + "learning_rate": 2.3368372344981813e-05, + "loss": 0.0325, + "step": 103810 + }, + { + "epoch": 0.22896436739275702, + "grad_norm": 0.09760827571153641, + "learning_rate": 2.3367000056553525e-05, + "loss": 0.0321, + "step": 103820 + }, + { + "epoch": 0.2289864213676552, + "grad_norm": 0.11430611461400986, + "learning_rate": 2.3365627666458292e-05, + "loss": 0.031, + "step": 103830 + }, + { + "epoch": 0.22900847534255336, + "grad_norm": 0.11541430652141571, + "learning_rate": 2.3364255174712783e-05, + "loss": 0.0304, + "step": 103840 + }, + { + "epoch": 0.22903052931745152, + "grad_norm": 0.10864097625017166, + "learning_rate": 2.3362882581333674e-05, + "loss": 0.0321, + "step": 103850 + }, + { + "epoch": 0.2290525832923497, + "grad_norm": 0.10409293323755264, + "learning_rate": 2.3361509886337652e-05, + "loss": 0.0318, + "step": 103860 + }, + { + "epoch": 0.22907463726724786, + "grad_norm": 0.11196743696928024, + "learning_rate": 2.3360137089741384e-05, + "loss": 0.0304, + "step": 103870 + }, + { + "epoch": 0.22909669124214602, + "grad_norm": 0.12729798257350922, + "learning_rate": 2.335876419156156e-05, + "loss": 0.0311, + "step": 103880 + }, + { + "epoch": 0.2291187452170442, + "grad_norm": 0.09710779786109924, + "learning_rate": 2.3357391191814853e-05, + "loss": 0.0338, + "step": 103890 + }, + { + "epoch": 0.22914079919194236, + "grad_norm": 0.10761500149965286, + "learning_rate": 2.335601809051796e-05, + "loss": 0.0329, + "step": 103900 + }, + { + "epoch": 0.2291628531668405, + "grad_norm": 0.1082330197095871, + "learning_rate": 2.3354644887687553e-05, + "loss": 0.0334, + "step": 103910 + }, + { + "epoch": 0.2291849071417387, + "grad_norm": 0.11569913476705551, + "learning_rate": 2.3353271583340328e-05, + "loss": 0.0334, + "step": 103920 + }, + { + "epoch": 0.22920696111663685, + "grad_norm": 0.08596152067184448, + "learning_rate": 2.335189817749296e-05, + "loss": 0.0303, + "step": 103930 + }, + { + "epoch": 0.229229015091535, + "grad_norm": 0.11833903938531876, + "learning_rate": 2.335052467016215e-05, + "loss": 0.0313, + "step": 103940 + }, + { + "epoch": 0.2292510690664332, + "grad_norm": 0.08288104832172394, + "learning_rate": 2.334915106136458e-05, + "loss": 0.0321, + "step": 103950 + }, + { + "epoch": 0.22927312304133135, + "grad_norm": 0.08924636244773865, + "learning_rate": 2.3347777351116936e-05, + "loss": 0.0314, + "step": 103960 + }, + { + "epoch": 0.2292951770162295, + "grad_norm": 0.11896565556526184, + "learning_rate": 2.334640353943592e-05, + "loss": 0.0312, + "step": 103970 + }, + { + "epoch": 0.2293172309911277, + "grad_norm": 0.09512006491422653, + "learning_rate": 2.334502962633822e-05, + "loss": 0.0308, + "step": 103980 + }, + { + "epoch": 0.22933928496602585, + "grad_norm": 0.1443399041891098, + "learning_rate": 2.334365561184053e-05, + "loss": 0.0307, + "step": 103990 + }, + { + "epoch": 0.22936133894092403, + "grad_norm": 0.10800939053297043, + "learning_rate": 2.3342281495959546e-05, + "loss": 0.0334, + "step": 104000 + }, + { + "epoch": 0.2293833929158222, + "grad_norm": 0.11542409658432007, + "learning_rate": 2.3340907278711968e-05, + "loss": 0.0311, + "step": 104010 + }, + { + "epoch": 0.22940544689072034, + "grad_norm": 0.129564568400383, + "learning_rate": 2.333953296011449e-05, + "loss": 0.0314, + "step": 104020 + }, + { + "epoch": 0.22942750086561853, + "grad_norm": 0.1152811124920845, + "learning_rate": 2.333815854018381e-05, + "loss": 0.0317, + "step": 104030 + }, + { + "epoch": 0.22944955484051668, + "grad_norm": 0.09936832636594772, + "learning_rate": 2.3336784018936634e-05, + "loss": 0.0327, + "step": 104040 + }, + { + "epoch": 0.22947160881541484, + "grad_norm": 0.1321345567703247, + "learning_rate": 2.3335409396389666e-05, + "loss": 0.0328, + "step": 104050 + }, + { + "epoch": 0.22949366279031302, + "grad_norm": 0.09583045542240143, + "learning_rate": 2.33340346725596e-05, + "loss": 0.0307, + "step": 104060 + }, + { + "epoch": 0.22951571676521118, + "grad_norm": 0.12063658982515335, + "learning_rate": 2.3332659847463137e-05, + "loss": 0.0317, + "step": 104070 + }, + { + "epoch": 0.22953777074010934, + "grad_norm": 0.11279677599668503, + "learning_rate": 2.3331284921117e-05, + "loss": 0.0336, + "step": 104080 + }, + { + "epoch": 0.22955982471500752, + "grad_norm": 0.12010116130113602, + "learning_rate": 2.3329909893537878e-05, + "loss": 0.0317, + "step": 104090 + }, + { + "epoch": 0.22958187868990568, + "grad_norm": 0.10074403136968613, + "learning_rate": 2.3328534764742492e-05, + "loss": 0.0346, + "step": 104100 + }, + { + "epoch": 0.22960393266480383, + "grad_norm": 0.11670207232236862, + "learning_rate": 2.3327159534747543e-05, + "loss": 0.0308, + "step": 104110 + }, + { + "epoch": 0.22962598663970202, + "grad_norm": 0.1079656332731247, + "learning_rate": 2.332578420356974e-05, + "loss": 0.0309, + "step": 104120 + }, + { + "epoch": 0.22964804061460017, + "grad_norm": 0.12194086611270905, + "learning_rate": 2.3324408771225804e-05, + "loss": 0.0312, + "step": 104130 + }, + { + "epoch": 0.22967009458949833, + "grad_norm": 0.08846980333328247, + "learning_rate": 2.3323033237732435e-05, + "loss": 0.0301, + "step": 104140 + }, + { + "epoch": 0.22969214856439651, + "grad_norm": 0.11187563836574554, + "learning_rate": 2.332165760310636e-05, + "loss": 0.0314, + "step": 104150 + }, + { + "epoch": 0.22971420253929467, + "grad_norm": 0.12022565305233002, + "learning_rate": 2.3320281867364285e-05, + "loss": 0.0319, + "step": 104160 + }, + { + "epoch": 0.22973625651419283, + "grad_norm": 0.13779887557029724, + "learning_rate": 2.331890603052293e-05, + "loss": 0.0318, + "step": 104170 + }, + { + "epoch": 0.229758310489091, + "grad_norm": 0.13024893403053284, + "learning_rate": 2.3317530092599014e-05, + "loss": 0.0323, + "step": 104180 + }, + { + "epoch": 0.22978036446398917, + "grad_norm": 0.09140783548355103, + "learning_rate": 2.3316154053609254e-05, + "loss": 0.031, + "step": 104190 + }, + { + "epoch": 0.22980241843888732, + "grad_norm": 0.126845583319664, + "learning_rate": 2.3314777913570363e-05, + "loss": 0.0336, + "step": 104200 + }, + { + "epoch": 0.2298244724137855, + "grad_norm": 0.11175443977117538, + "learning_rate": 2.331340167249908e-05, + "loss": 0.0334, + "step": 104210 + }, + { + "epoch": 0.22984652638868366, + "grad_norm": 0.09797593206167221, + "learning_rate": 2.331202533041211e-05, + "loss": 0.0312, + "step": 104220 + }, + { + "epoch": 0.22986858036358182, + "grad_norm": 0.11527755111455917, + "learning_rate": 2.3310648887326194e-05, + "loss": 0.0329, + "step": 104230 + }, + { + "epoch": 0.22989063433848, + "grad_norm": 0.10779409110546112, + "learning_rate": 2.330927234325804e-05, + "loss": 0.03, + "step": 104240 + }, + { + "epoch": 0.22991268831337816, + "grad_norm": 0.09317067265510559, + "learning_rate": 2.330789569822438e-05, + "loss": 0.0327, + "step": 104250 + }, + { + "epoch": 0.22993474228827632, + "grad_norm": 0.13248369097709656, + "learning_rate": 2.330651895224195e-05, + "loss": 0.0321, + "step": 104260 + }, + { + "epoch": 0.2299567962631745, + "grad_norm": 0.13156233727931976, + "learning_rate": 2.3305142105327466e-05, + "loss": 0.0316, + "step": 104270 + }, + { + "epoch": 0.22997885023807266, + "grad_norm": 0.13049781322479248, + "learning_rate": 2.330376515749767e-05, + "loss": 0.0323, + "step": 104280 + }, + { + "epoch": 0.2300009042129708, + "grad_norm": 0.09261825680732727, + "learning_rate": 2.3302388108769286e-05, + "loss": 0.0299, + "step": 104290 + }, + { + "epoch": 0.230022958187869, + "grad_norm": 0.09566082805395126, + "learning_rate": 2.3301010959159045e-05, + "loss": 0.0299, + "step": 104300 + }, + { + "epoch": 0.23004501216276715, + "grad_norm": 0.10632769763469696, + "learning_rate": 2.3299633708683685e-05, + "loss": 0.0336, + "step": 104310 + }, + { + "epoch": 0.2300670661376653, + "grad_norm": 0.10214035958051682, + "learning_rate": 2.3298256357359938e-05, + "loss": 0.0326, + "step": 104320 + }, + { + "epoch": 0.2300891201125635, + "grad_norm": 0.09315048158168793, + "learning_rate": 2.329687890520454e-05, + "loss": 0.0318, + "step": 104330 + }, + { + "epoch": 0.23011117408746165, + "grad_norm": 0.12029853463172913, + "learning_rate": 2.3295501352234233e-05, + "loss": 0.0333, + "step": 104340 + }, + { + "epoch": 0.2301332280623598, + "grad_norm": 0.09533173590898514, + "learning_rate": 2.329412369846575e-05, + "loss": 0.0307, + "step": 104350 + }, + { + "epoch": 0.230155282037258, + "grad_norm": 0.12400448322296143, + "learning_rate": 2.3292745943915835e-05, + "loss": 0.0331, + "step": 104360 + }, + { + "epoch": 0.23017733601215615, + "grad_norm": 0.11435138434171677, + "learning_rate": 2.3291368088601226e-05, + "loss": 0.033, + "step": 104370 + }, + { + "epoch": 0.2301993899870543, + "grad_norm": 0.0951264500617981, + "learning_rate": 2.328999013253867e-05, + "loss": 0.0319, + "step": 104380 + }, + { + "epoch": 0.2302214439619525, + "grad_norm": 0.09969854354858398, + "learning_rate": 2.32886120757449e-05, + "loss": 0.0319, + "step": 104390 + }, + { + "epoch": 0.23024349793685064, + "grad_norm": 0.14681211113929749, + "learning_rate": 2.3287233918236675e-05, + "loss": 0.0326, + "step": 104400 + }, + { + "epoch": 0.2302655519117488, + "grad_norm": 0.15061065554618835, + "learning_rate": 2.3285855660030723e-05, + "loss": 0.0307, + "step": 104410 + }, + { + "epoch": 0.23028760588664698, + "grad_norm": 0.12509968876838684, + "learning_rate": 2.3284477301143812e-05, + "loss": 0.0334, + "step": 104420 + }, + { + "epoch": 0.23030965986154514, + "grad_norm": 0.10674677044153214, + "learning_rate": 2.3283098841592673e-05, + "loss": 0.0323, + "step": 104430 + }, + { + "epoch": 0.23033171383644332, + "grad_norm": 0.13026876747608185, + "learning_rate": 2.328172028139407e-05, + "loss": 0.0338, + "step": 104440 + }, + { + "epoch": 0.23035376781134148, + "grad_norm": 0.10595429688692093, + "learning_rate": 2.3280341620564743e-05, + "loss": 0.0316, + "step": 104450 + }, + { + "epoch": 0.23037582178623964, + "grad_norm": 0.11591994017362595, + "learning_rate": 2.3278962859121444e-05, + "loss": 0.0317, + "step": 104460 + }, + { + "epoch": 0.23039787576113782, + "grad_norm": 0.10807543247938156, + "learning_rate": 2.3277583997080937e-05, + "loss": 0.033, + "step": 104470 + }, + { + "epoch": 0.23041992973603598, + "grad_norm": 0.12967267632484436, + "learning_rate": 2.327620503445996e-05, + "loss": 0.0323, + "step": 104480 + }, + { + "epoch": 0.23044198371093413, + "grad_norm": 0.12215133011341095, + "learning_rate": 2.3274825971275284e-05, + "loss": 0.0315, + "step": 104490 + }, + { + "epoch": 0.23046403768583232, + "grad_norm": 0.192729189991951, + "learning_rate": 2.3273446807543654e-05, + "loss": 0.0321, + "step": 104500 + }, + { + "epoch": 0.23048609166073047, + "grad_norm": 0.11337494850158691, + "learning_rate": 2.327206754328184e-05, + "loss": 0.0316, + "step": 104510 + }, + { + "epoch": 0.23050814563562863, + "grad_norm": 0.1592177003622055, + "learning_rate": 2.327068817850659e-05, + "loss": 0.0344, + "step": 104520 + }, + { + "epoch": 0.23053019961052681, + "grad_norm": 0.13346807658672333, + "learning_rate": 2.326930871323467e-05, + "loss": 0.0306, + "step": 104530 + }, + { + "epoch": 0.23055225358542497, + "grad_norm": 0.11229059845209122, + "learning_rate": 2.326792914748285e-05, + "loss": 0.0316, + "step": 104540 + }, + { + "epoch": 0.23057430756032313, + "grad_norm": 0.11487389355897903, + "learning_rate": 2.3266549481267877e-05, + "loss": 0.0328, + "step": 104550 + }, + { + "epoch": 0.2305963615352213, + "grad_norm": 0.09440631419420242, + "learning_rate": 2.3265169714606526e-05, + "loss": 0.0315, + "step": 104560 + }, + { + "epoch": 0.23061841551011947, + "grad_norm": 0.18768461048603058, + "learning_rate": 2.3263789847515555e-05, + "loss": 0.0318, + "step": 104570 + }, + { + "epoch": 0.23064046948501762, + "grad_norm": 0.10014647245407104, + "learning_rate": 2.326240988001174e-05, + "loss": 0.0307, + "step": 104580 + }, + { + "epoch": 0.2306625234599158, + "grad_norm": 0.12423102557659149, + "learning_rate": 2.326102981211184e-05, + "loss": 0.0304, + "step": 104590 + }, + { + "epoch": 0.23068457743481396, + "grad_norm": 0.12353377044200897, + "learning_rate": 2.325964964383263e-05, + "loss": 0.0311, + "step": 104600 + }, + { + "epoch": 0.23070663140971212, + "grad_norm": 0.11703632026910782, + "learning_rate": 2.325826937519088e-05, + "loss": 0.0327, + "step": 104610 + }, + { + "epoch": 0.2307286853846103, + "grad_norm": 0.13528577983379364, + "learning_rate": 2.325688900620336e-05, + "loss": 0.0316, + "step": 104620 + }, + { + "epoch": 0.23075073935950846, + "grad_norm": 0.14696654677391052, + "learning_rate": 2.325550853688684e-05, + "loss": 0.0333, + "step": 104630 + }, + { + "epoch": 0.23077279333440662, + "grad_norm": 0.10497304052114487, + "learning_rate": 2.3254127967258095e-05, + "loss": 0.0317, + "step": 104640 + }, + { + "epoch": 0.2307948473093048, + "grad_norm": 0.10494095832109451, + "learning_rate": 2.3252747297333908e-05, + "loss": 0.0324, + "step": 104650 + }, + { + "epoch": 0.23081690128420296, + "grad_norm": 0.09875338524580002, + "learning_rate": 2.3251366527131045e-05, + "loss": 0.0319, + "step": 104660 + }, + { + "epoch": 0.23083895525910111, + "grad_norm": 0.10833176225423813, + "learning_rate": 2.3249985656666292e-05, + "loss": 0.0332, + "step": 104670 + }, + { + "epoch": 0.2308610092339993, + "grad_norm": 0.11492910236120224, + "learning_rate": 2.324860468595642e-05, + "loss": 0.0317, + "step": 104680 + }, + { + "epoch": 0.23088306320889745, + "grad_norm": 0.11281700432300568, + "learning_rate": 2.3247223615018215e-05, + "loss": 0.0315, + "step": 104690 + }, + { + "epoch": 0.2309051171837956, + "grad_norm": 0.09164735674858093, + "learning_rate": 2.3245842443868458e-05, + "loss": 0.0308, + "step": 104700 + }, + { + "epoch": 0.2309271711586938, + "grad_norm": 0.09179840236902237, + "learning_rate": 2.324446117252393e-05, + "loss": 0.0316, + "step": 104710 + }, + { + "epoch": 0.23094922513359195, + "grad_norm": 0.11951129883527756, + "learning_rate": 2.324307980100141e-05, + "loss": 0.0302, + "step": 104720 + }, + { + "epoch": 0.2309712791084901, + "grad_norm": 0.10501231253147125, + "learning_rate": 2.3241698329317688e-05, + "loss": 0.0304, + "step": 104730 + }, + { + "epoch": 0.2309933330833883, + "grad_norm": 0.10792262852191925, + "learning_rate": 2.3240316757489553e-05, + "loss": 0.0336, + "step": 104740 + }, + { + "epoch": 0.23101538705828645, + "grad_norm": 0.12386612594127655, + "learning_rate": 2.323893508553379e-05, + "loss": 0.0334, + "step": 104750 + }, + { + "epoch": 0.2310374410331846, + "grad_norm": 0.11988073587417603, + "learning_rate": 2.3237553313467182e-05, + "loss": 0.0315, + "step": 104760 + }, + { + "epoch": 0.2310594950080828, + "grad_norm": 0.1142151728272438, + "learning_rate": 2.323617144130653e-05, + "loss": 0.0327, + "step": 104770 + }, + { + "epoch": 0.23108154898298094, + "grad_norm": 0.0860767588019371, + "learning_rate": 2.3234789469068614e-05, + "loss": 0.0297, + "step": 104780 + }, + { + "epoch": 0.2311036029578791, + "grad_norm": 0.10107779502868652, + "learning_rate": 2.3233407396770235e-05, + "loss": 0.0318, + "step": 104790 + }, + { + "epoch": 0.23112565693277728, + "grad_norm": 0.11127246171236038, + "learning_rate": 2.3232025224428178e-05, + "loss": 0.0322, + "step": 104800 + }, + { + "epoch": 0.23114771090767544, + "grad_norm": 0.14182694256305695, + "learning_rate": 2.3230642952059248e-05, + "loss": 0.0327, + "step": 104810 + }, + { + "epoch": 0.2311697648825736, + "grad_norm": 0.10124647617340088, + "learning_rate": 2.3229260579680227e-05, + "loss": 0.0302, + "step": 104820 + }, + { + "epoch": 0.23119181885747178, + "grad_norm": 0.11679087579250336, + "learning_rate": 2.3227878107307926e-05, + "loss": 0.0307, + "step": 104830 + }, + { + "epoch": 0.23121387283236994, + "grad_norm": 0.10938026756048203, + "learning_rate": 2.3226495534959136e-05, + "loss": 0.0338, + "step": 104840 + }, + { + "epoch": 0.23123592680726812, + "grad_norm": 0.07633727788925171, + "learning_rate": 2.3225112862650658e-05, + "loss": 0.0311, + "step": 104850 + }, + { + "epoch": 0.23125798078216628, + "grad_norm": 0.12037532031536102, + "learning_rate": 2.3223730090399298e-05, + "loss": 0.033, + "step": 104860 + }, + { + "epoch": 0.23128003475706443, + "grad_norm": 0.13084667921066284, + "learning_rate": 2.322234721822185e-05, + "loss": 0.0342, + "step": 104870 + }, + { + "epoch": 0.23130208873196262, + "grad_norm": 0.15452663600444794, + "learning_rate": 2.3220964246135115e-05, + "loss": 0.0323, + "step": 104880 + }, + { + "epoch": 0.23132414270686077, + "grad_norm": 0.08156508952379227, + "learning_rate": 2.321958117415591e-05, + "loss": 0.0304, + "step": 104890 + }, + { + "epoch": 0.23134619668175893, + "grad_norm": 0.10793130099773407, + "learning_rate": 2.3218198002301025e-05, + "loss": 0.0311, + "step": 104900 + }, + { + "epoch": 0.23136825065665712, + "grad_norm": 0.13503748178482056, + "learning_rate": 2.3216814730587282e-05, + "loss": 0.0313, + "step": 104910 + }, + { + "epoch": 0.23139030463155527, + "grad_norm": 0.11237988620996475, + "learning_rate": 2.3215431359031475e-05, + "loss": 0.0336, + "step": 104920 + }, + { + "epoch": 0.23141235860645343, + "grad_norm": 0.10143439471721649, + "learning_rate": 2.3214047887650427e-05, + "loss": 0.0321, + "step": 104930 + }, + { + "epoch": 0.2314344125813516, + "grad_norm": 0.1015612781047821, + "learning_rate": 2.321266431646094e-05, + "loss": 0.0311, + "step": 104940 + }, + { + "epoch": 0.23145646655624977, + "grad_norm": 0.09065181016921997, + "learning_rate": 2.321128064547983e-05, + "loss": 0.032, + "step": 104950 + }, + { + "epoch": 0.23147852053114792, + "grad_norm": 0.15992729365825653, + "learning_rate": 2.32098968747239e-05, + "loss": 0.0321, + "step": 104960 + }, + { + "epoch": 0.2315005745060461, + "grad_norm": 0.11172285676002502, + "learning_rate": 2.320851300420998e-05, + "loss": 0.0322, + "step": 104970 + }, + { + "epoch": 0.23152262848094426, + "grad_norm": 0.12144094705581665, + "learning_rate": 2.320712903395487e-05, + "loss": 0.0335, + "step": 104980 + }, + { + "epoch": 0.23154468245584242, + "grad_norm": 0.130933478474617, + "learning_rate": 2.3205744963975398e-05, + "loss": 0.0313, + "step": 104990 + }, + { + "epoch": 0.2315667364307406, + "grad_norm": 0.11932024359703064, + "learning_rate": 2.3204360794288373e-05, + "loss": 0.0314, + "step": 105000 + }, + { + "epoch": 0.23158879040563876, + "grad_norm": 0.10239723324775696, + "learning_rate": 2.320297652491062e-05, + "loss": 0.0314, + "step": 105010 + }, + { + "epoch": 0.23161084438053692, + "grad_norm": 0.11808434128761292, + "learning_rate": 2.320159215585896e-05, + "loss": 0.031, + "step": 105020 + }, + { + "epoch": 0.2316328983554351, + "grad_norm": 0.08520004898309708, + "learning_rate": 2.3200207687150208e-05, + "loss": 0.0308, + "step": 105030 + }, + { + "epoch": 0.23165495233033326, + "grad_norm": 0.10078664124011993, + "learning_rate": 2.3198823118801194e-05, + "loss": 0.0326, + "step": 105040 + }, + { + "epoch": 0.23167700630523141, + "grad_norm": 0.13815811276435852, + "learning_rate": 2.3197438450828736e-05, + "loss": 0.0337, + "step": 105050 + }, + { + "epoch": 0.2316990602801296, + "grad_norm": 0.09933508187532425, + "learning_rate": 2.319605368324966e-05, + "loss": 0.0323, + "step": 105060 + }, + { + "epoch": 0.23172111425502775, + "grad_norm": 0.11284727603197098, + "learning_rate": 2.3194668816080794e-05, + "loss": 0.0324, + "step": 105070 + }, + { + "epoch": 0.2317431682299259, + "grad_norm": 0.1124720647931099, + "learning_rate": 2.3193283849338968e-05, + "loss": 0.0313, + "step": 105080 + }, + { + "epoch": 0.2317652222048241, + "grad_norm": 0.09153497219085693, + "learning_rate": 2.3191898783041002e-05, + "loss": 0.0312, + "step": 105090 + }, + { + "epoch": 0.23178727617972225, + "grad_norm": 0.10719989240169525, + "learning_rate": 2.3190513617203737e-05, + "loss": 0.033, + "step": 105100 + }, + { + "epoch": 0.2318093301546204, + "grad_norm": 0.11180541664361954, + "learning_rate": 2.3189128351844e-05, + "loss": 0.0332, + "step": 105110 + }, + { + "epoch": 0.2318313841295186, + "grad_norm": 0.09448356181383133, + "learning_rate": 2.3187742986978613e-05, + "loss": 0.0329, + "step": 105120 + }, + { + "epoch": 0.23185343810441675, + "grad_norm": 0.10461216419935226, + "learning_rate": 2.3186357522624425e-05, + "loss": 0.0298, + "step": 105130 + }, + { + "epoch": 0.2318754920793149, + "grad_norm": 0.1179889664053917, + "learning_rate": 2.3184971958798257e-05, + "loss": 0.0309, + "step": 105140 + }, + { + "epoch": 0.2318975460542131, + "grad_norm": 0.12564118206501007, + "learning_rate": 2.3183586295516962e-05, + "loss": 0.0322, + "step": 105150 + }, + { + "epoch": 0.23191960002911124, + "grad_norm": 0.1397530734539032, + "learning_rate": 2.3182200532797357e-05, + "loss": 0.0313, + "step": 105160 + }, + { + "epoch": 0.2319416540040094, + "grad_norm": 0.13040634989738464, + "learning_rate": 2.3180814670656295e-05, + "loss": 0.0313, + "step": 105170 + }, + { + "epoch": 0.23196370797890759, + "grad_norm": 0.10660815984010696, + "learning_rate": 2.317942870911061e-05, + "loss": 0.0315, + "step": 105180 + }, + { + "epoch": 0.23198576195380574, + "grad_norm": 0.1123572513461113, + "learning_rate": 2.3178042648177145e-05, + "loss": 0.0311, + "step": 105190 + }, + { + "epoch": 0.2320078159287039, + "grad_norm": 0.10584692656993866, + "learning_rate": 2.3176656487872744e-05, + "loss": 0.0309, + "step": 105200 + }, + { + "epoch": 0.23202986990360208, + "grad_norm": 0.09309804439544678, + "learning_rate": 2.3175270228214237e-05, + "loss": 0.0336, + "step": 105210 + }, + { + "epoch": 0.23205192387850024, + "grad_norm": 0.11969555169343948, + "learning_rate": 2.3173883869218484e-05, + "loss": 0.0318, + "step": 105220 + }, + { + "epoch": 0.2320739778533984, + "grad_norm": 0.0912802666425705, + "learning_rate": 2.317249741090232e-05, + "loss": 0.0326, + "step": 105230 + }, + { + "epoch": 0.23209603182829658, + "grad_norm": 0.09833764284849167, + "learning_rate": 2.3171110853282603e-05, + "loss": 0.0332, + "step": 105240 + }, + { + "epoch": 0.23211808580319473, + "grad_norm": 0.08416245132684708, + "learning_rate": 2.316972419637617e-05, + "loss": 0.0321, + "step": 105250 + }, + { + "epoch": 0.2321401397780929, + "grad_norm": 0.11706036329269409, + "learning_rate": 2.3168337440199876e-05, + "loss": 0.0331, + "step": 105260 + }, + { + "epoch": 0.23216219375299108, + "grad_norm": 0.08041802793741226, + "learning_rate": 2.316695058477057e-05, + "loss": 0.0315, + "step": 105270 + }, + { + "epoch": 0.23218424772788923, + "grad_norm": 0.09585406631231308, + "learning_rate": 2.3165563630105102e-05, + "loss": 0.0318, + "step": 105280 + }, + { + "epoch": 0.23220630170278742, + "grad_norm": 0.13329865038394928, + "learning_rate": 2.316417657622033e-05, + "loss": 0.0327, + "step": 105290 + }, + { + "epoch": 0.23222835567768557, + "grad_norm": 0.1050349548459053, + "learning_rate": 2.31627894231331e-05, + "loss": 0.0325, + "step": 105300 + }, + { + "epoch": 0.23225040965258373, + "grad_norm": 0.14061574637889862, + "learning_rate": 2.3161402170860275e-05, + "loss": 0.0303, + "step": 105310 + }, + { + "epoch": 0.2322724636274819, + "grad_norm": 0.12580189108848572, + "learning_rate": 2.3160014819418708e-05, + "loss": 0.0329, + "step": 105320 + }, + { + "epoch": 0.23229451760238007, + "grad_norm": 0.0913093164563179, + "learning_rate": 2.3158627368825253e-05, + "loss": 0.033, + "step": 105330 + }, + { + "epoch": 0.23231657157727822, + "grad_norm": 0.10024352371692657, + "learning_rate": 2.3157239819096777e-05, + "loss": 0.0321, + "step": 105340 + }, + { + "epoch": 0.2323386255521764, + "grad_norm": 0.09928932040929794, + "learning_rate": 2.3155852170250128e-05, + "loss": 0.031, + "step": 105350 + }, + { + "epoch": 0.23236067952707457, + "grad_norm": 0.09436016529798508, + "learning_rate": 2.3154464422302182e-05, + "loss": 0.0312, + "step": 105360 + }, + { + "epoch": 0.23238273350197272, + "grad_norm": 0.10732758790254593, + "learning_rate": 2.315307657526979e-05, + "loss": 0.0329, + "step": 105370 + }, + { + "epoch": 0.2324047874768709, + "grad_norm": 0.08785690367221832, + "learning_rate": 2.3151688629169823e-05, + "loss": 0.0328, + "step": 105380 + }, + { + "epoch": 0.23242684145176906, + "grad_norm": 0.1029994860291481, + "learning_rate": 2.3150300584019142e-05, + "loss": 0.0326, + "step": 105390 + }, + { + "epoch": 0.23244889542666722, + "grad_norm": 0.11416453123092651, + "learning_rate": 2.314891243983461e-05, + "loss": 0.0314, + "step": 105400 + }, + { + "epoch": 0.2324709494015654, + "grad_norm": 0.1056009829044342, + "learning_rate": 2.3147524196633102e-05, + "loss": 0.0327, + "step": 105410 + }, + { + "epoch": 0.23249300337646356, + "grad_norm": 0.09338454902172089, + "learning_rate": 2.314613585443148e-05, + "loss": 0.0325, + "step": 105420 + }, + { + "epoch": 0.23251505735136171, + "grad_norm": 0.12186326831579208, + "learning_rate": 2.3144747413246623e-05, + "loss": 0.0322, + "step": 105430 + }, + { + "epoch": 0.2325371113262599, + "grad_norm": 0.10914605855941772, + "learning_rate": 2.3143358873095386e-05, + "loss": 0.0339, + "step": 105440 + }, + { + "epoch": 0.23255916530115806, + "grad_norm": 0.11282221227884293, + "learning_rate": 2.3141970233994655e-05, + "loss": 0.0316, + "step": 105450 + }, + { + "epoch": 0.2325812192760562, + "grad_norm": 0.12260251492261887, + "learning_rate": 2.3140581495961295e-05, + "loss": 0.0313, + "step": 105460 + }, + { + "epoch": 0.2326032732509544, + "grad_norm": 0.1006426215171814, + "learning_rate": 2.3139192659012187e-05, + "loss": 0.0304, + "step": 105470 + }, + { + "epoch": 0.23262532722585255, + "grad_norm": 0.11414986103773117, + "learning_rate": 2.3137803723164203e-05, + "loss": 0.0336, + "step": 105480 + }, + { + "epoch": 0.2326473812007507, + "grad_norm": 0.1466333568096161, + "learning_rate": 2.3136414688434223e-05, + "loss": 0.0325, + "step": 105490 + }, + { + "epoch": 0.2326694351756489, + "grad_norm": 0.14272847771644592, + "learning_rate": 2.313502555483912e-05, + "loss": 0.0332, + "step": 105500 + }, + { + "epoch": 0.23269148915054705, + "grad_norm": 0.09569748491048813, + "learning_rate": 2.3133636322395773e-05, + "loss": 0.0296, + "step": 105510 + }, + { + "epoch": 0.2327135431254452, + "grad_norm": 0.07760265469551086, + "learning_rate": 2.3132246991121066e-05, + "loss": 0.0333, + "step": 105520 + }, + { + "epoch": 0.2327355971003434, + "grad_norm": 0.10608190298080444, + "learning_rate": 2.3130857561031884e-05, + "loss": 0.0331, + "step": 105530 + }, + { + "epoch": 0.23275765107524155, + "grad_norm": 0.10325908660888672, + "learning_rate": 2.3129468032145106e-05, + "loss": 0.0337, + "step": 105540 + }, + { + "epoch": 0.2327797050501397, + "grad_norm": 0.09307468682527542, + "learning_rate": 2.312807840447761e-05, + "loss": 0.034, + "step": 105550 + }, + { + "epoch": 0.23280175902503789, + "grad_norm": 0.07408104091882706, + "learning_rate": 2.3126688678046287e-05, + "loss": 0.0308, + "step": 105560 + }, + { + "epoch": 0.23282381299993604, + "grad_norm": 0.09315890818834305, + "learning_rate": 2.312529885286803e-05, + "loss": 0.0311, + "step": 105570 + }, + { + "epoch": 0.2328458669748342, + "grad_norm": 0.08220427483320236, + "learning_rate": 2.3123908928959715e-05, + "loss": 0.0309, + "step": 105580 + }, + { + "epoch": 0.23286792094973238, + "grad_norm": 0.10106952488422394, + "learning_rate": 2.312251890633824e-05, + "loss": 0.0319, + "step": 105590 + }, + { + "epoch": 0.23288997492463054, + "grad_norm": 0.10517528653144836, + "learning_rate": 2.312112878502049e-05, + "loss": 0.0321, + "step": 105600 + }, + { + "epoch": 0.2329120288995287, + "grad_norm": 0.1297774463891983, + "learning_rate": 2.311973856502336e-05, + "loss": 0.0332, + "step": 105610 + }, + { + "epoch": 0.23293408287442688, + "grad_norm": 0.10139447450637817, + "learning_rate": 2.3118348246363734e-05, + "loss": 0.0324, + "step": 105620 + }, + { + "epoch": 0.23295613684932504, + "grad_norm": 0.12204854190349579, + "learning_rate": 2.3116957829058517e-05, + "loss": 0.0309, + "step": 105630 + }, + { + "epoch": 0.2329781908242232, + "grad_norm": 0.1294333040714264, + "learning_rate": 2.3115567313124597e-05, + "loss": 0.0316, + "step": 105640 + }, + { + "epoch": 0.23300024479912138, + "grad_norm": 0.10157672315835953, + "learning_rate": 2.3114176698578874e-05, + "loss": 0.032, + "step": 105650 + }, + { + "epoch": 0.23302229877401953, + "grad_norm": 0.10828105360269547, + "learning_rate": 2.311278598543824e-05, + "loss": 0.0303, + "step": 105660 + }, + { + "epoch": 0.2330443527489177, + "grad_norm": 0.13759297132492065, + "learning_rate": 2.3111395173719597e-05, + "loss": 0.0287, + "step": 105670 + }, + { + "epoch": 0.23306640672381587, + "grad_norm": 0.10678531974554062, + "learning_rate": 2.311000426343985e-05, + "loss": 0.0305, + "step": 105680 + }, + { + "epoch": 0.23308846069871403, + "grad_norm": 0.10761118680238724, + "learning_rate": 2.310861325461588e-05, + "loss": 0.03, + "step": 105690 + }, + { + "epoch": 0.23311051467361218, + "grad_norm": 0.11159837990999222, + "learning_rate": 2.3107222147264618e-05, + "loss": 0.0326, + "step": 105700 + }, + { + "epoch": 0.23313256864851037, + "grad_norm": 0.11104828119277954, + "learning_rate": 2.3105830941402945e-05, + "loss": 0.0306, + "step": 105710 + }, + { + "epoch": 0.23315462262340853, + "grad_norm": 0.1003512442111969, + "learning_rate": 2.3104439637047776e-05, + "loss": 0.0315, + "step": 105720 + }, + { + "epoch": 0.2331766765983067, + "grad_norm": 0.10670702904462814, + "learning_rate": 2.310304823421601e-05, + "loss": 0.0301, + "step": 105730 + }, + { + "epoch": 0.23319873057320487, + "grad_norm": 0.11746913939714432, + "learning_rate": 2.3101656732924564e-05, + "loss": 0.0311, + "step": 105740 + }, + { + "epoch": 0.23322078454810302, + "grad_norm": 0.10620938986539841, + "learning_rate": 2.3100265133190334e-05, + "loss": 0.0314, + "step": 105750 + }, + { + "epoch": 0.2332428385230012, + "grad_norm": 0.12871555984020233, + "learning_rate": 2.3098873435030236e-05, + "loss": 0.0304, + "step": 105760 + }, + { + "epoch": 0.23326489249789936, + "grad_norm": 0.12087985128164291, + "learning_rate": 2.309748163846118e-05, + "loss": 0.0305, + "step": 105770 + }, + { + "epoch": 0.23328694647279752, + "grad_norm": 0.09599947929382324, + "learning_rate": 2.3096089743500073e-05, + "loss": 0.0308, + "step": 105780 + }, + { + "epoch": 0.2333090004476957, + "grad_norm": 0.12870828807353973, + "learning_rate": 2.3094697750163836e-05, + "loss": 0.032, + "step": 105790 + }, + { + "epoch": 0.23333105442259386, + "grad_norm": 0.09907766431570053, + "learning_rate": 2.3093305658469378e-05, + "loss": 0.0346, + "step": 105800 + }, + { + "epoch": 0.23335310839749202, + "grad_norm": 0.11196300387382507, + "learning_rate": 2.3091913468433608e-05, + "loss": 0.0307, + "step": 105810 + }, + { + "epoch": 0.2333751623723902, + "grad_norm": 0.13002736866474152, + "learning_rate": 2.3090521180073458e-05, + "loss": 0.0322, + "step": 105820 + }, + { + "epoch": 0.23339721634728836, + "grad_norm": 0.1161700114607811, + "learning_rate": 2.3089128793405837e-05, + "loss": 0.032, + "step": 105830 + }, + { + "epoch": 0.2334192703221865, + "grad_norm": 0.12145193666219711, + "learning_rate": 2.3087736308447655e-05, + "loss": 0.0317, + "step": 105840 + }, + { + "epoch": 0.2334413242970847, + "grad_norm": 0.11356829106807709, + "learning_rate": 2.308634372521585e-05, + "loss": 0.0313, + "step": 105850 + }, + { + "epoch": 0.23346337827198285, + "grad_norm": 0.10409055650234222, + "learning_rate": 2.3084951043727326e-05, + "loss": 0.0305, + "step": 105860 + }, + { + "epoch": 0.233485432246881, + "grad_norm": 0.11106260865926743, + "learning_rate": 2.3083558263999023e-05, + "loss": 0.0305, + "step": 105870 + }, + { + "epoch": 0.2335074862217792, + "grad_norm": 0.0934879332780838, + "learning_rate": 2.3082165386047847e-05, + "loss": 0.0323, + "step": 105880 + }, + { + "epoch": 0.23352954019667735, + "grad_norm": 0.10292854905128479, + "learning_rate": 2.3080772409890733e-05, + "loss": 0.0332, + "step": 105890 + }, + { + "epoch": 0.2335515941715755, + "grad_norm": 0.08778931945562363, + "learning_rate": 2.3079379335544605e-05, + "loss": 0.0313, + "step": 105900 + }, + { + "epoch": 0.2335736481464737, + "grad_norm": 0.12370320409536362, + "learning_rate": 2.307798616302639e-05, + "loss": 0.0303, + "step": 105910 + }, + { + "epoch": 0.23359570212137185, + "grad_norm": 0.12574246525764465, + "learning_rate": 2.3076592892353018e-05, + "loss": 0.0319, + "step": 105920 + }, + { + "epoch": 0.23361775609627, + "grad_norm": 0.11546175926923752, + "learning_rate": 2.3075199523541415e-05, + "loss": 0.0321, + "step": 105930 + }, + { + "epoch": 0.23363981007116819, + "grad_norm": 0.11471384018659592, + "learning_rate": 2.3073806056608515e-05, + "loss": 0.0313, + "step": 105940 + }, + { + "epoch": 0.23366186404606634, + "grad_norm": 0.13852699100971222, + "learning_rate": 2.3072412491571247e-05, + "loss": 0.0325, + "step": 105950 + }, + { + "epoch": 0.2336839180209645, + "grad_norm": 0.1057979092001915, + "learning_rate": 2.3071018828446546e-05, + "loss": 0.0303, + "step": 105960 + }, + { + "epoch": 0.23370597199586268, + "grad_norm": 0.11961958557367325, + "learning_rate": 2.306962506725134e-05, + "loss": 0.0324, + "step": 105970 + }, + { + "epoch": 0.23372802597076084, + "grad_norm": 0.12249689549207687, + "learning_rate": 2.3068231208002583e-05, + "loss": 0.0329, + "step": 105980 + }, + { + "epoch": 0.233750079945659, + "grad_norm": 0.09420705586671829, + "learning_rate": 2.306683725071719e-05, + "loss": 0.0307, + "step": 105990 + }, + { + "epoch": 0.23377213392055718, + "grad_norm": 0.11348521709442139, + "learning_rate": 2.3065443195412112e-05, + "loss": 0.032, + "step": 106000 + }, + { + "epoch": 0.23379418789545534, + "grad_norm": 0.10037878900766373, + "learning_rate": 2.3064049042104282e-05, + "loss": 0.0337, + "step": 106010 + }, + { + "epoch": 0.2338162418703535, + "grad_norm": 0.09582982212305069, + "learning_rate": 2.3062654790810644e-05, + "loss": 0.0326, + "step": 106020 + }, + { + "epoch": 0.23383829584525168, + "grad_norm": 0.10754737257957458, + "learning_rate": 2.3061260441548134e-05, + "loss": 0.0316, + "step": 106030 + }, + { + "epoch": 0.23386034982014983, + "grad_norm": 0.08974204957485199, + "learning_rate": 2.30598659943337e-05, + "loss": 0.0306, + "step": 106040 + }, + { + "epoch": 0.233882403795048, + "grad_norm": 0.13832589983940125, + "learning_rate": 2.3058471449184286e-05, + "loss": 0.0318, + "step": 106050 + }, + { + "epoch": 0.23390445776994617, + "grad_norm": 0.11012551188468933, + "learning_rate": 2.305707680611684e-05, + "loss": 0.0322, + "step": 106060 + }, + { + "epoch": 0.23392651174484433, + "grad_norm": 0.11165925860404968, + "learning_rate": 2.30556820651483e-05, + "loss": 0.0313, + "step": 106070 + }, + { + "epoch": 0.23394856571974249, + "grad_norm": 0.1160162091255188, + "learning_rate": 2.3054287226295614e-05, + "loss": 0.0322, + "step": 106080 + }, + { + "epoch": 0.23397061969464067, + "grad_norm": 0.11876031011343002, + "learning_rate": 2.3052892289575735e-05, + "loss": 0.0324, + "step": 106090 + }, + { + "epoch": 0.23399267366953883, + "grad_norm": 0.22251303493976593, + "learning_rate": 2.305149725500561e-05, + "loss": 0.0318, + "step": 106100 + }, + { + "epoch": 0.23401472764443698, + "grad_norm": 0.10234124958515167, + "learning_rate": 2.3050102122602195e-05, + "loss": 0.0307, + "step": 106110 + }, + { + "epoch": 0.23403678161933517, + "grad_norm": 0.13170164823532104, + "learning_rate": 2.3048706892382435e-05, + "loss": 0.0314, + "step": 106120 + }, + { + "epoch": 0.23405883559423332, + "grad_norm": 0.09567677229642868, + "learning_rate": 2.3047311564363293e-05, + "loss": 0.0321, + "step": 106130 + }, + { + "epoch": 0.2340808895691315, + "grad_norm": 0.11700291186571121, + "learning_rate": 2.3045916138561715e-05, + "loss": 0.0309, + "step": 106140 + }, + { + "epoch": 0.23410294354402966, + "grad_norm": 0.09400789439678192, + "learning_rate": 2.3044520614994655e-05, + "loss": 0.0321, + "step": 106150 + }, + { + "epoch": 0.23412499751892782, + "grad_norm": 0.11221957206726074, + "learning_rate": 2.3043124993679082e-05, + "loss": 0.0333, + "step": 106160 + }, + { + "epoch": 0.234147051493826, + "grad_norm": 0.11146332323551178, + "learning_rate": 2.3041729274631937e-05, + "loss": 0.0324, + "step": 106170 + }, + { + "epoch": 0.23416910546872416, + "grad_norm": 0.12605924904346466, + "learning_rate": 2.3040333457870196e-05, + "loss": 0.031, + "step": 106180 + }, + { + "epoch": 0.23419115944362232, + "grad_norm": 0.10372164100408554, + "learning_rate": 2.3038937543410808e-05, + "loss": 0.0323, + "step": 106190 + }, + { + "epoch": 0.2342132134185205, + "grad_norm": 0.14337846636772156, + "learning_rate": 2.303754153127074e-05, + "loss": 0.0332, + "step": 106200 + }, + { + "epoch": 0.23423526739341866, + "grad_norm": 0.08914200961589813, + "learning_rate": 2.3036145421466956e-05, + "loss": 0.0314, + "step": 106210 + }, + { + "epoch": 0.2342573213683168, + "grad_norm": 0.10808426141738892, + "learning_rate": 2.3034749214016415e-05, + "loss": 0.0316, + "step": 106220 + }, + { + "epoch": 0.234279375343215, + "grad_norm": 0.12713786959648132, + "learning_rate": 2.303335290893609e-05, + "loss": 0.0333, + "step": 106230 + }, + { + "epoch": 0.23430142931811315, + "grad_norm": 0.09356168657541275, + "learning_rate": 2.303195650624293e-05, + "loss": 0.0316, + "step": 106240 + }, + { + "epoch": 0.2343234832930113, + "grad_norm": 0.09794659167528152, + "learning_rate": 2.303056000595393e-05, + "loss": 0.033, + "step": 106250 + }, + { + "epoch": 0.2343455372679095, + "grad_norm": 0.09564372897148132, + "learning_rate": 2.3029163408086036e-05, + "loss": 0.0307, + "step": 106260 + }, + { + "epoch": 0.23436759124280765, + "grad_norm": 0.127741739153862, + "learning_rate": 2.3027766712656226e-05, + "loss": 0.0325, + "step": 106270 + }, + { + "epoch": 0.2343896452177058, + "grad_norm": 0.13135230541229248, + "learning_rate": 2.3026369919681468e-05, + "loss": 0.0318, + "step": 106280 + }, + { + "epoch": 0.234411699192604, + "grad_norm": 0.11301112920045853, + "learning_rate": 2.3024973029178747e-05, + "loss": 0.0314, + "step": 106290 + }, + { + "epoch": 0.23443375316750215, + "grad_norm": 0.1042495146393776, + "learning_rate": 2.3023576041165017e-05, + "loss": 0.0315, + "step": 106300 + }, + { + "epoch": 0.2344558071424003, + "grad_norm": 0.11053220182657242, + "learning_rate": 2.302217895565727e-05, + "loss": 0.0317, + "step": 106310 + }, + { + "epoch": 0.2344778611172985, + "grad_norm": 0.11093910783529282, + "learning_rate": 2.302078177267247e-05, + "loss": 0.0328, + "step": 106320 + }, + { + "epoch": 0.23449991509219664, + "grad_norm": 0.10379043221473694, + "learning_rate": 2.30193844922276e-05, + "loss": 0.0302, + "step": 106330 + }, + { + "epoch": 0.2345219690670948, + "grad_norm": 0.11345064640045166, + "learning_rate": 2.301798711433964e-05, + "loss": 0.0318, + "step": 106340 + }, + { + "epoch": 0.23454402304199298, + "grad_norm": 0.1348675936460495, + "learning_rate": 2.3016589639025558e-05, + "loss": 0.0315, + "step": 106350 + }, + { + "epoch": 0.23456607701689114, + "grad_norm": 0.09971848875284195, + "learning_rate": 2.3015192066302352e-05, + "loss": 0.0325, + "step": 106360 + }, + { + "epoch": 0.2345881309917893, + "grad_norm": 0.11040432751178741, + "learning_rate": 2.301379439618699e-05, + "loss": 0.0324, + "step": 106370 + }, + { + "epoch": 0.23461018496668748, + "grad_norm": 0.08943665772676468, + "learning_rate": 2.3012396628696462e-05, + "loss": 0.0323, + "step": 106380 + }, + { + "epoch": 0.23463223894158564, + "grad_norm": 0.13880623877048492, + "learning_rate": 2.3010998763847747e-05, + "loss": 0.0308, + "step": 106390 + }, + { + "epoch": 0.2346542929164838, + "grad_norm": 0.1471836119890213, + "learning_rate": 2.3009600801657837e-05, + "loss": 0.0308, + "step": 106400 + }, + { + "epoch": 0.23467634689138198, + "grad_norm": 0.10773027688264847, + "learning_rate": 2.3008202742143712e-05, + "loss": 0.0328, + "step": 106410 + }, + { + "epoch": 0.23469840086628013, + "grad_norm": 0.10847275704145432, + "learning_rate": 2.300680458532236e-05, + "loss": 0.0316, + "step": 106420 + }, + { + "epoch": 0.2347204548411783, + "grad_norm": 0.0973619818687439, + "learning_rate": 2.3005406331210775e-05, + "loss": 0.0332, + "step": 106430 + }, + { + "epoch": 0.23474250881607647, + "grad_norm": 0.09707629680633545, + "learning_rate": 2.300400797982595e-05, + "loss": 0.0334, + "step": 106440 + }, + { + "epoch": 0.23476456279097463, + "grad_norm": 0.09786999970674515, + "learning_rate": 2.3002609531184868e-05, + "loss": 0.0307, + "step": 106450 + }, + { + "epoch": 0.23478661676587279, + "grad_norm": 0.12963485717773438, + "learning_rate": 2.300121098530452e-05, + "loss": 0.0319, + "step": 106460 + }, + { + "epoch": 0.23480867074077097, + "grad_norm": 0.10818665474653244, + "learning_rate": 2.2999812342201913e-05, + "loss": 0.0315, + "step": 106470 + }, + { + "epoch": 0.23483072471566913, + "grad_norm": 0.10664717108011246, + "learning_rate": 2.2998413601894025e-05, + "loss": 0.0324, + "step": 106480 + }, + { + "epoch": 0.23485277869056728, + "grad_norm": 0.09345410764217377, + "learning_rate": 2.299701476439786e-05, + "loss": 0.0322, + "step": 106490 + }, + { + "epoch": 0.23487483266546547, + "grad_norm": 0.08160144090652466, + "learning_rate": 2.299561582973042e-05, + "loss": 0.0313, + "step": 106500 + }, + { + "epoch": 0.23489688664036362, + "grad_norm": 0.08983688056468964, + "learning_rate": 2.2994216797908693e-05, + "loss": 0.0302, + "step": 106510 + }, + { + "epoch": 0.23491894061526178, + "grad_norm": 0.08804888278245926, + "learning_rate": 2.299281766894969e-05, + "loss": 0.0324, + "step": 106520 + }, + { + "epoch": 0.23494099459015996, + "grad_norm": 0.11383423954248428, + "learning_rate": 2.29914184428704e-05, + "loss": 0.0307, + "step": 106530 + }, + { + "epoch": 0.23496304856505812, + "grad_norm": 0.09444306045770645, + "learning_rate": 2.2990019119687836e-05, + "loss": 0.0312, + "step": 106540 + }, + { + "epoch": 0.23498510253995628, + "grad_norm": 0.11956573277711868, + "learning_rate": 2.2988619699418995e-05, + "loss": 0.0314, + "step": 106550 + }, + { + "epoch": 0.23500715651485446, + "grad_norm": 0.13939514756202698, + "learning_rate": 2.2987220182080883e-05, + "loss": 0.0318, + "step": 106560 + }, + { + "epoch": 0.23502921048975262, + "grad_norm": 0.11001434177160263, + "learning_rate": 2.2985820567690498e-05, + "loss": 0.0323, + "step": 106570 + }, + { + "epoch": 0.2350512644646508, + "grad_norm": 0.10281853377819061, + "learning_rate": 2.298442085626486e-05, + "loss": 0.0309, + "step": 106580 + }, + { + "epoch": 0.23507331843954896, + "grad_norm": 0.09792307019233704, + "learning_rate": 2.2983021047820967e-05, + "loss": 0.0306, + "step": 106590 + }, + { + "epoch": 0.2350953724144471, + "grad_norm": 0.11332414299249649, + "learning_rate": 2.2981621142375828e-05, + "loss": 0.0318, + "step": 106600 + }, + { + "epoch": 0.2351174263893453, + "grad_norm": 0.1002110168337822, + "learning_rate": 2.2980221139946463e-05, + "loss": 0.0321, + "step": 106610 + }, + { + "epoch": 0.23513948036424345, + "grad_norm": 0.10215619951486588, + "learning_rate": 2.297882104054987e-05, + "loss": 0.0316, + "step": 106620 + }, + { + "epoch": 0.2351615343391416, + "grad_norm": 0.11554306745529175, + "learning_rate": 2.2977420844203074e-05, + "loss": 0.0328, + "step": 106630 + }, + { + "epoch": 0.2351835883140398, + "grad_norm": 0.11027048528194427, + "learning_rate": 2.2976020550923083e-05, + "loss": 0.0338, + "step": 106640 + }, + { + "epoch": 0.23520564228893795, + "grad_norm": 0.10635291785001755, + "learning_rate": 2.2974620160726907e-05, + "loss": 0.031, + "step": 106650 + }, + { + "epoch": 0.2352276962638361, + "grad_norm": 0.10351341217756271, + "learning_rate": 2.2973219673631572e-05, + "loss": 0.0311, + "step": 106660 + }, + { + "epoch": 0.2352497502387343, + "grad_norm": 0.12345923483371735, + "learning_rate": 2.2971819089654085e-05, + "loss": 0.0322, + "step": 106670 + }, + { + "epoch": 0.23527180421363245, + "grad_norm": 0.10939707607030869, + "learning_rate": 2.2970418408811472e-05, + "loss": 0.0301, + "step": 106680 + }, + { + "epoch": 0.2352938581885306, + "grad_norm": 0.07278701663017273, + "learning_rate": 2.296901763112075e-05, + "loss": 0.0299, + "step": 106690 + }, + { + "epoch": 0.2353159121634288, + "grad_norm": 0.11034519225358963, + "learning_rate": 2.2967616756598936e-05, + "loss": 0.0341, + "step": 106700 + }, + { + "epoch": 0.23533796613832694, + "grad_norm": 0.09025789052248001, + "learning_rate": 2.296621578526306e-05, + "loss": 0.0317, + "step": 106710 + }, + { + "epoch": 0.2353600201132251, + "grad_norm": 0.08376682549715042, + "learning_rate": 2.2964814717130144e-05, + "loss": 0.0306, + "step": 106720 + }, + { + "epoch": 0.23538207408812328, + "grad_norm": 0.08384580165147781, + "learning_rate": 2.2963413552217204e-05, + "loss": 0.0329, + "step": 106730 + }, + { + "epoch": 0.23540412806302144, + "grad_norm": 0.0984063521027565, + "learning_rate": 2.2962012290541267e-05, + "loss": 0.0304, + "step": 106740 + }, + { + "epoch": 0.2354261820379196, + "grad_norm": 0.07908593118190765, + "learning_rate": 2.296061093211937e-05, + "loss": 0.0312, + "step": 106750 + }, + { + "epoch": 0.23544823601281778, + "grad_norm": 0.10270315408706665, + "learning_rate": 2.2959209476968535e-05, + "loss": 0.0306, + "step": 106760 + }, + { + "epoch": 0.23547028998771594, + "grad_norm": 0.11570020020008087, + "learning_rate": 2.2957807925105788e-05, + "loss": 0.0311, + "step": 106770 + }, + { + "epoch": 0.2354923439626141, + "grad_norm": 0.09771458059549332, + "learning_rate": 2.295640627654816e-05, + "loss": 0.0312, + "step": 106780 + }, + { + "epoch": 0.23551439793751228, + "grad_norm": 0.08725699782371521, + "learning_rate": 2.2955004531312682e-05, + "loss": 0.03, + "step": 106790 + }, + { + "epoch": 0.23553645191241043, + "grad_norm": 0.09034222364425659, + "learning_rate": 2.295360268941639e-05, + "loss": 0.0319, + "step": 106800 + }, + { + "epoch": 0.2355585058873086, + "grad_norm": 0.08177421241998672, + "learning_rate": 2.2952200750876317e-05, + "loss": 0.0333, + "step": 106810 + }, + { + "epoch": 0.23558055986220677, + "grad_norm": 0.1371113657951355, + "learning_rate": 2.2950798715709496e-05, + "loss": 0.0305, + "step": 106820 + }, + { + "epoch": 0.23560261383710493, + "grad_norm": 0.08352497220039368, + "learning_rate": 2.2949396583932964e-05, + "loss": 0.0314, + "step": 106830 + }, + { + "epoch": 0.23562466781200309, + "grad_norm": 0.08729856461286545, + "learning_rate": 2.294799435556376e-05, + "loss": 0.0321, + "step": 106840 + }, + { + "epoch": 0.23564672178690127, + "grad_norm": 0.0765826478600502, + "learning_rate": 2.2946592030618912e-05, + "loss": 0.032, + "step": 106850 + }, + { + "epoch": 0.23566877576179943, + "grad_norm": 0.07796107977628708, + "learning_rate": 2.294518960911548e-05, + "loss": 0.0327, + "step": 106860 + }, + { + "epoch": 0.23569082973669758, + "grad_norm": 0.1039363369345665, + "learning_rate": 2.2943787091070485e-05, + "loss": 0.0329, + "step": 106870 + }, + { + "epoch": 0.23571288371159577, + "grad_norm": 0.11066698282957077, + "learning_rate": 2.294238447650098e-05, + "loss": 0.0323, + "step": 106880 + }, + { + "epoch": 0.23573493768649392, + "grad_norm": 0.10570311546325684, + "learning_rate": 2.2940981765424004e-05, + "loss": 0.0307, + "step": 106890 + }, + { + "epoch": 0.23575699166139208, + "grad_norm": 0.10695348680019379, + "learning_rate": 2.29395789578566e-05, + "loss": 0.0303, + "step": 106900 + }, + { + "epoch": 0.23577904563629026, + "grad_norm": 0.1220068633556366, + "learning_rate": 2.293817605381582e-05, + "loss": 0.0311, + "step": 106910 + }, + { + "epoch": 0.23580109961118842, + "grad_norm": 0.0910036563873291, + "learning_rate": 2.2936773053318698e-05, + "loss": 0.0336, + "step": 106920 + }, + { + "epoch": 0.23582315358608658, + "grad_norm": 0.08993274718523026, + "learning_rate": 2.2935369956382294e-05, + "loss": 0.0307, + "step": 106930 + }, + { + "epoch": 0.23584520756098476, + "grad_norm": 0.11461775749921799, + "learning_rate": 2.2933966763023655e-05, + "loss": 0.0324, + "step": 106940 + }, + { + "epoch": 0.23586726153588292, + "grad_norm": 0.10503306239843369, + "learning_rate": 2.293256347325983e-05, + "loss": 0.032, + "step": 106950 + }, + { + "epoch": 0.23588931551078107, + "grad_norm": 0.10608717054128647, + "learning_rate": 2.2931160087107868e-05, + "loss": 0.0314, + "step": 106960 + }, + { + "epoch": 0.23591136948567926, + "grad_norm": 0.12411078065633774, + "learning_rate": 2.2929756604584818e-05, + "loss": 0.0323, + "step": 106970 + }, + { + "epoch": 0.2359334234605774, + "grad_norm": 0.09165401756763458, + "learning_rate": 2.292835302570774e-05, + "loss": 0.0332, + "step": 106980 + }, + { + "epoch": 0.23595547743547557, + "grad_norm": 0.1158190593123436, + "learning_rate": 2.2926949350493692e-05, + "loss": 0.0312, + "step": 106990 + }, + { + "epoch": 0.23597753141037375, + "grad_norm": 0.1361367106437683, + "learning_rate": 2.292554557895972e-05, + "loss": 0.0307, + "step": 107000 + }, + { + "epoch": 0.2359995853852719, + "grad_norm": 0.11257927864789963, + "learning_rate": 2.2924141711122887e-05, + "loss": 0.0307, + "step": 107010 + }, + { + "epoch": 0.2360216393601701, + "grad_norm": 0.09761131554841995, + "learning_rate": 2.2922737747000255e-05, + "loss": 0.0312, + "step": 107020 + }, + { + "epoch": 0.23604369333506825, + "grad_norm": 0.10312154144048691, + "learning_rate": 2.2921333686608875e-05, + "loss": 0.0309, + "step": 107030 + }, + { + "epoch": 0.2360657473099664, + "grad_norm": 0.09994916617870331, + "learning_rate": 2.2919929529965814e-05, + "loss": 0.0309, + "step": 107040 + }, + { + "epoch": 0.2360878012848646, + "grad_norm": 0.10760727524757385, + "learning_rate": 2.2918525277088125e-05, + "loss": 0.0328, + "step": 107050 + }, + { + "epoch": 0.23610985525976275, + "grad_norm": 0.12156983464956284, + "learning_rate": 2.2917120927992888e-05, + "loss": 0.0328, + "step": 107060 + }, + { + "epoch": 0.2361319092346609, + "grad_norm": 0.14363598823547363, + "learning_rate": 2.291571648269715e-05, + "loss": 0.032, + "step": 107070 + }, + { + "epoch": 0.2361539632095591, + "grad_norm": 0.0964093878865242, + "learning_rate": 2.2914311941217985e-05, + "loss": 0.0319, + "step": 107080 + }, + { + "epoch": 0.23617601718445724, + "grad_norm": 0.09262876957654953, + "learning_rate": 2.2912907303572456e-05, + "loss": 0.0329, + "step": 107090 + }, + { + "epoch": 0.2361980711593554, + "grad_norm": 0.12412004917860031, + "learning_rate": 2.291150256977763e-05, + "loss": 0.0311, + "step": 107100 + }, + { + "epoch": 0.23622012513425358, + "grad_norm": 0.14307251572608948, + "learning_rate": 2.2910097739850585e-05, + "loss": 0.0319, + "step": 107110 + }, + { + "epoch": 0.23624217910915174, + "grad_norm": 0.09048224985599518, + "learning_rate": 2.2908692813808383e-05, + "loss": 0.0324, + "step": 107120 + }, + { + "epoch": 0.2362642330840499, + "grad_norm": 0.131738543510437, + "learning_rate": 2.2907287791668093e-05, + "loss": 0.0318, + "step": 107130 + }, + { + "epoch": 0.23628628705894808, + "grad_norm": 0.096330426633358, + "learning_rate": 2.290588267344679e-05, + "loss": 0.0324, + "step": 107140 + }, + { + "epoch": 0.23630834103384624, + "grad_norm": 0.1169169694185257, + "learning_rate": 2.290447745916155e-05, + "loss": 0.0304, + "step": 107150 + }, + { + "epoch": 0.2363303950087444, + "grad_norm": 0.09885460138320923, + "learning_rate": 2.2903072148829446e-05, + "loss": 0.0318, + "step": 107160 + }, + { + "epoch": 0.23635244898364258, + "grad_norm": 0.10503090918064117, + "learning_rate": 2.2901666742467548e-05, + "loss": 0.0315, + "step": 107170 + }, + { + "epoch": 0.23637450295854073, + "grad_norm": 0.11568951606750488, + "learning_rate": 2.2900261240092947e-05, + "loss": 0.0331, + "step": 107180 + }, + { + "epoch": 0.2363965569334389, + "grad_norm": 0.1177721843123436, + "learning_rate": 2.2898855641722707e-05, + "loss": 0.0318, + "step": 107190 + }, + { + "epoch": 0.23641861090833707, + "grad_norm": 0.12148246169090271, + "learning_rate": 2.2897449947373916e-05, + "loss": 0.0306, + "step": 107200 + }, + { + "epoch": 0.23644066488323523, + "grad_norm": 0.12197412550449371, + "learning_rate": 2.289604415706365e-05, + "loss": 0.0305, + "step": 107210 + }, + { + "epoch": 0.23646271885813339, + "grad_norm": 0.11582177132368088, + "learning_rate": 2.2894638270808996e-05, + "loss": 0.0312, + "step": 107220 + }, + { + "epoch": 0.23648477283303157, + "grad_norm": 0.10097068548202515, + "learning_rate": 2.289323228862703e-05, + "loss": 0.0308, + "step": 107230 + }, + { + "epoch": 0.23650682680792973, + "grad_norm": 0.12311353534460068, + "learning_rate": 2.289182621053484e-05, + "loss": 0.0325, + "step": 107240 + }, + { + "epoch": 0.23652888078282788, + "grad_norm": 0.09494011849164963, + "learning_rate": 2.289042003654951e-05, + "loss": 0.0297, + "step": 107250 + }, + { + "epoch": 0.23655093475772607, + "grad_norm": 0.12216756492853165, + "learning_rate": 2.2889013766688128e-05, + "loss": 0.0311, + "step": 107260 + }, + { + "epoch": 0.23657298873262422, + "grad_norm": 0.08592353016138077, + "learning_rate": 2.288760740096778e-05, + "loss": 0.0322, + "step": 107270 + }, + { + "epoch": 0.23659504270752238, + "grad_norm": 0.12245608866214752, + "learning_rate": 2.2886200939405554e-05, + "loss": 0.0307, + "step": 107280 + }, + { + "epoch": 0.23661709668242056, + "grad_norm": 0.09961358457803726, + "learning_rate": 2.2884794382018542e-05, + "loss": 0.0305, + "step": 107290 + }, + { + "epoch": 0.23663915065731872, + "grad_norm": 0.12971459329128265, + "learning_rate": 2.2883387728823834e-05, + "loss": 0.0318, + "step": 107300 + }, + { + "epoch": 0.23666120463221688, + "grad_norm": 0.11066421121358871, + "learning_rate": 2.288198097983852e-05, + "loss": 0.0316, + "step": 107310 + }, + { + "epoch": 0.23668325860711506, + "grad_norm": 0.12732121348381042, + "learning_rate": 2.2880574135079696e-05, + "loss": 0.0318, + "step": 107320 + }, + { + "epoch": 0.23670531258201322, + "grad_norm": 0.08612451702356339, + "learning_rate": 2.2879167194564456e-05, + "loss": 0.0306, + "step": 107330 + }, + { + "epoch": 0.23672736655691137, + "grad_norm": 0.13644911348819733, + "learning_rate": 2.2877760158309896e-05, + "loss": 0.0326, + "step": 107340 + }, + { + "epoch": 0.23674942053180956, + "grad_norm": 0.09742983430624008, + "learning_rate": 2.287635302633311e-05, + "loss": 0.0334, + "step": 107350 + }, + { + "epoch": 0.2367714745067077, + "grad_norm": 0.12968873977661133, + "learning_rate": 2.2874945798651204e-05, + "loss": 0.0318, + "step": 107360 + }, + { + "epoch": 0.23679352848160587, + "grad_norm": 0.10095138847827911, + "learning_rate": 2.2873538475281267e-05, + "loss": 0.0309, + "step": 107370 + }, + { + "epoch": 0.23681558245650405, + "grad_norm": 0.11296439915895462, + "learning_rate": 2.2872131056240398e-05, + "loss": 0.0313, + "step": 107380 + }, + { + "epoch": 0.2368376364314022, + "grad_norm": 0.11608605086803436, + "learning_rate": 2.2870723541545718e-05, + "loss": 0.0343, + "step": 107390 + }, + { + "epoch": 0.23685969040630037, + "grad_norm": 0.10949330776929855, + "learning_rate": 2.2869315931214303e-05, + "loss": 0.03, + "step": 107400 + }, + { + "epoch": 0.23688174438119855, + "grad_norm": 0.10250579565763474, + "learning_rate": 2.2867908225263276e-05, + "loss": 0.031, + "step": 107410 + }, + { + "epoch": 0.2369037983560967, + "grad_norm": 0.10802718997001648, + "learning_rate": 2.286650042370973e-05, + "loss": 0.0299, + "step": 107420 + }, + { + "epoch": 0.2369258523309949, + "grad_norm": 0.10710498690605164, + "learning_rate": 2.2865092526570785e-05, + "loss": 0.0309, + "step": 107430 + }, + { + "epoch": 0.23694790630589305, + "grad_norm": 0.1433270126581192, + "learning_rate": 2.2863684533863537e-05, + "loss": 0.0314, + "step": 107440 + }, + { + "epoch": 0.2369699602807912, + "grad_norm": 0.1561373621225357, + "learning_rate": 2.2862276445605095e-05, + "loss": 0.0316, + "step": 107450 + }, + { + "epoch": 0.2369920142556894, + "grad_norm": 0.13507455587387085, + "learning_rate": 2.2860868261812574e-05, + "loss": 0.0337, + "step": 107460 + }, + { + "epoch": 0.23701406823058754, + "grad_norm": 0.10741894692182541, + "learning_rate": 2.2859459982503075e-05, + "loss": 0.0315, + "step": 107470 + }, + { + "epoch": 0.2370361222054857, + "grad_norm": 0.09505823254585266, + "learning_rate": 2.2858051607693725e-05, + "loss": 0.0321, + "step": 107480 + }, + { + "epoch": 0.23705817618038388, + "grad_norm": 0.12980982661247253, + "learning_rate": 2.2856643137401624e-05, + "loss": 0.0324, + "step": 107490 + }, + { + "epoch": 0.23708023015528204, + "grad_norm": 0.11425571888685226, + "learning_rate": 2.2855234571643892e-05, + "loss": 0.0315, + "step": 107500 + }, + { + "epoch": 0.2371022841301802, + "grad_norm": 0.15683631598949432, + "learning_rate": 2.2853825910437642e-05, + "loss": 0.0327, + "step": 107510 + }, + { + "epoch": 0.23712433810507838, + "grad_norm": 0.11418775469064713, + "learning_rate": 2.2852417153799995e-05, + "loss": 0.0303, + "step": 107520 + }, + { + "epoch": 0.23714639207997654, + "grad_norm": 0.11082691699266434, + "learning_rate": 2.2851008301748065e-05, + "loss": 0.0322, + "step": 107530 + }, + { + "epoch": 0.2371684460548747, + "grad_norm": 0.09221503138542175, + "learning_rate": 2.284959935429897e-05, + "loss": 0.0318, + "step": 107540 + }, + { + "epoch": 0.23719050002977288, + "grad_norm": 0.0807325541973114, + "learning_rate": 2.2848190311469832e-05, + "loss": 0.0306, + "step": 107550 + }, + { + "epoch": 0.23721255400467103, + "grad_norm": 0.11832374334335327, + "learning_rate": 2.284678117327777e-05, + "loss": 0.0316, + "step": 107560 + }, + { + "epoch": 0.2372346079795692, + "grad_norm": 0.11420333385467529, + "learning_rate": 2.2845371939739914e-05, + "loss": 0.0317, + "step": 107570 + }, + { + "epoch": 0.23725666195446737, + "grad_norm": 0.12620596587657928, + "learning_rate": 2.2843962610873375e-05, + "loss": 0.0288, + "step": 107580 + }, + { + "epoch": 0.23727871592936553, + "grad_norm": 0.08550392836332321, + "learning_rate": 2.2842553186695286e-05, + "loss": 0.0299, + "step": 107590 + }, + { + "epoch": 0.2373007699042637, + "grad_norm": 0.10249143838882446, + "learning_rate": 2.2841143667222775e-05, + "loss": 0.0329, + "step": 107600 + }, + { + "epoch": 0.23732282387916187, + "grad_norm": 0.088544562458992, + "learning_rate": 2.2839734052472963e-05, + "loss": 0.031, + "step": 107610 + }, + { + "epoch": 0.23734487785406003, + "grad_norm": 0.10989164561033249, + "learning_rate": 2.283832434246298e-05, + "loss": 0.0319, + "step": 107620 + }, + { + "epoch": 0.23736693182895818, + "grad_norm": 0.12845562398433685, + "learning_rate": 2.2836914537209955e-05, + "loss": 0.0311, + "step": 107630 + }, + { + "epoch": 0.23738898580385637, + "grad_norm": 0.10034406930208206, + "learning_rate": 2.283550463673102e-05, + "loss": 0.0312, + "step": 107640 + }, + { + "epoch": 0.23741103977875452, + "grad_norm": 0.1033424660563469, + "learning_rate": 2.2834094641043305e-05, + "loss": 0.0326, + "step": 107650 + }, + { + "epoch": 0.23743309375365268, + "grad_norm": 0.08993709087371826, + "learning_rate": 2.2832684550163944e-05, + "loss": 0.0313, + "step": 107660 + }, + { + "epoch": 0.23745514772855086, + "grad_norm": 0.10146308690309525, + "learning_rate": 2.2831274364110067e-05, + "loss": 0.0324, + "step": 107670 + }, + { + "epoch": 0.23747720170344902, + "grad_norm": 0.11614102125167847, + "learning_rate": 2.282986408289882e-05, + "loss": 0.0326, + "step": 107680 + }, + { + "epoch": 0.23749925567834718, + "grad_norm": 0.12178956717252731, + "learning_rate": 2.282845370654733e-05, + "loss": 0.032, + "step": 107690 + }, + { + "epoch": 0.23752130965324536, + "grad_norm": 0.09325957298278809, + "learning_rate": 2.2827043235072732e-05, + "loss": 0.0311, + "step": 107700 + }, + { + "epoch": 0.23754336362814352, + "grad_norm": 0.10329858213663101, + "learning_rate": 2.282563266849217e-05, + "loss": 0.0319, + "step": 107710 + }, + { + "epoch": 0.23756541760304167, + "grad_norm": 0.10362361371517181, + "learning_rate": 2.2824222006822784e-05, + "loss": 0.0324, + "step": 107720 + }, + { + "epoch": 0.23758747157793986, + "grad_norm": 0.09291195869445801, + "learning_rate": 2.282281125008171e-05, + "loss": 0.0315, + "step": 107730 + }, + { + "epoch": 0.237609525552838, + "grad_norm": 0.09332173317670822, + "learning_rate": 2.2821400398286094e-05, + "loss": 0.0323, + "step": 107740 + }, + { + "epoch": 0.23763157952773617, + "grad_norm": 0.09391487389802933, + "learning_rate": 2.2819989451453082e-05, + "loss": 0.0333, + "step": 107750 + }, + { + "epoch": 0.23765363350263435, + "grad_norm": 0.11736885458230972, + "learning_rate": 2.2818578409599816e-05, + "loss": 0.0307, + "step": 107760 + }, + { + "epoch": 0.2376756874775325, + "grad_norm": 0.11304439604282379, + "learning_rate": 2.2817167272743435e-05, + "loss": 0.0333, + "step": 107770 + }, + { + "epoch": 0.23769774145243067, + "grad_norm": 0.09383179247379303, + "learning_rate": 2.2815756040901096e-05, + "loss": 0.033, + "step": 107780 + }, + { + "epoch": 0.23771979542732885, + "grad_norm": 0.1253819465637207, + "learning_rate": 2.281434471408994e-05, + "loss": 0.0306, + "step": 107790 + }, + { + "epoch": 0.237741849402227, + "grad_norm": 0.11180933564901352, + "learning_rate": 2.2812933292327113e-05, + "loss": 0.0324, + "step": 107800 + }, + { + "epoch": 0.23776390337712516, + "grad_norm": 0.17707869410514832, + "learning_rate": 2.2811521775629775e-05, + "loss": 0.0327, + "step": 107810 + }, + { + "epoch": 0.23778595735202335, + "grad_norm": 0.09718573838472366, + "learning_rate": 2.281011016401507e-05, + "loss": 0.0304, + "step": 107820 + }, + { + "epoch": 0.2378080113269215, + "grad_norm": 0.09113263338804245, + "learning_rate": 2.2808698457500152e-05, + "loss": 0.0324, + "step": 107830 + }, + { + "epoch": 0.23783006530181966, + "grad_norm": 0.12102106213569641, + "learning_rate": 2.2807286656102183e-05, + "loss": 0.0313, + "step": 107840 + }, + { + "epoch": 0.23785211927671784, + "grad_norm": 0.09601317346096039, + "learning_rate": 2.2805874759838302e-05, + "loss": 0.0313, + "step": 107850 + }, + { + "epoch": 0.237874173251616, + "grad_norm": 0.08968763053417206, + "learning_rate": 2.2804462768725674e-05, + "loss": 0.0327, + "step": 107860 + }, + { + "epoch": 0.23789622722651418, + "grad_norm": 0.09998729825019836, + "learning_rate": 2.2803050682781453e-05, + "loss": 0.0304, + "step": 107870 + }, + { + "epoch": 0.23791828120141234, + "grad_norm": 0.1152249425649643, + "learning_rate": 2.28016385020228e-05, + "loss": 0.0327, + "step": 107880 + }, + { + "epoch": 0.2379403351763105, + "grad_norm": 0.13632017374038696, + "learning_rate": 2.2800226226466873e-05, + "loss": 0.0309, + "step": 107890 + }, + { + "epoch": 0.23796238915120868, + "grad_norm": 0.09438662976026535, + "learning_rate": 2.2798813856130835e-05, + "loss": 0.0315, + "step": 107900 + }, + { + "epoch": 0.23798444312610684, + "grad_norm": 0.15964144468307495, + "learning_rate": 2.2797401391031848e-05, + "loss": 0.032, + "step": 107910 + }, + { + "epoch": 0.238006497101005, + "grad_norm": 0.10478225350379944, + "learning_rate": 2.2795988831187062e-05, + "loss": 0.0335, + "step": 107920 + }, + { + "epoch": 0.23802855107590318, + "grad_norm": 0.13340845704078674, + "learning_rate": 2.2794576176613662e-05, + "loss": 0.032, + "step": 107930 + }, + { + "epoch": 0.23805060505080133, + "grad_norm": 0.11937874555587769, + "learning_rate": 2.2793163427328795e-05, + "loss": 0.0312, + "step": 107940 + }, + { + "epoch": 0.2380726590256995, + "grad_norm": 0.11226657032966614, + "learning_rate": 2.2791750583349637e-05, + "loss": 0.0315, + "step": 107950 + }, + { + "epoch": 0.23809471300059767, + "grad_norm": 0.10304015874862671, + "learning_rate": 2.2790337644693353e-05, + "loss": 0.0307, + "step": 107960 + }, + { + "epoch": 0.23811676697549583, + "grad_norm": 0.0884176641702652, + "learning_rate": 2.2788924611377113e-05, + "loss": 0.0296, + "step": 107970 + }, + { + "epoch": 0.238138820950394, + "grad_norm": 0.09228025376796722, + "learning_rate": 2.278751148341808e-05, + "loss": 0.0295, + "step": 107980 + }, + { + "epoch": 0.23816087492529217, + "grad_norm": 0.11560434103012085, + "learning_rate": 2.278609826083343e-05, + "loss": 0.0316, + "step": 107990 + }, + { + "epoch": 0.23818292890019033, + "grad_norm": 0.09517890214920044, + "learning_rate": 2.278468494364034e-05, + "loss": 0.0303, + "step": 108000 + }, + { + "epoch": 0.23820498287508848, + "grad_norm": 0.10899507254362106, + "learning_rate": 2.2783271531855975e-05, + "loss": 0.0324, + "step": 108010 + }, + { + "epoch": 0.23822703684998667, + "grad_norm": 0.11654604971408844, + "learning_rate": 2.2781858025497512e-05, + "loss": 0.0316, + "step": 108020 + }, + { + "epoch": 0.23824909082488482, + "grad_norm": 0.10020507127046585, + "learning_rate": 2.278044442458213e-05, + "loss": 0.0333, + "step": 108030 + }, + { + "epoch": 0.23827114479978298, + "grad_norm": 0.12266461551189423, + "learning_rate": 2.2779030729127e-05, + "loss": 0.0328, + "step": 108040 + }, + { + "epoch": 0.23829319877468116, + "grad_norm": 0.11973311752080917, + "learning_rate": 2.2777616939149302e-05, + "loss": 0.0318, + "step": 108050 + }, + { + "epoch": 0.23831525274957932, + "grad_norm": 0.09600472450256348, + "learning_rate": 2.277620305466621e-05, + "loss": 0.0336, + "step": 108060 + }, + { + "epoch": 0.23833730672447748, + "grad_norm": 0.10616684705018997, + "learning_rate": 2.2774789075694914e-05, + "loss": 0.0309, + "step": 108070 + }, + { + "epoch": 0.23835936069937566, + "grad_norm": 0.10728757083415985, + "learning_rate": 2.2773375002252595e-05, + "loss": 0.0306, + "step": 108080 + }, + { + "epoch": 0.23838141467427382, + "grad_norm": 0.08663395792245865, + "learning_rate": 2.2771960834356424e-05, + "loss": 0.0329, + "step": 108090 + }, + { + "epoch": 0.23840346864917197, + "grad_norm": 0.10003214329481125, + "learning_rate": 2.2770546572023593e-05, + "loss": 0.032, + "step": 108100 + }, + { + "epoch": 0.23842552262407016, + "grad_norm": 0.08569639176130295, + "learning_rate": 2.276913221527128e-05, + "loss": 0.0309, + "step": 108110 + }, + { + "epoch": 0.2384475765989683, + "grad_norm": 0.12100957334041595, + "learning_rate": 2.276771776411668e-05, + "loss": 0.0324, + "step": 108120 + }, + { + "epoch": 0.23846963057386647, + "grad_norm": 0.10457351803779602, + "learning_rate": 2.2766303218576975e-05, + "loss": 0.031, + "step": 108130 + }, + { + "epoch": 0.23849168454876465, + "grad_norm": 0.09484543651342392, + "learning_rate": 2.2764888578669352e-05, + "loss": 0.033, + "step": 108140 + }, + { + "epoch": 0.2385137385236628, + "grad_norm": 0.08710197359323502, + "learning_rate": 2.2763473844411e-05, + "loss": 0.029, + "step": 108150 + }, + { + "epoch": 0.23853579249856097, + "grad_norm": 0.09910663217306137, + "learning_rate": 2.2762059015819118e-05, + "loss": 0.0315, + "step": 108160 + }, + { + "epoch": 0.23855784647345915, + "grad_norm": 0.08800174295902252, + "learning_rate": 2.2760644092910885e-05, + "loss": 0.0319, + "step": 108170 + }, + { + "epoch": 0.2385799004483573, + "grad_norm": 0.09904712438583374, + "learning_rate": 2.27592290757035e-05, + "loss": 0.0326, + "step": 108180 + }, + { + "epoch": 0.23860195442325546, + "grad_norm": 0.09515896439552307, + "learning_rate": 2.2757813964214155e-05, + "loss": 0.0303, + "step": 108190 + }, + { + "epoch": 0.23862400839815365, + "grad_norm": 0.09469437599182129, + "learning_rate": 2.2756398758460043e-05, + "loss": 0.0325, + "step": 108200 + }, + { + "epoch": 0.2386460623730518, + "grad_norm": 0.08076363801956177, + "learning_rate": 2.2754983458458367e-05, + "loss": 0.031, + "step": 108210 + }, + { + "epoch": 0.23866811634794996, + "grad_norm": 0.10835697501897812, + "learning_rate": 2.2753568064226318e-05, + "loss": 0.0302, + "step": 108220 + }, + { + "epoch": 0.23869017032284814, + "grad_norm": 0.0909348726272583, + "learning_rate": 2.2752152575781096e-05, + "loss": 0.0319, + "step": 108230 + }, + { + "epoch": 0.2387122242977463, + "grad_norm": 0.08908133208751678, + "learning_rate": 2.2750736993139904e-05, + "loss": 0.0325, + "step": 108240 + }, + { + "epoch": 0.23873427827264446, + "grad_norm": 0.0923699364066124, + "learning_rate": 2.2749321316319938e-05, + "loss": 0.0314, + "step": 108250 + }, + { + "epoch": 0.23875633224754264, + "grad_norm": 0.08563825488090515, + "learning_rate": 2.2747905545338402e-05, + "loss": 0.0309, + "step": 108260 + }, + { + "epoch": 0.2387783862224408, + "grad_norm": 0.11003275960683823, + "learning_rate": 2.2746489680212495e-05, + "loss": 0.0303, + "step": 108270 + }, + { + "epoch": 0.23880044019733898, + "grad_norm": 0.09187556803226471, + "learning_rate": 2.274507372095943e-05, + "loss": 0.0304, + "step": 108280 + }, + { + "epoch": 0.23882249417223714, + "grad_norm": 0.09092625975608826, + "learning_rate": 2.2743657667596403e-05, + "loss": 0.033, + "step": 108290 + }, + { + "epoch": 0.2388445481471353, + "grad_norm": 0.1398540884256363, + "learning_rate": 2.2742241520140625e-05, + "loss": 0.0299, + "step": 108300 + }, + { + "epoch": 0.23886660212203348, + "grad_norm": 0.10484597831964493, + "learning_rate": 2.27408252786093e-05, + "loss": 0.0322, + "step": 108310 + }, + { + "epoch": 0.23888865609693163, + "grad_norm": 0.09974168986082077, + "learning_rate": 2.2739408943019642e-05, + "loss": 0.0302, + "step": 108320 + }, + { + "epoch": 0.2389107100718298, + "grad_norm": 0.10526382923126221, + "learning_rate": 2.273799251338886e-05, + "loss": 0.0331, + "step": 108330 + }, + { + "epoch": 0.23893276404672797, + "grad_norm": 0.11038640886545181, + "learning_rate": 2.273657598973416e-05, + "loss": 0.033, + "step": 108340 + }, + { + "epoch": 0.23895481802162613, + "grad_norm": 0.11015409976243973, + "learning_rate": 2.273515937207276e-05, + "loss": 0.0349, + "step": 108350 + }, + { + "epoch": 0.2389768719965243, + "grad_norm": 0.12357629835605621, + "learning_rate": 2.273374266042187e-05, + "loss": 0.0306, + "step": 108360 + }, + { + "epoch": 0.23899892597142247, + "grad_norm": 0.10541782528162003, + "learning_rate": 2.27323258547987e-05, + "loss": 0.0315, + "step": 108370 + }, + { + "epoch": 0.23902097994632063, + "grad_norm": 0.12067580223083496, + "learning_rate": 2.2730908955220472e-05, + "loss": 0.0317, + "step": 108380 + }, + { + "epoch": 0.23904303392121878, + "grad_norm": 0.1316176801919937, + "learning_rate": 2.2729491961704405e-05, + "loss": 0.0342, + "step": 108390 + }, + { + "epoch": 0.23906508789611697, + "grad_norm": 0.11309389024972916, + "learning_rate": 2.272807487426771e-05, + "loss": 0.0306, + "step": 108400 + }, + { + "epoch": 0.23908714187101512, + "grad_norm": 0.10426432639360428, + "learning_rate": 2.2726657692927608e-05, + "loss": 0.0327, + "step": 108410 + }, + { + "epoch": 0.23910919584591328, + "grad_norm": 0.09543004631996155, + "learning_rate": 2.272524041770132e-05, + "loss": 0.0315, + "step": 108420 + }, + { + "epoch": 0.23913124982081146, + "grad_norm": 0.07810644805431366, + "learning_rate": 2.272382304860607e-05, + "loss": 0.0326, + "step": 108430 + }, + { + "epoch": 0.23915330379570962, + "grad_norm": 0.10195877403020859, + "learning_rate": 2.272240558565908e-05, + "loss": 0.0294, + "step": 108440 + }, + { + "epoch": 0.23917535777060778, + "grad_norm": 0.10203016549348831, + "learning_rate": 2.2720988028877564e-05, + "loss": 0.032, + "step": 108450 + }, + { + "epoch": 0.23919741174550596, + "grad_norm": 0.11898981034755707, + "learning_rate": 2.2719570378278758e-05, + "loss": 0.0307, + "step": 108460 + }, + { + "epoch": 0.23921946572040412, + "grad_norm": 0.1141352429986, + "learning_rate": 2.271815263387988e-05, + "loss": 0.0313, + "step": 108470 + }, + { + "epoch": 0.23924151969530227, + "grad_norm": 0.10218891501426697, + "learning_rate": 2.2716734795698165e-05, + "loss": 0.0333, + "step": 108480 + }, + { + "epoch": 0.23926357367020046, + "grad_norm": 0.0867655947804451, + "learning_rate": 2.2715316863750836e-05, + "loss": 0.031, + "step": 108490 + }, + { + "epoch": 0.23928562764509861, + "grad_norm": 0.15758907794952393, + "learning_rate": 2.2713898838055117e-05, + "loss": 0.0333, + "step": 108500 + }, + { + "epoch": 0.23930768161999677, + "grad_norm": 0.16627104580402374, + "learning_rate": 2.2712480718628246e-05, + "loss": 0.0327, + "step": 108510 + }, + { + "epoch": 0.23932973559489495, + "grad_norm": 0.10136228054761887, + "learning_rate": 2.2711062505487453e-05, + "loss": 0.0321, + "step": 108520 + }, + { + "epoch": 0.2393517895697931, + "grad_norm": 0.10503413528203964, + "learning_rate": 2.2709644198649976e-05, + "loss": 0.0312, + "step": 108530 + }, + { + "epoch": 0.23937384354469127, + "grad_norm": 0.13346773386001587, + "learning_rate": 2.2708225798133033e-05, + "loss": 0.0316, + "step": 108540 + }, + { + "epoch": 0.23939589751958945, + "grad_norm": 0.10036268085241318, + "learning_rate": 2.2706807303953873e-05, + "loss": 0.0315, + "step": 108550 + }, + { + "epoch": 0.2394179514944876, + "grad_norm": 0.08769095689058304, + "learning_rate": 2.270538871612973e-05, + "loss": 0.031, + "step": 108560 + }, + { + "epoch": 0.23944000546938576, + "grad_norm": 0.08757755160331726, + "learning_rate": 2.2703970034677833e-05, + "loss": 0.0305, + "step": 108570 + }, + { + "epoch": 0.23946205944428395, + "grad_norm": 0.09217669069766998, + "learning_rate": 2.2702551259615433e-05, + "loss": 0.0317, + "step": 108580 + }, + { + "epoch": 0.2394841134191821, + "grad_norm": 0.11602780967950821, + "learning_rate": 2.270113239095976e-05, + "loss": 0.0315, + "step": 108590 + }, + { + "epoch": 0.23950616739408026, + "grad_norm": 0.09896925091743469, + "learning_rate": 2.269971342872806e-05, + "loss": 0.0315, + "step": 108600 + }, + { + "epoch": 0.23952822136897844, + "grad_norm": 0.10224414616823196, + "learning_rate": 2.269829437293757e-05, + "loss": 0.0329, + "step": 108610 + }, + { + "epoch": 0.2395502753438766, + "grad_norm": 0.13059626519680023, + "learning_rate": 2.2696875223605535e-05, + "loss": 0.0312, + "step": 108620 + }, + { + "epoch": 0.23957232931877476, + "grad_norm": 0.0908171534538269, + "learning_rate": 2.2695455980749197e-05, + "loss": 0.0289, + "step": 108630 + }, + { + "epoch": 0.23959438329367294, + "grad_norm": 0.14583741128444672, + "learning_rate": 2.2694036644385805e-05, + "loss": 0.0315, + "step": 108640 + }, + { + "epoch": 0.2396164372685711, + "grad_norm": 0.13553521037101746, + "learning_rate": 2.2692617214532602e-05, + "loss": 0.0313, + "step": 108650 + }, + { + "epoch": 0.23963849124346925, + "grad_norm": 0.19233761727809906, + "learning_rate": 2.269119769120684e-05, + "loss": 0.0313, + "step": 108660 + }, + { + "epoch": 0.23966054521836744, + "grad_norm": 0.12953777611255646, + "learning_rate": 2.2689778074425764e-05, + "loss": 0.0325, + "step": 108670 + }, + { + "epoch": 0.2396825991932656, + "grad_norm": 0.11869215220212936, + "learning_rate": 2.2688358364206623e-05, + "loss": 0.0319, + "step": 108680 + }, + { + "epoch": 0.23970465316816375, + "grad_norm": 0.11603298783302307, + "learning_rate": 2.2686938560566667e-05, + "loss": 0.0338, + "step": 108690 + }, + { + "epoch": 0.23972670714306193, + "grad_norm": 0.10184212028980255, + "learning_rate": 2.268551866352315e-05, + "loss": 0.0314, + "step": 108700 + }, + { + "epoch": 0.2397487611179601, + "grad_norm": 0.1273529976606369, + "learning_rate": 2.2684098673093325e-05, + "loss": 0.0325, + "step": 108710 + }, + { + "epoch": 0.23977081509285827, + "grad_norm": 0.10153628885746002, + "learning_rate": 2.268267858929445e-05, + "loss": 0.031, + "step": 108720 + }, + { + "epoch": 0.23979286906775643, + "grad_norm": 0.09762617945671082, + "learning_rate": 2.268125841214377e-05, + "loss": 0.0302, + "step": 108730 + }, + { + "epoch": 0.2398149230426546, + "grad_norm": 0.10129650682210922, + "learning_rate": 2.2679838141658556e-05, + "loss": 0.0311, + "step": 108740 + }, + { + "epoch": 0.23983697701755277, + "grad_norm": 0.10922272503376007, + "learning_rate": 2.267841777785605e-05, + "loss": 0.0345, + "step": 108750 + }, + { + "epoch": 0.23985903099245093, + "grad_norm": 0.11015833169221878, + "learning_rate": 2.2676997320753523e-05, + "loss": 0.0324, + "step": 108760 + }, + { + "epoch": 0.23988108496734908, + "grad_norm": 0.10541436821222305, + "learning_rate": 2.2675576770368227e-05, + "loss": 0.0315, + "step": 108770 + }, + { + "epoch": 0.23990313894224727, + "grad_norm": 0.15041421353816986, + "learning_rate": 2.267415612671743e-05, + "loss": 0.0307, + "step": 108780 + }, + { + "epoch": 0.23992519291714542, + "grad_norm": 0.09396962076425552, + "learning_rate": 2.267273538981839e-05, + "loss": 0.0307, + "step": 108790 + }, + { + "epoch": 0.23994724689204358, + "grad_norm": 0.11175744980573654, + "learning_rate": 2.267131455968837e-05, + "loss": 0.032, + "step": 108800 + }, + { + "epoch": 0.23996930086694176, + "grad_norm": 0.09101372957229614, + "learning_rate": 2.2669893636344634e-05, + "loss": 0.0331, + "step": 108810 + }, + { + "epoch": 0.23999135484183992, + "grad_norm": 0.11549112200737, + "learning_rate": 2.2668472619804448e-05, + "loss": 0.0327, + "step": 108820 + }, + { + "epoch": 0.24001340881673808, + "grad_norm": 0.15526896715164185, + "learning_rate": 2.2667051510085082e-05, + "loss": 0.0329, + "step": 108830 + }, + { + "epoch": 0.24003546279163626, + "grad_norm": 0.12071581929922104, + "learning_rate": 2.26656303072038e-05, + "loss": 0.032, + "step": 108840 + }, + { + "epoch": 0.24005751676653442, + "grad_norm": 0.11067420244216919, + "learning_rate": 2.2664209011177874e-05, + "loss": 0.0299, + "step": 108850 + }, + { + "epoch": 0.24007957074143257, + "grad_norm": 0.12180303037166595, + "learning_rate": 2.266278762202457e-05, + "loss": 0.0325, + "step": 108860 + }, + { + "epoch": 0.24010162471633076, + "grad_norm": 0.10901840776205063, + "learning_rate": 2.2661366139761158e-05, + "loss": 0.0295, + "step": 108870 + }, + { + "epoch": 0.24012367869122891, + "grad_norm": 0.09255290776491165, + "learning_rate": 2.265994456440492e-05, + "loss": 0.0311, + "step": 108880 + }, + { + "epoch": 0.24014573266612707, + "grad_norm": 0.12144103646278381, + "learning_rate": 2.2658522895973125e-05, + "loss": 0.0304, + "step": 108890 + }, + { + "epoch": 0.24016778664102525, + "grad_norm": 0.08501642942428589, + "learning_rate": 2.265710113448304e-05, + "loss": 0.0312, + "step": 108900 + }, + { + "epoch": 0.2401898406159234, + "grad_norm": 0.09963109344244003, + "learning_rate": 2.265567927995195e-05, + "loss": 0.0301, + "step": 108910 + }, + { + "epoch": 0.24021189459082157, + "grad_norm": 0.11248420178890228, + "learning_rate": 2.2654257332397128e-05, + "loss": 0.0316, + "step": 108920 + }, + { + "epoch": 0.24023394856571975, + "grad_norm": 0.122160404920578, + "learning_rate": 2.265283529183585e-05, + "loss": 0.0319, + "step": 108930 + }, + { + "epoch": 0.2402560025406179, + "grad_norm": 0.0990581214427948, + "learning_rate": 2.2651413158285403e-05, + "loss": 0.0317, + "step": 108940 + }, + { + "epoch": 0.24027805651551606, + "grad_norm": 0.12675261497497559, + "learning_rate": 2.2649990931763057e-05, + "loss": 0.0315, + "step": 108950 + }, + { + "epoch": 0.24030011049041425, + "grad_norm": 0.08963291347026825, + "learning_rate": 2.2648568612286102e-05, + "loss": 0.0315, + "step": 108960 + }, + { + "epoch": 0.2403221644653124, + "grad_norm": 0.10627420246601105, + "learning_rate": 2.2647146199871815e-05, + "loss": 0.0318, + "step": 108970 + }, + { + "epoch": 0.24034421844021056, + "grad_norm": 0.08704254031181335, + "learning_rate": 2.264572369453748e-05, + "loss": 0.032, + "step": 108980 + }, + { + "epoch": 0.24036627241510874, + "grad_norm": 0.11109604686498642, + "learning_rate": 2.2644301096300387e-05, + "loss": 0.032, + "step": 108990 + }, + { + "epoch": 0.2403883263900069, + "grad_norm": 0.09117249399423599, + "learning_rate": 2.2642878405177813e-05, + "loss": 0.0321, + "step": 109000 + }, + { + "epoch": 0.24041038036490506, + "grad_norm": 0.13058480620384216, + "learning_rate": 2.264145562118706e-05, + "loss": 0.0312, + "step": 109010 + }, + { + "epoch": 0.24043243433980324, + "grad_norm": 0.13916230201721191, + "learning_rate": 2.2640032744345396e-05, + "loss": 0.0316, + "step": 109020 + }, + { + "epoch": 0.2404544883147014, + "grad_norm": 0.13491295278072357, + "learning_rate": 2.2638609774670125e-05, + "loss": 0.0338, + "step": 109030 + }, + { + "epoch": 0.24047654228959955, + "grad_norm": 0.11507393419742584, + "learning_rate": 2.2637186712178535e-05, + "loss": 0.0321, + "step": 109040 + }, + { + "epoch": 0.24049859626449774, + "grad_norm": 0.08874484896659851, + "learning_rate": 2.2635763556887913e-05, + "loss": 0.0311, + "step": 109050 + }, + { + "epoch": 0.2405206502393959, + "grad_norm": 0.10202868282794952, + "learning_rate": 2.2634340308815555e-05, + "loss": 0.0321, + "step": 109060 + }, + { + "epoch": 0.24054270421429405, + "grad_norm": 0.12307354807853699, + "learning_rate": 2.2632916967978754e-05, + "loss": 0.0331, + "step": 109070 + }, + { + "epoch": 0.24056475818919223, + "grad_norm": 0.11505986005067825, + "learning_rate": 2.2631493534394807e-05, + "loss": 0.0288, + "step": 109080 + }, + { + "epoch": 0.2405868121640904, + "grad_norm": 0.09900099784135818, + "learning_rate": 2.2630070008081006e-05, + "loss": 0.031, + "step": 109090 + }, + { + "epoch": 0.24060886613898855, + "grad_norm": 0.09305808693170547, + "learning_rate": 2.2628646389054655e-05, + "loss": 0.031, + "step": 109100 + }, + { + "epoch": 0.24063092011388673, + "grad_norm": 0.14404450356960297, + "learning_rate": 2.262722267733304e-05, + "loss": 0.0311, + "step": 109110 + }, + { + "epoch": 0.2406529740887849, + "grad_norm": 0.10538724809885025, + "learning_rate": 2.2625798872933474e-05, + "loss": 0.0323, + "step": 109120 + }, + { + "epoch": 0.24067502806368304, + "grad_norm": 0.11037961393594742, + "learning_rate": 2.262437497587325e-05, + "loss": 0.0302, + "step": 109130 + }, + { + "epoch": 0.24069708203858123, + "grad_norm": 0.12491487711668015, + "learning_rate": 2.2622950986169672e-05, + "loss": 0.0325, + "step": 109140 + }, + { + "epoch": 0.24071913601347938, + "grad_norm": 0.10803695768117905, + "learning_rate": 2.2621526903840043e-05, + "loss": 0.0325, + "step": 109150 + }, + { + "epoch": 0.24074118998837757, + "grad_norm": 0.12384873628616333, + "learning_rate": 2.2620102728901666e-05, + "loss": 0.0323, + "step": 109160 + }, + { + "epoch": 0.24076324396327572, + "grad_norm": 0.11072047054767609, + "learning_rate": 2.2618678461371844e-05, + "loss": 0.0327, + "step": 109170 + }, + { + "epoch": 0.24078529793817388, + "grad_norm": 0.1199275478720665, + "learning_rate": 2.2617254101267887e-05, + "loss": 0.0335, + "step": 109180 + }, + { + "epoch": 0.24080735191307207, + "grad_norm": 0.13899485766887665, + "learning_rate": 2.26158296486071e-05, + "loss": 0.0316, + "step": 109190 + }, + { + "epoch": 0.24082940588797022, + "grad_norm": 0.09120022505521774, + "learning_rate": 2.261440510340679e-05, + "loss": 0.0312, + "step": 109200 + }, + { + "epoch": 0.24085145986286838, + "grad_norm": 0.08866582065820694, + "learning_rate": 2.2612980465684278e-05, + "loss": 0.03, + "step": 109210 + }, + { + "epoch": 0.24087351383776656, + "grad_norm": 0.11941158026456833, + "learning_rate": 2.2611555735456854e-05, + "loss": 0.0311, + "step": 109220 + }, + { + "epoch": 0.24089556781266472, + "grad_norm": 0.11336290836334229, + "learning_rate": 2.2610130912741848e-05, + "loss": 0.0301, + "step": 109230 + }, + { + "epoch": 0.24091762178756287, + "grad_norm": 0.08121887594461441, + "learning_rate": 2.2608705997556563e-05, + "loss": 0.0318, + "step": 109240 + }, + { + "epoch": 0.24093967576246106, + "grad_norm": 0.09619005769491196, + "learning_rate": 2.260728098991832e-05, + "loss": 0.031, + "step": 109250 + }, + { + "epoch": 0.24096172973735921, + "grad_norm": 0.08568660914897919, + "learning_rate": 2.2605855889844425e-05, + "loss": 0.0307, + "step": 109260 + }, + { + "epoch": 0.24098378371225737, + "grad_norm": 0.13721229135990143, + "learning_rate": 2.2604430697352205e-05, + "loss": 0.0318, + "step": 109270 + }, + { + "epoch": 0.24100583768715556, + "grad_norm": 0.10191433131694794, + "learning_rate": 2.260300541245897e-05, + "loss": 0.0344, + "step": 109280 + }, + { + "epoch": 0.2410278916620537, + "grad_norm": 0.08932653814554214, + "learning_rate": 2.2601580035182045e-05, + "loss": 0.0309, + "step": 109290 + }, + { + "epoch": 0.24104994563695187, + "grad_norm": 0.10261253267526627, + "learning_rate": 2.260015456553874e-05, + "loss": 0.0308, + "step": 109300 + }, + { + "epoch": 0.24107199961185005, + "grad_norm": 0.10743702948093414, + "learning_rate": 2.2598729003546385e-05, + "loss": 0.0314, + "step": 109310 + }, + { + "epoch": 0.2410940535867482, + "grad_norm": 0.1021539643406868, + "learning_rate": 2.2597303349222295e-05, + "loss": 0.0332, + "step": 109320 + }, + { + "epoch": 0.24111610756164636, + "grad_norm": 0.09761461615562439, + "learning_rate": 2.25958776025838e-05, + "loss": 0.0321, + "step": 109330 + }, + { + "epoch": 0.24113816153654455, + "grad_norm": 0.113893523812294, + "learning_rate": 2.2594451763648213e-05, + "loss": 0.0314, + "step": 109340 + }, + { + "epoch": 0.2411602155114427, + "grad_norm": 0.10144145786762238, + "learning_rate": 2.2593025832432878e-05, + "loss": 0.0324, + "step": 109350 + }, + { + "epoch": 0.24118226948634086, + "grad_norm": 0.10810887813568115, + "learning_rate": 2.2591599808955102e-05, + "loss": 0.0313, + "step": 109360 + }, + { + "epoch": 0.24120432346123905, + "grad_norm": 0.09510137885808945, + "learning_rate": 2.259017369323222e-05, + "loss": 0.03, + "step": 109370 + }, + { + "epoch": 0.2412263774361372, + "grad_norm": 0.10319114476442337, + "learning_rate": 2.2588747485281562e-05, + "loss": 0.0311, + "step": 109380 + }, + { + "epoch": 0.24124843141103536, + "grad_norm": 0.1575128734111786, + "learning_rate": 2.2587321185120464e-05, + "loss": 0.0299, + "step": 109390 + }, + { + "epoch": 0.24127048538593354, + "grad_norm": 0.12925802171230316, + "learning_rate": 2.258589479276624e-05, + "loss": 0.0312, + "step": 109400 + }, + { + "epoch": 0.2412925393608317, + "grad_norm": 0.10241184383630753, + "learning_rate": 2.258446830823624e-05, + "loss": 0.0317, + "step": 109410 + }, + { + "epoch": 0.24131459333572985, + "grad_norm": 0.09900642931461334, + "learning_rate": 2.2583041731547783e-05, + "loss": 0.0339, + "step": 109420 + }, + { + "epoch": 0.24133664731062804, + "grad_norm": 0.12090496718883514, + "learning_rate": 2.2581615062718214e-05, + "loss": 0.0321, + "step": 109430 + }, + { + "epoch": 0.2413587012855262, + "grad_norm": 0.11253990232944489, + "learning_rate": 2.2580188301764864e-05, + "loss": 0.0319, + "step": 109440 + }, + { + "epoch": 0.24138075526042435, + "grad_norm": 0.09154220670461655, + "learning_rate": 2.2578761448705066e-05, + "loss": 0.0323, + "step": 109450 + }, + { + "epoch": 0.24140280923532254, + "grad_norm": 0.10467905551195145, + "learning_rate": 2.257733450355616e-05, + "loss": 0.0309, + "step": 109460 + }, + { + "epoch": 0.2414248632102207, + "grad_norm": 0.19358183443546295, + "learning_rate": 2.257590746633549e-05, + "loss": 0.0314, + "step": 109470 + }, + { + "epoch": 0.24144691718511885, + "grad_norm": 0.09834194928407669, + "learning_rate": 2.257448033706039e-05, + "loss": 0.032, + "step": 109480 + }, + { + "epoch": 0.24146897116001703, + "grad_norm": 0.09226780384778976, + "learning_rate": 2.25730531157482e-05, + "loss": 0.0327, + "step": 109490 + }, + { + "epoch": 0.2414910251349152, + "grad_norm": 0.11600949615240097, + "learning_rate": 2.2571625802416267e-05, + "loss": 0.031, + "step": 109500 + }, + { + "epoch": 0.24151307910981334, + "grad_norm": 0.12622161209583282, + "learning_rate": 2.2570198397081928e-05, + "loss": 0.0313, + "step": 109510 + }, + { + "epoch": 0.24153513308471153, + "grad_norm": 0.11531076580286026, + "learning_rate": 2.2568770899762538e-05, + "loss": 0.0312, + "step": 109520 + }, + { + "epoch": 0.24155718705960968, + "grad_norm": 0.10944631695747375, + "learning_rate": 2.256734331047543e-05, + "loss": 0.0336, + "step": 109530 + }, + { + "epoch": 0.24157924103450784, + "grad_norm": 0.11712174862623215, + "learning_rate": 2.256591562923796e-05, + "loss": 0.0308, + "step": 109540 + }, + { + "epoch": 0.24160129500940603, + "grad_norm": 0.1318526566028595, + "learning_rate": 2.2564487856067467e-05, + "loss": 0.0316, + "step": 109550 + }, + { + "epoch": 0.24162334898430418, + "grad_norm": 0.11409370601177216, + "learning_rate": 2.256305999098131e-05, + "loss": 0.0324, + "step": 109560 + }, + { + "epoch": 0.24164540295920237, + "grad_norm": 0.10841178148984909, + "learning_rate": 2.2561632033996828e-05, + "loss": 0.0296, + "step": 109570 + }, + { + "epoch": 0.24166745693410052, + "grad_norm": 0.09922206401824951, + "learning_rate": 2.256020398513138e-05, + "loss": 0.031, + "step": 109580 + }, + { + "epoch": 0.24168951090899868, + "grad_norm": 0.1550060510635376, + "learning_rate": 2.2558775844402313e-05, + "loss": 0.0311, + "step": 109590 + }, + { + "epoch": 0.24171156488389686, + "grad_norm": 0.1632089614868164, + "learning_rate": 2.255734761182699e-05, + "loss": 0.0326, + "step": 109600 + }, + { + "epoch": 0.24173361885879502, + "grad_norm": 0.12886406481266022, + "learning_rate": 2.2555919287422745e-05, + "loss": 0.032, + "step": 109610 + }, + { + "epoch": 0.24175567283369317, + "grad_norm": 0.15874023735523224, + "learning_rate": 2.255449087120696e-05, + "loss": 0.0333, + "step": 109620 + }, + { + "epoch": 0.24177772680859136, + "grad_norm": 0.12095945328474045, + "learning_rate": 2.2553062363196976e-05, + "loss": 0.0318, + "step": 109630 + }, + { + "epoch": 0.24179978078348952, + "grad_norm": 0.11387994140386581, + "learning_rate": 2.2551633763410148e-05, + "loss": 0.0308, + "step": 109640 + }, + { + "epoch": 0.24182183475838767, + "grad_norm": 0.09553263336420059, + "learning_rate": 2.255020507186384e-05, + "loss": 0.0285, + "step": 109650 + }, + { + "epoch": 0.24184388873328586, + "grad_norm": 0.08699674904346466, + "learning_rate": 2.2548776288575412e-05, + "loss": 0.0338, + "step": 109660 + }, + { + "epoch": 0.241865942708184, + "grad_norm": 0.09014532715082169, + "learning_rate": 2.2547347413562226e-05, + "loss": 0.0318, + "step": 109670 + }, + { + "epoch": 0.24188799668308217, + "grad_norm": 0.11097118258476257, + "learning_rate": 2.2545918446841646e-05, + "loss": 0.033, + "step": 109680 + }, + { + "epoch": 0.24191005065798035, + "grad_norm": 0.09789393097162247, + "learning_rate": 2.2544489388431028e-05, + "loss": 0.032, + "step": 109690 + }, + { + "epoch": 0.2419321046328785, + "grad_norm": 0.109989233314991, + "learning_rate": 2.2543060238347744e-05, + "loss": 0.0332, + "step": 109700 + }, + { + "epoch": 0.24195415860777666, + "grad_norm": 0.10109523683786392, + "learning_rate": 2.2541630996609157e-05, + "loss": 0.0317, + "step": 109710 + }, + { + "epoch": 0.24197621258267485, + "grad_norm": 0.1430622637271881, + "learning_rate": 2.254020166323263e-05, + "loss": 0.0316, + "step": 109720 + }, + { + "epoch": 0.241998266557573, + "grad_norm": 0.10499340295791626, + "learning_rate": 2.2538772238235533e-05, + "loss": 0.032, + "step": 109730 + }, + { + "epoch": 0.24202032053247116, + "grad_norm": 0.10322141647338867, + "learning_rate": 2.2537342721635242e-05, + "loss": 0.0325, + "step": 109740 + }, + { + "epoch": 0.24204237450736935, + "grad_norm": 0.10387340933084488, + "learning_rate": 2.2535913113449114e-05, + "loss": 0.0317, + "step": 109750 + }, + { + "epoch": 0.2420644284822675, + "grad_norm": 0.10201581567525864, + "learning_rate": 2.2534483413694526e-05, + "loss": 0.0317, + "step": 109760 + }, + { + "epoch": 0.24208648245716566, + "grad_norm": 0.08892524242401123, + "learning_rate": 2.2533053622388853e-05, + "loss": 0.0352, + "step": 109770 + }, + { + "epoch": 0.24210853643206384, + "grad_norm": 0.12541653215885162, + "learning_rate": 2.2531623739549466e-05, + "loss": 0.031, + "step": 109780 + }, + { + "epoch": 0.242130590406962, + "grad_norm": 0.14333918690681458, + "learning_rate": 2.253019376519374e-05, + "loss": 0.0323, + "step": 109790 + }, + { + "epoch": 0.24215264438186015, + "grad_norm": 0.11745836585760117, + "learning_rate": 2.2528763699339054e-05, + "loss": 0.0319, + "step": 109800 + }, + { + "epoch": 0.24217469835675834, + "grad_norm": 0.12292186915874481, + "learning_rate": 2.2527333542002773e-05, + "loss": 0.0323, + "step": 109810 + }, + { + "epoch": 0.2421967523316565, + "grad_norm": 0.11351130902767181, + "learning_rate": 2.2525903293202286e-05, + "loss": 0.032, + "step": 109820 + }, + { + "epoch": 0.24221880630655465, + "grad_norm": 0.0972914919257164, + "learning_rate": 2.252447295295497e-05, + "loss": 0.0323, + "step": 109830 + }, + { + "epoch": 0.24224086028145284, + "grad_norm": 0.09735989570617676, + "learning_rate": 2.25230425212782e-05, + "loss": 0.0319, + "step": 109840 + }, + { + "epoch": 0.242262914256351, + "grad_norm": 0.10970556735992432, + "learning_rate": 2.2521611998189363e-05, + "loss": 0.032, + "step": 109850 + }, + { + "epoch": 0.24228496823124915, + "grad_norm": 0.11255261301994324, + "learning_rate": 2.252018138370584e-05, + "loss": 0.0319, + "step": 109860 + }, + { + "epoch": 0.24230702220614733, + "grad_norm": 0.13937419652938843, + "learning_rate": 2.2518750677845006e-05, + "loss": 0.0316, + "step": 109870 + }, + { + "epoch": 0.2423290761810455, + "grad_norm": 0.11168672889471054, + "learning_rate": 2.2517319880624262e-05, + "loss": 0.0311, + "step": 109880 + }, + { + "epoch": 0.24235113015594364, + "grad_norm": 0.14619898796081543, + "learning_rate": 2.251588899206098e-05, + "loss": 0.0314, + "step": 109890 + }, + { + "epoch": 0.24237318413084183, + "grad_norm": 0.10259231925010681, + "learning_rate": 2.251445801217255e-05, + "loss": 0.0314, + "step": 109900 + }, + { + "epoch": 0.24239523810573999, + "grad_norm": 0.09376991540193558, + "learning_rate": 2.2513026940976356e-05, + "loss": 0.0306, + "step": 109910 + }, + { + "epoch": 0.24241729208063814, + "grad_norm": 0.10302625596523285, + "learning_rate": 2.2511595778489794e-05, + "loss": 0.0302, + "step": 109920 + }, + { + "epoch": 0.24243934605553633, + "grad_norm": 0.09602642804384232, + "learning_rate": 2.251016452473025e-05, + "loss": 0.0299, + "step": 109930 + }, + { + "epoch": 0.24246140003043448, + "grad_norm": 0.09042048454284668, + "learning_rate": 2.250873317971512e-05, + "loss": 0.0309, + "step": 109940 + }, + { + "epoch": 0.24248345400533264, + "grad_norm": 0.12042590230703354, + "learning_rate": 2.2507301743461794e-05, + "loss": 0.0322, + "step": 109950 + }, + { + "epoch": 0.24250550798023082, + "grad_norm": 0.12044086307287216, + "learning_rate": 2.250587021598766e-05, + "loss": 0.0307, + "step": 109960 + }, + { + "epoch": 0.24252756195512898, + "grad_norm": 0.1087445467710495, + "learning_rate": 2.2504438597310116e-05, + "loss": 0.0307, + "step": 109970 + }, + { + "epoch": 0.24254961593002713, + "grad_norm": 0.11101029813289642, + "learning_rate": 2.250300688744656e-05, + "loss": 0.0323, + "step": 109980 + }, + { + "epoch": 0.24257166990492532, + "grad_norm": 0.10020171105861664, + "learning_rate": 2.2501575086414382e-05, + "loss": 0.0318, + "step": 109990 + }, + { + "epoch": 0.24259372387982348, + "grad_norm": 0.1121123805642128, + "learning_rate": 2.2500143194230986e-05, + "loss": 0.0318, + "step": 110000 + }, + { + "epoch": 0.24261577785472166, + "grad_norm": 0.12509912252426147, + "learning_rate": 2.2498711210913774e-05, + "loss": 0.032, + "step": 110010 + }, + { + "epoch": 0.24263783182961982, + "grad_norm": 0.17119595408439636, + "learning_rate": 2.2497279136480137e-05, + "loss": 0.0299, + "step": 110020 + }, + { + "epoch": 0.24265988580451797, + "grad_norm": 0.10593778640031815, + "learning_rate": 2.249584697094748e-05, + "loss": 0.0334, + "step": 110030 + }, + { + "epoch": 0.24268193977941616, + "grad_norm": 0.12121311575174332, + "learning_rate": 2.2494414714333202e-05, + "loss": 0.0312, + "step": 110040 + }, + { + "epoch": 0.2427039937543143, + "grad_norm": 0.09692730009555817, + "learning_rate": 2.2492982366654712e-05, + "loss": 0.0308, + "step": 110050 + }, + { + "epoch": 0.24272604772921247, + "grad_norm": 0.10793842375278473, + "learning_rate": 2.2491549927929412e-05, + "loss": 0.0311, + "step": 110060 + }, + { + "epoch": 0.24274810170411065, + "grad_norm": 0.09802250564098358, + "learning_rate": 2.2490117398174708e-05, + "loss": 0.0299, + "step": 110070 + }, + { + "epoch": 0.2427701556790088, + "grad_norm": 0.09980472177267075, + "learning_rate": 2.2488684777408004e-05, + "loss": 0.0307, + "step": 110080 + }, + { + "epoch": 0.24279220965390697, + "grad_norm": 0.09619171917438507, + "learning_rate": 2.2487252065646706e-05, + "loss": 0.0329, + "step": 110090 + }, + { + "epoch": 0.24281426362880515, + "grad_norm": 0.11580938845872879, + "learning_rate": 2.248581926290823e-05, + "loss": 0.033, + "step": 110100 + }, + { + "epoch": 0.2428363176037033, + "grad_norm": 0.13992521166801453, + "learning_rate": 2.2484386369209986e-05, + "loss": 0.032, + "step": 110110 + }, + { + "epoch": 0.24285837157860146, + "grad_norm": 0.09871982038021088, + "learning_rate": 2.2482953384569376e-05, + "loss": 0.0342, + "step": 110120 + }, + { + "epoch": 0.24288042555349965, + "grad_norm": 0.09650692343711853, + "learning_rate": 2.248152030900382e-05, + "loss": 0.0326, + "step": 110130 + }, + { + "epoch": 0.2429024795283978, + "grad_norm": 0.16390953958034515, + "learning_rate": 2.2480087142530728e-05, + "loss": 0.0325, + "step": 110140 + }, + { + "epoch": 0.24292453350329596, + "grad_norm": 0.1458076387643814, + "learning_rate": 2.2478653885167513e-05, + "loss": 0.0309, + "step": 110150 + }, + { + "epoch": 0.24294658747819414, + "grad_norm": 0.1126527339220047, + "learning_rate": 2.2477220536931594e-05, + "loss": 0.0327, + "step": 110160 + }, + { + "epoch": 0.2429686414530923, + "grad_norm": 0.13130757212638855, + "learning_rate": 2.247578709784038e-05, + "loss": 0.0309, + "step": 110170 + }, + { + "epoch": 0.24299069542799046, + "grad_norm": 0.11819963157176971, + "learning_rate": 2.2474353567911306e-05, + "loss": 0.0305, + "step": 110180 + }, + { + "epoch": 0.24301274940288864, + "grad_norm": 0.09191866219043732, + "learning_rate": 2.2472919947161767e-05, + "loss": 0.0312, + "step": 110190 + }, + { + "epoch": 0.2430348033777868, + "grad_norm": 0.09273470938205719, + "learning_rate": 2.2471486235609206e-05, + "loss": 0.0312, + "step": 110200 + }, + { + "epoch": 0.24305685735268495, + "grad_norm": 0.0896439254283905, + "learning_rate": 2.2470052433271026e-05, + "loss": 0.0293, + "step": 110210 + }, + { + "epoch": 0.24307891132758314, + "grad_norm": 0.10492819547653198, + "learning_rate": 2.2468618540164657e-05, + "loss": 0.0302, + "step": 110220 + }, + { + "epoch": 0.2431009653024813, + "grad_norm": 0.12040137499570847, + "learning_rate": 2.246718455630752e-05, + "loss": 0.0312, + "step": 110230 + }, + { + "epoch": 0.24312301927737945, + "grad_norm": 0.11890129745006561, + "learning_rate": 2.2465750481717047e-05, + "loss": 0.0339, + "step": 110240 + }, + { + "epoch": 0.24314507325227763, + "grad_norm": 0.09514401853084564, + "learning_rate": 2.246431631641065e-05, + "loss": 0.0323, + "step": 110250 + }, + { + "epoch": 0.2431671272271758, + "grad_norm": 0.11482686549425125, + "learning_rate": 2.2462882060405767e-05, + "loss": 0.0309, + "step": 110260 + }, + { + "epoch": 0.24318918120207395, + "grad_norm": 0.10775961726903915, + "learning_rate": 2.246144771371982e-05, + "loss": 0.0321, + "step": 110270 + }, + { + "epoch": 0.24321123517697213, + "grad_norm": 0.11211598664522171, + "learning_rate": 2.2460013276370236e-05, + "loss": 0.0327, + "step": 110280 + }, + { + "epoch": 0.24323328915187029, + "grad_norm": 0.1196356862783432, + "learning_rate": 2.245857874837445e-05, + "loss": 0.0324, + "step": 110290 + }, + { + "epoch": 0.24325534312676844, + "grad_norm": 0.12895716726779938, + "learning_rate": 2.245714412974989e-05, + "loss": 0.0325, + "step": 110300 + }, + { + "epoch": 0.24327739710166663, + "grad_norm": 0.10566353797912598, + "learning_rate": 2.2455709420513983e-05, + "loss": 0.0316, + "step": 110310 + }, + { + "epoch": 0.24329945107656478, + "grad_norm": 0.10783635079860687, + "learning_rate": 2.245427462068417e-05, + "loss": 0.032, + "step": 110320 + }, + { + "epoch": 0.24332150505146294, + "grad_norm": 0.08294882625341415, + "learning_rate": 2.2452839730277885e-05, + "loss": 0.032, + "step": 110330 + }, + { + "epoch": 0.24334355902636112, + "grad_norm": 0.0899156853556633, + "learning_rate": 2.245140474931256e-05, + "loss": 0.0315, + "step": 110340 + }, + { + "epoch": 0.24336561300125928, + "grad_norm": 0.13051727414131165, + "learning_rate": 2.2449969677805626e-05, + "loss": 0.031, + "step": 110350 + }, + { + "epoch": 0.24338766697615744, + "grad_norm": 0.1179908737540245, + "learning_rate": 2.2448534515774533e-05, + "loss": 0.0327, + "step": 110360 + }, + { + "epoch": 0.24340972095105562, + "grad_norm": 0.12340063601732254, + "learning_rate": 2.244709926323671e-05, + "loss": 0.0319, + "step": 110370 + }, + { + "epoch": 0.24343177492595378, + "grad_norm": 0.09136735647916794, + "learning_rate": 2.2445663920209604e-05, + "loss": 0.0321, + "step": 110380 + }, + { + "epoch": 0.24345382890085193, + "grad_norm": 0.11384539306163788, + "learning_rate": 2.2444228486710645e-05, + "loss": 0.0304, + "step": 110390 + }, + { + "epoch": 0.24347588287575012, + "grad_norm": 0.10221001505851746, + "learning_rate": 2.2442792962757288e-05, + "loss": 0.0329, + "step": 110400 + }, + { + "epoch": 0.24349793685064827, + "grad_norm": 0.14172804355621338, + "learning_rate": 2.2441357348366966e-05, + "loss": 0.0295, + "step": 110410 + }, + { + "epoch": 0.24351999082554643, + "grad_norm": 0.13217447698116302, + "learning_rate": 2.2439921643557127e-05, + "loss": 0.0313, + "step": 110420 + }, + { + "epoch": 0.2435420448004446, + "grad_norm": 0.10000849515199661, + "learning_rate": 2.2438485848345216e-05, + "loss": 0.032, + "step": 110430 + }, + { + "epoch": 0.24356409877534277, + "grad_norm": 0.1190701350569725, + "learning_rate": 2.2437049962748676e-05, + "loss": 0.0312, + "step": 110440 + }, + { + "epoch": 0.24358615275024095, + "grad_norm": 0.1095859557390213, + "learning_rate": 2.2435613986784962e-05, + "loss": 0.0321, + "step": 110450 + }, + { + "epoch": 0.2436082067251391, + "grad_norm": 0.10491825640201569, + "learning_rate": 2.2434177920471513e-05, + "loss": 0.0309, + "step": 110460 + }, + { + "epoch": 0.24363026070003727, + "grad_norm": 0.12656524777412415, + "learning_rate": 2.2432741763825785e-05, + "loss": 0.032, + "step": 110470 + }, + { + "epoch": 0.24365231467493545, + "grad_norm": 0.099948950111866, + "learning_rate": 2.2431305516865227e-05, + "loss": 0.0317, + "step": 110480 + }, + { + "epoch": 0.2436743686498336, + "grad_norm": 0.09640856087207794, + "learning_rate": 2.2429869179607294e-05, + "loss": 0.0325, + "step": 110490 + }, + { + "epoch": 0.24369642262473176, + "grad_norm": 0.13156183063983917, + "learning_rate": 2.2428432752069432e-05, + "loss": 0.0303, + "step": 110500 + }, + { + "epoch": 0.24371847659962995, + "grad_norm": 0.13492275774478912, + "learning_rate": 2.24269962342691e-05, + "loss": 0.0313, + "step": 110510 + }, + { + "epoch": 0.2437405305745281, + "grad_norm": 0.1400054395198822, + "learning_rate": 2.2425559626223752e-05, + "loss": 0.0314, + "step": 110520 + }, + { + "epoch": 0.24376258454942626, + "grad_norm": 0.13416258990764618, + "learning_rate": 2.2424122927950836e-05, + "loss": 0.0329, + "step": 110530 + }, + { + "epoch": 0.24378463852432444, + "grad_norm": 0.1333494931459427, + "learning_rate": 2.2422686139467827e-05, + "loss": 0.0301, + "step": 110540 + }, + { + "epoch": 0.2438066924992226, + "grad_norm": 0.12575586140155792, + "learning_rate": 2.2421249260792167e-05, + "loss": 0.0314, + "step": 110550 + }, + { + "epoch": 0.24382874647412076, + "grad_norm": 0.09709053486585617, + "learning_rate": 2.2419812291941325e-05, + "loss": 0.0341, + "step": 110560 + }, + { + "epoch": 0.24385080044901894, + "grad_norm": 0.11037062108516693, + "learning_rate": 2.2418375232932756e-05, + "loss": 0.0316, + "step": 110570 + }, + { + "epoch": 0.2438728544239171, + "grad_norm": 0.11041757464408875, + "learning_rate": 2.241693808378392e-05, + "loss": 0.0299, + "step": 110580 + }, + { + "epoch": 0.24389490839881525, + "grad_norm": 0.10262307524681091, + "learning_rate": 2.2415500844512295e-05, + "loss": 0.0315, + "step": 110590 + }, + { + "epoch": 0.24391696237371344, + "grad_norm": 0.08237063139677048, + "learning_rate": 2.241406351513532e-05, + "loss": 0.0317, + "step": 110600 + }, + { + "epoch": 0.2439390163486116, + "grad_norm": 0.10579921305179596, + "learning_rate": 2.2412626095670485e-05, + "loss": 0.034, + "step": 110610 + }, + { + "epoch": 0.24396107032350975, + "grad_norm": 0.14292794466018677, + "learning_rate": 2.241118858613524e-05, + "loss": 0.0316, + "step": 110620 + }, + { + "epoch": 0.24398312429840793, + "grad_norm": 0.11435599625110626, + "learning_rate": 2.2409750986547053e-05, + "loss": 0.0334, + "step": 110630 + }, + { + "epoch": 0.2440051782733061, + "grad_norm": 0.10634224861860275, + "learning_rate": 2.2408313296923394e-05, + "loss": 0.0318, + "step": 110640 + }, + { + "epoch": 0.24402723224820425, + "grad_norm": 0.09262220561504364, + "learning_rate": 2.2406875517281746e-05, + "loss": 0.0334, + "step": 110650 + }, + { + "epoch": 0.24404928622310243, + "grad_norm": 0.10385343432426453, + "learning_rate": 2.2405437647639557e-05, + "loss": 0.0306, + "step": 110660 + }, + { + "epoch": 0.24407134019800059, + "grad_norm": 0.10151569545269012, + "learning_rate": 2.2403999688014307e-05, + "loss": 0.0333, + "step": 110670 + }, + { + "epoch": 0.24409339417289874, + "grad_norm": 0.0940270870923996, + "learning_rate": 2.2402561638423475e-05, + "loss": 0.0322, + "step": 110680 + }, + { + "epoch": 0.24411544814779693, + "grad_norm": 0.1060807853937149, + "learning_rate": 2.240112349888453e-05, + "loss": 0.0317, + "step": 110690 + }, + { + "epoch": 0.24413750212269508, + "grad_norm": 0.11708325892686844, + "learning_rate": 2.2399685269414943e-05, + "loss": 0.0319, + "step": 110700 + }, + { + "epoch": 0.24415955609759324, + "grad_norm": 0.11111146211624146, + "learning_rate": 2.23982469500322e-05, + "loss": 0.032, + "step": 110710 + }, + { + "epoch": 0.24418161007249142, + "grad_norm": 0.11873152107000351, + "learning_rate": 2.239680854075377e-05, + "loss": 0.0317, + "step": 110720 + }, + { + "epoch": 0.24420366404738958, + "grad_norm": 0.08669726550579071, + "learning_rate": 2.2395370041597124e-05, + "loss": 0.0316, + "step": 110730 + }, + { + "epoch": 0.24422571802228774, + "grad_norm": 0.09262480586767197, + "learning_rate": 2.239393145257976e-05, + "loss": 0.0327, + "step": 110740 + }, + { + "epoch": 0.24424777199718592, + "grad_norm": 0.12137378752231598, + "learning_rate": 2.2392492773719143e-05, + "loss": 0.0341, + "step": 110750 + }, + { + "epoch": 0.24426982597208408, + "grad_norm": 0.12481832504272461, + "learning_rate": 2.239105400503276e-05, + "loss": 0.0306, + "step": 110760 + }, + { + "epoch": 0.24429187994698223, + "grad_norm": 0.10413845628499985, + "learning_rate": 2.238961514653809e-05, + "loss": 0.031, + "step": 110770 + }, + { + "epoch": 0.24431393392188042, + "grad_norm": 0.10889163613319397, + "learning_rate": 2.238817619825262e-05, + "loss": 0.0301, + "step": 110780 + }, + { + "epoch": 0.24433598789677857, + "grad_norm": 0.11622245609760284, + "learning_rate": 2.2386737160193837e-05, + "loss": 0.0309, + "step": 110790 + }, + { + "epoch": 0.24435804187167673, + "grad_norm": 0.1495746523141861, + "learning_rate": 2.238529803237922e-05, + "loss": 0.0322, + "step": 110800 + }, + { + "epoch": 0.2443800958465749, + "grad_norm": 0.11894071847200394, + "learning_rate": 2.238385881482626e-05, + "loss": 0.0328, + "step": 110810 + }, + { + "epoch": 0.24440214982147307, + "grad_norm": 0.1342863291501999, + "learning_rate": 2.238241950755244e-05, + "loss": 0.0309, + "step": 110820 + }, + { + "epoch": 0.24442420379637123, + "grad_norm": 0.10522786527872086, + "learning_rate": 2.238098011057526e-05, + "loss": 0.0293, + "step": 110830 + }, + { + "epoch": 0.2444462577712694, + "grad_norm": 0.10648920387029648, + "learning_rate": 2.2379540623912198e-05, + "loss": 0.0309, + "step": 110840 + }, + { + "epoch": 0.24446831174616757, + "grad_norm": 0.10382199287414551, + "learning_rate": 2.2378101047580752e-05, + "loss": 0.032, + "step": 110850 + }, + { + "epoch": 0.24449036572106575, + "grad_norm": 0.14713099598884583, + "learning_rate": 2.237666138159841e-05, + "loss": 0.0294, + "step": 110860 + }, + { + "epoch": 0.2445124196959639, + "grad_norm": 0.12532131373882294, + "learning_rate": 2.2375221625982674e-05, + "loss": 0.0322, + "step": 110870 + }, + { + "epoch": 0.24453447367086206, + "grad_norm": 0.11358509957790375, + "learning_rate": 2.2373781780751024e-05, + "loss": 0.0308, + "step": 110880 + }, + { + "epoch": 0.24455652764576025, + "grad_norm": 0.11390987038612366, + "learning_rate": 2.2372341845920967e-05, + "loss": 0.0328, + "step": 110890 + }, + { + "epoch": 0.2445785816206584, + "grad_norm": 0.10779125243425369, + "learning_rate": 2.2370901821509994e-05, + "loss": 0.0309, + "step": 110900 + }, + { + "epoch": 0.24460063559555656, + "grad_norm": 0.08984389156103134, + "learning_rate": 2.236946170753561e-05, + "loss": 0.0312, + "step": 110910 + }, + { + "epoch": 0.24462268957045474, + "grad_norm": 0.10729391872882843, + "learning_rate": 2.23680215040153e-05, + "loss": 0.0308, + "step": 110920 + }, + { + "epoch": 0.2446447435453529, + "grad_norm": 0.10822179168462753, + "learning_rate": 2.2366581210966578e-05, + "loss": 0.0327, + "step": 110930 + }, + { + "epoch": 0.24466679752025106, + "grad_norm": 0.10345632582902908, + "learning_rate": 2.2365140828406936e-05, + "loss": 0.0306, + "step": 110940 + }, + { + "epoch": 0.24468885149514924, + "grad_norm": 0.11183522641658783, + "learning_rate": 2.2363700356353883e-05, + "loss": 0.0299, + "step": 110950 + }, + { + "epoch": 0.2447109054700474, + "grad_norm": 0.12164581567049026, + "learning_rate": 2.2362259794824915e-05, + "loss": 0.0307, + "step": 110960 + }, + { + "epoch": 0.24473295944494555, + "grad_norm": 0.10209998488426208, + "learning_rate": 2.236081914383754e-05, + "loss": 0.0332, + "step": 110970 + }, + { + "epoch": 0.24475501341984374, + "grad_norm": 0.13785824179649353, + "learning_rate": 2.2359378403409265e-05, + "loss": 0.0318, + "step": 110980 + }, + { + "epoch": 0.2447770673947419, + "grad_norm": 0.10662771016359329, + "learning_rate": 2.2357937573557592e-05, + "loss": 0.03, + "step": 110990 + }, + { + "epoch": 0.24479912136964005, + "grad_norm": 0.14458003640174866, + "learning_rate": 2.235649665430003e-05, + "loss": 0.0331, + "step": 111000 + }, + { + "epoch": 0.24482117534453823, + "grad_norm": 0.09656701236963272, + "learning_rate": 2.235505564565409e-05, + "loss": 0.0298, + "step": 111010 + }, + { + "epoch": 0.2448432293194364, + "grad_norm": 0.11878858506679535, + "learning_rate": 2.2353614547637277e-05, + "loss": 0.0319, + "step": 111020 + }, + { + "epoch": 0.24486528329433455, + "grad_norm": 0.11423993855714798, + "learning_rate": 2.2352173360267103e-05, + "loss": 0.0305, + "step": 111030 + }, + { + "epoch": 0.24488733726923273, + "grad_norm": 0.10375256836414337, + "learning_rate": 2.2350732083561085e-05, + "loss": 0.0326, + "step": 111040 + }, + { + "epoch": 0.2449093912441309, + "grad_norm": 0.08458012342453003, + "learning_rate": 2.2349290717536726e-05, + "loss": 0.0305, + "step": 111050 + }, + { + "epoch": 0.24493144521902904, + "grad_norm": 0.12179186940193176, + "learning_rate": 2.234784926221155e-05, + "loss": 0.0304, + "step": 111060 + }, + { + "epoch": 0.24495349919392723, + "grad_norm": 0.09926604479551315, + "learning_rate": 2.234640771760307e-05, + "loss": 0.0305, + "step": 111070 + }, + { + "epoch": 0.24497555316882538, + "grad_norm": 0.0912691205739975, + "learning_rate": 2.23449660837288e-05, + "loss": 0.0303, + "step": 111080 + }, + { + "epoch": 0.24499760714372354, + "grad_norm": 0.12375453859567642, + "learning_rate": 2.2343524360606252e-05, + "loss": 0.0313, + "step": 111090 + }, + { + "epoch": 0.24501966111862172, + "grad_norm": 0.12573209404945374, + "learning_rate": 2.2342082548252956e-05, + "loss": 0.0305, + "step": 111100 + }, + { + "epoch": 0.24504171509351988, + "grad_norm": 0.10352395474910736, + "learning_rate": 2.2340640646686423e-05, + "loss": 0.0315, + "step": 111110 + }, + { + "epoch": 0.24506376906841804, + "grad_norm": 0.10599818080663681, + "learning_rate": 2.233919865592417e-05, + "loss": 0.0317, + "step": 111120 + }, + { + "epoch": 0.24508582304331622, + "grad_norm": 0.10101918876171112, + "learning_rate": 2.233775657598373e-05, + "loss": 0.0302, + "step": 111130 + }, + { + "epoch": 0.24510787701821438, + "grad_norm": 0.13706238567829132, + "learning_rate": 2.2336314406882617e-05, + "loss": 0.0304, + "step": 111140 + }, + { + "epoch": 0.24512993099311253, + "grad_norm": 0.09070711582899094, + "learning_rate": 2.233487214863836e-05, + "loss": 0.0339, + "step": 111150 + }, + { + "epoch": 0.24515198496801072, + "grad_norm": 0.08352136611938477, + "learning_rate": 2.233342980126848e-05, + "loss": 0.0317, + "step": 111160 + }, + { + "epoch": 0.24517403894290887, + "grad_norm": 0.09616174548864365, + "learning_rate": 2.23319873647905e-05, + "loss": 0.0332, + "step": 111170 + }, + { + "epoch": 0.24519609291780703, + "grad_norm": 0.10218177735805511, + "learning_rate": 2.2330544839221958e-05, + "loss": 0.033, + "step": 111180 + }, + { + "epoch": 0.2452181468927052, + "grad_norm": 0.1069832369685173, + "learning_rate": 2.232910222458037e-05, + "loss": 0.0306, + "step": 111190 + }, + { + "epoch": 0.24524020086760337, + "grad_norm": 0.10912994295358658, + "learning_rate": 2.232765952088327e-05, + "loss": 0.032, + "step": 111200 + }, + { + "epoch": 0.24526225484250153, + "grad_norm": 0.10892214626073837, + "learning_rate": 2.232621672814819e-05, + "loss": 0.0337, + "step": 111210 + }, + { + "epoch": 0.2452843088173997, + "grad_norm": 0.09709000587463379, + "learning_rate": 2.232477384639266e-05, + "loss": 0.0306, + "step": 111220 + }, + { + "epoch": 0.24530636279229787, + "grad_norm": 0.08550593256950378, + "learning_rate": 2.2323330875634213e-05, + "loss": 0.0306, + "step": 111230 + }, + { + "epoch": 0.24532841676719602, + "grad_norm": 0.10672490298748016, + "learning_rate": 2.232188781589038e-05, + "loss": 0.0294, + "step": 111240 + }, + { + "epoch": 0.2453504707420942, + "grad_norm": 0.11587899178266525, + "learning_rate": 2.23204446671787e-05, + "loss": 0.0313, + "step": 111250 + }, + { + "epoch": 0.24537252471699236, + "grad_norm": 0.09754220396280289, + "learning_rate": 2.23190014295167e-05, + "loss": 0.0329, + "step": 111260 + }, + { + "epoch": 0.24539457869189052, + "grad_norm": 0.08261764794588089, + "learning_rate": 2.2317558102921923e-05, + "loss": 0.0316, + "step": 111270 + }, + { + "epoch": 0.2454166326667887, + "grad_norm": 0.13583670556545258, + "learning_rate": 2.231611468741191e-05, + "loss": 0.0316, + "step": 111280 + }, + { + "epoch": 0.24543868664168686, + "grad_norm": 0.10056144744157791, + "learning_rate": 2.2314671183004195e-05, + "loss": 0.0312, + "step": 111290 + }, + { + "epoch": 0.24546074061658504, + "grad_norm": 0.12079363316297531, + "learning_rate": 2.2313227589716314e-05, + "loss": 0.0334, + "step": 111300 + }, + { + "epoch": 0.2454827945914832, + "grad_norm": 0.10689440369606018, + "learning_rate": 2.231178390756582e-05, + "loss": 0.0311, + "step": 111310 + }, + { + "epoch": 0.24550484856638136, + "grad_norm": 0.1258450448513031, + "learning_rate": 2.2310340136570248e-05, + "loss": 0.0321, + "step": 111320 + }, + { + "epoch": 0.24552690254127954, + "grad_norm": 0.11017140001058578, + "learning_rate": 2.2308896276747136e-05, + "loss": 0.0298, + "step": 111330 + }, + { + "epoch": 0.2455489565161777, + "grad_norm": 0.11189096421003342, + "learning_rate": 2.2307452328114036e-05, + "loss": 0.0306, + "step": 111340 + }, + { + "epoch": 0.24557101049107585, + "grad_norm": 0.10771305114030838, + "learning_rate": 2.2306008290688492e-05, + "loss": 0.0311, + "step": 111350 + }, + { + "epoch": 0.24559306446597404, + "grad_norm": 0.08776649832725525, + "learning_rate": 2.2304564164488047e-05, + "loss": 0.0318, + "step": 111360 + }, + { + "epoch": 0.2456151184408722, + "grad_norm": 0.1132480576634407, + "learning_rate": 2.230311994953025e-05, + "loss": 0.032, + "step": 111370 + }, + { + "epoch": 0.24563717241577035, + "grad_norm": 0.10264861583709717, + "learning_rate": 2.2301675645832653e-05, + "loss": 0.0308, + "step": 111380 + }, + { + "epoch": 0.24565922639066853, + "grad_norm": 0.10525768250226974, + "learning_rate": 2.2300231253412805e-05, + "loss": 0.0329, + "step": 111390 + }, + { + "epoch": 0.2456812803655667, + "grad_norm": 0.10041461884975433, + "learning_rate": 2.229878677228825e-05, + "loss": 0.0337, + "step": 111400 + }, + { + "epoch": 0.24570333434046485, + "grad_norm": 0.10839971899986267, + "learning_rate": 2.2297342202476544e-05, + "loss": 0.0297, + "step": 111410 + }, + { + "epoch": 0.24572538831536303, + "grad_norm": 0.10606397688388824, + "learning_rate": 2.229589754399524e-05, + "loss": 0.0314, + "step": 111420 + }, + { + "epoch": 0.2457474422902612, + "grad_norm": 0.11696070432662964, + "learning_rate": 2.2294452796861902e-05, + "loss": 0.0329, + "step": 111430 + }, + { + "epoch": 0.24576949626515934, + "grad_norm": 0.1047847792506218, + "learning_rate": 2.2293007961094062e-05, + "loss": 0.0309, + "step": 111440 + }, + { + "epoch": 0.24579155024005753, + "grad_norm": 0.0944553017616272, + "learning_rate": 2.22915630367093e-05, + "loss": 0.0322, + "step": 111450 + }, + { + "epoch": 0.24581360421495568, + "grad_norm": 0.09842425584793091, + "learning_rate": 2.2290118023725157e-05, + "loss": 0.0326, + "step": 111460 + }, + { + "epoch": 0.24583565818985384, + "grad_norm": 0.10835151374340057, + "learning_rate": 2.22886729221592e-05, + "loss": 0.0327, + "step": 111470 + }, + { + "epoch": 0.24585771216475202, + "grad_norm": 0.09827754646539688, + "learning_rate": 2.2287227732028986e-05, + "loss": 0.0302, + "step": 111480 + }, + { + "epoch": 0.24587976613965018, + "grad_norm": 0.10920864343643188, + "learning_rate": 2.228578245335207e-05, + "loss": 0.032, + "step": 111490 + }, + { + "epoch": 0.24590182011454834, + "grad_norm": 0.10406506806612015, + "learning_rate": 2.2284337086146023e-05, + "loss": 0.032, + "step": 111500 + }, + { + "epoch": 0.24592387408944652, + "grad_norm": 0.09690041095018387, + "learning_rate": 2.2282891630428403e-05, + "loss": 0.0319, + "step": 111510 + }, + { + "epoch": 0.24594592806434468, + "grad_norm": 0.11416218429803848, + "learning_rate": 2.228144608621677e-05, + "loss": 0.0337, + "step": 111520 + }, + { + "epoch": 0.24596798203924283, + "grad_norm": 0.0874997153878212, + "learning_rate": 2.2280000453528694e-05, + "loss": 0.0304, + "step": 111530 + }, + { + "epoch": 0.24599003601414102, + "grad_norm": 0.09595749527215958, + "learning_rate": 2.227855473238174e-05, + "loss": 0.0323, + "step": 111540 + }, + { + "epoch": 0.24601208998903917, + "grad_norm": 0.1270628422498703, + "learning_rate": 2.227710892279347e-05, + "loss": 0.032, + "step": 111550 + }, + { + "epoch": 0.24603414396393733, + "grad_norm": 0.11419206857681274, + "learning_rate": 2.227566302478146e-05, + "loss": 0.0323, + "step": 111560 + }, + { + "epoch": 0.2460561979388355, + "grad_norm": 0.09873327612876892, + "learning_rate": 2.227421703836327e-05, + "loss": 0.0309, + "step": 111570 + }, + { + "epoch": 0.24607825191373367, + "grad_norm": 0.12284641712903976, + "learning_rate": 2.227277096355648e-05, + "loss": 0.0347, + "step": 111580 + }, + { + "epoch": 0.24610030588863183, + "grad_norm": 0.09901832044124603, + "learning_rate": 2.227132480037865e-05, + "loss": 0.0297, + "step": 111590 + }, + { + "epoch": 0.24612235986353, + "grad_norm": 0.1056702584028244, + "learning_rate": 2.2269878548847366e-05, + "loss": 0.0331, + "step": 111600 + }, + { + "epoch": 0.24614441383842817, + "grad_norm": 0.11417072266340256, + "learning_rate": 2.226843220898019e-05, + "loss": 0.0311, + "step": 111610 + }, + { + "epoch": 0.24616646781332632, + "grad_norm": 0.13444453477859497, + "learning_rate": 2.2266985780794704e-05, + "loss": 0.0308, + "step": 111620 + }, + { + "epoch": 0.2461885217882245, + "grad_norm": 0.10780493915081024, + "learning_rate": 2.2265539264308475e-05, + "loss": 0.0313, + "step": 111630 + }, + { + "epoch": 0.24621057576312266, + "grad_norm": 0.09591228514909744, + "learning_rate": 2.2264092659539085e-05, + "loss": 0.0311, + "step": 111640 + }, + { + "epoch": 0.24623262973802082, + "grad_norm": 0.11817760020494461, + "learning_rate": 2.226264596650411e-05, + "loss": 0.0314, + "step": 111650 + }, + { + "epoch": 0.246254683712919, + "grad_norm": 0.15483108162879944, + "learning_rate": 2.2261199185221132e-05, + "loss": 0.031, + "step": 111660 + }, + { + "epoch": 0.24627673768781716, + "grad_norm": 0.13142496347427368, + "learning_rate": 2.2259752315707725e-05, + "loss": 0.0315, + "step": 111670 + }, + { + "epoch": 0.24629879166271532, + "grad_norm": 0.12795668840408325, + "learning_rate": 2.2258305357981474e-05, + "loss": 0.032, + "step": 111680 + }, + { + "epoch": 0.2463208456376135, + "grad_norm": 0.09866978973150253, + "learning_rate": 2.225685831205996e-05, + "loss": 0.0313, + "step": 111690 + }, + { + "epoch": 0.24634289961251166, + "grad_norm": 0.09956430643796921, + "learning_rate": 2.2255411177960766e-05, + "loss": 0.0309, + "step": 111700 + }, + { + "epoch": 0.24636495358740984, + "grad_norm": 0.10205665975809097, + "learning_rate": 2.225396395570148e-05, + "loss": 0.0328, + "step": 111710 + }, + { + "epoch": 0.246387007562308, + "grad_norm": 0.10596700012683868, + "learning_rate": 2.225251664529968e-05, + "loss": 0.031, + "step": 111720 + }, + { + "epoch": 0.24640906153720615, + "grad_norm": 0.0968056470155716, + "learning_rate": 2.2251069246772956e-05, + "loss": 0.032, + "step": 111730 + }, + { + "epoch": 0.24643111551210434, + "grad_norm": 0.08745091408491135, + "learning_rate": 2.224962176013889e-05, + "loss": 0.0299, + "step": 111740 + }, + { + "epoch": 0.2464531694870025, + "grad_norm": 0.10311703383922577, + "learning_rate": 2.224817418541508e-05, + "loss": 0.0313, + "step": 111750 + }, + { + "epoch": 0.24647522346190065, + "grad_norm": 0.09319330006837845, + "learning_rate": 2.2246726522619108e-05, + "loss": 0.0313, + "step": 111760 + }, + { + "epoch": 0.24649727743679883, + "grad_norm": 0.08830424398183823, + "learning_rate": 2.224527877176857e-05, + "loss": 0.0298, + "step": 111770 + }, + { + "epoch": 0.246519331411697, + "grad_norm": 0.10691413283348083, + "learning_rate": 2.224383093288105e-05, + "loss": 0.0304, + "step": 111780 + }, + { + "epoch": 0.24654138538659515, + "grad_norm": 0.10988463461399078, + "learning_rate": 2.2242383005974146e-05, + "loss": 0.0299, + "step": 111790 + }, + { + "epoch": 0.24656343936149333, + "grad_norm": 0.13231317698955536, + "learning_rate": 2.2240934991065454e-05, + "loss": 0.0324, + "step": 111800 + }, + { + "epoch": 0.2465854933363915, + "grad_norm": 0.11686889827251434, + "learning_rate": 2.223948688817256e-05, + "loss": 0.031, + "step": 111810 + }, + { + "epoch": 0.24660754731128964, + "grad_norm": 0.09424937516450882, + "learning_rate": 2.2238038697313067e-05, + "loss": 0.0302, + "step": 111820 + }, + { + "epoch": 0.24662960128618783, + "grad_norm": 0.10809896141290665, + "learning_rate": 2.223659041850457e-05, + "loss": 0.0313, + "step": 111830 + }, + { + "epoch": 0.24665165526108598, + "grad_norm": 0.14708039164543152, + "learning_rate": 2.223514205176467e-05, + "loss": 0.0318, + "step": 111840 + }, + { + "epoch": 0.24667370923598414, + "grad_norm": 0.1078837662935257, + "learning_rate": 2.2233693597110954e-05, + "loss": 0.0327, + "step": 111850 + }, + { + "epoch": 0.24669576321088232, + "grad_norm": 0.0971042737364769, + "learning_rate": 2.223224505456104e-05, + "loss": 0.0325, + "step": 111860 + }, + { + "epoch": 0.24671781718578048, + "grad_norm": 0.12711381912231445, + "learning_rate": 2.2230796424132517e-05, + "loss": 0.0321, + "step": 111870 + }, + { + "epoch": 0.24673987116067864, + "grad_norm": 0.1286446899175644, + "learning_rate": 2.2229347705842993e-05, + "loss": 0.0318, + "step": 111880 + }, + { + "epoch": 0.24676192513557682, + "grad_norm": 0.08240140974521637, + "learning_rate": 2.2227898899710067e-05, + "loss": 0.0328, + "step": 111890 + }, + { + "epoch": 0.24678397911047498, + "grad_norm": 0.10454419255256653, + "learning_rate": 2.222645000575134e-05, + "loss": 0.0315, + "step": 111900 + }, + { + "epoch": 0.24680603308537313, + "grad_norm": 0.1035684123635292, + "learning_rate": 2.222500102398443e-05, + "loss": 0.0327, + "step": 111910 + }, + { + "epoch": 0.24682808706027132, + "grad_norm": 0.11476291716098785, + "learning_rate": 2.2223551954426934e-05, + "loss": 0.0322, + "step": 111920 + }, + { + "epoch": 0.24685014103516947, + "grad_norm": 0.10820548981428146, + "learning_rate": 2.222210279709646e-05, + "loss": 0.0308, + "step": 111930 + }, + { + "epoch": 0.24687219501006763, + "grad_norm": 0.09383448213338852, + "learning_rate": 2.2220653552010616e-05, + "loss": 0.0312, + "step": 111940 + }, + { + "epoch": 0.24689424898496581, + "grad_norm": 0.10372285544872284, + "learning_rate": 2.221920421918702e-05, + "loss": 0.0317, + "step": 111950 + }, + { + "epoch": 0.24691630295986397, + "grad_norm": 0.11486402153968811, + "learning_rate": 2.2217754798643276e-05, + "loss": 0.0328, + "step": 111960 + }, + { + "epoch": 0.24693835693476213, + "grad_norm": 0.13266617059707642, + "learning_rate": 2.2216305290396996e-05, + "loss": 0.032, + "step": 111970 + }, + { + "epoch": 0.2469604109096603, + "grad_norm": 0.14451603591442108, + "learning_rate": 2.221485569446579e-05, + "loss": 0.0316, + "step": 111980 + }, + { + "epoch": 0.24698246488455847, + "grad_norm": 0.11078909784555435, + "learning_rate": 2.2213406010867276e-05, + "loss": 0.0328, + "step": 111990 + }, + { + "epoch": 0.24700451885945662, + "grad_norm": 0.1043790727853775, + "learning_rate": 2.221195623961907e-05, + "loss": 0.0319, + "step": 112000 + }, + { + "epoch": 0.2470265728343548, + "grad_norm": 0.10204831510782242, + "learning_rate": 2.2210506380738782e-05, + "loss": 0.0329, + "step": 112010 + }, + { + "epoch": 0.24704862680925296, + "grad_norm": 0.09573155641555786, + "learning_rate": 2.220905643424404e-05, + "loss": 0.0329, + "step": 112020 + }, + { + "epoch": 0.24707068078415112, + "grad_norm": 0.10426578670740128, + "learning_rate": 2.2207606400152453e-05, + "loss": 0.0315, + "step": 112030 + }, + { + "epoch": 0.2470927347590493, + "grad_norm": 0.10808269679546356, + "learning_rate": 2.220615627848164e-05, + "loss": 0.0321, + "step": 112040 + }, + { + "epoch": 0.24711478873394746, + "grad_norm": 0.11514066904783249, + "learning_rate": 2.2204706069249234e-05, + "loss": 0.0324, + "step": 112050 + }, + { + "epoch": 0.24713684270884562, + "grad_norm": 0.07799530774354935, + "learning_rate": 2.2203255772472836e-05, + "loss": 0.0313, + "step": 112060 + }, + { + "epoch": 0.2471588966837438, + "grad_norm": 0.10632971674203873, + "learning_rate": 2.220180538817009e-05, + "loss": 0.0323, + "step": 112070 + }, + { + "epoch": 0.24718095065864196, + "grad_norm": 0.10179588943719864, + "learning_rate": 2.22003549163586e-05, + "loss": 0.031, + "step": 112080 + }, + { + "epoch": 0.2472030046335401, + "grad_norm": 0.09964463114738464, + "learning_rate": 2.2198904357056e-05, + "loss": 0.0303, + "step": 112090 + }, + { + "epoch": 0.2472250586084383, + "grad_norm": 0.10418087244033813, + "learning_rate": 2.219745371027992e-05, + "loss": 0.0314, + "step": 112100 + }, + { + "epoch": 0.24724711258333645, + "grad_norm": 0.10210828483104706, + "learning_rate": 2.219600297604798e-05, + "loss": 0.03, + "step": 112110 + }, + { + "epoch": 0.2472691665582346, + "grad_norm": 0.08901410549879074, + "learning_rate": 2.219455215437781e-05, + "loss": 0.0306, + "step": 112120 + }, + { + "epoch": 0.2472912205331328, + "grad_norm": 0.10904128104448318, + "learning_rate": 2.2193101245287035e-05, + "loss": 0.0322, + "step": 112130 + }, + { + "epoch": 0.24731327450803095, + "grad_norm": 0.09831032902002335, + "learning_rate": 2.2191650248793295e-05, + "loss": 0.0335, + "step": 112140 + }, + { + "epoch": 0.24733532848292913, + "grad_norm": 0.10269573330879211, + "learning_rate": 2.2190199164914208e-05, + "loss": 0.0337, + "step": 112150 + }, + { + "epoch": 0.2473573824578273, + "grad_norm": 0.10694152861833572, + "learning_rate": 2.2188747993667417e-05, + "loss": 0.0318, + "step": 112160 + }, + { + "epoch": 0.24737943643272545, + "grad_norm": 0.12684495747089386, + "learning_rate": 2.2187296735070546e-05, + "loss": 0.0306, + "step": 112170 + }, + { + "epoch": 0.24740149040762363, + "grad_norm": 0.12491792440414429, + "learning_rate": 2.218584538914124e-05, + "loss": 0.0295, + "step": 112180 + }, + { + "epoch": 0.2474235443825218, + "grad_norm": 0.10221649706363678, + "learning_rate": 2.218439395589712e-05, + "loss": 0.0295, + "step": 112190 + }, + { + "epoch": 0.24744559835741994, + "grad_norm": 0.10895255953073502, + "learning_rate": 2.2182942435355838e-05, + "loss": 0.0298, + "step": 112200 + }, + { + "epoch": 0.24746765233231813, + "grad_norm": 0.09553427994251251, + "learning_rate": 2.218149082753502e-05, + "loss": 0.0313, + "step": 112210 + }, + { + "epoch": 0.24748970630721628, + "grad_norm": 0.09167218208312988, + "learning_rate": 2.2180039132452308e-05, + "loss": 0.0301, + "step": 112220 + }, + { + "epoch": 0.24751176028211444, + "grad_norm": 0.09772979468107224, + "learning_rate": 2.2178587350125344e-05, + "loss": 0.03, + "step": 112230 + }, + { + "epoch": 0.24753381425701262, + "grad_norm": 0.11919453740119934, + "learning_rate": 2.217713548057176e-05, + "loss": 0.0301, + "step": 112240 + }, + { + "epoch": 0.24755586823191078, + "grad_norm": 0.10921463370323181, + "learning_rate": 2.217568352380921e-05, + "loss": 0.0311, + "step": 112250 + }, + { + "epoch": 0.24757792220680894, + "grad_norm": 0.09790568053722382, + "learning_rate": 2.217423147985533e-05, + "loss": 0.0325, + "step": 112260 + }, + { + "epoch": 0.24759997618170712, + "grad_norm": 0.09328174591064453, + "learning_rate": 2.217277934872776e-05, + "loss": 0.0302, + "step": 112270 + }, + { + "epoch": 0.24762203015660528, + "grad_norm": 0.10333195328712463, + "learning_rate": 2.2171327130444147e-05, + "loss": 0.0327, + "step": 112280 + }, + { + "epoch": 0.24764408413150343, + "grad_norm": 0.07733944803476334, + "learning_rate": 2.2169874825022144e-05, + "loss": 0.0303, + "step": 112290 + }, + { + "epoch": 0.24766613810640162, + "grad_norm": 0.0820804238319397, + "learning_rate": 2.2168422432479386e-05, + "loss": 0.0299, + "step": 112300 + }, + { + "epoch": 0.24768819208129977, + "grad_norm": 0.11487966775894165, + "learning_rate": 2.2166969952833537e-05, + "loss": 0.0297, + "step": 112310 + }, + { + "epoch": 0.24771024605619793, + "grad_norm": 0.1288120597600937, + "learning_rate": 2.2165517386102225e-05, + "loss": 0.0294, + "step": 112320 + }, + { + "epoch": 0.24773230003109611, + "grad_norm": 0.11734277009963989, + "learning_rate": 2.2164064732303122e-05, + "loss": 0.0322, + "step": 112330 + }, + { + "epoch": 0.24775435400599427, + "grad_norm": 0.09643345326185226, + "learning_rate": 2.2162611991453857e-05, + "loss": 0.0284, + "step": 112340 + }, + { + "epoch": 0.24777640798089243, + "grad_norm": 0.11831586062908173, + "learning_rate": 2.2161159163572103e-05, + "loss": 0.0309, + "step": 112350 + }, + { + "epoch": 0.2477984619557906, + "grad_norm": 0.12988118827342987, + "learning_rate": 2.21597062486755e-05, + "loss": 0.0319, + "step": 112360 + }, + { + "epoch": 0.24782051593068877, + "grad_norm": 0.07885953038930893, + "learning_rate": 2.215825324678171e-05, + "loss": 0.0314, + "step": 112370 + }, + { + "epoch": 0.24784256990558692, + "grad_norm": 0.09116274118423462, + "learning_rate": 2.215680015790838e-05, + "loss": 0.0337, + "step": 112380 + }, + { + "epoch": 0.2478646238804851, + "grad_norm": 0.11580519378185272, + "learning_rate": 2.215534698207317e-05, + "loss": 0.0308, + "step": 112390 + }, + { + "epoch": 0.24788667785538326, + "grad_norm": 0.12444151937961578, + "learning_rate": 2.2153893719293744e-05, + "loss": 0.0312, + "step": 112400 + }, + { + "epoch": 0.24790873183028142, + "grad_norm": 0.12528857588768005, + "learning_rate": 2.2152440369587754e-05, + "loss": 0.0312, + "step": 112410 + }, + { + "epoch": 0.2479307858051796, + "grad_norm": 0.09924133867025375, + "learning_rate": 2.215098693297286e-05, + "loss": 0.0323, + "step": 112420 + }, + { + "epoch": 0.24795283978007776, + "grad_norm": 0.11413095891475677, + "learning_rate": 2.214953340946672e-05, + "loss": 0.0295, + "step": 112430 + }, + { + "epoch": 0.24797489375497592, + "grad_norm": 0.09661343693733215, + "learning_rate": 2.2148079799087e-05, + "loss": 0.0311, + "step": 112440 + }, + { + "epoch": 0.2479969477298741, + "grad_norm": 0.11317600309848785, + "learning_rate": 2.2146626101851365e-05, + "loss": 0.0316, + "step": 112450 + }, + { + "epoch": 0.24801900170477226, + "grad_norm": 0.08374744653701782, + "learning_rate": 2.214517231777747e-05, + "loss": 0.0325, + "step": 112460 + }, + { + "epoch": 0.2480410556796704, + "grad_norm": 0.11274775117635727, + "learning_rate": 2.2143718446882984e-05, + "loss": 0.0331, + "step": 112470 + }, + { + "epoch": 0.2480631096545686, + "grad_norm": 0.13057579100131989, + "learning_rate": 2.2142264489185584e-05, + "loss": 0.0332, + "step": 112480 + }, + { + "epoch": 0.24808516362946675, + "grad_norm": 0.1337442249059677, + "learning_rate": 2.214081044470292e-05, + "loss": 0.0312, + "step": 112490 + }, + { + "epoch": 0.2481072176043649, + "grad_norm": 0.08364835381507874, + "learning_rate": 2.2139356313452667e-05, + "loss": 0.034, + "step": 112500 + }, + { + "epoch": 0.2481292715792631, + "grad_norm": 0.11355037987232208, + "learning_rate": 2.213790209545249e-05, + "loss": 0.0313, + "step": 112510 + }, + { + "epoch": 0.24815132555416125, + "grad_norm": 0.11309272795915604, + "learning_rate": 2.213644779072007e-05, + "loss": 0.0313, + "step": 112520 + }, + { + "epoch": 0.2481733795290594, + "grad_norm": 0.12533244490623474, + "learning_rate": 2.213499339927307e-05, + "loss": 0.0337, + "step": 112530 + }, + { + "epoch": 0.2481954335039576, + "grad_norm": 0.13109058141708374, + "learning_rate": 2.2133538921129165e-05, + "loss": 0.0313, + "step": 112540 + }, + { + "epoch": 0.24821748747885575, + "grad_norm": 0.08890869468450546, + "learning_rate": 2.2132084356306026e-05, + "loss": 0.0318, + "step": 112550 + }, + { + "epoch": 0.2482395414537539, + "grad_norm": 0.09219896048307419, + "learning_rate": 2.2130629704821327e-05, + "loss": 0.0317, + "step": 112560 + }, + { + "epoch": 0.2482615954286521, + "grad_norm": 0.2127709835767746, + "learning_rate": 2.2129174966692747e-05, + "loss": 0.032, + "step": 112570 + }, + { + "epoch": 0.24828364940355024, + "grad_norm": 0.10247084498405457, + "learning_rate": 2.2127720141937954e-05, + "loss": 0.0326, + "step": 112580 + }, + { + "epoch": 0.24830570337844843, + "grad_norm": 0.12212857604026794, + "learning_rate": 2.212626523057464e-05, + "loss": 0.0306, + "step": 112590 + }, + { + "epoch": 0.24832775735334658, + "grad_norm": 0.12749642133712769, + "learning_rate": 2.212481023262047e-05, + "loss": 0.0329, + "step": 112600 + }, + { + "epoch": 0.24834981132824474, + "grad_norm": 0.1244833767414093, + "learning_rate": 2.212335514809313e-05, + "loss": 0.032, + "step": 112610 + }, + { + "epoch": 0.24837186530314292, + "grad_norm": 0.08349262923002243, + "learning_rate": 2.21218999770103e-05, + "loss": 0.0298, + "step": 112620 + }, + { + "epoch": 0.24839391927804108, + "grad_norm": 0.10562796145677567, + "learning_rate": 2.2120444719389656e-05, + "loss": 0.0315, + "step": 112630 + }, + { + "epoch": 0.24841597325293924, + "grad_norm": 0.12418871372938156, + "learning_rate": 2.2118989375248893e-05, + "loss": 0.032, + "step": 112640 + }, + { + "epoch": 0.24843802722783742, + "grad_norm": 0.10020160675048828, + "learning_rate": 2.211753394460568e-05, + "loss": 0.0303, + "step": 112650 + }, + { + "epoch": 0.24846008120273558, + "grad_norm": 0.12274988740682602, + "learning_rate": 2.2116078427477718e-05, + "loss": 0.0304, + "step": 112660 + }, + { + "epoch": 0.24848213517763373, + "grad_norm": 0.08925649523735046, + "learning_rate": 2.2114622823882676e-05, + "loss": 0.0319, + "step": 112670 + }, + { + "epoch": 0.24850418915253192, + "grad_norm": 0.12208971381187439, + "learning_rate": 2.211316713383826e-05, + "loss": 0.0321, + "step": 112680 + }, + { + "epoch": 0.24852624312743007, + "grad_norm": 0.12489902973175049, + "learning_rate": 2.211171135736214e-05, + "loss": 0.0304, + "step": 112690 + }, + { + "epoch": 0.24854829710232823, + "grad_norm": 0.1108132004737854, + "learning_rate": 2.211025549447201e-05, + "loss": 0.0317, + "step": 112700 + }, + { + "epoch": 0.24857035107722641, + "grad_norm": 0.1198689267039299, + "learning_rate": 2.210879954518557e-05, + "loss": 0.0302, + "step": 112710 + }, + { + "epoch": 0.24859240505212457, + "grad_norm": 0.109065942466259, + "learning_rate": 2.2107343509520498e-05, + "loss": 0.0324, + "step": 112720 + }, + { + "epoch": 0.24861445902702273, + "grad_norm": 0.08958204835653305, + "learning_rate": 2.2105887387494496e-05, + "loss": 0.0309, + "step": 112730 + }, + { + "epoch": 0.2486365130019209, + "grad_norm": 0.10043814778327942, + "learning_rate": 2.210443117912525e-05, + "loss": 0.0311, + "step": 112740 + }, + { + "epoch": 0.24865856697681907, + "grad_norm": 0.10949725657701492, + "learning_rate": 2.210297488443046e-05, + "loss": 0.0327, + "step": 112750 + }, + { + "epoch": 0.24868062095171722, + "grad_norm": 0.08450862765312195, + "learning_rate": 2.210151850342782e-05, + "loss": 0.0298, + "step": 112760 + }, + { + "epoch": 0.2487026749266154, + "grad_norm": 0.10891145467758179, + "learning_rate": 2.210006203613502e-05, + "loss": 0.0318, + "step": 112770 + }, + { + "epoch": 0.24872472890151356, + "grad_norm": 0.13290099799633026, + "learning_rate": 2.2098605482569767e-05, + "loss": 0.031, + "step": 112780 + }, + { + "epoch": 0.24874678287641172, + "grad_norm": 0.11835776269435883, + "learning_rate": 2.209714884274975e-05, + "loss": 0.032, + "step": 112790 + }, + { + "epoch": 0.2487688368513099, + "grad_norm": 0.12675435841083527, + "learning_rate": 2.209569211669268e-05, + "loss": 0.0314, + "step": 112800 + }, + { + "epoch": 0.24879089082620806, + "grad_norm": 0.12595809996128082, + "learning_rate": 2.2094235304416247e-05, + "loss": 0.0326, + "step": 112810 + }, + { + "epoch": 0.24881294480110622, + "grad_norm": 0.09319046139717102, + "learning_rate": 2.2092778405938158e-05, + "loss": 0.0328, + "step": 112820 + }, + { + "epoch": 0.2488349987760044, + "grad_norm": 0.12049796432256699, + "learning_rate": 2.2091321421276113e-05, + "loss": 0.0315, + "step": 112830 + }, + { + "epoch": 0.24885705275090256, + "grad_norm": 0.09338364005088806, + "learning_rate": 2.2089864350447822e-05, + "loss": 0.0313, + "step": 112840 + }, + { + "epoch": 0.24887910672580071, + "grad_norm": 0.09794564545154572, + "learning_rate": 2.2088407193470988e-05, + "loss": 0.0318, + "step": 112850 + }, + { + "epoch": 0.2489011607006989, + "grad_norm": 0.10525710880756378, + "learning_rate": 2.2086949950363304e-05, + "loss": 0.0315, + "step": 112860 + }, + { + "epoch": 0.24892321467559705, + "grad_norm": 0.09427187591791153, + "learning_rate": 2.2085492621142493e-05, + "loss": 0.0305, + "step": 112870 + }, + { + "epoch": 0.2489452686504952, + "grad_norm": 0.07996798306703568, + "learning_rate": 2.2084035205826255e-05, + "loss": 0.0315, + "step": 112880 + }, + { + "epoch": 0.2489673226253934, + "grad_norm": 0.09795763343572617, + "learning_rate": 2.2082577704432302e-05, + "loss": 0.0314, + "step": 112890 + }, + { + "epoch": 0.24898937660029155, + "grad_norm": 0.11319242417812347, + "learning_rate": 2.208112011697834e-05, + "loss": 0.0309, + "step": 112900 + }, + { + "epoch": 0.2490114305751897, + "grad_norm": 0.09859359264373779, + "learning_rate": 2.2079662443482088e-05, + "loss": 0.0298, + "step": 112910 + }, + { + "epoch": 0.2490334845500879, + "grad_norm": 0.1309007704257965, + "learning_rate": 2.2078204683961252e-05, + "loss": 0.031, + "step": 112920 + }, + { + "epoch": 0.24905553852498605, + "grad_norm": 0.09515130519866943, + "learning_rate": 2.2076746838433544e-05, + "loss": 0.029, + "step": 112930 + }, + { + "epoch": 0.2490775924998842, + "grad_norm": 0.13851189613342285, + "learning_rate": 2.207528890691668e-05, + "loss": 0.0312, + "step": 112940 + }, + { + "epoch": 0.2490996464747824, + "grad_norm": 0.0868101492524147, + "learning_rate": 2.2073830889428373e-05, + "loss": 0.0292, + "step": 112950 + }, + { + "epoch": 0.24912170044968054, + "grad_norm": 0.09568862617015839, + "learning_rate": 2.2072372785986347e-05, + "loss": 0.0327, + "step": 112960 + }, + { + "epoch": 0.2491437544245787, + "grad_norm": 0.12850919365882874, + "learning_rate": 2.207091459660831e-05, + "loss": 0.0314, + "step": 112970 + }, + { + "epoch": 0.24916580839947688, + "grad_norm": 0.11316817998886108, + "learning_rate": 2.2069456321311993e-05, + "loss": 0.0315, + "step": 112980 + }, + { + "epoch": 0.24918786237437504, + "grad_norm": 0.12469907850027084, + "learning_rate": 2.2067997960115096e-05, + "loss": 0.0306, + "step": 112990 + }, + { + "epoch": 0.24920991634927323, + "grad_norm": 0.11875645071268082, + "learning_rate": 2.206653951303536e-05, + "loss": 0.0293, + "step": 113000 + }, + { + "epoch": 0.24923197032417138, + "grad_norm": 0.10581813007593155, + "learning_rate": 2.206508098009049e-05, + "loss": 0.0307, + "step": 113010 + }, + { + "epoch": 0.24925402429906954, + "grad_norm": 0.1118616908788681, + "learning_rate": 2.2063622361298218e-05, + "loss": 0.0337, + "step": 113020 + }, + { + "epoch": 0.24927607827396772, + "grad_norm": 0.11116945743560791, + "learning_rate": 2.2062163656676264e-05, + "loss": 0.0316, + "step": 113030 + }, + { + "epoch": 0.24929813224886588, + "grad_norm": 0.10492727160453796, + "learning_rate": 2.2060704866242355e-05, + "loss": 0.0305, + "step": 113040 + }, + { + "epoch": 0.24932018622376403, + "grad_norm": 0.10853967815637589, + "learning_rate": 2.2059245990014213e-05, + "loss": 0.0311, + "step": 113050 + }, + { + "epoch": 0.24934224019866222, + "grad_norm": 0.12353550642728806, + "learning_rate": 2.2057787028009572e-05, + "loss": 0.0329, + "step": 113060 + }, + { + "epoch": 0.24936429417356037, + "grad_norm": 0.10083655267953873, + "learning_rate": 2.2056327980246152e-05, + "loss": 0.0303, + "step": 113070 + }, + { + "epoch": 0.24938634814845853, + "grad_norm": 0.12296049296855927, + "learning_rate": 2.2054868846741687e-05, + "loss": 0.0316, + "step": 113080 + }, + { + "epoch": 0.24940840212335671, + "grad_norm": 0.12531597912311554, + "learning_rate": 2.2053409627513902e-05, + "loss": 0.03, + "step": 113090 + }, + { + "epoch": 0.24943045609825487, + "grad_norm": 0.10348258167505264, + "learning_rate": 2.2051950322580533e-05, + "loss": 0.0298, + "step": 113100 + }, + { + "epoch": 0.24945251007315303, + "grad_norm": 0.09802243113517761, + "learning_rate": 2.2050490931959304e-05, + "loss": 0.0327, + "step": 113110 + }, + { + "epoch": 0.2494745640480512, + "grad_norm": 0.09566515684127808, + "learning_rate": 2.204903145566796e-05, + "loss": 0.0324, + "step": 113120 + }, + { + "epoch": 0.24949661802294937, + "grad_norm": 0.09896201640367508, + "learning_rate": 2.2047571893724225e-05, + "loss": 0.0312, + "step": 113130 + }, + { + "epoch": 0.24951867199784752, + "grad_norm": 0.08943653851747513, + "learning_rate": 2.2046112246145838e-05, + "loss": 0.0302, + "step": 113140 + }, + { + "epoch": 0.2495407259727457, + "grad_norm": 0.11989510804414749, + "learning_rate": 2.2044652512950537e-05, + "loss": 0.0318, + "step": 113150 + }, + { + "epoch": 0.24956277994764386, + "grad_norm": 0.11727046966552734, + "learning_rate": 2.2043192694156054e-05, + "loss": 0.03, + "step": 113160 + }, + { + "epoch": 0.24958483392254202, + "grad_norm": 0.10532213747501373, + "learning_rate": 2.204173278978013e-05, + "loss": 0.0305, + "step": 113170 + }, + { + "epoch": 0.2496068878974402, + "grad_norm": 0.1041654497385025, + "learning_rate": 2.2040272799840503e-05, + "loss": 0.0307, + "step": 113180 + }, + { + "epoch": 0.24962894187233836, + "grad_norm": 0.10268518328666687, + "learning_rate": 2.203881272435492e-05, + "loss": 0.0315, + "step": 113190 + }, + { + "epoch": 0.24965099584723652, + "grad_norm": 0.10093271732330322, + "learning_rate": 2.203735256334111e-05, + "loss": 0.0314, + "step": 113200 + }, + { + "epoch": 0.2496730498221347, + "grad_norm": 0.10933147370815277, + "learning_rate": 2.2035892316816826e-05, + "loss": 0.0322, + "step": 113210 + }, + { + "epoch": 0.24969510379703286, + "grad_norm": 0.09870216995477676, + "learning_rate": 2.2034431984799802e-05, + "loss": 0.0337, + "step": 113220 + }, + { + "epoch": 0.24971715777193101, + "grad_norm": 0.1223694309592247, + "learning_rate": 2.2032971567307795e-05, + "loss": 0.0326, + "step": 113230 + }, + { + "epoch": 0.2497392117468292, + "grad_norm": 0.09735895693302155, + "learning_rate": 2.2031511064358537e-05, + "loss": 0.0309, + "step": 113240 + }, + { + "epoch": 0.24976126572172735, + "grad_norm": 0.13583926856517792, + "learning_rate": 2.2030050475969786e-05, + "loss": 0.0305, + "step": 113250 + }, + { + "epoch": 0.2497833196966255, + "grad_norm": 0.11669376492500305, + "learning_rate": 2.2028589802159284e-05, + "loss": 0.0314, + "step": 113260 + }, + { + "epoch": 0.2498053736715237, + "grad_norm": 0.09583140909671783, + "learning_rate": 2.2027129042944774e-05, + "loss": 0.032, + "step": 113270 + }, + { + "epoch": 0.24982742764642185, + "grad_norm": 0.09520569443702698, + "learning_rate": 2.2025668198344014e-05, + "loss": 0.0319, + "step": 113280 + }, + { + "epoch": 0.24984948162132, + "grad_norm": 0.0956408753991127, + "learning_rate": 2.2024207268374757e-05, + "loss": 0.0334, + "step": 113290 + }, + { + "epoch": 0.2498715355962182, + "grad_norm": 0.12665994465351105, + "learning_rate": 2.2022746253054743e-05, + "loss": 0.0309, + "step": 113300 + }, + { + "epoch": 0.24989358957111635, + "grad_norm": 0.09436522424221039, + "learning_rate": 2.2021285152401733e-05, + "loss": 0.0314, + "step": 113310 + }, + { + "epoch": 0.2499156435460145, + "grad_norm": 0.09372605383396149, + "learning_rate": 2.2019823966433485e-05, + "loss": 0.0316, + "step": 113320 + }, + { + "epoch": 0.2499376975209127, + "grad_norm": 0.0945248007774353, + "learning_rate": 2.2018362695167745e-05, + "loss": 0.0306, + "step": 113330 + }, + { + "epoch": 0.24995975149581084, + "grad_norm": 0.21695587038993835, + "learning_rate": 2.2016901338622266e-05, + "loss": 0.0319, + "step": 113340 + }, + { + "epoch": 0.249981805470709, + "grad_norm": 0.09442199021577835, + "learning_rate": 2.2015439896814818e-05, + "loss": 0.033, + "step": 113350 + }, + { + "epoch": 0.25000385944560716, + "grad_norm": 0.08777438849210739, + "learning_rate": 2.2013978369763146e-05, + "loss": 0.0331, + "step": 113360 + }, + { + "epoch": 0.25002591342050534, + "grad_norm": 0.10282040387392044, + "learning_rate": 2.201251675748502e-05, + "loss": 0.0329, + "step": 113370 + }, + { + "epoch": 0.2500479673954035, + "grad_norm": 0.1244610846042633, + "learning_rate": 2.201105505999819e-05, + "loss": 0.0311, + "step": 113380 + }, + { + "epoch": 0.25007002137030165, + "grad_norm": 0.10258863121271133, + "learning_rate": 2.2009593277320424e-05, + "loss": 0.0301, + "step": 113390 + }, + { + "epoch": 0.25009207534519984, + "grad_norm": 0.12389423698186874, + "learning_rate": 2.200813140946948e-05, + "loss": 0.0302, + "step": 113400 + }, + { + "epoch": 0.250114129320098, + "grad_norm": 0.10199328511953354, + "learning_rate": 2.2006669456463124e-05, + "loss": 0.0314, + "step": 113410 + }, + { + "epoch": 0.25013618329499615, + "grad_norm": 0.0852317288517952, + "learning_rate": 2.200520741831912e-05, + "loss": 0.0318, + "step": 113420 + }, + { + "epoch": 0.25015823726989433, + "grad_norm": 0.11228495091199875, + "learning_rate": 2.2003745295055227e-05, + "loss": 0.0291, + "step": 113430 + }, + { + "epoch": 0.2501802912447925, + "grad_norm": 0.09999741613864899, + "learning_rate": 2.2002283086689222e-05, + "loss": 0.031, + "step": 113440 + }, + { + "epoch": 0.25020234521969065, + "grad_norm": 0.12569309771060944, + "learning_rate": 2.2000820793238863e-05, + "loss": 0.0306, + "step": 113450 + }, + { + "epoch": 0.25022439919458883, + "grad_norm": 0.13028742372989655, + "learning_rate": 2.199935841472192e-05, + "loss": 0.0328, + "step": 113460 + }, + { + "epoch": 0.250246453169487, + "grad_norm": 0.10748746246099472, + "learning_rate": 2.1997895951156164e-05, + "loss": 0.0315, + "step": 113470 + }, + { + "epoch": 0.25026850714438514, + "grad_norm": 0.10535633563995361, + "learning_rate": 2.1996433402559366e-05, + "loss": 0.0311, + "step": 113480 + }, + { + "epoch": 0.25029056111928333, + "grad_norm": 0.08214156329631805, + "learning_rate": 2.19949707689493e-05, + "loss": 0.0305, + "step": 113490 + }, + { + "epoch": 0.2503126150941815, + "grad_norm": 0.1293841153383255, + "learning_rate": 2.1993508050343733e-05, + "loss": 0.0325, + "step": 113500 + }, + { + "epoch": 0.25033466906907964, + "grad_norm": 0.1030232384800911, + "learning_rate": 2.199204524676044e-05, + "loss": 0.0317, + "step": 113510 + }, + { + "epoch": 0.2503567230439778, + "grad_norm": 0.09168174117803574, + "learning_rate": 2.1990582358217193e-05, + "loss": 0.0323, + "step": 113520 + }, + { + "epoch": 0.250378777018876, + "grad_norm": 0.11979128420352936, + "learning_rate": 2.198911938473177e-05, + "loss": 0.0326, + "step": 113530 + }, + { + "epoch": 0.25040083099377414, + "grad_norm": 0.10255920886993408, + "learning_rate": 2.198765632632195e-05, + "loss": 0.0329, + "step": 113540 + }, + { + "epoch": 0.2504228849686723, + "grad_norm": 0.11524035781621933, + "learning_rate": 2.198619318300551e-05, + "loss": 0.0346, + "step": 113550 + }, + { + "epoch": 0.2504449389435705, + "grad_norm": 0.13337905704975128, + "learning_rate": 2.1984729954800223e-05, + "loss": 0.0308, + "step": 113560 + }, + { + "epoch": 0.25046699291846863, + "grad_norm": 0.1127438098192215, + "learning_rate": 2.198326664172387e-05, + "loss": 0.0315, + "step": 113570 + }, + { + "epoch": 0.2504890468933668, + "grad_norm": 0.0993148609995842, + "learning_rate": 2.1981803243794244e-05, + "loss": 0.0311, + "step": 113580 + }, + { + "epoch": 0.250511100868265, + "grad_norm": 0.11025545746088028, + "learning_rate": 2.1980339761029105e-05, + "loss": 0.0333, + "step": 113590 + }, + { + "epoch": 0.25053315484316313, + "grad_norm": 0.12559932470321655, + "learning_rate": 2.1978876193446257e-05, + "loss": 0.0311, + "step": 113600 + }, + { + "epoch": 0.2505552088180613, + "grad_norm": 0.1335582286119461, + "learning_rate": 2.1977412541063468e-05, + "loss": 0.03, + "step": 113610 + }, + { + "epoch": 0.2505772627929595, + "grad_norm": 0.0815945491194725, + "learning_rate": 2.1975948803898532e-05, + "loss": 0.0314, + "step": 113620 + }, + { + "epoch": 0.2505993167678576, + "grad_norm": 0.10749666392803192, + "learning_rate": 2.197448498196923e-05, + "loss": 0.0308, + "step": 113630 + }, + { + "epoch": 0.2506213707427558, + "grad_norm": 0.11356215924024582, + "learning_rate": 2.197302107529335e-05, + "loss": 0.0308, + "step": 113640 + }, + { + "epoch": 0.250643424717654, + "grad_norm": 0.09958262741565704, + "learning_rate": 2.197155708388868e-05, + "loss": 0.0326, + "step": 113650 + }, + { + "epoch": 0.2506654786925522, + "grad_norm": 0.10750313848257065, + "learning_rate": 2.1970093007773014e-05, + "loss": 0.0321, + "step": 113660 + }, + { + "epoch": 0.2506875326674503, + "grad_norm": 0.09283614158630371, + "learning_rate": 2.1968628846964133e-05, + "loss": 0.0307, + "step": 113670 + }, + { + "epoch": 0.2507095866423485, + "grad_norm": 0.11069151014089584, + "learning_rate": 2.1967164601479827e-05, + "loss": 0.0285, + "step": 113680 + }, + { + "epoch": 0.2507316406172467, + "grad_norm": 0.12056678533554077, + "learning_rate": 2.1965700271337903e-05, + "loss": 0.0342, + "step": 113690 + }, + { + "epoch": 0.2507536945921448, + "grad_norm": 0.10851876437664032, + "learning_rate": 2.196423585655614e-05, + "loss": 0.0327, + "step": 113700 + }, + { + "epoch": 0.250775748567043, + "grad_norm": 0.08197210729122162, + "learning_rate": 2.1962771357152333e-05, + "loss": 0.03, + "step": 113710 + }, + { + "epoch": 0.2507978025419412, + "grad_norm": 0.09779569506645203, + "learning_rate": 2.196130677314428e-05, + "loss": 0.0297, + "step": 113720 + }, + { + "epoch": 0.2508198565168393, + "grad_norm": 0.09934984147548676, + "learning_rate": 2.1959842104549786e-05, + "loss": 0.0308, + "step": 113730 + }, + { + "epoch": 0.2508419104917375, + "grad_norm": 0.11880633234977722, + "learning_rate": 2.1958377351386628e-05, + "loss": 0.0303, + "step": 113740 + }, + { + "epoch": 0.25086396446663567, + "grad_norm": 0.08035260438919067, + "learning_rate": 2.1956912513672624e-05, + "loss": 0.032, + "step": 113750 + }, + { + "epoch": 0.2508860184415338, + "grad_norm": 0.0983971655368805, + "learning_rate": 2.1955447591425558e-05, + "loss": 0.0316, + "step": 113760 + }, + { + "epoch": 0.250908072416432, + "grad_norm": 0.11009899526834488, + "learning_rate": 2.1953982584663243e-05, + "loss": 0.0295, + "step": 113770 + }, + { + "epoch": 0.25093012639133017, + "grad_norm": 0.10402300208806992, + "learning_rate": 2.1952517493403468e-05, + "loss": 0.0311, + "step": 113780 + }, + { + "epoch": 0.2509521803662283, + "grad_norm": 0.1177208349108696, + "learning_rate": 2.1951052317664043e-05, + "loss": 0.0307, + "step": 113790 + }, + { + "epoch": 0.2509742343411265, + "grad_norm": 0.1169840469956398, + "learning_rate": 2.194958705746277e-05, + "loss": 0.0314, + "step": 113800 + }, + { + "epoch": 0.25099628831602466, + "grad_norm": 0.1321556121110916, + "learning_rate": 2.194812171281745e-05, + "loss": 0.0325, + "step": 113810 + }, + { + "epoch": 0.2510183422909228, + "grad_norm": 0.12033676356077194, + "learning_rate": 2.1946656283745897e-05, + "loss": 0.033, + "step": 113820 + }, + { + "epoch": 0.251040396265821, + "grad_norm": 0.09937474876642227, + "learning_rate": 2.1945190770265908e-05, + "loss": 0.0308, + "step": 113830 + }, + { + "epoch": 0.25106245024071916, + "grad_norm": 0.12809787690639496, + "learning_rate": 2.1943725172395294e-05, + "loss": 0.0313, + "step": 113840 + }, + { + "epoch": 0.2510845042156173, + "grad_norm": 0.10136200487613678, + "learning_rate": 2.1942259490151864e-05, + "loss": 0.031, + "step": 113850 + }, + { + "epoch": 0.25110655819051547, + "grad_norm": 0.1347384750843048, + "learning_rate": 2.1940793723553425e-05, + "loss": 0.031, + "step": 113860 + }, + { + "epoch": 0.25112861216541366, + "grad_norm": 0.11311633139848709, + "learning_rate": 2.193932787261779e-05, + "loss": 0.0298, + "step": 113870 + }, + { + "epoch": 0.2511506661403118, + "grad_norm": 0.09534313529729843, + "learning_rate": 2.193786193736277e-05, + "loss": 0.0309, + "step": 113880 + }, + { + "epoch": 0.25117272011520997, + "grad_norm": 0.0851626843214035, + "learning_rate": 2.1936395917806177e-05, + "loss": 0.0334, + "step": 113890 + }, + { + "epoch": 0.25119477409010815, + "grad_norm": 0.1151747927069664, + "learning_rate": 2.1934929813965826e-05, + "loss": 0.0308, + "step": 113900 + }, + { + "epoch": 0.2512168280650063, + "grad_norm": 0.08813676983118057, + "learning_rate": 2.1933463625859527e-05, + "loss": 0.0327, + "step": 113910 + }, + { + "epoch": 0.25123888203990447, + "grad_norm": 0.09503420442342758, + "learning_rate": 2.19319973535051e-05, + "loss": 0.0305, + "step": 113920 + }, + { + "epoch": 0.25126093601480265, + "grad_norm": 0.11315681040287018, + "learning_rate": 2.1930530996920362e-05, + "loss": 0.0323, + "step": 113930 + }, + { + "epoch": 0.2512829899897008, + "grad_norm": 0.11468111723661423, + "learning_rate": 2.192906455612313e-05, + "loss": 0.034, + "step": 113940 + }, + { + "epoch": 0.25130504396459896, + "grad_norm": 0.10258354991674423, + "learning_rate": 2.1927598031131213e-05, + "loss": 0.0315, + "step": 113950 + }, + { + "epoch": 0.25132709793949715, + "grad_norm": 0.11414618790149689, + "learning_rate": 2.1926131421962448e-05, + "loss": 0.031, + "step": 113960 + }, + { + "epoch": 0.2513491519143953, + "grad_norm": 0.12083218991756439, + "learning_rate": 2.1924664728634643e-05, + "loss": 0.0301, + "step": 113970 + }, + { + "epoch": 0.25137120588929346, + "grad_norm": 0.1018025279045105, + "learning_rate": 2.1923197951165625e-05, + "loss": 0.0329, + "step": 113980 + }, + { + "epoch": 0.25139325986419164, + "grad_norm": 0.11619285494089127, + "learning_rate": 2.1921731089573214e-05, + "loss": 0.0337, + "step": 113990 + }, + { + "epoch": 0.25141531383908977, + "grad_norm": 0.12127204984426498, + "learning_rate": 2.1920264143875233e-05, + "loss": 0.033, + "step": 114000 + }, + { + "epoch": 0.25143736781398796, + "grad_norm": 0.0986177995800972, + "learning_rate": 2.1918797114089513e-05, + "loss": 0.0327, + "step": 114010 + }, + { + "epoch": 0.25145942178888614, + "grad_norm": 0.08639520406723022, + "learning_rate": 2.1917330000233872e-05, + "loss": 0.0319, + "step": 114020 + }, + { + "epoch": 0.25148147576378427, + "grad_norm": 0.11285662651062012, + "learning_rate": 2.1915862802326146e-05, + "loss": 0.0312, + "step": 114030 + }, + { + "epoch": 0.25150352973868245, + "grad_norm": 0.1079176664352417, + "learning_rate": 2.1914395520384152e-05, + "loss": 0.0302, + "step": 114040 + }, + { + "epoch": 0.25152558371358064, + "grad_norm": 0.12473388016223907, + "learning_rate": 2.191292815442573e-05, + "loss": 0.0312, + "step": 114050 + }, + { + "epoch": 0.25154763768847876, + "grad_norm": 0.09241124987602234, + "learning_rate": 2.19114607044687e-05, + "loss": 0.0311, + "step": 114060 + }, + { + "epoch": 0.25156969166337695, + "grad_norm": 0.11384374648332596, + "learning_rate": 2.1909993170530896e-05, + "loss": 0.0323, + "step": 114070 + }, + { + "epoch": 0.25159174563827513, + "grad_norm": 0.09775324165821075, + "learning_rate": 2.1908525552630153e-05, + "loss": 0.0292, + "step": 114080 + }, + { + "epoch": 0.25161379961317326, + "grad_norm": 0.10398375988006592, + "learning_rate": 2.19070578507843e-05, + "loss": 0.0311, + "step": 114090 + }, + { + "epoch": 0.25163585358807145, + "grad_norm": 0.11564446240663528, + "learning_rate": 2.1905590065011173e-05, + "loss": 0.0309, + "step": 114100 + }, + { + "epoch": 0.25165790756296963, + "grad_norm": 0.12057414650917053, + "learning_rate": 2.1904122195328606e-05, + "loss": 0.0308, + "step": 114110 + }, + { + "epoch": 0.25167996153786776, + "grad_norm": 0.14577244222164154, + "learning_rate": 2.190265424175444e-05, + "loss": 0.031, + "step": 114120 + }, + { + "epoch": 0.25170201551276594, + "grad_norm": 0.10928186774253845, + "learning_rate": 2.1901186204306503e-05, + "loss": 0.0345, + "step": 114130 + }, + { + "epoch": 0.2517240694876641, + "grad_norm": 0.13825255632400513, + "learning_rate": 2.189971808300264e-05, + "loss": 0.0322, + "step": 114140 + }, + { + "epoch": 0.25174612346256225, + "grad_norm": 0.12309122085571289, + "learning_rate": 2.1898249877860688e-05, + "loss": 0.0301, + "step": 114150 + }, + { + "epoch": 0.25176817743746044, + "grad_norm": 0.10161100327968597, + "learning_rate": 2.1896781588898486e-05, + "loss": 0.0312, + "step": 114160 + }, + { + "epoch": 0.2517902314123586, + "grad_norm": 0.07774903625249863, + "learning_rate": 2.1895313216133876e-05, + "loss": 0.0322, + "step": 114170 + }, + { + "epoch": 0.25181228538725675, + "grad_norm": 0.10898172110319138, + "learning_rate": 2.1893844759584697e-05, + "loss": 0.0318, + "step": 114180 + }, + { + "epoch": 0.25183433936215494, + "grad_norm": 0.08494061231613159, + "learning_rate": 2.1892376219268802e-05, + "loss": 0.0307, + "step": 114190 + }, + { + "epoch": 0.2518563933370531, + "grad_norm": 0.09949354082345963, + "learning_rate": 2.1890907595204024e-05, + "loss": 0.0312, + "step": 114200 + }, + { + "epoch": 0.25187844731195125, + "grad_norm": 0.11950787156820297, + "learning_rate": 2.1889438887408218e-05, + "loss": 0.0323, + "step": 114210 + }, + { + "epoch": 0.25190050128684943, + "grad_norm": 0.10029125213623047, + "learning_rate": 2.188797009589922e-05, + "loss": 0.0305, + "step": 114220 + }, + { + "epoch": 0.2519225552617476, + "grad_norm": 0.1209351047873497, + "learning_rate": 2.1886501220694882e-05, + "loss": 0.0296, + "step": 114230 + }, + { + "epoch": 0.25194460923664574, + "grad_norm": 0.10669350624084473, + "learning_rate": 2.1885032261813055e-05, + "loss": 0.031, + "step": 114240 + }, + { + "epoch": 0.25196666321154393, + "grad_norm": 0.10745289921760559, + "learning_rate": 2.1883563219271582e-05, + "loss": 0.0317, + "step": 114250 + }, + { + "epoch": 0.2519887171864421, + "grad_norm": 0.10752129554748535, + "learning_rate": 2.188209409308832e-05, + "loss": 0.031, + "step": 114260 + }, + { + "epoch": 0.25201077116134024, + "grad_norm": 0.13625793159008026, + "learning_rate": 2.1880624883281113e-05, + "loss": 0.0306, + "step": 114270 + }, + { + "epoch": 0.2520328251362384, + "grad_norm": 0.07717485725879669, + "learning_rate": 2.187915558986782e-05, + "loss": 0.0322, + "step": 114280 + }, + { + "epoch": 0.2520548791111366, + "grad_norm": 0.0996759682893753, + "learning_rate": 2.187768621286629e-05, + "loss": 0.0307, + "step": 114290 + }, + { + "epoch": 0.25207693308603474, + "grad_norm": 0.09886803478002548, + "learning_rate": 2.1876216752294382e-05, + "loss": 0.0306, + "step": 114300 + }, + { + "epoch": 0.2520989870609329, + "grad_norm": 0.12344193458557129, + "learning_rate": 2.187474720816995e-05, + "loss": 0.032, + "step": 114310 + }, + { + "epoch": 0.2521210410358311, + "grad_norm": 0.11657712608575821, + "learning_rate": 2.187327758051084e-05, + "loss": 0.0315, + "step": 114320 + }, + { + "epoch": 0.25214309501072923, + "grad_norm": 0.10004575550556183, + "learning_rate": 2.1871807869334927e-05, + "loss": 0.0316, + "step": 114330 + }, + { + "epoch": 0.2521651489856274, + "grad_norm": 0.11235535144805908, + "learning_rate": 2.1870338074660052e-05, + "loss": 0.0308, + "step": 114340 + }, + { + "epoch": 0.2521872029605256, + "grad_norm": 0.09594377875328064, + "learning_rate": 2.186886819650409e-05, + "loss": 0.0308, + "step": 114350 + }, + { + "epoch": 0.25220925693542373, + "grad_norm": 0.1098836287856102, + "learning_rate": 2.186739823488489e-05, + "loss": 0.0309, + "step": 114360 + }, + { + "epoch": 0.2522313109103219, + "grad_norm": 0.10821134597063065, + "learning_rate": 2.1865928189820317e-05, + "loss": 0.031, + "step": 114370 + }, + { + "epoch": 0.2522533648852201, + "grad_norm": 0.16173683106899261, + "learning_rate": 2.1864458061328237e-05, + "loss": 0.0324, + "step": 114380 + }, + { + "epoch": 0.25227541886011823, + "grad_norm": 0.1358807384967804, + "learning_rate": 2.186298784942651e-05, + "loss": 0.0317, + "step": 114390 + }, + { + "epoch": 0.2522974728350164, + "grad_norm": 0.09179292619228363, + "learning_rate": 2.1861517554133e-05, + "loss": 0.0316, + "step": 114400 + }, + { + "epoch": 0.2523195268099146, + "grad_norm": 0.09310874342918396, + "learning_rate": 2.186004717546557e-05, + "loss": 0.0307, + "step": 114410 + }, + { + "epoch": 0.2523415807848127, + "grad_norm": 0.0952346995472908, + "learning_rate": 2.1858576713442098e-05, + "loss": 0.0291, + "step": 114420 + }, + { + "epoch": 0.2523636347597109, + "grad_norm": 0.10150076448917389, + "learning_rate": 2.1857106168080435e-05, + "loss": 0.0307, + "step": 114430 + }, + { + "epoch": 0.2523856887346091, + "grad_norm": 0.09825316816568375, + "learning_rate": 2.1855635539398462e-05, + "loss": 0.0302, + "step": 114440 + }, + { + "epoch": 0.2524077427095072, + "grad_norm": 0.08212681114673615, + "learning_rate": 2.185416482741404e-05, + "loss": 0.0318, + "step": 114450 + }, + { + "epoch": 0.2524297966844054, + "grad_norm": 0.07744347304105759, + "learning_rate": 2.185269403214505e-05, + "loss": 0.0304, + "step": 114460 + }, + { + "epoch": 0.2524518506593036, + "grad_norm": 0.11457616090774536, + "learning_rate": 2.1851223153609355e-05, + "loss": 0.0296, + "step": 114470 + }, + { + "epoch": 0.2524739046342017, + "grad_norm": 0.11040705442428589, + "learning_rate": 2.184975219182483e-05, + "loss": 0.0313, + "step": 114480 + }, + { + "epoch": 0.2524959586090999, + "grad_norm": 0.10970032215118408, + "learning_rate": 2.184828114680935e-05, + "loss": 0.0324, + "step": 114490 + }, + { + "epoch": 0.2525180125839981, + "grad_norm": 0.1127898171544075, + "learning_rate": 2.1846810018580785e-05, + "loss": 0.0313, + "step": 114500 + }, + { + "epoch": 0.25254006655889627, + "grad_norm": 0.09007002413272858, + "learning_rate": 2.1845338807157018e-05, + "loss": 0.0319, + "step": 114510 + }, + { + "epoch": 0.2525621205337944, + "grad_norm": 0.08977295458316803, + "learning_rate": 2.184386751255592e-05, + "loss": 0.0311, + "step": 114520 + }, + { + "epoch": 0.2525841745086926, + "grad_norm": 0.09303408861160278, + "learning_rate": 2.1842396134795365e-05, + "loss": 0.0308, + "step": 114530 + }, + { + "epoch": 0.25260622848359077, + "grad_norm": 0.13866020739078522, + "learning_rate": 2.1840924673893245e-05, + "loss": 0.0311, + "step": 114540 + }, + { + "epoch": 0.2526282824584889, + "grad_norm": 0.13915707170963287, + "learning_rate": 2.1839453129867425e-05, + "loss": 0.0306, + "step": 114550 + }, + { + "epoch": 0.2526503364333871, + "grad_norm": 0.08612528443336487, + "learning_rate": 2.1837981502735797e-05, + "loss": 0.0309, + "step": 114560 + }, + { + "epoch": 0.25267239040828526, + "grad_norm": 0.11974870413541794, + "learning_rate": 2.183650979251623e-05, + "loss": 0.0312, + "step": 114570 + }, + { + "epoch": 0.2526944443831834, + "grad_norm": 0.12229549139738083, + "learning_rate": 2.1835037999226622e-05, + "loss": 0.0308, + "step": 114580 + }, + { + "epoch": 0.2527164983580816, + "grad_norm": 0.10445661842823029, + "learning_rate": 2.1833566122884844e-05, + "loss": 0.0316, + "step": 114590 + }, + { + "epoch": 0.25273855233297976, + "grad_norm": 0.11244067549705505, + "learning_rate": 2.183209416350879e-05, + "loss": 0.0299, + "step": 114600 + }, + { + "epoch": 0.2527606063078779, + "grad_norm": 0.11572545766830444, + "learning_rate": 2.1830622121116337e-05, + "loss": 0.0311, + "step": 114610 + }, + { + "epoch": 0.2527826602827761, + "grad_norm": 0.13093312084674835, + "learning_rate": 2.182914999572538e-05, + "loss": 0.0299, + "step": 114620 + }, + { + "epoch": 0.25280471425767426, + "grad_norm": 0.13721473515033722, + "learning_rate": 2.1827677787353803e-05, + "loss": 0.0307, + "step": 114630 + }, + { + "epoch": 0.2528267682325724, + "grad_norm": 0.150931715965271, + "learning_rate": 2.182620549601949e-05, + "loss": 0.0333, + "step": 114640 + }, + { + "epoch": 0.25284882220747057, + "grad_norm": 0.1019149050116539, + "learning_rate": 2.182473312174034e-05, + "loss": 0.0321, + "step": 114650 + }, + { + "epoch": 0.25287087618236875, + "grad_norm": 0.10247036814689636, + "learning_rate": 2.1823260664534233e-05, + "loss": 0.0334, + "step": 114660 + }, + { + "epoch": 0.2528929301572669, + "grad_norm": 0.11407900601625443, + "learning_rate": 2.1821788124419073e-05, + "loss": 0.0313, + "step": 114670 + }, + { + "epoch": 0.25291498413216507, + "grad_norm": 0.10144476592540741, + "learning_rate": 2.1820315501412743e-05, + "loss": 0.0298, + "step": 114680 + }, + { + "epoch": 0.25293703810706325, + "grad_norm": 0.08795277029275894, + "learning_rate": 2.181884279553314e-05, + "loss": 0.0319, + "step": 114690 + }, + { + "epoch": 0.2529590920819614, + "grad_norm": 0.1005740687251091, + "learning_rate": 2.1817370006798165e-05, + "loss": 0.0299, + "step": 114700 + }, + { + "epoch": 0.25298114605685956, + "grad_norm": 0.08346503227949142, + "learning_rate": 2.1815897135225697e-05, + "loss": 0.0303, + "step": 114710 + }, + { + "epoch": 0.25300320003175775, + "grad_norm": 0.12915712594985962, + "learning_rate": 2.1814424180833655e-05, + "loss": 0.032, + "step": 114720 + }, + { + "epoch": 0.2530252540066559, + "grad_norm": 0.08847486972808838, + "learning_rate": 2.1812951143639917e-05, + "loss": 0.0302, + "step": 114730 + }, + { + "epoch": 0.25304730798155406, + "grad_norm": 0.1258164495229721, + "learning_rate": 2.1811478023662393e-05, + "loss": 0.0325, + "step": 114740 + }, + { + "epoch": 0.25306936195645224, + "grad_norm": 0.11783280968666077, + "learning_rate": 2.1810004820918976e-05, + "loss": 0.031, + "step": 114750 + }, + { + "epoch": 0.25309141593135037, + "grad_norm": 0.10106790065765381, + "learning_rate": 2.1808531535427576e-05, + "loss": 0.0304, + "step": 114760 + }, + { + "epoch": 0.25311346990624856, + "grad_norm": 0.10699032247066498, + "learning_rate": 2.1807058167206088e-05, + "loss": 0.0314, + "step": 114770 + }, + { + "epoch": 0.25313552388114674, + "grad_norm": 0.09878172725439072, + "learning_rate": 2.180558471627242e-05, + "loss": 0.0309, + "step": 114780 + }, + { + "epoch": 0.25315757785604487, + "grad_norm": 0.16267770528793335, + "learning_rate": 2.1804111182644468e-05, + "loss": 0.03, + "step": 114790 + }, + { + "epoch": 0.25317963183094305, + "grad_norm": 0.11094743758440018, + "learning_rate": 2.1802637566340142e-05, + "loss": 0.0322, + "step": 114800 + }, + { + "epoch": 0.25320168580584124, + "grad_norm": 0.13460244238376617, + "learning_rate": 2.1801163867377346e-05, + "loss": 0.0305, + "step": 114810 + }, + { + "epoch": 0.25322373978073937, + "grad_norm": 0.10316652804613113, + "learning_rate": 2.1799690085773985e-05, + "loss": 0.0316, + "step": 114820 + }, + { + "epoch": 0.25324579375563755, + "grad_norm": 0.0918070450425148, + "learning_rate": 2.1798216221547974e-05, + "loss": 0.0295, + "step": 114830 + }, + { + "epoch": 0.25326784773053573, + "grad_norm": 0.08481357246637344, + "learning_rate": 2.1796742274717212e-05, + "loss": 0.0315, + "step": 114840 + }, + { + "epoch": 0.25328990170543386, + "grad_norm": 0.12733887135982513, + "learning_rate": 2.179526824529962e-05, + "loss": 0.0313, + "step": 114850 + }, + { + "epoch": 0.25331195568033205, + "grad_norm": 0.1248573511838913, + "learning_rate": 2.17937941333131e-05, + "loss": 0.0309, + "step": 114860 + }, + { + "epoch": 0.25333400965523023, + "grad_norm": 0.13698376715183258, + "learning_rate": 2.1792319938775567e-05, + "loss": 0.0314, + "step": 114870 + }, + { + "epoch": 0.25335606363012836, + "grad_norm": 0.11272931843996048, + "learning_rate": 2.1790845661704938e-05, + "loss": 0.031, + "step": 114880 + }, + { + "epoch": 0.25337811760502654, + "grad_norm": 0.1364738792181015, + "learning_rate": 2.1789371302119118e-05, + "loss": 0.0332, + "step": 114890 + }, + { + "epoch": 0.2534001715799247, + "grad_norm": 0.110087089240551, + "learning_rate": 2.1787896860036025e-05, + "loss": 0.0307, + "step": 114900 + }, + { + "epoch": 0.25342222555482286, + "grad_norm": 0.09956547617912292, + "learning_rate": 2.1786422335473578e-05, + "loss": 0.0328, + "step": 114910 + }, + { + "epoch": 0.25344427952972104, + "grad_norm": 0.09604593366384506, + "learning_rate": 2.1784947728449697e-05, + "loss": 0.0327, + "step": 114920 + }, + { + "epoch": 0.2534663335046192, + "grad_norm": 0.08879868686199188, + "learning_rate": 2.1783473038982288e-05, + "loss": 0.0295, + "step": 114930 + }, + { + "epoch": 0.25348838747951735, + "grad_norm": 0.10392535477876663, + "learning_rate": 2.1781998267089282e-05, + "loss": 0.032, + "step": 114940 + }, + { + "epoch": 0.25351044145441554, + "grad_norm": 0.1052778884768486, + "learning_rate": 2.178052341278859e-05, + "loss": 0.0303, + "step": 114950 + }, + { + "epoch": 0.2535324954293137, + "grad_norm": 0.10291354358196259, + "learning_rate": 2.177904847609814e-05, + "loss": 0.0311, + "step": 114960 + }, + { + "epoch": 0.25355454940421185, + "grad_norm": 0.130459725856781, + "learning_rate": 2.177757345703585e-05, + "loss": 0.0303, + "step": 114970 + }, + { + "epoch": 0.25357660337911003, + "grad_norm": 0.1400015652179718, + "learning_rate": 2.177609835561964e-05, + "loss": 0.0321, + "step": 114980 + }, + { + "epoch": 0.2535986573540082, + "grad_norm": 0.11044872552156448, + "learning_rate": 2.1774623171867443e-05, + "loss": 0.0315, + "step": 114990 + }, + { + "epoch": 0.25362071132890635, + "grad_norm": 0.10864163190126419, + "learning_rate": 2.177314790579717e-05, + "loss": 0.0318, + "step": 115000 + }, + { + "epoch": 0.25364276530380453, + "grad_norm": 0.12494561821222305, + "learning_rate": 2.1771672557426762e-05, + "loss": 0.0317, + "step": 115010 + }, + { + "epoch": 0.2536648192787027, + "grad_norm": 0.10262156277894974, + "learning_rate": 2.177019712677414e-05, + "loss": 0.0308, + "step": 115020 + }, + { + "epoch": 0.25368687325360084, + "grad_norm": 0.0832386463880539, + "learning_rate": 2.1768721613857223e-05, + "loss": 0.0306, + "step": 115030 + }, + { + "epoch": 0.253708927228499, + "grad_norm": 0.1166423112154007, + "learning_rate": 2.1767246018693956e-05, + "loss": 0.0305, + "step": 115040 + }, + { + "epoch": 0.2537309812033972, + "grad_norm": 0.11728297919034958, + "learning_rate": 2.1765770341302255e-05, + "loss": 0.0316, + "step": 115050 + }, + { + "epoch": 0.25375303517829534, + "grad_norm": 0.10350209474563599, + "learning_rate": 2.1764294581700057e-05, + "loss": 0.0308, + "step": 115060 + }, + { + "epoch": 0.2537750891531935, + "grad_norm": 0.14872610569000244, + "learning_rate": 2.1762818739905296e-05, + "loss": 0.033, + "step": 115070 + }, + { + "epoch": 0.2537971431280917, + "grad_norm": 0.10969244688749313, + "learning_rate": 2.17613428159359e-05, + "loss": 0.0305, + "step": 115080 + }, + { + "epoch": 0.25381919710298984, + "grad_norm": 0.1159178838133812, + "learning_rate": 2.1759866809809805e-05, + "loss": 0.0311, + "step": 115090 + }, + { + "epoch": 0.253841251077888, + "grad_norm": 0.11308258026838303, + "learning_rate": 2.1758390721544946e-05, + "loss": 0.0304, + "step": 115100 + }, + { + "epoch": 0.2538633050527862, + "grad_norm": 0.12140003591775894, + "learning_rate": 2.175691455115926e-05, + "loss": 0.0317, + "step": 115110 + }, + { + "epoch": 0.25388535902768433, + "grad_norm": 0.5087446570396423, + "learning_rate": 2.175543829867068e-05, + "loss": 0.0324, + "step": 115120 + }, + { + "epoch": 0.2539074130025825, + "grad_norm": 0.09885453432798386, + "learning_rate": 2.175396196409715e-05, + "loss": 0.0309, + "step": 115130 + }, + { + "epoch": 0.2539294669774807, + "grad_norm": 0.11098803579807281, + "learning_rate": 2.1752485547456606e-05, + "loss": 0.0309, + "step": 115140 + }, + { + "epoch": 0.25395152095237883, + "grad_norm": 0.09572884440422058, + "learning_rate": 2.1751009048766984e-05, + "loss": 0.0318, + "step": 115150 + }, + { + "epoch": 0.253973574927277, + "grad_norm": 0.11226936429738998, + "learning_rate": 2.1749532468046225e-05, + "loss": 0.0304, + "step": 115160 + }, + { + "epoch": 0.2539956289021752, + "grad_norm": 0.13191568851470947, + "learning_rate": 2.174805580531228e-05, + "loss": 0.0331, + "step": 115170 + }, + { + "epoch": 0.2540176828770733, + "grad_norm": 0.19935137033462524, + "learning_rate": 2.174657906058308e-05, + "loss": 0.0307, + "step": 115180 + }, + { + "epoch": 0.2540397368519715, + "grad_norm": 0.10515942424535751, + "learning_rate": 2.1745102233876583e-05, + "loss": 0.0308, + "step": 115190 + }, + { + "epoch": 0.2540617908268697, + "grad_norm": 0.10639963299036026, + "learning_rate": 2.1743625325210715e-05, + "loss": 0.0295, + "step": 115200 + }, + { + "epoch": 0.2540838448017678, + "grad_norm": 0.12075111269950867, + "learning_rate": 2.174214833460344e-05, + "loss": 0.0323, + "step": 115210 + }, + { + "epoch": 0.254105898776666, + "grad_norm": 0.11062844842672348, + "learning_rate": 2.1740671262072696e-05, + "loss": 0.0317, + "step": 115220 + }, + { + "epoch": 0.2541279527515642, + "grad_norm": 0.10603611171245575, + "learning_rate": 2.173919410763643e-05, + "loss": 0.0323, + "step": 115230 + }, + { + "epoch": 0.2541500067264623, + "grad_norm": 0.09471849352121353, + "learning_rate": 2.1737716871312592e-05, + "loss": 0.0299, + "step": 115240 + }, + { + "epoch": 0.2541720607013605, + "grad_norm": 0.12263087183237076, + "learning_rate": 2.1736239553119135e-05, + "loss": 0.0325, + "step": 115250 + }, + { + "epoch": 0.2541941146762587, + "grad_norm": 0.12062036246061325, + "learning_rate": 2.173476215307401e-05, + "loss": 0.0308, + "step": 115260 + }, + { + "epoch": 0.2542161686511568, + "grad_norm": 0.1277134269475937, + "learning_rate": 2.1733284671195162e-05, + "loss": 0.0307, + "step": 115270 + }, + { + "epoch": 0.254238222626055, + "grad_norm": 0.1520276814699173, + "learning_rate": 2.1731807107500546e-05, + "loss": 0.0301, + "step": 115280 + }, + { + "epoch": 0.2542602766009532, + "grad_norm": 0.11779041588306427, + "learning_rate": 2.173032946200812e-05, + "loss": 0.0317, + "step": 115290 + }, + { + "epoch": 0.2542823305758513, + "grad_norm": 0.10048606246709824, + "learning_rate": 2.1728851734735837e-05, + "loss": 0.0315, + "step": 115300 + }, + { + "epoch": 0.2543043845507495, + "grad_norm": 0.11678210645914078, + "learning_rate": 2.1727373925701655e-05, + "loss": 0.0323, + "step": 115310 + }, + { + "epoch": 0.2543264385256477, + "grad_norm": 0.17252370715141296, + "learning_rate": 2.172589603492352e-05, + "loss": 0.0311, + "step": 115320 + }, + { + "epoch": 0.2543484925005458, + "grad_norm": 0.11883030086755753, + "learning_rate": 2.1724418062419407e-05, + "loss": 0.0322, + "step": 115330 + }, + { + "epoch": 0.254370546475444, + "grad_norm": 0.11877518147230148, + "learning_rate": 2.172294000820726e-05, + "loss": 0.032, + "step": 115340 + }, + { + "epoch": 0.2543926004503422, + "grad_norm": 0.11571282893419266, + "learning_rate": 2.1721461872305048e-05, + "loss": 0.0323, + "step": 115350 + }, + { + "epoch": 0.25441465442524036, + "grad_norm": 0.10543373972177505, + "learning_rate": 2.1719983654730727e-05, + "loss": 0.0315, + "step": 115360 + }, + { + "epoch": 0.2544367084001385, + "grad_norm": 0.10340816527605057, + "learning_rate": 2.1718505355502258e-05, + "loss": 0.0294, + "step": 115370 + }, + { + "epoch": 0.2544587623750367, + "grad_norm": 0.13133519887924194, + "learning_rate": 2.1717026974637607e-05, + "loss": 0.0302, + "step": 115380 + }, + { + "epoch": 0.25448081634993486, + "grad_norm": 0.13413238525390625, + "learning_rate": 2.1715548512154738e-05, + "loss": 0.0307, + "step": 115390 + }, + { + "epoch": 0.254502870324833, + "grad_norm": 0.11235867440700531, + "learning_rate": 2.1714069968071612e-05, + "loss": 0.031, + "step": 115400 + }, + { + "epoch": 0.25452492429973117, + "grad_norm": 0.12221550941467285, + "learning_rate": 2.1712591342406196e-05, + "loss": 0.0312, + "step": 115410 + }, + { + "epoch": 0.25454697827462935, + "grad_norm": 0.1058635413646698, + "learning_rate": 2.171111263517646e-05, + "loss": 0.0303, + "step": 115420 + }, + { + "epoch": 0.2545690322495275, + "grad_norm": 0.1608448624610901, + "learning_rate": 2.1709633846400366e-05, + "loss": 0.0322, + "step": 115430 + }, + { + "epoch": 0.25459108622442567, + "grad_norm": 0.09260676801204681, + "learning_rate": 2.1708154976095887e-05, + "loss": 0.032, + "step": 115440 + }, + { + "epoch": 0.25461314019932385, + "grad_norm": 0.10211312025785446, + "learning_rate": 2.1706676024280993e-05, + "loss": 0.0298, + "step": 115450 + }, + { + "epoch": 0.254635194174222, + "grad_norm": 0.11831084638834, + "learning_rate": 2.1705196990973653e-05, + "loss": 0.029, + "step": 115460 + }, + { + "epoch": 0.25465724814912016, + "grad_norm": 0.11865711212158203, + "learning_rate": 2.1703717876191836e-05, + "loss": 0.03, + "step": 115470 + }, + { + "epoch": 0.25467930212401835, + "grad_norm": 0.11774901300668716, + "learning_rate": 2.1702238679953517e-05, + "loss": 0.0318, + "step": 115480 + }, + { + "epoch": 0.2547013560989165, + "grad_norm": 0.10701985657215118, + "learning_rate": 2.1700759402276674e-05, + "loss": 0.0301, + "step": 115490 + }, + { + "epoch": 0.25472341007381466, + "grad_norm": 0.0951569601893425, + "learning_rate": 2.1699280043179276e-05, + "loss": 0.0314, + "step": 115500 + }, + { + "epoch": 0.25474546404871284, + "grad_norm": 0.12017031013965607, + "learning_rate": 2.16978006026793e-05, + "loss": 0.0319, + "step": 115510 + }, + { + "epoch": 0.254767518023611, + "grad_norm": 0.1298651248216629, + "learning_rate": 2.1696321080794727e-05, + "loss": 0.0321, + "step": 115520 + }, + { + "epoch": 0.25478957199850916, + "grad_norm": 0.0813543051481247, + "learning_rate": 2.1694841477543525e-05, + "loss": 0.0324, + "step": 115530 + }, + { + "epoch": 0.25481162597340734, + "grad_norm": 0.10513744503259659, + "learning_rate": 2.169336179294368e-05, + "loss": 0.0316, + "step": 115540 + }, + { + "epoch": 0.25483367994830547, + "grad_norm": 0.1131024956703186, + "learning_rate": 2.169188202701317e-05, + "loss": 0.0303, + "step": 115550 + }, + { + "epoch": 0.25485573392320365, + "grad_norm": 0.0843314453959465, + "learning_rate": 2.1690402179769974e-05, + "loss": 0.0297, + "step": 115560 + }, + { + "epoch": 0.25487778789810184, + "grad_norm": 0.14163652062416077, + "learning_rate": 2.168892225123207e-05, + "loss": 0.0307, + "step": 115570 + }, + { + "epoch": 0.25489984187299997, + "grad_norm": 0.11286497861146927, + "learning_rate": 2.1687442241417458e-05, + "loss": 0.0319, + "step": 115580 + }, + { + "epoch": 0.25492189584789815, + "grad_norm": 0.111368328332901, + "learning_rate": 2.16859621503441e-05, + "loss": 0.0305, + "step": 115590 + }, + { + "epoch": 0.25494394982279633, + "grad_norm": 0.11509132385253906, + "learning_rate": 2.168448197802999e-05, + "loss": 0.0308, + "step": 115600 + }, + { + "epoch": 0.25496600379769446, + "grad_norm": 0.14421187341213226, + "learning_rate": 2.1683001724493116e-05, + "loss": 0.031, + "step": 115610 + }, + { + "epoch": 0.25498805777259265, + "grad_norm": 0.12847165763378143, + "learning_rate": 2.1681521389751457e-05, + "loss": 0.0315, + "step": 115620 + }, + { + "epoch": 0.25501011174749083, + "grad_norm": 0.08998920768499374, + "learning_rate": 2.1680040973823007e-05, + "loss": 0.0323, + "step": 115630 + }, + { + "epoch": 0.25503216572238896, + "grad_norm": 0.1113322526216507, + "learning_rate": 2.167856047672575e-05, + "loss": 0.0329, + "step": 115640 + }, + { + "epoch": 0.25505421969728714, + "grad_norm": 0.11768337339162827, + "learning_rate": 2.1677079898477682e-05, + "loss": 0.03, + "step": 115650 + }, + { + "epoch": 0.2550762736721853, + "grad_norm": 0.10436466336250305, + "learning_rate": 2.167559923909678e-05, + "loss": 0.0317, + "step": 115660 + }, + { + "epoch": 0.25509832764708346, + "grad_norm": 0.09975368529558182, + "learning_rate": 2.1674118498601058e-05, + "loss": 0.0322, + "step": 115670 + }, + { + "epoch": 0.25512038162198164, + "grad_norm": 0.10126952081918716, + "learning_rate": 2.1672637677008486e-05, + "loss": 0.0314, + "step": 115680 + }, + { + "epoch": 0.2551424355968798, + "grad_norm": 0.0926395058631897, + "learning_rate": 2.1671156774337066e-05, + "loss": 0.0305, + "step": 115690 + }, + { + "epoch": 0.25516448957177795, + "grad_norm": 0.09895976632833481, + "learning_rate": 2.1669675790604793e-05, + "loss": 0.0319, + "step": 115700 + }, + { + "epoch": 0.25518654354667614, + "grad_norm": 0.14350229501724243, + "learning_rate": 2.1668194725829665e-05, + "loss": 0.0319, + "step": 115710 + }, + { + "epoch": 0.2552085975215743, + "grad_norm": 0.11550368368625641, + "learning_rate": 2.1666713580029676e-05, + "loss": 0.0313, + "step": 115720 + }, + { + "epoch": 0.25523065149647245, + "grad_norm": 0.14181160926818848, + "learning_rate": 2.1665232353222817e-05, + "loss": 0.03, + "step": 115730 + }, + { + "epoch": 0.25525270547137063, + "grad_norm": 0.07500646263360977, + "learning_rate": 2.1663751045427095e-05, + "loss": 0.0312, + "step": 115740 + }, + { + "epoch": 0.2552747594462688, + "grad_norm": 0.13875748217105865, + "learning_rate": 2.1662269656660506e-05, + "loss": 0.0319, + "step": 115750 + }, + { + "epoch": 0.25529681342116695, + "grad_norm": 0.10959939658641815, + "learning_rate": 2.166078818694105e-05, + "loss": 0.0317, + "step": 115760 + }, + { + "epoch": 0.25531886739606513, + "grad_norm": 0.12116367369890213, + "learning_rate": 2.1659306636286733e-05, + "loss": 0.0304, + "step": 115770 + }, + { + "epoch": 0.2553409213709633, + "grad_norm": 0.10861089825630188, + "learning_rate": 2.1657825004715547e-05, + "loss": 0.0315, + "step": 115780 + }, + { + "epoch": 0.25536297534586144, + "grad_norm": 0.12064886838197708, + "learning_rate": 2.1656343292245505e-05, + "loss": 0.0324, + "step": 115790 + }, + { + "epoch": 0.2553850293207596, + "grad_norm": 0.09191902726888657, + "learning_rate": 2.16548614988946e-05, + "loss": 0.0306, + "step": 115800 + }, + { + "epoch": 0.2554070832956578, + "grad_norm": 0.10305358469486237, + "learning_rate": 2.1653379624680853e-05, + "loss": 0.031, + "step": 115810 + }, + { + "epoch": 0.25542913727055594, + "grad_norm": 0.11322835087776184, + "learning_rate": 2.165189766962226e-05, + "loss": 0.0324, + "step": 115820 + }, + { + "epoch": 0.2554511912454541, + "grad_norm": 0.10190965980291367, + "learning_rate": 2.165041563373683e-05, + "loss": 0.0328, + "step": 115830 + }, + { + "epoch": 0.2554732452203523, + "grad_norm": 0.07967300713062286, + "learning_rate": 2.164893351704257e-05, + "loss": 0.0295, + "step": 115840 + }, + { + "epoch": 0.25549529919525044, + "grad_norm": 0.12663251161575317, + "learning_rate": 2.1647451319557492e-05, + "loss": 0.0311, + "step": 115850 + }, + { + "epoch": 0.2555173531701486, + "grad_norm": 0.09191219508647919, + "learning_rate": 2.16459690412996e-05, + "loss": 0.031, + "step": 115860 + }, + { + "epoch": 0.2555394071450468, + "grad_norm": 0.09841077029705048, + "learning_rate": 2.1644486682286914e-05, + "loss": 0.0323, + "step": 115870 + }, + { + "epoch": 0.25556146111994493, + "grad_norm": 0.11428816616535187, + "learning_rate": 2.1643004242537436e-05, + "loss": 0.03, + "step": 115880 + }, + { + "epoch": 0.2555835150948431, + "grad_norm": 0.10623813420534134, + "learning_rate": 2.1641521722069187e-05, + "loss": 0.0324, + "step": 115890 + }, + { + "epoch": 0.2556055690697413, + "grad_norm": 0.13233527541160583, + "learning_rate": 2.1640039120900177e-05, + "loss": 0.0295, + "step": 115900 + }, + { + "epoch": 0.25562762304463943, + "grad_norm": 0.12271816283464432, + "learning_rate": 2.1638556439048428e-05, + "loss": 0.0321, + "step": 115910 + }, + { + "epoch": 0.2556496770195376, + "grad_norm": 0.11583175510168076, + "learning_rate": 2.1637073676531943e-05, + "loss": 0.032, + "step": 115920 + }, + { + "epoch": 0.2556717309944358, + "grad_norm": 0.10569348186254501, + "learning_rate": 2.1635590833368753e-05, + "loss": 0.0319, + "step": 115930 + }, + { + "epoch": 0.2556937849693339, + "grad_norm": 0.11212943494319916, + "learning_rate": 2.1634107909576863e-05, + "loss": 0.0326, + "step": 115940 + }, + { + "epoch": 0.2557158389442321, + "grad_norm": 0.12077503651380539, + "learning_rate": 2.1632624905174304e-05, + "loss": 0.032, + "step": 115950 + }, + { + "epoch": 0.2557378929191303, + "grad_norm": 0.08884641528129578, + "learning_rate": 2.1631141820179088e-05, + "loss": 0.0294, + "step": 115960 + }, + { + "epoch": 0.2557599468940284, + "grad_norm": 0.12192438542842865, + "learning_rate": 2.1629658654609236e-05, + "loss": 0.0347, + "step": 115970 + }, + { + "epoch": 0.2557820008689266, + "grad_norm": 0.12571148574352264, + "learning_rate": 2.1628175408482776e-05, + "loss": 0.0319, + "step": 115980 + }, + { + "epoch": 0.2558040548438248, + "grad_norm": 0.10887912660837173, + "learning_rate": 2.1626692081817723e-05, + "loss": 0.031, + "step": 115990 + }, + { + "epoch": 0.2558261088187229, + "grad_norm": 0.10732638090848923, + "learning_rate": 2.162520867463211e-05, + "loss": 0.0307, + "step": 116000 + }, + { + "epoch": 0.2558481627936211, + "grad_norm": 0.10322722047567368, + "learning_rate": 2.162372518694395e-05, + "loss": 0.0315, + "step": 116010 + }, + { + "epoch": 0.2558702167685193, + "grad_norm": 0.11480791866779327, + "learning_rate": 2.1622241618771276e-05, + "loss": 0.0316, + "step": 116020 + }, + { + "epoch": 0.2558922707434174, + "grad_norm": 0.15840990841388702, + "learning_rate": 2.1620757970132117e-05, + "loss": 0.0323, + "step": 116030 + }, + { + "epoch": 0.2559143247183156, + "grad_norm": 0.1547463834285736, + "learning_rate": 2.1619274241044496e-05, + "loss": 0.0316, + "step": 116040 + }, + { + "epoch": 0.2559363786932138, + "grad_norm": 0.10226622223854065, + "learning_rate": 2.161779043152644e-05, + "loss": 0.0315, + "step": 116050 + }, + { + "epoch": 0.2559584326681119, + "grad_norm": 0.11419064551591873, + "learning_rate": 2.1616306541595992e-05, + "loss": 0.0327, + "step": 116060 + }, + { + "epoch": 0.2559804866430101, + "grad_norm": 0.12543252110481262, + "learning_rate": 2.1614822571271165e-05, + "loss": 0.0299, + "step": 116070 + }, + { + "epoch": 0.2560025406179083, + "grad_norm": 0.07554096728563309, + "learning_rate": 2.161333852057e-05, + "loss": 0.0284, + "step": 116080 + }, + { + "epoch": 0.2560245945928064, + "grad_norm": 0.09227050840854645, + "learning_rate": 2.161185438951053e-05, + "loss": 0.0314, + "step": 116090 + }, + { + "epoch": 0.2560466485677046, + "grad_norm": 0.11544354259967804, + "learning_rate": 2.1610370178110778e-05, + "loss": 0.0302, + "step": 116100 + }, + { + "epoch": 0.2560687025426028, + "grad_norm": 0.10343107581138611, + "learning_rate": 2.16088858863888e-05, + "loss": 0.0318, + "step": 116110 + }, + { + "epoch": 0.2560907565175009, + "grad_norm": 0.08325204253196716, + "learning_rate": 2.1607401514362606e-05, + "loss": 0.0321, + "step": 116120 + }, + { + "epoch": 0.2561128104923991, + "grad_norm": 0.08449484407901764, + "learning_rate": 2.1605917062050255e-05, + "loss": 0.0308, + "step": 116130 + }, + { + "epoch": 0.2561348644672973, + "grad_norm": 0.10410110652446747, + "learning_rate": 2.1604432529469765e-05, + "loss": 0.0296, + "step": 116140 + }, + { + "epoch": 0.2561569184421954, + "grad_norm": 0.11976588517427444, + "learning_rate": 2.1602947916639196e-05, + "loss": 0.0324, + "step": 116150 + }, + { + "epoch": 0.2561789724170936, + "grad_norm": 0.10287873446941376, + "learning_rate": 2.160146322357657e-05, + "loss": 0.03, + "step": 116160 + }, + { + "epoch": 0.25620102639199177, + "grad_norm": 0.12546831369400024, + "learning_rate": 2.159997845029993e-05, + "loss": 0.0317, + "step": 116170 + }, + { + "epoch": 0.2562230803668899, + "grad_norm": 0.1294054538011551, + "learning_rate": 2.1598493596827323e-05, + "loss": 0.03, + "step": 116180 + }, + { + "epoch": 0.2562451343417881, + "grad_norm": 0.12211424112319946, + "learning_rate": 2.159700866317679e-05, + "loss": 0.0319, + "step": 116190 + }, + { + "epoch": 0.25626718831668627, + "grad_norm": 0.14411459863185883, + "learning_rate": 2.1595523649366374e-05, + "loss": 0.031, + "step": 116200 + }, + { + "epoch": 0.25628924229158445, + "grad_norm": 0.10641425102949142, + "learning_rate": 2.159403855541411e-05, + "loss": 0.0308, + "step": 116210 + }, + { + "epoch": 0.2563112962664826, + "grad_norm": 0.09422562271356583, + "learning_rate": 2.159255338133806e-05, + "loss": 0.0312, + "step": 116220 + }, + { + "epoch": 0.25633335024138076, + "grad_norm": 0.10435150563716888, + "learning_rate": 2.1591068127156264e-05, + "loss": 0.0305, + "step": 116230 + }, + { + "epoch": 0.25635540421627895, + "grad_norm": 0.124556764960289, + "learning_rate": 2.158958279288676e-05, + "loss": 0.0293, + "step": 116240 + }, + { + "epoch": 0.2563774581911771, + "grad_norm": 0.08306588977575302, + "learning_rate": 2.158809737854761e-05, + "loss": 0.0312, + "step": 116250 + }, + { + "epoch": 0.25639951216607526, + "grad_norm": 0.11149510741233826, + "learning_rate": 2.158661188415685e-05, + "loss": 0.0321, + "step": 116260 + }, + { + "epoch": 0.25642156614097344, + "grad_norm": 0.1247522234916687, + "learning_rate": 2.158512630973254e-05, + "loss": 0.0298, + "step": 116270 + }, + { + "epoch": 0.2564436201158716, + "grad_norm": 0.11483614891767502, + "learning_rate": 2.158364065529273e-05, + "loss": 0.0331, + "step": 116280 + }, + { + "epoch": 0.25646567409076976, + "grad_norm": 0.10068359971046448, + "learning_rate": 2.158215492085547e-05, + "loss": 0.0313, + "step": 116290 + }, + { + "epoch": 0.25648772806566794, + "grad_norm": 0.10318564623594284, + "learning_rate": 2.1580669106438807e-05, + "loss": 0.0319, + "step": 116300 + }, + { + "epoch": 0.25650978204056607, + "grad_norm": 0.10998103022575378, + "learning_rate": 2.157918321206081e-05, + "loss": 0.029, + "step": 116310 + }, + { + "epoch": 0.25653183601546425, + "grad_norm": 0.12104432284832001, + "learning_rate": 2.157769723773952e-05, + "loss": 0.0322, + "step": 116320 + }, + { + "epoch": 0.25655388999036244, + "grad_norm": 0.10915949195623398, + "learning_rate": 2.1576211183493e-05, + "loss": 0.0303, + "step": 116330 + }, + { + "epoch": 0.25657594396526057, + "grad_norm": 0.12060054391622543, + "learning_rate": 2.1574725049339304e-05, + "loss": 0.0318, + "step": 116340 + }, + { + "epoch": 0.25659799794015875, + "grad_norm": 0.10293268412351608, + "learning_rate": 2.157323883529649e-05, + "loss": 0.0314, + "step": 116350 + }, + { + "epoch": 0.25662005191505693, + "grad_norm": 0.09364505857229233, + "learning_rate": 2.157175254138262e-05, + "loss": 0.0326, + "step": 116360 + }, + { + "epoch": 0.25664210588995506, + "grad_norm": 0.1536128968000412, + "learning_rate": 2.1570266167615748e-05, + "loss": 0.0317, + "step": 116370 + }, + { + "epoch": 0.25666415986485325, + "grad_norm": 0.0858713835477829, + "learning_rate": 2.1568779714013943e-05, + "loss": 0.0322, + "step": 116380 + }, + { + "epoch": 0.25668621383975143, + "grad_norm": 0.0878705084323883, + "learning_rate": 2.1567293180595265e-05, + "loss": 0.0308, + "step": 116390 + }, + { + "epoch": 0.25670826781464956, + "grad_norm": 0.10882771015167236, + "learning_rate": 2.1565806567377768e-05, + "loss": 0.0314, + "step": 116400 + }, + { + "epoch": 0.25673032178954774, + "grad_norm": 0.10677066445350647, + "learning_rate": 2.1564319874379526e-05, + "loss": 0.0304, + "step": 116410 + }, + { + "epoch": 0.25675237576444593, + "grad_norm": 0.1410665065050125, + "learning_rate": 2.1562833101618597e-05, + "loss": 0.0327, + "step": 116420 + }, + { + "epoch": 0.25677442973934406, + "grad_norm": 0.12469101697206497, + "learning_rate": 2.1561346249113053e-05, + "loss": 0.03, + "step": 116430 + }, + { + "epoch": 0.25679648371424224, + "grad_norm": 0.11105108261108398, + "learning_rate": 2.155985931688095e-05, + "loss": 0.0317, + "step": 116440 + }, + { + "epoch": 0.2568185376891404, + "grad_norm": 0.10358908772468567, + "learning_rate": 2.1558372304940372e-05, + "loss": 0.0304, + "step": 116450 + }, + { + "epoch": 0.25684059166403855, + "grad_norm": 0.13735218346118927, + "learning_rate": 2.1556885213309374e-05, + "loss": 0.0325, + "step": 116460 + }, + { + "epoch": 0.25686264563893674, + "grad_norm": 0.09235337376594543, + "learning_rate": 2.1555398042006026e-05, + "loss": 0.0293, + "step": 116470 + }, + { + "epoch": 0.2568846996138349, + "grad_norm": 0.10297362506389618, + "learning_rate": 2.155391079104841e-05, + "loss": 0.0325, + "step": 116480 + }, + { + "epoch": 0.25690675358873305, + "grad_norm": 0.10137047618627548, + "learning_rate": 2.1552423460454588e-05, + "loss": 0.0314, + "step": 116490 + }, + { + "epoch": 0.25692880756363123, + "grad_norm": 0.09018386155366898, + "learning_rate": 2.1550936050242632e-05, + "loss": 0.0315, + "step": 116500 + }, + { + "epoch": 0.2569508615385294, + "grad_norm": 0.11237619817256927, + "learning_rate": 2.1549448560430617e-05, + "loss": 0.0316, + "step": 116510 + }, + { + "epoch": 0.25697291551342755, + "grad_norm": 0.09523331373929977, + "learning_rate": 2.154796099103662e-05, + "loss": 0.0301, + "step": 116520 + }, + { + "epoch": 0.25699496948832573, + "grad_norm": 0.11705475300550461, + "learning_rate": 2.1546473342078715e-05, + "loss": 0.0295, + "step": 116530 + }, + { + "epoch": 0.2570170234632239, + "grad_norm": 0.1022297814488411, + "learning_rate": 2.1544985613574977e-05, + "loss": 0.0312, + "step": 116540 + }, + { + "epoch": 0.25703907743812204, + "grad_norm": 0.11095552891492844, + "learning_rate": 2.1543497805543488e-05, + "loss": 0.0322, + "step": 116550 + }, + { + "epoch": 0.2570611314130202, + "grad_norm": 0.11108297109603882, + "learning_rate": 2.154200991800232e-05, + "loss": 0.0324, + "step": 116560 + }, + { + "epoch": 0.2570831853879184, + "grad_norm": 0.14749334752559662, + "learning_rate": 2.154052195096955e-05, + "loss": 0.0337, + "step": 116570 + }, + { + "epoch": 0.25710523936281654, + "grad_norm": 0.12360712140798569, + "learning_rate": 2.153903390446327e-05, + "loss": 0.0325, + "step": 116580 + }, + { + "epoch": 0.2571272933377147, + "grad_norm": 0.12009771913290024, + "learning_rate": 2.153754577850155e-05, + "loss": 0.0328, + "step": 116590 + }, + { + "epoch": 0.2571493473126129, + "grad_norm": 0.12142251431941986, + "learning_rate": 2.153605757310248e-05, + "loss": 0.0301, + "step": 116600 + }, + { + "epoch": 0.25717140128751104, + "grad_norm": 0.10396026074886322, + "learning_rate": 2.1534569288284134e-05, + "loss": 0.0317, + "step": 116610 + }, + { + "epoch": 0.2571934552624092, + "grad_norm": 0.11326926946640015, + "learning_rate": 2.1533080924064608e-05, + "loss": 0.0319, + "step": 116620 + }, + { + "epoch": 0.2572155092373074, + "grad_norm": 0.1133095771074295, + "learning_rate": 2.1531592480461977e-05, + "loss": 0.0319, + "step": 116630 + }, + { + "epoch": 0.25723756321220553, + "grad_norm": 0.11489646136760712, + "learning_rate": 2.153010395749433e-05, + "loss": 0.03, + "step": 116640 + }, + { + "epoch": 0.2572596171871037, + "grad_norm": 0.09844665974378586, + "learning_rate": 2.1528615355179753e-05, + "loss": 0.0319, + "step": 116650 + }, + { + "epoch": 0.2572816711620019, + "grad_norm": 0.10272539407014847, + "learning_rate": 2.152712667353634e-05, + "loss": 0.0321, + "step": 116660 + }, + { + "epoch": 0.25730372513690003, + "grad_norm": 0.11637445539236069, + "learning_rate": 2.152563791258217e-05, + "loss": 0.0308, + "step": 116670 + }, + { + "epoch": 0.2573257791117982, + "grad_norm": 0.12011292576789856, + "learning_rate": 2.1524149072335343e-05, + "loss": 0.0291, + "step": 116680 + }, + { + "epoch": 0.2573478330866964, + "grad_norm": 0.11037812381982803, + "learning_rate": 2.1522660152813943e-05, + "loss": 0.0321, + "step": 116690 + }, + { + "epoch": 0.2573698870615945, + "grad_norm": 0.09857414662837982, + "learning_rate": 2.1521171154036066e-05, + "loss": 0.0307, + "step": 116700 + }, + { + "epoch": 0.2573919410364927, + "grad_norm": 0.0940391942858696, + "learning_rate": 2.1519682076019798e-05, + "loss": 0.0316, + "step": 116710 + }, + { + "epoch": 0.2574139950113909, + "grad_norm": 0.11077467352151871, + "learning_rate": 2.151819291878324e-05, + "loss": 0.0288, + "step": 116720 + }, + { + "epoch": 0.257436048986289, + "grad_norm": 0.10346116125583649, + "learning_rate": 2.1516703682344486e-05, + "loss": 0.0319, + "step": 116730 + }, + { + "epoch": 0.2574581029611872, + "grad_norm": 0.09734645485877991, + "learning_rate": 2.1515214366721626e-05, + "loss": 0.0293, + "step": 116740 + }, + { + "epoch": 0.2574801569360854, + "grad_norm": 0.11849391460418701, + "learning_rate": 2.1513724971932762e-05, + "loss": 0.0322, + "step": 116750 + }, + { + "epoch": 0.2575022109109835, + "grad_norm": 0.1194595918059349, + "learning_rate": 2.1512235497995985e-05, + "loss": 0.0325, + "step": 116760 + }, + { + "epoch": 0.2575242648858817, + "grad_norm": 0.09646983444690704, + "learning_rate": 2.1510745944929404e-05, + "loss": 0.0314, + "step": 116770 + }, + { + "epoch": 0.2575463188607799, + "grad_norm": 0.10445265471935272, + "learning_rate": 2.150925631275111e-05, + "loss": 0.0307, + "step": 116780 + }, + { + "epoch": 0.257568372835678, + "grad_norm": 0.18427135050296783, + "learning_rate": 2.1507766601479205e-05, + "loss": 0.0342, + "step": 116790 + }, + { + "epoch": 0.2575904268105762, + "grad_norm": 0.09949778020381927, + "learning_rate": 2.1506276811131797e-05, + "loss": 0.03, + "step": 116800 + }, + { + "epoch": 0.2576124807854744, + "grad_norm": 0.10255077481269836, + "learning_rate": 2.1504786941726975e-05, + "loss": 0.0312, + "step": 116810 + }, + { + "epoch": 0.2576345347603725, + "grad_norm": 0.1346878856420517, + "learning_rate": 2.1503296993282856e-05, + "loss": 0.0302, + "step": 116820 + }, + { + "epoch": 0.2576565887352707, + "grad_norm": 0.08805837482213974, + "learning_rate": 2.1501806965817535e-05, + "loss": 0.0302, + "step": 116830 + }, + { + "epoch": 0.2576786427101689, + "grad_norm": 0.09925834089517593, + "learning_rate": 2.1500316859349123e-05, + "loss": 0.0311, + "step": 116840 + }, + { + "epoch": 0.257700696685067, + "grad_norm": 0.10750583559274673, + "learning_rate": 2.1498826673895722e-05, + "loss": 0.0307, + "step": 116850 + }, + { + "epoch": 0.2577227506599652, + "grad_norm": 0.09301239997148514, + "learning_rate": 2.1497336409475442e-05, + "loss": 0.0322, + "step": 116860 + }, + { + "epoch": 0.2577448046348634, + "grad_norm": 0.09020615369081497, + "learning_rate": 2.1495846066106386e-05, + "loss": 0.0323, + "step": 116870 + }, + { + "epoch": 0.2577668586097615, + "grad_norm": 0.09007574617862701, + "learning_rate": 2.1494355643806676e-05, + "loss": 0.0316, + "step": 116880 + }, + { + "epoch": 0.2577889125846597, + "grad_norm": 0.09800287336111069, + "learning_rate": 2.149286514259441e-05, + "loss": 0.0307, + "step": 116890 + }, + { + "epoch": 0.2578109665595579, + "grad_norm": 0.10550026595592499, + "learning_rate": 2.1491374562487697e-05, + "loss": 0.0318, + "step": 116900 + }, + { + "epoch": 0.257833020534456, + "grad_norm": 0.08404088020324707, + "learning_rate": 2.1489883903504657e-05, + "loss": 0.0312, + "step": 116910 + }, + { + "epoch": 0.2578550745093542, + "grad_norm": 0.1020020991563797, + "learning_rate": 2.1488393165663402e-05, + "loss": 0.0313, + "step": 116920 + }, + { + "epoch": 0.25787712848425237, + "grad_norm": 0.13089726865291595, + "learning_rate": 2.1486902348982045e-05, + "loss": 0.0306, + "step": 116930 + }, + { + "epoch": 0.2578991824591505, + "grad_norm": 0.10715187340974808, + "learning_rate": 2.1485411453478695e-05, + "loss": 0.0314, + "step": 116940 + }, + { + "epoch": 0.2579212364340487, + "grad_norm": 0.08164553344249725, + "learning_rate": 2.1483920479171472e-05, + "loss": 0.0311, + "step": 116950 + }, + { + "epoch": 0.25794329040894687, + "grad_norm": 0.1053028404712677, + "learning_rate": 2.1482429426078503e-05, + "loss": 0.0314, + "step": 116960 + }, + { + "epoch": 0.257965344383845, + "grad_norm": 0.13233830034732819, + "learning_rate": 2.1480938294217887e-05, + "loss": 0.0303, + "step": 116970 + }, + { + "epoch": 0.2579873983587432, + "grad_norm": 0.0792071595788002, + "learning_rate": 2.1479447083607754e-05, + "loss": 0.0305, + "step": 116980 + }, + { + "epoch": 0.25800945233364136, + "grad_norm": 0.1305806189775467, + "learning_rate": 2.147795579426622e-05, + "loss": 0.0314, + "step": 116990 + }, + { + "epoch": 0.2580315063085395, + "grad_norm": 0.10285646468400955, + "learning_rate": 2.147646442621141e-05, + "loss": 0.0334, + "step": 117000 + }, + { + "epoch": 0.2580535602834377, + "grad_norm": 0.08795229345560074, + "learning_rate": 2.147497297946144e-05, + "loss": 0.0294, + "step": 117010 + }, + { + "epoch": 0.25807561425833586, + "grad_norm": 0.09326492995023727, + "learning_rate": 2.1473481454034435e-05, + "loss": 0.0327, + "step": 117020 + }, + { + "epoch": 0.258097668233234, + "grad_norm": 0.12335341423749924, + "learning_rate": 2.1471989849948516e-05, + "loss": 0.0316, + "step": 117030 + }, + { + "epoch": 0.2581197222081322, + "grad_norm": 0.09285347163677216, + "learning_rate": 2.1470498167221815e-05, + "loss": 0.0313, + "step": 117040 + }, + { + "epoch": 0.25814177618303036, + "grad_norm": 0.08217335492372513, + "learning_rate": 2.1469006405872452e-05, + "loss": 0.0299, + "step": 117050 + }, + { + "epoch": 0.2581638301579285, + "grad_norm": 0.08438222855329514, + "learning_rate": 2.146751456591855e-05, + "loss": 0.0303, + "step": 117060 + }, + { + "epoch": 0.25818588413282667, + "grad_norm": 0.09355907887220383, + "learning_rate": 2.1466022647378243e-05, + "loss": 0.0296, + "step": 117070 + }, + { + "epoch": 0.25820793810772485, + "grad_norm": 0.1010032519698143, + "learning_rate": 2.1464530650269653e-05, + "loss": 0.0295, + "step": 117080 + }, + { + "epoch": 0.25822999208262304, + "grad_norm": 0.12026940286159515, + "learning_rate": 2.1463038574610914e-05, + "loss": 0.0319, + "step": 117090 + }, + { + "epoch": 0.25825204605752117, + "grad_norm": 0.11406579613685608, + "learning_rate": 2.1461546420420152e-05, + "loss": 0.0316, + "step": 117100 + }, + { + "epoch": 0.25827410003241935, + "grad_norm": 0.12283612042665482, + "learning_rate": 2.1460054187715504e-05, + "loss": 0.032, + "step": 117110 + }, + { + "epoch": 0.25829615400731754, + "grad_norm": 0.1358014941215515, + "learning_rate": 2.1458561876515096e-05, + "loss": 0.0293, + "step": 117120 + }, + { + "epoch": 0.25831820798221566, + "grad_norm": 0.11161170899868011, + "learning_rate": 2.145706948683706e-05, + "loss": 0.0327, + "step": 117130 + }, + { + "epoch": 0.25834026195711385, + "grad_norm": 0.10009866952896118, + "learning_rate": 2.145557701869954e-05, + "loss": 0.0311, + "step": 117140 + }, + { + "epoch": 0.25836231593201203, + "grad_norm": 0.10819307714700699, + "learning_rate": 2.145408447212066e-05, + "loss": 0.0312, + "step": 117150 + }, + { + "epoch": 0.25838436990691016, + "grad_norm": 0.09105570614337921, + "learning_rate": 2.145259184711856e-05, + "loss": 0.031, + "step": 117160 + }, + { + "epoch": 0.25840642388180834, + "grad_norm": 0.09613770246505737, + "learning_rate": 2.1451099143711377e-05, + "loss": 0.0307, + "step": 117170 + }, + { + "epoch": 0.25842847785670653, + "grad_norm": 0.10560280084609985, + "learning_rate": 2.1449606361917248e-05, + "loss": 0.0321, + "step": 117180 + }, + { + "epoch": 0.25845053183160466, + "grad_norm": 0.10380670428276062, + "learning_rate": 2.1448113501754313e-05, + "loss": 0.0305, + "step": 117190 + }, + { + "epoch": 0.25847258580650284, + "grad_norm": 0.1170431450009346, + "learning_rate": 2.144662056324071e-05, + "loss": 0.0303, + "step": 117200 + }, + { + "epoch": 0.258494639781401, + "grad_norm": 0.10424255579710007, + "learning_rate": 2.1445127546394584e-05, + "loss": 0.0304, + "step": 117210 + }, + { + "epoch": 0.25851669375629915, + "grad_norm": 0.1453678160905838, + "learning_rate": 2.144363445123407e-05, + "loss": 0.0317, + "step": 117220 + }, + { + "epoch": 0.25853874773119734, + "grad_norm": 0.10704400390386581, + "learning_rate": 2.1442141277777314e-05, + "loss": 0.03, + "step": 117230 + }, + { + "epoch": 0.2585608017060955, + "grad_norm": 0.13437600433826447, + "learning_rate": 2.144064802604246e-05, + "loss": 0.0327, + "step": 117240 + }, + { + "epoch": 0.25858285568099365, + "grad_norm": 0.09635960310697556, + "learning_rate": 2.143915469604765e-05, + "loss": 0.0321, + "step": 117250 + }, + { + "epoch": 0.25860490965589183, + "grad_norm": 0.1382509469985962, + "learning_rate": 2.143766128781103e-05, + "loss": 0.033, + "step": 117260 + }, + { + "epoch": 0.25862696363079, + "grad_norm": 0.08322847634553909, + "learning_rate": 2.1436167801350752e-05, + "loss": 0.0308, + "step": 117270 + }, + { + "epoch": 0.25864901760568815, + "grad_norm": 0.1166536808013916, + "learning_rate": 2.1434674236684955e-05, + "loss": 0.0312, + "step": 117280 + }, + { + "epoch": 0.25867107158058633, + "grad_norm": 0.14394615590572357, + "learning_rate": 2.143318059383179e-05, + "loss": 0.0305, + "step": 117290 + }, + { + "epoch": 0.2586931255554845, + "grad_norm": 0.13232313096523285, + "learning_rate": 2.143168687280941e-05, + "loss": 0.0306, + "step": 117300 + }, + { + "epoch": 0.25871517953038264, + "grad_norm": 0.10971954464912415, + "learning_rate": 2.1430193073635957e-05, + "loss": 0.0289, + "step": 117310 + }, + { + "epoch": 0.25873723350528083, + "grad_norm": 0.08146152645349503, + "learning_rate": 2.142869919632959e-05, + "loss": 0.0307, + "step": 117320 + }, + { + "epoch": 0.258759287480179, + "grad_norm": 0.10380669683218002, + "learning_rate": 2.142720524090846e-05, + "loss": 0.0305, + "step": 117330 + }, + { + "epoch": 0.25878134145507714, + "grad_norm": 0.1322152465581894, + "learning_rate": 2.1425711207390712e-05, + "loss": 0.0325, + "step": 117340 + }, + { + "epoch": 0.2588033954299753, + "grad_norm": 0.14236629009246826, + "learning_rate": 2.1424217095794514e-05, + "loss": 0.0301, + "step": 117350 + }, + { + "epoch": 0.2588254494048735, + "grad_norm": 0.08722855895757675, + "learning_rate": 2.1422722906138006e-05, + "loss": 0.0315, + "step": 117360 + }, + { + "epoch": 0.25884750337977164, + "grad_norm": 0.10116281360387802, + "learning_rate": 2.1421228638439355e-05, + "loss": 0.0317, + "step": 117370 + }, + { + "epoch": 0.2588695573546698, + "grad_norm": 0.14002469182014465, + "learning_rate": 2.141973429271671e-05, + "loss": 0.0315, + "step": 117380 + }, + { + "epoch": 0.258891611329568, + "grad_norm": 0.11938308924436569, + "learning_rate": 2.1418239868988235e-05, + "loss": 0.0323, + "step": 117390 + }, + { + "epoch": 0.25891366530446613, + "grad_norm": 0.11875995248556137, + "learning_rate": 2.1416745367272084e-05, + "loss": 0.0314, + "step": 117400 + }, + { + "epoch": 0.2589357192793643, + "grad_norm": 0.18401378393173218, + "learning_rate": 2.141525078758642e-05, + "loss": 0.0305, + "step": 117410 + }, + { + "epoch": 0.2589577732542625, + "grad_norm": 0.1004924401640892, + "learning_rate": 2.14137561299494e-05, + "loss": 0.0312, + "step": 117420 + }, + { + "epoch": 0.25897982722916063, + "grad_norm": 0.10974616557359695, + "learning_rate": 2.141226139437919e-05, + "loss": 0.0323, + "step": 117430 + }, + { + "epoch": 0.2590018812040588, + "grad_norm": 0.10904692858457565, + "learning_rate": 2.141076658089395e-05, + "loss": 0.0343, + "step": 117440 + }, + { + "epoch": 0.259023935178957, + "grad_norm": 0.09665434062480927, + "learning_rate": 2.1409271689511842e-05, + "loss": 0.0322, + "step": 117450 + }, + { + "epoch": 0.2590459891538551, + "grad_norm": 0.10201456397771835, + "learning_rate": 2.140777672025103e-05, + "loss": 0.0306, + "step": 117460 + }, + { + "epoch": 0.2590680431287533, + "grad_norm": 0.09949422627687454, + "learning_rate": 2.140628167312968e-05, + "loss": 0.0307, + "step": 117470 + }, + { + "epoch": 0.2590900971036515, + "grad_norm": 0.18448619544506073, + "learning_rate": 2.1404786548165965e-05, + "loss": 0.0311, + "step": 117480 + }, + { + "epoch": 0.2591121510785496, + "grad_norm": 0.09862220287322998, + "learning_rate": 2.140329134537804e-05, + "loss": 0.0327, + "step": 117490 + }, + { + "epoch": 0.2591342050534478, + "grad_norm": 0.13612604141235352, + "learning_rate": 2.1401796064784083e-05, + "loss": 0.0302, + "step": 117500 + }, + { + "epoch": 0.259156259028346, + "grad_norm": 0.10880672186613083, + "learning_rate": 2.1400300706402258e-05, + "loss": 0.0307, + "step": 117510 + }, + { + "epoch": 0.2591783130032441, + "grad_norm": 0.11328207701444626, + "learning_rate": 2.1398805270250737e-05, + "loss": 0.031, + "step": 117520 + }, + { + "epoch": 0.2592003669781423, + "grad_norm": 0.11217831820249557, + "learning_rate": 2.1397309756347693e-05, + "loss": 0.0309, + "step": 117530 + }, + { + "epoch": 0.2592224209530405, + "grad_norm": 0.11791639029979706, + "learning_rate": 2.139581416471129e-05, + "loss": 0.0313, + "step": 117540 + }, + { + "epoch": 0.2592444749279386, + "grad_norm": 0.11358223855495453, + "learning_rate": 2.1394318495359713e-05, + "loss": 0.0316, + "step": 117550 + }, + { + "epoch": 0.2592665289028368, + "grad_norm": 0.1071099042892456, + "learning_rate": 2.139282274831112e-05, + "loss": 0.033, + "step": 117560 + }, + { + "epoch": 0.259288582877735, + "grad_norm": 0.08788897097110748, + "learning_rate": 2.1391326923583703e-05, + "loss": 0.03, + "step": 117570 + }, + { + "epoch": 0.2593106368526331, + "grad_norm": 0.09928248822689056, + "learning_rate": 2.138983102119563e-05, + "loss": 0.031, + "step": 117580 + }, + { + "epoch": 0.2593326908275313, + "grad_norm": 0.12459442019462585, + "learning_rate": 2.1388335041165072e-05, + "loss": 0.0323, + "step": 117590 + }, + { + "epoch": 0.2593547448024295, + "grad_norm": 0.07894205302000046, + "learning_rate": 2.1386838983510217e-05, + "loss": 0.0303, + "step": 117600 + }, + { + "epoch": 0.2593767987773276, + "grad_norm": 0.08527346700429916, + "learning_rate": 2.138534284824923e-05, + "loss": 0.0308, + "step": 117610 + }, + { + "epoch": 0.2593988527522258, + "grad_norm": 0.10906422883272171, + "learning_rate": 2.1383846635400304e-05, + "loss": 0.0315, + "step": 117620 + }, + { + "epoch": 0.259420906727124, + "grad_norm": 0.12320095300674438, + "learning_rate": 2.1382350344981612e-05, + "loss": 0.0304, + "step": 117630 + }, + { + "epoch": 0.2594429607020221, + "grad_norm": 0.10989490151405334, + "learning_rate": 2.138085397701134e-05, + "loss": 0.0295, + "step": 117640 + }, + { + "epoch": 0.2594650146769203, + "grad_norm": 0.09502143412828445, + "learning_rate": 2.1379357531507665e-05, + "loss": 0.0313, + "step": 117650 + }, + { + "epoch": 0.2594870686518185, + "grad_norm": 0.09829282015562057, + "learning_rate": 2.137786100848878e-05, + "loss": 0.0318, + "step": 117660 + }, + { + "epoch": 0.2595091226267166, + "grad_norm": 0.09576959908008575, + "learning_rate": 2.137636440797285e-05, + "loss": 0.0323, + "step": 117670 + }, + { + "epoch": 0.2595311766016148, + "grad_norm": 0.08463118225336075, + "learning_rate": 2.137486772997808e-05, + "loss": 0.03, + "step": 117680 + }, + { + "epoch": 0.25955323057651297, + "grad_norm": 0.12068162113428116, + "learning_rate": 2.1373370974522647e-05, + "loss": 0.0305, + "step": 117690 + }, + { + "epoch": 0.2595752845514111, + "grad_norm": 0.10955314338207245, + "learning_rate": 2.1371874141624735e-05, + "loss": 0.0315, + "step": 117700 + }, + { + "epoch": 0.2595973385263093, + "grad_norm": 0.07993970811367035, + "learning_rate": 2.1370377231302545e-05, + "loss": 0.0328, + "step": 117710 + }, + { + "epoch": 0.25961939250120747, + "grad_norm": 0.10330309718847275, + "learning_rate": 2.1368880243574248e-05, + "loss": 0.031, + "step": 117720 + }, + { + "epoch": 0.2596414464761056, + "grad_norm": 0.10251066088676453, + "learning_rate": 2.136738317845805e-05, + "loss": 0.0305, + "step": 117730 + }, + { + "epoch": 0.2596635004510038, + "grad_norm": 0.12121915817260742, + "learning_rate": 2.1365886035972127e-05, + "loss": 0.0314, + "step": 117740 + }, + { + "epoch": 0.25968555442590197, + "grad_norm": 0.07573214918375015, + "learning_rate": 2.1364388816134685e-05, + "loss": 0.0323, + "step": 117750 + }, + { + "epoch": 0.2597076084008001, + "grad_norm": 0.09187322854995728, + "learning_rate": 2.1362891518963904e-05, + "loss": 0.031, + "step": 117760 + }, + { + "epoch": 0.2597296623756983, + "grad_norm": 0.10746987909078598, + "learning_rate": 2.1361394144477992e-05, + "loss": 0.0302, + "step": 117770 + }, + { + "epoch": 0.25975171635059646, + "grad_norm": 0.08568774163722992, + "learning_rate": 2.135989669269513e-05, + "loss": 0.0313, + "step": 117780 + }, + { + "epoch": 0.2597737703254946, + "grad_norm": 0.07979023456573486, + "learning_rate": 2.1358399163633517e-05, + "loss": 0.0308, + "step": 117790 + }, + { + "epoch": 0.2597958243003928, + "grad_norm": 0.09067258983850479, + "learning_rate": 2.1356901557311353e-05, + "loss": 0.0304, + "step": 117800 + }, + { + "epoch": 0.25981787827529096, + "grad_norm": 0.10815959423780441, + "learning_rate": 2.1355403873746832e-05, + "loss": 0.0317, + "step": 117810 + }, + { + "epoch": 0.2598399322501891, + "grad_norm": 0.09254807233810425, + "learning_rate": 2.135390611295815e-05, + "loss": 0.0317, + "step": 117820 + }, + { + "epoch": 0.25986198622508727, + "grad_norm": 0.10154430568218231, + "learning_rate": 2.1352408274963518e-05, + "loss": 0.0313, + "step": 117830 + }, + { + "epoch": 0.25988404019998546, + "grad_norm": 0.0819500982761383, + "learning_rate": 2.1350910359781124e-05, + "loss": 0.0311, + "step": 117840 + }, + { + "epoch": 0.2599060941748836, + "grad_norm": 0.09770524501800537, + "learning_rate": 2.134941236742917e-05, + "loss": 0.031, + "step": 117850 + }, + { + "epoch": 0.25992814814978177, + "grad_norm": 0.13627642393112183, + "learning_rate": 2.1347914297925863e-05, + "loss": 0.033, + "step": 117860 + }, + { + "epoch": 0.25995020212467995, + "grad_norm": 0.13019010424613953, + "learning_rate": 2.1346416151289405e-05, + "loss": 0.0334, + "step": 117870 + }, + { + "epoch": 0.2599722560995781, + "grad_norm": 0.0982394814491272, + "learning_rate": 2.1344917927537994e-05, + "loss": 0.0321, + "step": 117880 + }, + { + "epoch": 0.25999431007447626, + "grad_norm": 0.1254386156797409, + "learning_rate": 2.1343419626689847e-05, + "loss": 0.0316, + "step": 117890 + }, + { + "epoch": 0.26001636404937445, + "grad_norm": 0.11791684478521347, + "learning_rate": 2.1341921248763155e-05, + "loss": 0.0309, + "step": 117900 + }, + { + "epoch": 0.2600384180242726, + "grad_norm": 0.083319291472435, + "learning_rate": 2.1340422793776137e-05, + "loss": 0.0315, + "step": 117910 + }, + { + "epoch": 0.26006047199917076, + "grad_norm": 0.11557774990797043, + "learning_rate": 2.1338924261746993e-05, + "loss": 0.0282, + "step": 117920 + }, + { + "epoch": 0.26008252597406895, + "grad_norm": 0.13960233330726624, + "learning_rate": 2.133742565269394e-05, + "loss": 0.0306, + "step": 117930 + }, + { + "epoch": 0.26010457994896713, + "grad_norm": 0.1444348841905594, + "learning_rate": 2.1335926966635176e-05, + "loss": 0.031, + "step": 117940 + }, + { + "epoch": 0.26012663392386526, + "grad_norm": 0.1100190207362175, + "learning_rate": 2.133442820358892e-05, + "loss": 0.0301, + "step": 117950 + }, + { + "epoch": 0.26014868789876344, + "grad_norm": 0.10618387907743454, + "learning_rate": 2.1332929363573378e-05, + "loss": 0.0307, + "step": 117960 + }, + { + "epoch": 0.2601707418736616, + "grad_norm": 0.09335938096046448, + "learning_rate": 2.1331430446606764e-05, + "loss": 0.0299, + "step": 117970 + }, + { + "epoch": 0.26019279584855975, + "grad_norm": 0.08019813895225525, + "learning_rate": 2.1329931452707298e-05, + "loss": 0.0289, + "step": 117980 + }, + { + "epoch": 0.26021484982345794, + "grad_norm": 0.14811362326145172, + "learning_rate": 2.132843238189318e-05, + "loss": 0.0312, + "step": 117990 + }, + { + "epoch": 0.2602369037983561, + "grad_norm": 0.09361928701400757, + "learning_rate": 2.1326933234182642e-05, + "loss": 0.0295, + "step": 118000 + }, + { + "epoch": 0.26025895777325425, + "grad_norm": 0.10440263897180557, + "learning_rate": 2.132543400959389e-05, + "loss": 0.0316, + "step": 118010 + }, + { + "epoch": 0.26028101174815244, + "grad_norm": 0.25222447514533997, + "learning_rate": 2.132393470814514e-05, + "loss": 0.0312, + "step": 118020 + }, + { + "epoch": 0.2603030657230506, + "grad_norm": 0.1179346889257431, + "learning_rate": 2.132243532985461e-05, + "loss": 0.0309, + "step": 118030 + }, + { + "epoch": 0.26032511969794875, + "grad_norm": 0.0813424289226532, + "learning_rate": 2.1320935874740523e-05, + "loss": 0.0307, + "step": 118040 + }, + { + "epoch": 0.26034717367284693, + "grad_norm": 0.12871915102005005, + "learning_rate": 2.1319436342821098e-05, + "loss": 0.0305, + "step": 118050 + }, + { + "epoch": 0.2603692276477451, + "grad_norm": 0.09896136075258255, + "learning_rate": 2.131793673411456e-05, + "loss": 0.0307, + "step": 118060 + }, + { + "epoch": 0.26039128162264324, + "grad_norm": 0.13534729182720184, + "learning_rate": 2.1316437048639118e-05, + "loss": 0.03, + "step": 118070 + }, + { + "epoch": 0.26041333559754143, + "grad_norm": 0.11704640090465546, + "learning_rate": 2.1314937286413006e-05, + "loss": 0.0325, + "step": 118080 + }, + { + "epoch": 0.2604353895724396, + "grad_norm": 0.11165902763605118, + "learning_rate": 2.131343744745444e-05, + "loss": 0.032, + "step": 118090 + }, + { + "epoch": 0.26045744354733774, + "grad_norm": 0.11933554708957672, + "learning_rate": 2.1311937531781653e-05, + "loss": 0.0301, + "step": 118100 + }, + { + "epoch": 0.2604794975222359, + "grad_norm": 0.1051635816693306, + "learning_rate": 2.1310437539412864e-05, + "loss": 0.0314, + "step": 118110 + }, + { + "epoch": 0.2605015514971341, + "grad_norm": 0.11140011996030807, + "learning_rate": 2.13089374703663e-05, + "loss": 0.0286, + "step": 118120 + }, + { + "epoch": 0.26052360547203224, + "grad_norm": 0.09452442079782486, + "learning_rate": 2.130743732466019e-05, + "loss": 0.0311, + "step": 118130 + }, + { + "epoch": 0.2605456594469304, + "grad_norm": 0.10485135763883591, + "learning_rate": 2.130593710231276e-05, + "loss": 0.0332, + "step": 118140 + }, + { + "epoch": 0.2605677134218286, + "grad_norm": 0.08459966629743576, + "learning_rate": 2.1304436803342243e-05, + "loss": 0.0303, + "step": 118150 + }, + { + "epoch": 0.26058976739672673, + "grad_norm": 0.13137085735797882, + "learning_rate": 2.1302936427766865e-05, + "loss": 0.0307, + "step": 118160 + }, + { + "epoch": 0.2606118213716249, + "grad_norm": 0.14038613438606262, + "learning_rate": 2.1301435975604856e-05, + "loss": 0.031, + "step": 118170 + }, + { + "epoch": 0.2606338753465231, + "grad_norm": 0.12155608832836151, + "learning_rate": 2.1299935446874456e-05, + "loss": 0.0309, + "step": 118180 + }, + { + "epoch": 0.26065592932142123, + "grad_norm": 0.13073812425136566, + "learning_rate": 2.1298434841593887e-05, + "loss": 0.0321, + "step": 118190 + }, + { + "epoch": 0.2606779832963194, + "grad_norm": 0.10641379654407501, + "learning_rate": 2.129693415978139e-05, + "loss": 0.0317, + "step": 118200 + }, + { + "epoch": 0.2607000372712176, + "grad_norm": 0.11855378746986389, + "learning_rate": 2.12954334014552e-05, + "loss": 0.0317, + "step": 118210 + }, + { + "epoch": 0.26072209124611573, + "grad_norm": 0.09980878978967667, + "learning_rate": 2.1293932566633543e-05, + "loss": 0.0305, + "step": 118220 + }, + { + "epoch": 0.2607441452210139, + "grad_norm": 0.13721849024295807, + "learning_rate": 2.129243165533467e-05, + "loss": 0.0308, + "step": 118230 + }, + { + "epoch": 0.2607661991959121, + "grad_norm": 0.14749056100845337, + "learning_rate": 2.1290930667576805e-05, + "loss": 0.0301, + "step": 118240 + }, + { + "epoch": 0.2607882531708102, + "grad_norm": 0.12008440494537354, + "learning_rate": 2.12894296033782e-05, + "loss": 0.0302, + "step": 118250 + }, + { + "epoch": 0.2608103071457084, + "grad_norm": 0.12475010007619858, + "learning_rate": 2.1287928462757084e-05, + "loss": 0.0313, + "step": 118260 + }, + { + "epoch": 0.2608323611206066, + "grad_norm": 0.10418138653039932, + "learning_rate": 2.1286427245731702e-05, + "loss": 0.0319, + "step": 118270 + }, + { + "epoch": 0.2608544150955047, + "grad_norm": 0.1097245067358017, + "learning_rate": 2.128492595232029e-05, + "loss": 0.0296, + "step": 118280 + }, + { + "epoch": 0.2608764690704029, + "grad_norm": 0.11748132109642029, + "learning_rate": 2.1283424582541095e-05, + "loss": 0.0317, + "step": 118290 + }, + { + "epoch": 0.2608985230453011, + "grad_norm": 0.12092804163694382, + "learning_rate": 2.128192313641236e-05, + "loss": 0.0329, + "step": 118300 + }, + { + "epoch": 0.2609205770201992, + "grad_norm": 0.0831153392791748, + "learning_rate": 2.1280421613952324e-05, + "loss": 0.032, + "step": 118310 + }, + { + "epoch": 0.2609426309950974, + "grad_norm": 0.10481591522693634, + "learning_rate": 2.1278920015179242e-05, + "loss": 0.0296, + "step": 118320 + }, + { + "epoch": 0.2609646849699956, + "grad_norm": 0.09634588658809662, + "learning_rate": 2.127741834011135e-05, + "loss": 0.03, + "step": 118330 + }, + { + "epoch": 0.2609867389448937, + "grad_norm": 0.1400870829820633, + "learning_rate": 2.1275916588766897e-05, + "loss": 0.0304, + "step": 118340 + }, + { + "epoch": 0.2610087929197919, + "grad_norm": 0.14437350630760193, + "learning_rate": 2.1274414761164134e-05, + "loss": 0.033, + "step": 118350 + }, + { + "epoch": 0.2610308468946901, + "grad_norm": 0.09500754624605179, + "learning_rate": 2.1272912857321305e-05, + "loss": 0.03, + "step": 118360 + }, + { + "epoch": 0.2610529008695882, + "grad_norm": 0.14602282643318176, + "learning_rate": 2.1271410877256667e-05, + "loss": 0.0333, + "step": 118370 + }, + { + "epoch": 0.2610749548444864, + "grad_norm": 0.10320993512868881, + "learning_rate": 2.1269908820988463e-05, + "loss": 0.0302, + "step": 118380 + }, + { + "epoch": 0.2610970088193846, + "grad_norm": 0.13225503265857697, + "learning_rate": 2.1268406688534947e-05, + "loss": 0.032, + "step": 118390 + }, + { + "epoch": 0.2611190627942827, + "grad_norm": 0.10209707170724869, + "learning_rate": 2.126690447991437e-05, + "loss": 0.0315, + "step": 118400 + }, + { + "epoch": 0.2611411167691809, + "grad_norm": 0.10548318177461624, + "learning_rate": 2.1265402195144987e-05, + "loss": 0.0327, + "step": 118410 + }, + { + "epoch": 0.2611631707440791, + "grad_norm": 0.08464237302541733, + "learning_rate": 2.1263899834245056e-05, + "loss": 0.0307, + "step": 118420 + }, + { + "epoch": 0.2611852247189772, + "grad_norm": 0.1308908760547638, + "learning_rate": 2.1262397397232824e-05, + "loss": 0.0301, + "step": 118430 + }, + { + "epoch": 0.2612072786938754, + "grad_norm": 0.12073779851198196, + "learning_rate": 2.126089488412655e-05, + "loss": 0.0299, + "step": 118440 + }, + { + "epoch": 0.2612293326687736, + "grad_norm": 0.12770408391952515, + "learning_rate": 2.1259392294944492e-05, + "loss": 0.0299, + "step": 118450 + }, + { + "epoch": 0.2612513866436717, + "grad_norm": 0.10352233797311783, + "learning_rate": 2.1257889629704915e-05, + "loss": 0.0305, + "step": 118460 + }, + { + "epoch": 0.2612734406185699, + "grad_norm": 0.09853996336460114, + "learning_rate": 2.1256386888426062e-05, + "loss": 0.0294, + "step": 118470 + }, + { + "epoch": 0.26129549459346807, + "grad_norm": 0.09874036908149719, + "learning_rate": 2.1254884071126205e-05, + "loss": 0.0304, + "step": 118480 + }, + { + "epoch": 0.2613175485683662, + "grad_norm": 0.1069873794913292, + "learning_rate": 2.1253381177823603e-05, + "loss": 0.0318, + "step": 118490 + }, + { + "epoch": 0.2613396025432644, + "grad_norm": 0.09805095940828323, + "learning_rate": 2.1251878208536512e-05, + "loss": 0.0289, + "step": 118500 + }, + { + "epoch": 0.26136165651816257, + "grad_norm": 0.10225626826286316, + "learning_rate": 2.12503751632832e-05, + "loss": 0.0301, + "step": 118510 + }, + { + "epoch": 0.2613837104930607, + "grad_norm": 0.11398147791624069, + "learning_rate": 2.1248872042081924e-05, + "loss": 0.0307, + "step": 118520 + }, + { + "epoch": 0.2614057644679589, + "grad_norm": 0.11001507192850113, + "learning_rate": 2.124736884495096e-05, + "loss": 0.029, + "step": 118530 + }, + { + "epoch": 0.26142781844285706, + "grad_norm": 0.10470324754714966, + "learning_rate": 2.1245865571908555e-05, + "loss": 0.0306, + "step": 118540 + }, + { + "epoch": 0.2614498724177552, + "grad_norm": 0.10762131959199905, + "learning_rate": 2.1244362222972996e-05, + "loss": 0.0295, + "step": 118550 + }, + { + "epoch": 0.2614719263926534, + "grad_norm": 0.10864081233739853, + "learning_rate": 2.1242858798162536e-05, + "loss": 0.0308, + "step": 118560 + }, + { + "epoch": 0.26149398036755156, + "grad_norm": 0.12175337225198746, + "learning_rate": 2.124135529749545e-05, + "loss": 0.031, + "step": 118570 + }, + { + "epoch": 0.2615160343424497, + "grad_norm": 0.11088195443153381, + "learning_rate": 2.1239851720990005e-05, + "loss": 0.0317, + "step": 118580 + }, + { + "epoch": 0.26153808831734787, + "grad_norm": 0.10236853361129761, + "learning_rate": 2.1238348068664466e-05, + "loss": 0.0329, + "step": 118590 + }, + { + "epoch": 0.26156014229224606, + "grad_norm": 0.08866892009973526, + "learning_rate": 2.1236844340537115e-05, + "loss": 0.0347, + "step": 118600 + }, + { + "epoch": 0.2615821962671442, + "grad_norm": 0.09859216958284378, + "learning_rate": 2.1235340536626207e-05, + "loss": 0.0316, + "step": 118610 + }, + { + "epoch": 0.26160425024204237, + "grad_norm": 0.0964575856924057, + "learning_rate": 2.123383665695003e-05, + "loss": 0.0299, + "step": 118620 + }, + { + "epoch": 0.26162630421694055, + "grad_norm": 0.11235557496547699, + "learning_rate": 2.1232332701526854e-05, + "loss": 0.0329, + "step": 118630 + }, + { + "epoch": 0.2616483581918387, + "grad_norm": 0.13402990996837616, + "learning_rate": 2.1230828670374946e-05, + "loss": 0.0317, + "step": 118640 + }, + { + "epoch": 0.26167041216673687, + "grad_norm": 0.09435800462961197, + "learning_rate": 2.1229324563512594e-05, + "loss": 0.0297, + "step": 118650 + }, + { + "epoch": 0.26169246614163505, + "grad_norm": 0.13591541349887848, + "learning_rate": 2.122782038095806e-05, + "loss": 0.0295, + "step": 118660 + }, + { + "epoch": 0.2617145201165332, + "grad_norm": 0.12525469064712524, + "learning_rate": 2.122631612272963e-05, + "loss": 0.0303, + "step": 118670 + }, + { + "epoch": 0.26173657409143136, + "grad_norm": 0.11849658191204071, + "learning_rate": 2.1224811788845582e-05, + "loss": 0.0302, + "step": 118680 + }, + { + "epoch": 0.26175862806632955, + "grad_norm": 0.13259364664554596, + "learning_rate": 2.1223307379324196e-05, + "loss": 0.0314, + "step": 118690 + }, + { + "epoch": 0.2617806820412277, + "grad_norm": 0.08611629903316498, + "learning_rate": 2.1221802894183744e-05, + "loss": 0.0315, + "step": 118700 + }, + { + "epoch": 0.26180273601612586, + "grad_norm": 0.11814247071743011, + "learning_rate": 2.1220298333442514e-05, + "loss": 0.0308, + "step": 118710 + }, + { + "epoch": 0.26182478999102404, + "grad_norm": 0.08774924278259277, + "learning_rate": 2.1218793697118787e-05, + "loss": 0.0305, + "step": 118720 + }, + { + "epoch": 0.26184684396592217, + "grad_norm": 0.11220364272594452, + "learning_rate": 2.1217288985230846e-05, + "loss": 0.0303, + "step": 118730 + }, + { + "epoch": 0.26186889794082036, + "grad_norm": 0.09840621054172516, + "learning_rate": 2.1215784197796973e-05, + "loss": 0.0302, + "step": 118740 + }, + { + "epoch": 0.26189095191571854, + "grad_norm": 0.11980140954256058, + "learning_rate": 2.1214279334835455e-05, + "loss": 0.0327, + "step": 118750 + }, + { + "epoch": 0.26191300589061667, + "grad_norm": 0.1303609311580658, + "learning_rate": 2.1212774396364572e-05, + "loss": 0.0311, + "step": 118760 + }, + { + "epoch": 0.26193505986551485, + "grad_norm": 0.10937344282865524, + "learning_rate": 2.1211269382402616e-05, + "loss": 0.0309, + "step": 118770 + }, + { + "epoch": 0.26195711384041304, + "grad_norm": 0.0898284986615181, + "learning_rate": 2.1209764292967873e-05, + "loss": 0.0327, + "step": 118780 + }, + { + "epoch": 0.2619791678153112, + "grad_norm": 0.1378730833530426, + "learning_rate": 2.120825912807863e-05, + "loss": 0.0316, + "step": 118790 + }, + { + "epoch": 0.26200122179020935, + "grad_norm": 0.11884403228759766, + "learning_rate": 2.1206753887753182e-05, + "loss": 0.0315, + "step": 118800 + }, + { + "epoch": 0.26202327576510753, + "grad_norm": 0.12917537987232208, + "learning_rate": 2.120524857200981e-05, + "loss": 0.0316, + "step": 118810 + }, + { + "epoch": 0.2620453297400057, + "grad_norm": 0.12599997222423553, + "learning_rate": 2.1203743180866806e-05, + "loss": 0.0327, + "step": 118820 + }, + { + "epoch": 0.26206738371490385, + "grad_norm": 0.0982011929154396, + "learning_rate": 2.120223771434247e-05, + "loss": 0.03, + "step": 118830 + }, + { + "epoch": 0.26208943768980203, + "grad_norm": 0.0999172180891037, + "learning_rate": 2.1200732172455087e-05, + "loss": 0.0302, + "step": 118840 + }, + { + "epoch": 0.2621114916647002, + "grad_norm": 0.09182103723287582, + "learning_rate": 2.1199226555222954e-05, + "loss": 0.0304, + "step": 118850 + }, + { + "epoch": 0.26213354563959834, + "grad_norm": 0.14563347399234772, + "learning_rate": 2.1197720862664364e-05, + "loss": 0.031, + "step": 118860 + }, + { + "epoch": 0.2621555996144965, + "grad_norm": 0.08382372558116913, + "learning_rate": 2.119621509479762e-05, + "loss": 0.0304, + "step": 118870 + }, + { + "epoch": 0.2621776535893947, + "grad_norm": 0.09279953688383102, + "learning_rate": 2.1194709251641007e-05, + "loss": 0.0327, + "step": 118880 + }, + { + "epoch": 0.26219970756429284, + "grad_norm": 0.12559527158737183, + "learning_rate": 2.119320333321283e-05, + "loss": 0.0324, + "step": 118890 + }, + { + "epoch": 0.262221761539191, + "grad_norm": 0.1476268470287323, + "learning_rate": 2.1191697339531382e-05, + "loss": 0.032, + "step": 118900 + }, + { + "epoch": 0.2622438155140892, + "grad_norm": 0.09541784971952438, + "learning_rate": 2.1190191270614973e-05, + "loss": 0.0323, + "step": 118910 + }, + { + "epoch": 0.26226586948898734, + "grad_norm": 0.09621307998895645, + "learning_rate": 2.118868512648189e-05, + "loss": 0.0296, + "step": 118920 + }, + { + "epoch": 0.2622879234638855, + "grad_norm": 0.11721495538949966, + "learning_rate": 2.1187178907150436e-05, + "loss": 0.0301, + "step": 118930 + }, + { + "epoch": 0.2623099774387837, + "grad_norm": 0.0994182825088501, + "learning_rate": 2.1185672612638923e-05, + "loss": 0.032, + "step": 118940 + }, + { + "epoch": 0.26233203141368183, + "grad_norm": 0.1205756813287735, + "learning_rate": 2.118416624296564e-05, + "loss": 0.0299, + "step": 118950 + }, + { + "epoch": 0.26235408538858, + "grad_norm": 0.12202832847833633, + "learning_rate": 2.1182659798148908e-05, + "loss": 0.0311, + "step": 118960 + }, + { + "epoch": 0.2623761393634782, + "grad_norm": 0.1013128012418747, + "learning_rate": 2.1181153278207017e-05, + "loss": 0.0315, + "step": 118970 + }, + { + "epoch": 0.26239819333837633, + "grad_norm": 0.11936724185943604, + "learning_rate": 2.1179646683158278e-05, + "loss": 0.0311, + "step": 118980 + }, + { + "epoch": 0.2624202473132745, + "grad_norm": 0.132342129945755, + "learning_rate": 2.1178140013021e-05, + "loss": 0.0311, + "step": 118990 + }, + { + "epoch": 0.2624423012881727, + "grad_norm": 0.10800536721944809, + "learning_rate": 2.117663326781348e-05, + "loss": 0.0324, + "step": 119000 + }, + { + "epoch": 0.2624643552630708, + "grad_norm": 0.10611645877361298, + "learning_rate": 2.1175126447554043e-05, + "loss": 0.0314, + "step": 119010 + }, + { + "epoch": 0.262486409237969, + "grad_norm": 0.0943336933851242, + "learning_rate": 2.1173619552260984e-05, + "loss": 0.0326, + "step": 119020 + }, + { + "epoch": 0.2625084632128672, + "grad_norm": 0.1052938774228096, + "learning_rate": 2.117211258195262e-05, + "loss": 0.0335, + "step": 119030 + }, + { + "epoch": 0.2625305171877653, + "grad_norm": 0.10048777610063553, + "learning_rate": 2.1170605536647262e-05, + "loss": 0.0297, + "step": 119040 + }, + { + "epoch": 0.2625525711626635, + "grad_norm": 0.12670114636421204, + "learning_rate": 2.116909841636322e-05, + "loss": 0.0307, + "step": 119050 + }, + { + "epoch": 0.2625746251375617, + "grad_norm": 0.08265762776136398, + "learning_rate": 2.116759122111881e-05, + "loss": 0.0316, + "step": 119060 + }, + { + "epoch": 0.2625966791124598, + "grad_norm": 0.1032082736492157, + "learning_rate": 2.116608395093234e-05, + "loss": 0.0298, + "step": 119070 + }, + { + "epoch": 0.262618733087358, + "grad_norm": 0.09388270229101181, + "learning_rate": 2.116457660582213e-05, + "loss": 0.0285, + "step": 119080 + }, + { + "epoch": 0.2626407870622562, + "grad_norm": 0.0864633247256279, + "learning_rate": 2.116306918580649e-05, + "loss": 0.0302, + "step": 119090 + }, + { + "epoch": 0.2626628410371543, + "grad_norm": 0.10762950778007507, + "learning_rate": 2.1161561690903742e-05, + "loss": 0.0304, + "step": 119100 + }, + { + "epoch": 0.2626848950120525, + "grad_norm": 0.09541695564985275, + "learning_rate": 2.1160054121132205e-05, + "loss": 0.0294, + "step": 119110 + }, + { + "epoch": 0.2627069489869507, + "grad_norm": 0.09382601082324982, + "learning_rate": 2.1158546476510192e-05, + "loss": 0.0294, + "step": 119120 + }, + { + "epoch": 0.2627290029618488, + "grad_norm": 0.10153429955244064, + "learning_rate": 2.1157038757056027e-05, + "loss": 0.0313, + "step": 119130 + }, + { + "epoch": 0.262751056936747, + "grad_norm": 0.11825316399335861, + "learning_rate": 2.1155530962788026e-05, + "loss": 0.03, + "step": 119140 + }, + { + "epoch": 0.2627731109116452, + "grad_norm": 0.1056453287601471, + "learning_rate": 2.1154023093724515e-05, + "loss": 0.0293, + "step": 119150 + }, + { + "epoch": 0.2627951648865433, + "grad_norm": 0.08113126456737518, + "learning_rate": 2.1152515149883807e-05, + "loss": 0.0306, + "step": 119160 + }, + { + "epoch": 0.2628172188614415, + "grad_norm": 0.08105114847421646, + "learning_rate": 2.1151007131284236e-05, + "loss": 0.0313, + "step": 119170 + }, + { + "epoch": 0.2628392728363397, + "grad_norm": 0.10054042190313339, + "learning_rate": 2.1149499037944117e-05, + "loss": 0.0296, + "step": 119180 + }, + { + "epoch": 0.2628613268112378, + "grad_norm": 0.11158612370491028, + "learning_rate": 2.114799086988178e-05, + "loss": 0.0321, + "step": 119190 + }, + { + "epoch": 0.262883380786136, + "grad_norm": 0.09671954810619354, + "learning_rate": 2.1146482627115553e-05, + "loss": 0.0309, + "step": 119200 + }, + { + "epoch": 0.2629054347610342, + "grad_norm": 0.13429167866706848, + "learning_rate": 2.1144974309663754e-05, + "loss": 0.029, + "step": 119210 + }, + { + "epoch": 0.2629274887359323, + "grad_norm": 0.12125193327665329, + "learning_rate": 2.1143465917544723e-05, + "loss": 0.0312, + "step": 119220 + }, + { + "epoch": 0.2629495427108305, + "grad_norm": 0.4071250855922699, + "learning_rate": 2.1141957450776773e-05, + "loss": 0.0303, + "step": 119230 + }, + { + "epoch": 0.26297159668572867, + "grad_norm": 0.1164044588804245, + "learning_rate": 2.114044890937824e-05, + "loss": 0.0306, + "step": 119240 + }, + { + "epoch": 0.2629936506606268, + "grad_norm": 0.09032745659351349, + "learning_rate": 2.113894029336746e-05, + "loss": 0.0321, + "step": 119250 + }, + { + "epoch": 0.263015704635525, + "grad_norm": 0.09421208500862122, + "learning_rate": 2.1137431602762757e-05, + "loss": 0.0298, + "step": 119260 + }, + { + "epoch": 0.26303775861042317, + "grad_norm": 0.10817933827638626, + "learning_rate": 2.1135922837582464e-05, + "loss": 0.0314, + "step": 119270 + }, + { + "epoch": 0.2630598125853213, + "grad_norm": 0.1092933863401413, + "learning_rate": 2.1134413997844918e-05, + "loss": 0.031, + "step": 119280 + }, + { + "epoch": 0.2630818665602195, + "grad_norm": 0.08767269551753998, + "learning_rate": 2.113290508356845e-05, + "loss": 0.0295, + "step": 119290 + }, + { + "epoch": 0.26310392053511766, + "grad_norm": 0.12514199316501617, + "learning_rate": 2.1131396094771394e-05, + "loss": 0.0316, + "step": 119300 + }, + { + "epoch": 0.2631259745100158, + "grad_norm": 0.10002446919679642, + "learning_rate": 2.1129887031472085e-05, + "loss": 0.0316, + "step": 119310 + }, + { + "epoch": 0.263148028484914, + "grad_norm": 0.09654571115970612, + "learning_rate": 2.1128377893688863e-05, + "loss": 0.0298, + "step": 119320 + }, + { + "epoch": 0.26317008245981216, + "grad_norm": 0.1264255940914154, + "learning_rate": 2.1126868681440064e-05, + "loss": 0.0321, + "step": 119330 + }, + { + "epoch": 0.2631921364347103, + "grad_norm": 0.11330604553222656, + "learning_rate": 2.1125359394744025e-05, + "loss": 0.0307, + "step": 119340 + }, + { + "epoch": 0.2632141904096085, + "grad_norm": 0.11367066949605942, + "learning_rate": 2.1123850033619086e-05, + "loss": 0.0316, + "step": 119350 + }, + { + "epoch": 0.26323624438450666, + "grad_norm": 0.10900755226612091, + "learning_rate": 2.1122340598083583e-05, + "loss": 0.0341, + "step": 119360 + }, + { + "epoch": 0.2632582983594048, + "grad_norm": 0.0843290314078331, + "learning_rate": 2.1120831088155866e-05, + "loss": 0.0298, + "step": 119370 + }, + { + "epoch": 0.26328035233430297, + "grad_norm": 0.10143706202507019, + "learning_rate": 2.1119321503854276e-05, + "loss": 0.031, + "step": 119380 + }, + { + "epoch": 0.26330240630920115, + "grad_norm": 0.10568298399448395, + "learning_rate": 2.1117811845197145e-05, + "loss": 0.0331, + "step": 119390 + }, + { + "epoch": 0.2633244602840993, + "grad_norm": 0.10984261333942413, + "learning_rate": 2.1116302112202828e-05, + "loss": 0.0305, + "step": 119400 + }, + { + "epoch": 0.26334651425899747, + "grad_norm": 0.137344092130661, + "learning_rate": 2.1114792304889663e-05, + "loss": 0.0318, + "step": 119410 + }, + { + "epoch": 0.26336856823389565, + "grad_norm": 0.13048681616783142, + "learning_rate": 2.1113282423275998e-05, + "loss": 0.0326, + "step": 119420 + }, + { + "epoch": 0.2633906222087938, + "grad_norm": 0.1446499526500702, + "learning_rate": 2.111177246738018e-05, + "loss": 0.0319, + "step": 119430 + }, + { + "epoch": 0.26341267618369196, + "grad_norm": 0.11788050830364227, + "learning_rate": 2.1110262437220558e-05, + "loss": 0.03, + "step": 119440 + }, + { + "epoch": 0.26343473015859015, + "grad_norm": 0.09617333114147186, + "learning_rate": 2.1108752332815476e-05, + "loss": 0.0302, + "step": 119450 + }, + { + "epoch": 0.2634567841334883, + "grad_norm": 0.10420850664377213, + "learning_rate": 2.1107242154183288e-05, + "loss": 0.0313, + "step": 119460 + }, + { + "epoch": 0.26347883810838646, + "grad_norm": 0.10132463276386261, + "learning_rate": 2.110573190134234e-05, + "loss": 0.0314, + "step": 119470 + }, + { + "epoch": 0.26350089208328464, + "grad_norm": 0.12857069075107574, + "learning_rate": 2.1104221574310986e-05, + "loss": 0.0328, + "step": 119480 + }, + { + "epoch": 0.26352294605818277, + "grad_norm": 0.10390503704547882, + "learning_rate": 2.1102711173107575e-05, + "loss": 0.029, + "step": 119490 + }, + { + "epoch": 0.26354500003308096, + "grad_norm": 0.10497821122407913, + "learning_rate": 2.1101200697750462e-05, + "loss": 0.0299, + "step": 119500 + }, + { + "epoch": 0.26356705400797914, + "grad_norm": 0.11765904724597931, + "learning_rate": 2.1099690148258e-05, + "loss": 0.0329, + "step": 119510 + }, + { + "epoch": 0.26358910798287727, + "grad_norm": 0.10548201948404312, + "learning_rate": 2.1098179524648545e-05, + "loss": 0.031, + "step": 119520 + }, + { + "epoch": 0.26361116195777545, + "grad_norm": 0.12001193314790726, + "learning_rate": 2.1096668826940445e-05, + "loss": 0.0315, + "step": 119530 + }, + { + "epoch": 0.26363321593267364, + "grad_norm": 0.11678630113601685, + "learning_rate": 2.1095158055152073e-05, + "loss": 0.0304, + "step": 119540 + }, + { + "epoch": 0.26365526990757177, + "grad_norm": 0.10338198393583298, + "learning_rate": 2.1093647209301766e-05, + "loss": 0.0317, + "step": 119550 + }, + { + "epoch": 0.26367732388246995, + "grad_norm": 0.14823457598686218, + "learning_rate": 2.1092136289407898e-05, + "loss": 0.0303, + "step": 119560 + }, + { + "epoch": 0.26369937785736813, + "grad_norm": 0.10095272213220596, + "learning_rate": 2.109062529548882e-05, + "loss": 0.0303, + "step": 119570 + }, + { + "epoch": 0.26372143183226626, + "grad_norm": 0.10834347456693649, + "learning_rate": 2.1089114227562895e-05, + "loss": 0.0306, + "step": 119580 + }, + { + "epoch": 0.26374348580716445, + "grad_norm": 0.13934598863124847, + "learning_rate": 2.108760308564848e-05, + "loss": 0.0325, + "step": 119590 + }, + { + "epoch": 0.26376553978206263, + "grad_norm": 0.1303417980670929, + "learning_rate": 2.1086091869763947e-05, + "loss": 0.0317, + "step": 119600 + }, + { + "epoch": 0.26378759375696076, + "grad_norm": 0.12790022790431976, + "learning_rate": 2.1084580579927643e-05, + "loss": 0.0293, + "step": 119610 + }, + { + "epoch": 0.26380964773185894, + "grad_norm": 0.09466453641653061, + "learning_rate": 2.1083069216157946e-05, + "loss": 0.0302, + "step": 119620 + }, + { + "epoch": 0.2638317017067571, + "grad_norm": 0.09599921107292175, + "learning_rate": 2.1081557778473213e-05, + "loss": 0.0313, + "step": 119630 + }, + { + "epoch": 0.26385375568165526, + "grad_norm": 0.11194540560245514, + "learning_rate": 2.1080046266891812e-05, + "loss": 0.0322, + "step": 119640 + }, + { + "epoch": 0.26387580965655344, + "grad_norm": 0.10990627110004425, + "learning_rate": 2.1078534681432105e-05, + "loss": 0.0301, + "step": 119650 + }, + { + "epoch": 0.2638978636314516, + "grad_norm": 0.10116122663021088, + "learning_rate": 2.1077023022112467e-05, + "loss": 0.0298, + "step": 119660 + }, + { + "epoch": 0.2639199176063498, + "grad_norm": 0.09701313823461533, + "learning_rate": 2.1075511288951258e-05, + "loss": 0.03, + "step": 119670 + }, + { + "epoch": 0.26394197158124794, + "grad_norm": 0.08776673674583435, + "learning_rate": 2.1073999481966852e-05, + "loss": 0.0323, + "step": 119680 + }, + { + "epoch": 0.2639640255561461, + "grad_norm": 0.09027062356472015, + "learning_rate": 2.107248760117762e-05, + "loss": 0.032, + "step": 119690 + }, + { + "epoch": 0.2639860795310443, + "grad_norm": 0.09795106947422028, + "learning_rate": 2.1070975646601926e-05, + "loss": 0.0304, + "step": 119700 + }, + { + "epoch": 0.26400813350594243, + "grad_norm": 0.08149917423725128, + "learning_rate": 2.1069463618258148e-05, + "loss": 0.0298, + "step": 119710 + }, + { + "epoch": 0.2640301874808406, + "grad_norm": 0.10576417297124863, + "learning_rate": 2.1067951516164658e-05, + "loss": 0.0303, + "step": 119720 + }, + { + "epoch": 0.2640522414557388, + "grad_norm": 0.12984874844551086, + "learning_rate": 2.1066439340339822e-05, + "loss": 0.0318, + "step": 119730 + }, + { + "epoch": 0.26407429543063693, + "grad_norm": 0.10755408555269241, + "learning_rate": 2.1064927090802026e-05, + "loss": 0.0311, + "step": 119740 + }, + { + "epoch": 0.2640963494055351, + "grad_norm": 0.09182904660701752, + "learning_rate": 2.106341476756964e-05, + "loss": 0.0292, + "step": 119750 + }, + { + "epoch": 0.2641184033804333, + "grad_norm": 0.1563674956560135, + "learning_rate": 2.106190237066104e-05, + "loss": 0.0307, + "step": 119760 + }, + { + "epoch": 0.2641404573553314, + "grad_norm": 0.12296722829341888, + "learning_rate": 2.1060389900094597e-05, + "loss": 0.0304, + "step": 119770 + }, + { + "epoch": 0.2641625113302296, + "grad_norm": 0.09097631275653839, + "learning_rate": 2.10588773558887e-05, + "loss": 0.0307, + "step": 119780 + }, + { + "epoch": 0.2641845653051278, + "grad_norm": 0.0949196070432663, + "learning_rate": 2.105736473806172e-05, + "loss": 0.0306, + "step": 119790 + }, + { + "epoch": 0.2642066192800259, + "grad_norm": 0.08432763069868088, + "learning_rate": 2.105585204663204e-05, + "loss": 0.0312, + "step": 119800 + }, + { + "epoch": 0.2642286732549241, + "grad_norm": 0.09826645255088806, + "learning_rate": 2.105433928161804e-05, + "loss": 0.0301, + "step": 119810 + }, + { + "epoch": 0.2642507272298223, + "grad_norm": 0.12151334434747696, + "learning_rate": 2.10528264430381e-05, + "loss": 0.0319, + "step": 119820 + }, + { + "epoch": 0.2642727812047204, + "grad_norm": 0.10794498026371002, + "learning_rate": 2.1051313530910607e-05, + "loss": 0.0327, + "step": 119830 + }, + { + "epoch": 0.2642948351796186, + "grad_norm": 0.09124378859996796, + "learning_rate": 2.1049800545253942e-05, + "loss": 0.031, + "step": 119840 + }, + { + "epoch": 0.2643168891545168, + "grad_norm": 0.10969754308462143, + "learning_rate": 2.1048287486086487e-05, + "loss": 0.0306, + "step": 119850 + }, + { + "epoch": 0.2643389431294149, + "grad_norm": 0.11263324320316315, + "learning_rate": 2.1046774353426627e-05, + "loss": 0.0304, + "step": 119860 + }, + { + "epoch": 0.2643609971043131, + "grad_norm": 0.10756392031908035, + "learning_rate": 2.104526114729275e-05, + "loss": 0.031, + "step": 119870 + }, + { + "epoch": 0.2643830510792113, + "grad_norm": 0.12204356491565704, + "learning_rate": 2.1043747867703243e-05, + "loss": 0.0336, + "step": 119880 + }, + { + "epoch": 0.2644051050541094, + "grad_norm": 0.09367257356643677, + "learning_rate": 2.1042234514676493e-05, + "loss": 0.0309, + "step": 119890 + }, + { + "epoch": 0.2644271590290076, + "grad_norm": 0.09800727665424347, + "learning_rate": 2.104072108823089e-05, + "loss": 0.0298, + "step": 119900 + }, + { + "epoch": 0.2644492130039058, + "grad_norm": 0.09435547888278961, + "learning_rate": 2.103920758838482e-05, + "loss": 0.0289, + "step": 119910 + }, + { + "epoch": 0.2644712669788039, + "grad_norm": 0.25781381130218506, + "learning_rate": 2.103769401515668e-05, + "loss": 0.0326, + "step": 119920 + }, + { + "epoch": 0.2644933209537021, + "grad_norm": 0.09314035624265671, + "learning_rate": 2.103618036856485e-05, + "loss": 0.0316, + "step": 119930 + }, + { + "epoch": 0.2645153749286003, + "grad_norm": 0.11023598909378052, + "learning_rate": 2.1034666648627735e-05, + "loss": 0.031, + "step": 119940 + }, + { + "epoch": 0.2645374289034984, + "grad_norm": 0.1148313656449318, + "learning_rate": 2.1033152855363722e-05, + "loss": 0.0301, + "step": 119950 + }, + { + "epoch": 0.2645594828783966, + "grad_norm": 0.08967508375644684, + "learning_rate": 2.1031638988791207e-05, + "loss": 0.0321, + "step": 119960 + }, + { + "epoch": 0.2645815368532948, + "grad_norm": 0.1349833458662033, + "learning_rate": 2.1030125048928583e-05, + "loss": 0.0305, + "step": 119970 + }, + { + "epoch": 0.2646035908281929, + "grad_norm": 0.1327172964811325, + "learning_rate": 2.1028611035794237e-05, + "loss": 0.031, + "step": 119980 + }, + { + "epoch": 0.2646256448030911, + "grad_norm": 0.09229756891727448, + "learning_rate": 2.1027096949406586e-05, + "loss": 0.0308, + "step": 119990 + }, + { + "epoch": 0.26464769877798927, + "grad_norm": 0.0978289544582367, + "learning_rate": 2.102558278978401e-05, + "loss": 0.0298, + "step": 120000 + } + ], + "logging_steps": 10, + "max_steps": 300000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 20000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 128, + "trial_name": null, + "trial_params": null +}