{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 19899, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015076134479119555, "grad_norm": 8.466437339782715, "learning_rate": 5.025125628140704e-08, "loss": 0.5183, "step": 10 }, { "epoch": 0.003015226895823911, "grad_norm": 6.377927303314209, "learning_rate": 1.0050251256281409e-07, "loss": 0.4951, "step": 20 }, { "epoch": 0.004522840343735866, "grad_norm": 4.76903772354126, "learning_rate": 1.5075376884422112e-07, "loss": 0.4449, "step": 30 }, { "epoch": 0.006030453791647822, "grad_norm": 4.432534694671631, "learning_rate": 2.0100502512562817e-07, "loss": 0.4111, "step": 40 }, { "epoch": 0.007538067239559777, "grad_norm": 3.7766637802124023, "learning_rate": 2.512562814070352e-07, "loss": 0.5161, "step": 50 }, { "epoch": 0.009045680687471733, "grad_norm": 9.47236156463623, "learning_rate": 3.0150753768844224e-07, "loss": 0.4049, "step": 60 }, { "epoch": 0.010553294135383688, "grad_norm": 4.561679840087891, "learning_rate": 3.5175879396984927e-07, "loss": 0.4068, "step": 70 }, { "epoch": 0.012060907583295644, "grad_norm": 7.666910171508789, "learning_rate": 4.0201005025125634e-07, "loss": 0.4016, "step": 80 }, { "epoch": 0.013568521031207599, "grad_norm": 5.0828166007995605, "learning_rate": 4.5226130653266337e-07, "loss": 0.3999, "step": 90 }, { "epoch": 0.015076134479119555, "grad_norm": 5.3433122634887695, "learning_rate": 5.025125628140704e-07, "loss": 0.4364, "step": 100 }, { "epoch": 0.01658374792703151, "grad_norm": 5.404842376708984, "learning_rate": 5.527638190954775e-07, "loss": 0.3805, "step": 110 }, { "epoch": 0.018091361374943465, "grad_norm": 5.883118629455566, "learning_rate": 6.030150753768845e-07, "loss": 0.4356, "step": 120 }, { "epoch": 0.01959897482285542, "grad_norm": 6.464461326599121, "learning_rate": 6.532663316582916e-07, "loss": 0.3889, "step": 130 }, { "epoch": 0.021106588270767376, "grad_norm": 6.58128023147583, "learning_rate": 7.035175879396985e-07, "loss": 0.4009, "step": 140 }, { "epoch": 0.022614201718679332, "grad_norm": 4.057061672210693, "learning_rate": 7.537688442211055e-07, "loss": 0.3517, "step": 150 }, { "epoch": 0.024121815166591287, "grad_norm": 3.9245855808258057, "learning_rate": 8.040201005025127e-07, "loss": 0.3786, "step": 160 }, { "epoch": 0.025629428614503243, "grad_norm": 4.780637264251709, "learning_rate": 8.542713567839197e-07, "loss": 0.3048, "step": 170 }, { "epoch": 0.027137042062415198, "grad_norm": 4.121255397796631, "learning_rate": 9.045226130653267e-07, "loss": 0.3208, "step": 180 }, { "epoch": 0.028644655510327154, "grad_norm": 4.083913803100586, "learning_rate": 9.547738693467337e-07, "loss": 0.3537, "step": 190 }, { "epoch": 0.03015226895823911, "grad_norm": 4.489317417144775, "learning_rate": 1.0050251256281409e-06, "loss": 0.328, "step": 200 }, { "epoch": 0.031659882406151064, "grad_norm": 4.143144607543945, "learning_rate": 1.0552763819095479e-06, "loss": 0.2938, "step": 210 }, { "epoch": 0.03316749585406302, "grad_norm": 4.063299655914307, "learning_rate": 1.105527638190955e-06, "loss": 0.3108, "step": 220 }, { "epoch": 0.034675109301974975, "grad_norm": 4.924376010894775, "learning_rate": 1.155778894472362e-06, "loss": 0.38, "step": 230 }, { "epoch": 0.03618272274988693, "grad_norm": 4.5875725746154785, "learning_rate": 1.206030150753769e-06, "loss": 0.3369, "step": 240 }, { "epoch": 0.037690336197798886, "grad_norm": 2.794879198074341, "learning_rate": 1.256281407035176e-06, "loss": 0.3755, "step": 250 }, { "epoch": 0.03919794964571084, "grad_norm": 5.770720481872559, "learning_rate": 1.3065326633165831e-06, "loss": 0.3477, "step": 260 }, { "epoch": 0.0407055630936228, "grad_norm": 14.629878044128418, "learning_rate": 1.35678391959799e-06, "loss": 0.3503, "step": 270 }, { "epoch": 0.04221317654153475, "grad_norm": 4.546604633331299, "learning_rate": 1.407035175879397e-06, "loss": 0.3264, "step": 280 }, { "epoch": 0.04372078998944671, "grad_norm": 3.433159589767456, "learning_rate": 1.457286432160804e-06, "loss": 0.389, "step": 290 }, { "epoch": 0.045228403437358664, "grad_norm": 4.80047607421875, "learning_rate": 1.507537688442211e-06, "loss": 0.3306, "step": 300 }, { "epoch": 0.04673601688527062, "grad_norm": 3.4989206790924072, "learning_rate": 1.5577889447236184e-06, "loss": 0.3689, "step": 310 }, { "epoch": 0.048243630333182574, "grad_norm": 5.795087814331055, "learning_rate": 1.6080402010050254e-06, "loss": 0.3609, "step": 320 }, { "epoch": 0.04975124378109453, "grad_norm": 2.6186094284057617, "learning_rate": 1.6582914572864323e-06, "loss": 0.298, "step": 330 }, { "epoch": 0.051258857229006485, "grad_norm": 3.2044849395751953, "learning_rate": 1.7085427135678393e-06, "loss": 0.2863, "step": 340 }, { "epoch": 0.05276647067691844, "grad_norm": 5.251688480377197, "learning_rate": 1.7587939698492465e-06, "loss": 0.3941, "step": 350 }, { "epoch": 0.054274084124830396, "grad_norm": 6.320252895355225, "learning_rate": 1.8090452261306535e-06, "loss": 0.2963, "step": 360 }, { "epoch": 0.05578169757274235, "grad_norm": 3.1780920028686523, "learning_rate": 1.8592964824120604e-06, "loss": 0.3253, "step": 370 }, { "epoch": 0.05728931102065431, "grad_norm": 4.07274866104126, "learning_rate": 1.9095477386934674e-06, "loss": 0.3388, "step": 380 }, { "epoch": 0.05879692446856626, "grad_norm": 3.737522840499878, "learning_rate": 1.9597989949748746e-06, "loss": 0.3154, "step": 390 }, { "epoch": 0.06030453791647822, "grad_norm": 3.700758218765259, "learning_rate": 2.0100502512562818e-06, "loss": 0.2646, "step": 400 }, { "epoch": 0.061812151364390173, "grad_norm": 3.022284984588623, "learning_rate": 2.0603015075376885e-06, "loss": 0.3061, "step": 410 }, { "epoch": 0.06331976481230213, "grad_norm": 3.63232159614563, "learning_rate": 2.1105527638190957e-06, "loss": 0.3233, "step": 420 }, { "epoch": 0.06482737826021408, "grad_norm": 3.972503900527954, "learning_rate": 2.1608040201005025e-06, "loss": 0.3035, "step": 430 }, { "epoch": 0.06633499170812604, "grad_norm": 2.6685028076171875, "learning_rate": 2.21105527638191e-06, "loss": 0.2917, "step": 440 }, { "epoch": 0.06784260515603799, "grad_norm": 4.319540977478027, "learning_rate": 2.261306532663317e-06, "loss": 0.3208, "step": 450 }, { "epoch": 0.06935021860394995, "grad_norm": 5.625812530517578, "learning_rate": 2.311557788944724e-06, "loss": 0.2697, "step": 460 }, { "epoch": 0.0708578320518619, "grad_norm": 4.213413715362549, "learning_rate": 2.3618090452261308e-06, "loss": 0.3167, "step": 470 }, { "epoch": 0.07236544549977386, "grad_norm": 3.17311954498291, "learning_rate": 2.412060301507538e-06, "loss": 0.2806, "step": 480 }, { "epoch": 0.07387305894768581, "grad_norm": 3.4852893352508545, "learning_rate": 2.462311557788945e-06, "loss": 0.3274, "step": 490 }, { "epoch": 0.07538067239559777, "grad_norm": 3.9858596324920654, "learning_rate": 2.512562814070352e-06, "loss": 0.2663, "step": 500 }, { "epoch": 0.07688828584350972, "grad_norm": 4.0728583335876465, "learning_rate": 2.562814070351759e-06, "loss": 0.3425, "step": 510 }, { "epoch": 0.07839589929142168, "grad_norm": 2.3040196895599365, "learning_rate": 2.6130653266331663e-06, "loss": 0.2931, "step": 520 }, { "epoch": 0.07990351273933363, "grad_norm": 6.347341537475586, "learning_rate": 2.663316582914573e-06, "loss": 0.3964, "step": 530 }, { "epoch": 0.0814111261872456, "grad_norm": 4.884843349456787, "learning_rate": 2.71356783919598e-06, "loss": 0.3244, "step": 540 }, { "epoch": 0.08291873963515754, "grad_norm": 2.759336233139038, "learning_rate": 2.763819095477387e-06, "loss": 0.263, "step": 550 }, { "epoch": 0.0844263530830695, "grad_norm": 3.1676714420318604, "learning_rate": 2.814070351758794e-06, "loss": 0.2837, "step": 560 }, { "epoch": 0.08593396653098145, "grad_norm": 4.213423728942871, "learning_rate": 2.8643216080402013e-06, "loss": 0.3544, "step": 570 }, { "epoch": 0.08744157997889342, "grad_norm": 4.730088233947754, "learning_rate": 2.914572864321608e-06, "loss": 0.2776, "step": 580 }, { "epoch": 0.08894919342680536, "grad_norm": 4.058215141296387, "learning_rate": 2.9648241206030153e-06, "loss": 0.276, "step": 590 }, { "epoch": 0.09045680687471733, "grad_norm": 4.410142421722412, "learning_rate": 3.015075376884422e-06, "loss": 0.328, "step": 600 }, { "epoch": 0.09196442032262928, "grad_norm": 4.103255748748779, "learning_rate": 3.065326633165829e-06, "loss": 0.3113, "step": 610 }, { "epoch": 0.09347203377054124, "grad_norm": 3.681431770324707, "learning_rate": 3.115577889447237e-06, "loss": 0.2949, "step": 620 }, { "epoch": 0.09497964721845319, "grad_norm": 3.7948412895202637, "learning_rate": 3.165829145728643e-06, "loss": 0.2848, "step": 630 }, { "epoch": 0.09648726066636515, "grad_norm": 3.1312339305877686, "learning_rate": 3.2160804020100507e-06, "loss": 0.3172, "step": 640 }, { "epoch": 0.0979948741142771, "grad_norm": 3.5624351501464844, "learning_rate": 3.266331658291458e-06, "loss": 0.33, "step": 650 }, { "epoch": 0.09950248756218906, "grad_norm": 6.255454063415527, "learning_rate": 3.3165829145728647e-06, "loss": 0.3307, "step": 660 }, { "epoch": 0.10101010101010101, "grad_norm": 3.932722806930542, "learning_rate": 3.366834170854272e-06, "loss": 0.2545, "step": 670 }, { "epoch": 0.10251771445801297, "grad_norm": 2.9681460857391357, "learning_rate": 3.4170854271356786e-06, "loss": 0.3089, "step": 680 }, { "epoch": 0.10402532790592492, "grad_norm": 4.814634799957275, "learning_rate": 3.467336683417086e-06, "loss": 0.2808, "step": 690 }, { "epoch": 0.10553294135383688, "grad_norm": 3.2699813842773438, "learning_rate": 3.517587939698493e-06, "loss": 0.3309, "step": 700 }, { "epoch": 0.10704055480174883, "grad_norm": 3.65628981590271, "learning_rate": 3.5678391959798997e-06, "loss": 0.3141, "step": 710 }, { "epoch": 0.10854816824966079, "grad_norm": 2.4147298336029053, "learning_rate": 3.618090452261307e-06, "loss": 0.2638, "step": 720 }, { "epoch": 0.11005578169757274, "grad_norm": 2.8677515983581543, "learning_rate": 3.6683417085427137e-06, "loss": 0.3159, "step": 730 }, { "epoch": 0.1115633951454847, "grad_norm": 3.7638463973999023, "learning_rate": 3.718592964824121e-06, "loss": 0.2703, "step": 740 }, { "epoch": 0.11307100859339665, "grad_norm": 3.0089945793151855, "learning_rate": 3.768844221105528e-06, "loss": 0.2938, "step": 750 }, { "epoch": 0.11457862204130861, "grad_norm": 5.12350606918335, "learning_rate": 3.819095477386935e-06, "loss": 0.3343, "step": 760 }, { "epoch": 0.11608623548922056, "grad_norm": 4.8278489112854, "learning_rate": 3.869346733668342e-06, "loss": 0.3057, "step": 770 }, { "epoch": 0.11759384893713253, "grad_norm": 2.5069169998168945, "learning_rate": 3.919597989949749e-06, "loss": 0.2722, "step": 780 }, { "epoch": 0.11910146238504447, "grad_norm": 5.142627716064453, "learning_rate": 3.969849246231156e-06, "loss": 0.2429, "step": 790 }, { "epoch": 0.12060907583295644, "grad_norm": 4.437289237976074, "learning_rate": 4.0201005025125635e-06, "loss": 0.2952, "step": 800 }, { "epoch": 0.12211668928086838, "grad_norm": 5.262016773223877, "learning_rate": 4.07035175879397e-06, "loss": 0.3145, "step": 810 }, { "epoch": 0.12362430272878035, "grad_norm": 2.919447183609009, "learning_rate": 4.120603015075377e-06, "loss": 0.3359, "step": 820 }, { "epoch": 0.1251319161766923, "grad_norm": 3.793212413787842, "learning_rate": 4.170854271356784e-06, "loss": 0.3662, "step": 830 }, { "epoch": 0.12663952962460426, "grad_norm": 2.2590548992156982, "learning_rate": 4.221105527638191e-06, "loss": 0.3356, "step": 840 }, { "epoch": 0.1281471430725162, "grad_norm": 4.258463382720947, "learning_rate": 4.271356783919598e-06, "loss": 0.3013, "step": 850 }, { "epoch": 0.12965475652042815, "grad_norm": 4.656329154968262, "learning_rate": 4.321608040201005e-06, "loss": 0.2928, "step": 860 }, { "epoch": 0.13116236996834013, "grad_norm": 3.040794610977173, "learning_rate": 4.3718592964824125e-06, "loss": 0.2837, "step": 870 }, { "epoch": 0.13266998341625208, "grad_norm": 2.699613571166992, "learning_rate": 4.42211055276382e-06, "loss": 0.2844, "step": 880 }, { "epoch": 0.13417759686416403, "grad_norm": 5.510797023773193, "learning_rate": 4.472361809045226e-06, "loss": 0.3074, "step": 890 }, { "epoch": 0.13568521031207598, "grad_norm": 3.7837042808532715, "learning_rate": 4.522613065326634e-06, "loss": 0.3111, "step": 900 }, { "epoch": 0.13719282375998793, "grad_norm": 2.6484439373016357, "learning_rate": 4.57286432160804e-06, "loss": 0.2994, "step": 910 }, { "epoch": 0.1387004372078999, "grad_norm": 5.390711307525635, "learning_rate": 4.623115577889448e-06, "loss": 0.3434, "step": 920 }, { "epoch": 0.14020805065581185, "grad_norm": 4.301868438720703, "learning_rate": 4.673366834170855e-06, "loss": 0.2703, "step": 930 }, { "epoch": 0.1417156641037238, "grad_norm": 3.3278768062591553, "learning_rate": 4.7236180904522615e-06, "loss": 0.33, "step": 940 }, { "epoch": 0.14322327755163575, "grad_norm": 4.29428243637085, "learning_rate": 4.773869346733669e-06, "loss": 0.3554, "step": 950 }, { "epoch": 0.14473089099954772, "grad_norm": 3.4742884635925293, "learning_rate": 4.824120603015076e-06, "loss": 0.2477, "step": 960 }, { "epoch": 0.14623850444745967, "grad_norm": 4.287181377410889, "learning_rate": 4.874371859296483e-06, "loss": 0.3007, "step": 970 }, { "epoch": 0.14774611789537162, "grad_norm": 3.2245290279388428, "learning_rate": 4.92462311557789e-06, "loss": 0.3068, "step": 980 }, { "epoch": 0.14925373134328357, "grad_norm": 2.9806454181671143, "learning_rate": 4.974874371859297e-06, "loss": 0.3114, "step": 990 }, { "epoch": 0.15076134479119555, "grad_norm": 5.1998491287231445, "learning_rate": 5.025125628140704e-06, "loss": 0.3145, "step": 1000 }, { "epoch": 0.1522689582391075, "grad_norm": 1.8972229957580566, "learning_rate": 5.0753768844221105e-06, "loss": 0.2911, "step": 1010 }, { "epoch": 0.15377657168701944, "grad_norm": 4.076289653778076, "learning_rate": 5.125628140703518e-06, "loss": 0.2732, "step": 1020 }, { "epoch": 0.1552841851349314, "grad_norm": 4.409232139587402, "learning_rate": 5.175879396984925e-06, "loss": 0.2304, "step": 1030 }, { "epoch": 0.15679179858284337, "grad_norm": 3.438533067703247, "learning_rate": 5.2261306532663325e-06, "loss": 0.2693, "step": 1040 }, { "epoch": 0.15829941203075532, "grad_norm": 3.3373851776123047, "learning_rate": 5.2763819095477384e-06, "loss": 0.2846, "step": 1050 }, { "epoch": 0.15980702547866726, "grad_norm": 4.502352714538574, "learning_rate": 5.326633165829146e-06, "loss": 0.3138, "step": 1060 }, { "epoch": 0.1613146389265792, "grad_norm": 4.1754374504089355, "learning_rate": 5.376884422110553e-06, "loss": 0.3063, "step": 1070 }, { "epoch": 0.1628222523744912, "grad_norm": 3.561445474624634, "learning_rate": 5.42713567839196e-06, "loss": 0.3021, "step": 1080 }, { "epoch": 0.16432986582240314, "grad_norm": 4.156134605407715, "learning_rate": 5.477386934673368e-06, "loss": 0.2603, "step": 1090 }, { "epoch": 0.16583747927031509, "grad_norm": 2.967522144317627, "learning_rate": 5.527638190954774e-06, "loss": 0.281, "step": 1100 }, { "epoch": 0.16734509271822703, "grad_norm": 4.860574245452881, "learning_rate": 5.577889447236181e-06, "loss": 0.2451, "step": 1110 }, { "epoch": 0.168852706166139, "grad_norm": 3.299337863922119, "learning_rate": 5.628140703517588e-06, "loss": 0.3059, "step": 1120 }, { "epoch": 0.17036031961405096, "grad_norm": 4.375476360321045, "learning_rate": 5.678391959798996e-06, "loss": 0.3123, "step": 1130 }, { "epoch": 0.1718679330619629, "grad_norm": 3.3299477100372314, "learning_rate": 5.728643216080403e-06, "loss": 0.2002, "step": 1140 }, { "epoch": 0.17337554650987486, "grad_norm": 2.254943609237671, "learning_rate": 5.778894472361809e-06, "loss": 0.2722, "step": 1150 }, { "epoch": 0.17488315995778683, "grad_norm": 2.612957000732422, "learning_rate": 5.829145728643216e-06, "loss": 0.312, "step": 1160 }, { "epoch": 0.17639077340569878, "grad_norm": 4.293173789978027, "learning_rate": 5.879396984924624e-06, "loss": 0.2694, "step": 1170 }, { "epoch": 0.17789838685361073, "grad_norm": 4.8992767333984375, "learning_rate": 5.9296482412060305e-06, "loss": 0.3296, "step": 1180 }, { "epoch": 0.17940600030152268, "grad_norm": 5.453653812408447, "learning_rate": 5.979899497487438e-06, "loss": 0.3205, "step": 1190 }, { "epoch": 0.18091361374943465, "grad_norm": 2.4829823970794678, "learning_rate": 6.030150753768844e-06, "loss": 0.4016, "step": 1200 }, { "epoch": 0.1824212271973466, "grad_norm": 3.1099557876586914, "learning_rate": 6.080402010050252e-06, "loss": 0.2832, "step": 1210 }, { "epoch": 0.18392884064525855, "grad_norm": 2.410034656524658, "learning_rate": 6.130653266331658e-06, "loss": 0.3354, "step": 1220 }, { "epoch": 0.1854364540931705, "grad_norm": 6.909908294677734, "learning_rate": 6.180904522613066e-06, "loss": 0.2765, "step": 1230 }, { "epoch": 0.18694406754108248, "grad_norm": 3.1648104190826416, "learning_rate": 6.231155778894474e-06, "loss": 0.2981, "step": 1240 }, { "epoch": 0.18845168098899442, "grad_norm": 4.3531880378723145, "learning_rate": 6.28140703517588e-06, "loss": 0.2591, "step": 1250 }, { "epoch": 0.18995929443690637, "grad_norm": 2.9781363010406494, "learning_rate": 6.331658291457286e-06, "loss": 0.2762, "step": 1260 }, { "epoch": 0.19146690788481832, "grad_norm": 2.6343753337860107, "learning_rate": 6.381909547738694e-06, "loss": 0.2653, "step": 1270 }, { "epoch": 0.1929745213327303, "grad_norm": 3.5649783611297607, "learning_rate": 6.4321608040201015e-06, "loss": 0.3203, "step": 1280 }, { "epoch": 0.19448213478064225, "grad_norm": 2.6057844161987305, "learning_rate": 6.482412060301508e-06, "loss": 0.2168, "step": 1290 }, { "epoch": 0.1959897482285542, "grad_norm": 2.0241429805755615, "learning_rate": 6.532663316582916e-06, "loss": 0.2744, "step": 1300 }, { "epoch": 0.19749736167646614, "grad_norm": 3.8744161128997803, "learning_rate": 6.582914572864322e-06, "loss": 0.3115, "step": 1310 }, { "epoch": 0.19900497512437812, "grad_norm": 3.4862043857574463, "learning_rate": 6.633165829145729e-06, "loss": 0.2986, "step": 1320 }, { "epoch": 0.20051258857229007, "grad_norm": 3.0941386222839355, "learning_rate": 6.683417085427136e-06, "loss": 0.2833, "step": 1330 }, { "epoch": 0.20202020202020202, "grad_norm": 2.1051924228668213, "learning_rate": 6.733668341708544e-06, "loss": 0.2826, "step": 1340 }, { "epoch": 0.20352781546811397, "grad_norm": 3.438100814819336, "learning_rate": 6.7839195979899505e-06, "loss": 0.2833, "step": 1350 }, { "epoch": 0.20503542891602594, "grad_norm": 2.937682867050171, "learning_rate": 6.834170854271357e-06, "loss": 0.2607, "step": 1360 }, { "epoch": 0.2065430423639379, "grad_norm": 2.570866107940674, "learning_rate": 6.884422110552764e-06, "loss": 0.2757, "step": 1370 }, { "epoch": 0.20805065581184984, "grad_norm": 2.3812942504882812, "learning_rate": 6.934673366834172e-06, "loss": 0.2785, "step": 1380 }, { "epoch": 0.2095582692597618, "grad_norm": 2.45267915725708, "learning_rate": 6.984924623115578e-06, "loss": 0.3287, "step": 1390 }, { "epoch": 0.21106588270767376, "grad_norm": 3.2904679775238037, "learning_rate": 7.035175879396986e-06, "loss": 0.2975, "step": 1400 }, { "epoch": 0.2125734961555857, "grad_norm": 2.8387913703918457, "learning_rate": 7.085427135678392e-06, "loss": 0.3006, "step": 1410 }, { "epoch": 0.21408110960349766, "grad_norm": 3.260937213897705, "learning_rate": 7.1356783919597995e-06, "loss": 0.3085, "step": 1420 }, { "epoch": 0.2155887230514096, "grad_norm": 2.9271090030670166, "learning_rate": 7.185929648241206e-06, "loss": 0.2426, "step": 1430 }, { "epoch": 0.21709633649932158, "grad_norm": 3.202342987060547, "learning_rate": 7.236180904522614e-06, "loss": 0.2703, "step": 1440 }, { "epoch": 0.21860394994723353, "grad_norm": 3.110767364501953, "learning_rate": 7.2864321608040215e-06, "loss": 0.284, "step": 1450 }, { "epoch": 0.22011156339514548, "grad_norm": 2.21466326713562, "learning_rate": 7.336683417085427e-06, "loss": 0.3194, "step": 1460 }, { "epoch": 0.22161917684305743, "grad_norm": 3.501913547515869, "learning_rate": 7.386934673366835e-06, "loss": 0.3467, "step": 1470 }, { "epoch": 0.2231267902909694, "grad_norm": 2.955921173095703, "learning_rate": 7.437185929648242e-06, "loss": 0.2904, "step": 1480 }, { "epoch": 0.22463440373888136, "grad_norm": 4.946940898895264, "learning_rate": 7.487437185929649e-06, "loss": 0.244, "step": 1490 }, { "epoch": 0.2261420171867933, "grad_norm": 4.053277015686035, "learning_rate": 7.537688442211056e-06, "loss": 0.2999, "step": 1500 }, { "epoch": 0.22764963063470525, "grad_norm": 3.2244439125061035, "learning_rate": 7.587939698492463e-06, "loss": 0.2667, "step": 1510 }, { "epoch": 0.22915724408261723, "grad_norm": 3.1936469078063965, "learning_rate": 7.63819095477387e-06, "loss": 0.3156, "step": 1520 }, { "epoch": 0.23066485753052918, "grad_norm": 5.090414047241211, "learning_rate": 7.688442211055276e-06, "loss": 0.2701, "step": 1530 }, { "epoch": 0.23217247097844113, "grad_norm": 4.028707504272461, "learning_rate": 7.738693467336685e-06, "loss": 0.2973, "step": 1540 }, { "epoch": 0.23368008442635307, "grad_norm": 2.9595274925231934, "learning_rate": 7.788944723618092e-06, "loss": 0.2772, "step": 1550 }, { "epoch": 0.23518769787426505, "grad_norm": 3.4496970176696777, "learning_rate": 7.839195979899498e-06, "loss": 0.2826, "step": 1560 }, { "epoch": 0.236695311322177, "grad_norm": 3.4038941860198975, "learning_rate": 7.889447236180905e-06, "loss": 0.2808, "step": 1570 }, { "epoch": 0.23820292477008895, "grad_norm": 5.341665744781494, "learning_rate": 7.939698492462312e-06, "loss": 0.3565, "step": 1580 }, { "epoch": 0.2397105382180009, "grad_norm": 4.107424736022949, "learning_rate": 7.989949748743719e-06, "loss": 0.2809, "step": 1590 }, { "epoch": 0.24121815166591287, "grad_norm": 5.134035110473633, "learning_rate": 8.040201005025127e-06, "loss": 0.2845, "step": 1600 }, { "epoch": 0.24272576511382482, "grad_norm": 4.4280686378479, "learning_rate": 8.090452261306532e-06, "loss": 0.3108, "step": 1610 }, { "epoch": 0.24423337856173677, "grad_norm": 2.977642774581909, "learning_rate": 8.14070351758794e-06, "loss": 0.2467, "step": 1620 }, { "epoch": 0.24574099200964872, "grad_norm": 3.0399117469787598, "learning_rate": 8.190954773869347e-06, "loss": 0.2703, "step": 1630 }, { "epoch": 0.2472486054575607, "grad_norm": 2.620563507080078, "learning_rate": 8.241206030150754e-06, "loss": 0.2042, "step": 1640 }, { "epoch": 0.24875621890547264, "grad_norm": 4.688863277435303, "learning_rate": 8.291457286432163e-06, "loss": 0.2735, "step": 1650 }, { "epoch": 0.2502638323533846, "grad_norm": 3.1117756366729736, "learning_rate": 8.341708542713568e-06, "loss": 0.3049, "step": 1660 }, { "epoch": 0.25177144580129657, "grad_norm": 3.42677903175354, "learning_rate": 8.391959798994976e-06, "loss": 0.2713, "step": 1670 }, { "epoch": 0.2532790592492085, "grad_norm": 3.6287248134613037, "learning_rate": 8.442211055276383e-06, "loss": 0.2694, "step": 1680 }, { "epoch": 0.25478667269712046, "grad_norm": 3.126880645751953, "learning_rate": 8.49246231155779e-06, "loss": 0.2488, "step": 1690 }, { "epoch": 0.2562942861450324, "grad_norm": 2.623779058456421, "learning_rate": 8.542713567839196e-06, "loss": 0.3267, "step": 1700 }, { "epoch": 0.25780189959294436, "grad_norm": 4.3131422996521, "learning_rate": 8.592964824120603e-06, "loss": 0.2916, "step": 1710 }, { "epoch": 0.2593095130408563, "grad_norm": 3.576908588409424, "learning_rate": 8.64321608040201e-06, "loss": 0.2829, "step": 1720 }, { "epoch": 0.26081712648876826, "grad_norm": 3.762002944946289, "learning_rate": 8.693467336683418e-06, "loss": 0.2426, "step": 1730 }, { "epoch": 0.26232473993668026, "grad_norm": 3.748302936553955, "learning_rate": 8.743718592964825e-06, "loss": 0.2604, "step": 1740 }, { "epoch": 0.2638323533845922, "grad_norm": 2.903917074203491, "learning_rate": 8.793969849246232e-06, "loss": 0.2739, "step": 1750 }, { "epoch": 0.26533996683250416, "grad_norm": 5.961026668548584, "learning_rate": 8.84422110552764e-06, "loss": 0.2903, "step": 1760 }, { "epoch": 0.2668475802804161, "grad_norm": 3.922060012817383, "learning_rate": 8.894472361809045e-06, "loss": 0.2301, "step": 1770 }, { "epoch": 0.26835519372832806, "grad_norm": 3.852799415588379, "learning_rate": 8.944723618090452e-06, "loss": 0.2933, "step": 1780 }, { "epoch": 0.26986280717624, "grad_norm": 3.003422975540161, "learning_rate": 8.99497487437186e-06, "loss": 0.2901, "step": 1790 }, { "epoch": 0.27137042062415195, "grad_norm": 2.498257637023926, "learning_rate": 9.045226130653267e-06, "loss": 0.2404, "step": 1800 }, { "epoch": 0.2728780340720639, "grad_norm": 5.169360160827637, "learning_rate": 9.095477386934674e-06, "loss": 0.3194, "step": 1810 }, { "epoch": 0.27438564751997585, "grad_norm": 3.0723836421966553, "learning_rate": 9.14572864321608e-06, "loss": 0.2683, "step": 1820 }, { "epoch": 0.27589326096788785, "grad_norm": 2.544919013977051, "learning_rate": 9.195979899497488e-06, "loss": 0.2621, "step": 1830 }, { "epoch": 0.2774008744157998, "grad_norm": 4.051214218139648, "learning_rate": 9.246231155778896e-06, "loss": 0.2722, "step": 1840 }, { "epoch": 0.27890848786371175, "grad_norm": 4.335169315338135, "learning_rate": 9.296482412060303e-06, "loss": 0.2981, "step": 1850 }, { "epoch": 0.2804161013116237, "grad_norm": 2.8749923706054688, "learning_rate": 9.34673366834171e-06, "loss": 0.25, "step": 1860 }, { "epoch": 0.28192371475953565, "grad_norm": 2.6422088146209717, "learning_rate": 9.396984924623116e-06, "loss": 0.289, "step": 1870 }, { "epoch": 0.2834313282074476, "grad_norm": 2.6056771278381348, "learning_rate": 9.447236180904523e-06, "loss": 0.234, "step": 1880 }, { "epoch": 0.28493894165535955, "grad_norm": 2.3233642578125, "learning_rate": 9.49748743718593e-06, "loss": 0.264, "step": 1890 }, { "epoch": 0.2864465551032715, "grad_norm": 4.396934986114502, "learning_rate": 9.547738693467338e-06, "loss": 0.3025, "step": 1900 }, { "epoch": 0.2879541685511835, "grad_norm": 1.7335578203201294, "learning_rate": 9.597989949748745e-06, "loss": 0.2802, "step": 1910 }, { "epoch": 0.28946178199909545, "grad_norm": 2.7161102294921875, "learning_rate": 9.648241206030152e-06, "loss": 0.2603, "step": 1920 }, { "epoch": 0.2909693954470074, "grad_norm": 4.456732749938965, "learning_rate": 9.698492462311559e-06, "loss": 0.2228, "step": 1930 }, { "epoch": 0.29247700889491934, "grad_norm": 2.818270444869995, "learning_rate": 9.748743718592965e-06, "loss": 0.2755, "step": 1940 }, { "epoch": 0.2939846223428313, "grad_norm": 3.1351141929626465, "learning_rate": 9.798994974874372e-06, "loss": 0.3078, "step": 1950 }, { "epoch": 0.29549223579074324, "grad_norm": 3.064115047454834, "learning_rate": 9.84924623115578e-06, "loss": 0.2866, "step": 1960 }, { "epoch": 0.2969998492386552, "grad_norm": 2.546238660812378, "learning_rate": 9.899497487437186e-06, "loss": 0.2606, "step": 1970 }, { "epoch": 0.29850746268656714, "grad_norm": 3.1788430213928223, "learning_rate": 9.949748743718594e-06, "loss": 0.2415, "step": 1980 }, { "epoch": 0.30001507613447914, "grad_norm": 3.4894962310791016, "learning_rate": 1e-05, "loss": 0.2494, "step": 1990 }, { "epoch": 0.3015226895823911, "grad_norm": 4.1228227615356445, "learning_rate": 9.999992306978083e-06, "loss": 0.3096, "step": 2000 }, { "epoch": 0.30303030303030304, "grad_norm": 3.651365280151367, "learning_rate": 9.999969227936001e-06, "loss": 0.2663, "step": 2010 }, { "epoch": 0.304537916478215, "grad_norm": 3.3638622760772705, "learning_rate": 9.999930762944776e-06, "loss": 0.2766, "step": 2020 }, { "epoch": 0.30604552992612694, "grad_norm": 2.9725019931793213, "learning_rate": 9.999876912122772e-06, "loss": 0.1915, "step": 2030 }, { "epoch": 0.3075531433740389, "grad_norm": 3.9681382179260254, "learning_rate": 9.9998076756357e-06, "loss": 0.2368, "step": 2040 }, { "epoch": 0.30906075682195083, "grad_norm": 4.290899276733398, "learning_rate": 9.999723053696613e-06, "loss": 0.2439, "step": 2050 }, { "epoch": 0.3105683702698628, "grad_norm": 3.2242908477783203, "learning_rate": 9.999623046565912e-06, "loss": 0.2852, "step": 2060 }, { "epoch": 0.3120759837177748, "grad_norm": 4.231281757354736, "learning_rate": 9.99950765455134e-06, "loss": 0.3192, "step": 2070 }, { "epoch": 0.31358359716568673, "grad_norm": 3.6881601810455322, "learning_rate": 9.99937687800798e-06, "loss": 0.2527, "step": 2080 }, { "epoch": 0.3150912106135987, "grad_norm": 3.546578884124756, "learning_rate": 9.99923071733826e-06, "loss": 0.2872, "step": 2090 }, { "epoch": 0.31659882406151063, "grad_norm": 2.7275216579437256, "learning_rate": 9.999069172991949e-06, "loss": 0.2018, "step": 2100 }, { "epoch": 0.3181064375094226, "grad_norm": 3.716137409210205, "learning_rate": 9.998892245466152e-06, "loss": 0.3604, "step": 2110 }, { "epoch": 0.31961405095733453, "grad_norm": 3.0390613079071045, "learning_rate": 9.998699935305309e-06, "loss": 0.2958, "step": 2120 }, { "epoch": 0.3211216644052465, "grad_norm": 3.1779959201812744, "learning_rate": 9.998492243101201e-06, "loss": 0.2515, "step": 2130 }, { "epoch": 0.3226292778531584, "grad_norm": 4.9841718673706055, "learning_rate": 9.99826916949294e-06, "loss": 0.2894, "step": 2140 }, { "epoch": 0.32413689130107043, "grad_norm": 2.9918882846832275, "learning_rate": 9.998030715166968e-06, "loss": 0.2849, "step": 2150 }, { "epoch": 0.3256445047489824, "grad_norm": 4.957390785217285, "learning_rate": 9.997776880857064e-06, "loss": 0.3547, "step": 2160 }, { "epoch": 0.3271521181968943, "grad_norm": 3.7274563312530518, "learning_rate": 9.997507667344323e-06, "loss": 0.2604, "step": 2170 }, { "epoch": 0.3286597316448063, "grad_norm": 3.310037851333618, "learning_rate": 9.997223075457174e-06, "loss": 0.267, "step": 2180 }, { "epoch": 0.3301673450927182, "grad_norm": 3.518956184387207, "learning_rate": 9.996923106071366e-06, "loss": 0.2855, "step": 2190 }, { "epoch": 0.33167495854063017, "grad_norm": 3.827298164367676, "learning_rate": 9.996607760109968e-06, "loss": 0.248, "step": 2200 }, { "epoch": 0.3331825719885421, "grad_norm": 2.4609646797180176, "learning_rate": 9.996277038543363e-06, "loss": 0.265, "step": 2210 }, { "epoch": 0.33469018543645407, "grad_norm": 2.364471673965454, "learning_rate": 9.99593094238925e-06, "loss": 0.2514, "step": 2220 }, { "epoch": 0.3361977988843661, "grad_norm": 2.191396474838257, "learning_rate": 9.995569472712645e-06, "loss": 0.3036, "step": 2230 }, { "epoch": 0.337705412332278, "grad_norm": 2.903813362121582, "learning_rate": 9.995192630625858e-06, "loss": 0.2966, "step": 2240 }, { "epoch": 0.33921302578018997, "grad_norm": 2.8035690784454346, "learning_rate": 9.994800417288518e-06, "loss": 0.2469, "step": 2250 }, { "epoch": 0.3407206392281019, "grad_norm": 2.415008544921875, "learning_rate": 9.994392833907538e-06, "loss": 0.2643, "step": 2260 }, { "epoch": 0.34222825267601387, "grad_norm": 3.9544615745544434, "learning_rate": 9.993969881737147e-06, "loss": 0.2299, "step": 2270 }, { "epoch": 0.3437358661239258, "grad_norm": 4.803537845611572, "learning_rate": 9.993531562078851e-06, "loss": 0.2603, "step": 2280 }, { "epoch": 0.34524347957183776, "grad_norm": 4.2921857833862305, "learning_rate": 9.993077876281454e-06, "loss": 0.2637, "step": 2290 }, { "epoch": 0.3467510930197497, "grad_norm": 3.0904462337493896, "learning_rate": 9.99260882574104e-06, "loss": 0.2846, "step": 2300 }, { "epoch": 0.3482587064676617, "grad_norm": 2.9947805404663086, "learning_rate": 9.992124411900975e-06, "loss": 0.2252, "step": 2310 }, { "epoch": 0.34976631991557366, "grad_norm": 3.7718050479888916, "learning_rate": 9.991624636251904e-06, "loss": 0.3023, "step": 2320 }, { "epoch": 0.3512739333634856, "grad_norm": 1.7917969226837158, "learning_rate": 9.99110950033174e-06, "loss": 0.2219, "step": 2330 }, { "epoch": 0.35278154681139756, "grad_norm": 2.7174954414367676, "learning_rate": 9.990579005725663e-06, "loss": 0.29, "step": 2340 }, { "epoch": 0.3542891602593095, "grad_norm": 3.5024642944335938, "learning_rate": 9.990033154066116e-06, "loss": 0.2432, "step": 2350 }, { "epoch": 0.35579677370722146, "grad_norm": 4.281993389129639, "learning_rate": 9.989471947032799e-06, "loss": 0.2416, "step": 2360 }, { "epoch": 0.3573043871551334, "grad_norm": 4.072799205780029, "learning_rate": 9.988895386352662e-06, "loss": 0.2552, "step": 2370 }, { "epoch": 0.35881200060304536, "grad_norm": 4.950239658355713, "learning_rate": 9.988303473799904e-06, "loss": 0.309, "step": 2380 }, { "epoch": 0.36031961405095736, "grad_norm": 3.8145763874053955, "learning_rate": 9.987696211195963e-06, "loss": 0.2687, "step": 2390 }, { "epoch": 0.3618272274988693, "grad_norm": 4.250570774078369, "learning_rate": 9.987073600409514e-06, "loss": 0.2892, "step": 2400 }, { "epoch": 0.36333484094678126, "grad_norm": 2.2496259212493896, "learning_rate": 9.986435643356457e-06, "loss": 0.3367, "step": 2410 }, { "epoch": 0.3648424543946932, "grad_norm": 3.1615355014801025, "learning_rate": 9.985782341999923e-06, "loss": 0.2888, "step": 2420 }, { "epoch": 0.36635006784260515, "grad_norm": 2.605109930038452, "learning_rate": 9.985113698350253e-06, "loss": 0.2403, "step": 2430 }, { "epoch": 0.3678576812905171, "grad_norm": 2.5473101139068604, "learning_rate": 9.984429714465006e-06, "loss": 0.2377, "step": 2440 }, { "epoch": 0.36936529473842905, "grad_norm": 2.998323440551758, "learning_rate": 9.983730392448943e-06, "loss": 0.24, "step": 2450 }, { "epoch": 0.370872908186341, "grad_norm": 3.5022575855255127, "learning_rate": 9.983015734454022e-06, "loss": 0.3087, "step": 2460 }, { "epoch": 0.372380521634253, "grad_norm": 2.598681688308716, "learning_rate": 9.982285742679394e-06, "loss": 0.2549, "step": 2470 }, { "epoch": 0.37388813508216495, "grad_norm": 3.1629135608673096, "learning_rate": 9.981540419371401e-06, "loss": 0.242, "step": 2480 }, { "epoch": 0.3753957485300769, "grad_norm": 4.050775527954102, "learning_rate": 9.980779766823554e-06, "loss": 0.2641, "step": 2490 }, { "epoch": 0.37690336197798885, "grad_norm": 3.2176005840301514, "learning_rate": 9.980003787376541e-06, "loss": 0.1972, "step": 2500 }, { "epoch": 0.3784109754259008, "grad_norm": 4.093595027923584, "learning_rate": 9.979212483418212e-06, "loss": 0.2634, "step": 2510 }, { "epoch": 0.37991858887381275, "grad_norm": 5.769745349884033, "learning_rate": 9.978405857383578e-06, "loss": 0.2758, "step": 2520 }, { "epoch": 0.3814262023217247, "grad_norm": 3.6237857341766357, "learning_rate": 9.977583911754788e-06, "loss": 0.2804, "step": 2530 }, { "epoch": 0.38293381576963664, "grad_norm": 2.7088418006896973, "learning_rate": 9.97674664906115e-06, "loss": 0.2962, "step": 2540 }, { "epoch": 0.38444142921754865, "grad_norm": 2.4936790466308594, "learning_rate": 9.97589407187909e-06, "loss": 0.1959, "step": 2550 }, { "epoch": 0.3859490426654606, "grad_norm": 2.6771116256713867, "learning_rate": 9.975026182832166e-06, "loss": 0.2623, "step": 2560 }, { "epoch": 0.38745665611337254, "grad_norm": 2.7719857692718506, "learning_rate": 9.974142984591057e-06, "loss": 0.2374, "step": 2570 }, { "epoch": 0.3889642695612845, "grad_norm": 2.6187658309936523, "learning_rate": 9.973244479873545e-06, "loss": 0.2313, "step": 2580 }, { "epoch": 0.39047188300919644, "grad_norm": 2.404445171356201, "learning_rate": 9.97233067144452e-06, "loss": 0.2504, "step": 2590 }, { "epoch": 0.3919794964571084, "grad_norm": 4.157626152038574, "learning_rate": 9.971401562115958e-06, "loss": 0.2694, "step": 2600 }, { "epoch": 0.39348710990502034, "grad_norm": 2.868192195892334, "learning_rate": 9.970457154746924e-06, "loss": 0.2564, "step": 2610 }, { "epoch": 0.3949947233529323, "grad_norm": 3.110403299331665, "learning_rate": 9.969497452243556e-06, "loss": 0.305, "step": 2620 }, { "epoch": 0.3965023368008443, "grad_norm": 3.422271490097046, "learning_rate": 9.96852245755906e-06, "loss": 0.2306, "step": 2630 }, { "epoch": 0.39800995024875624, "grad_norm": 3.280121326446533, "learning_rate": 9.967532173693698e-06, "loss": 0.2779, "step": 2640 }, { "epoch": 0.3995175636966682, "grad_norm": 3.5290169715881348, "learning_rate": 9.966526603694777e-06, "loss": 0.2287, "step": 2650 }, { "epoch": 0.40102517714458014, "grad_norm": 3.1920506954193115, "learning_rate": 9.96550575065665e-06, "loss": 0.2464, "step": 2660 }, { "epoch": 0.4025327905924921, "grad_norm": 5.141790390014648, "learning_rate": 9.964469617720694e-06, "loss": 0.2971, "step": 2670 }, { "epoch": 0.40404040404040403, "grad_norm": 3.2400362491607666, "learning_rate": 9.963418208075306e-06, "loss": 0.2468, "step": 2680 }, { "epoch": 0.405548017488316, "grad_norm": 2.6806387901306152, "learning_rate": 9.962351524955893e-06, "loss": 0.2189, "step": 2690 }, { "epoch": 0.40705563093622793, "grad_norm": 3.1995062828063965, "learning_rate": 9.961269571644861e-06, "loss": 0.2077, "step": 2700 }, { "epoch": 0.40856324438413993, "grad_norm": 3.2025880813598633, "learning_rate": 9.960172351471607e-06, "loss": 0.2401, "step": 2710 }, { "epoch": 0.4100708578320519, "grad_norm": 2.8332571983337402, "learning_rate": 9.959059867812506e-06, "loss": 0.2933, "step": 2720 }, { "epoch": 0.41157847127996383, "grad_norm": 3.1052703857421875, "learning_rate": 9.957932124090902e-06, "loss": 0.325, "step": 2730 }, { "epoch": 0.4130860847278758, "grad_norm": 3.242845296859741, "learning_rate": 9.9567891237771e-06, "loss": 0.232, "step": 2740 }, { "epoch": 0.41459369817578773, "grad_norm": 2.4695944786071777, "learning_rate": 9.955630870388348e-06, "loss": 0.3547, "step": 2750 }, { "epoch": 0.4161013116236997, "grad_norm": 3.9928171634674072, "learning_rate": 9.954457367488834e-06, "loss": 0.3111, "step": 2760 }, { "epoch": 0.4176089250716116, "grad_norm": 3.4647347927093506, "learning_rate": 9.953268618689672e-06, "loss": 0.2857, "step": 2770 }, { "epoch": 0.4191165385195236, "grad_norm": 3.9632694721221924, "learning_rate": 9.952064627648892e-06, "loss": 0.271, "step": 2780 }, { "epoch": 0.4206241519674355, "grad_norm": 3.1939327716827393, "learning_rate": 9.950845398071424e-06, "loss": 0.2761, "step": 2790 }, { "epoch": 0.4221317654153475, "grad_norm": 4.024521350860596, "learning_rate": 9.949610933709091e-06, "loss": 0.274, "step": 2800 }, { "epoch": 0.4236393788632595, "grad_norm": 2.3612654209136963, "learning_rate": 9.948361238360599e-06, "loss": 0.2077, "step": 2810 }, { "epoch": 0.4251469923111714, "grad_norm": 4.477809429168701, "learning_rate": 9.947096315871521e-06, "loss": 0.2467, "step": 2820 }, { "epoch": 0.42665460575908337, "grad_norm": 4.456467151641846, "learning_rate": 9.945816170134287e-06, "loss": 0.2948, "step": 2830 }, { "epoch": 0.4281622192069953, "grad_norm": 3.4860048294067383, "learning_rate": 9.944520805088173e-06, "loss": 0.2991, "step": 2840 }, { "epoch": 0.42966983265490727, "grad_norm": 3.3609325885772705, "learning_rate": 9.943210224719288e-06, "loss": 0.2471, "step": 2850 }, { "epoch": 0.4311774461028192, "grad_norm": 3.3220772743225098, "learning_rate": 9.941884433060563e-06, "loss": 0.2195, "step": 2860 }, { "epoch": 0.43268505955073117, "grad_norm": 3.537875175476074, "learning_rate": 9.940543434191733e-06, "loss": 0.257, "step": 2870 }, { "epoch": 0.43419267299864317, "grad_norm": 2.795330286026001, "learning_rate": 9.939187232239332e-06, "loss": 0.2334, "step": 2880 }, { "epoch": 0.4357002864465551, "grad_norm": 3.498202085494995, "learning_rate": 9.937815831376678e-06, "loss": 0.2291, "step": 2890 }, { "epoch": 0.43720789989446707, "grad_norm": 2.4325809478759766, "learning_rate": 9.936429235823857e-06, "loss": 0.2695, "step": 2900 }, { "epoch": 0.438715513342379, "grad_norm": 3.0280048847198486, "learning_rate": 9.935027449847712e-06, "loss": 0.2584, "step": 2910 }, { "epoch": 0.44022312679029096, "grad_norm": 3.9383790493011475, "learning_rate": 9.933610477761832e-06, "loss": 0.244, "step": 2920 }, { "epoch": 0.4417307402382029, "grad_norm": 2.742079973220825, "learning_rate": 9.932178323926535e-06, "loss": 0.2818, "step": 2930 }, { "epoch": 0.44323835368611486, "grad_norm": 1.8991209268569946, "learning_rate": 9.93073099274886e-06, "loss": 0.2314, "step": 2940 }, { "epoch": 0.4447459671340268, "grad_norm": 4.322510719299316, "learning_rate": 9.929268488682546e-06, "loss": 0.2321, "step": 2950 }, { "epoch": 0.4462535805819388, "grad_norm": 2.7267844676971436, "learning_rate": 9.92779081622802e-06, "loss": 0.2831, "step": 2960 }, { "epoch": 0.44776119402985076, "grad_norm": 2.417921781539917, "learning_rate": 9.926297979932393e-06, "loss": 0.3022, "step": 2970 }, { "epoch": 0.4492688074777627, "grad_norm": 3.7888383865356445, "learning_rate": 9.924789984389433e-06, "loss": 0.2505, "step": 2980 }, { "epoch": 0.45077642092567466, "grad_norm": 4.135186195373535, "learning_rate": 9.923266834239555e-06, "loss": 0.2771, "step": 2990 }, { "epoch": 0.4522840343735866, "grad_norm": 3.0177056789398193, "learning_rate": 9.921728534169812e-06, "loss": 0.2546, "step": 3000 }, { "epoch": 0.45379164782149856, "grad_norm": 4.112053871154785, "learning_rate": 9.920175088913874e-06, "loss": 0.3002, "step": 3010 }, { "epoch": 0.4552992612694105, "grad_norm": 2.6870369911193848, "learning_rate": 9.918606503252015e-06, "loss": 0.2931, "step": 3020 }, { "epoch": 0.45680687471732245, "grad_norm": 3.7786431312561035, "learning_rate": 9.917022782011104e-06, "loss": 0.2405, "step": 3030 }, { "epoch": 0.45831448816523446, "grad_norm": 2.9957709312438965, "learning_rate": 9.915423930064578e-06, "loss": 0.319, "step": 3040 }, { "epoch": 0.4598221016131464, "grad_norm": 3.186917304992676, "learning_rate": 9.91380995233244e-06, "loss": 0.2515, "step": 3050 }, { "epoch": 0.46132971506105835, "grad_norm": 2.492964506149292, "learning_rate": 9.912180853781237e-06, "loss": 0.2858, "step": 3060 }, { "epoch": 0.4628373285089703, "grad_norm": 2.885406494140625, "learning_rate": 9.910536639424045e-06, "loss": 0.2571, "step": 3070 }, { "epoch": 0.46434494195688225, "grad_norm": 4.852735996246338, "learning_rate": 9.908877314320454e-06, "loss": 0.235, "step": 3080 }, { "epoch": 0.4658525554047942, "grad_norm": 2.7892768383026123, "learning_rate": 9.907202883576552e-06, "loss": 0.197, "step": 3090 }, { "epoch": 0.46736016885270615, "grad_norm": 2.438904047012329, "learning_rate": 9.905513352344917e-06, "loss": 0.2362, "step": 3100 }, { "epoch": 0.4688677823006181, "grad_norm": 3.075490951538086, "learning_rate": 9.903808725824586e-06, "loss": 0.2424, "step": 3110 }, { "epoch": 0.4703753957485301, "grad_norm": 2.5461974143981934, "learning_rate": 9.90208900926105e-06, "loss": 0.1908, "step": 3120 }, { "epoch": 0.47188300919644205, "grad_norm": 3.060307025909424, "learning_rate": 9.900354207946238e-06, "loss": 0.2246, "step": 3130 }, { "epoch": 0.473390622644354, "grad_norm": 2.8629283905029297, "learning_rate": 9.898604327218495e-06, "loss": 0.2785, "step": 3140 }, { "epoch": 0.47489823609226595, "grad_norm": 3.9353957176208496, "learning_rate": 9.896839372462569e-06, "loss": 0.275, "step": 3150 }, { "epoch": 0.4764058495401779, "grad_norm": 2.966660261154175, "learning_rate": 9.895059349109595e-06, "loss": 0.2538, "step": 3160 }, { "epoch": 0.47791346298808984, "grad_norm": 2.869274139404297, "learning_rate": 9.893264262637074e-06, "loss": 0.2804, "step": 3170 }, { "epoch": 0.4794210764360018, "grad_norm": 2.872514486312866, "learning_rate": 9.891454118568866e-06, "loss": 0.2352, "step": 3180 }, { "epoch": 0.48092868988391374, "grad_norm": 2.442415237426758, "learning_rate": 9.889628922475159e-06, "loss": 0.2508, "step": 3190 }, { "epoch": 0.48243630333182574, "grad_norm": 3.562753677368164, "learning_rate": 9.887788679972464e-06, "loss": 0.2776, "step": 3200 }, { "epoch": 0.4839439167797377, "grad_norm": 4.020259380340576, "learning_rate": 9.88593339672359e-06, "loss": 0.2377, "step": 3210 }, { "epoch": 0.48545153022764964, "grad_norm": 2.6090028285980225, "learning_rate": 9.884063078437632e-06, "loss": 0.2433, "step": 3220 }, { "epoch": 0.4869591436755616, "grad_norm": 3.072434186935425, "learning_rate": 9.88217773086995e-06, "loss": 0.2062, "step": 3230 }, { "epoch": 0.48846675712347354, "grad_norm": 3.5558524131774902, "learning_rate": 9.88027735982215e-06, "loss": 0.2391, "step": 3240 }, { "epoch": 0.4899743705713855, "grad_norm": 2.883007287979126, "learning_rate": 9.878361971142073e-06, "loss": 0.2742, "step": 3250 }, { "epoch": 0.49148198401929744, "grad_norm": 3.307785987854004, "learning_rate": 9.876431570723768e-06, "loss": 0.2462, "step": 3260 }, { "epoch": 0.4929895974672094, "grad_norm": 2.33144211769104, "learning_rate": 9.874486164507481e-06, "loss": 0.2411, "step": 3270 }, { "epoch": 0.4944972109151214, "grad_norm": 2.7391421794891357, "learning_rate": 9.872525758479634e-06, "loss": 0.2349, "step": 3280 }, { "epoch": 0.49600482436303334, "grad_norm": 2.8810653686523438, "learning_rate": 9.870550358672806e-06, "loss": 0.2562, "step": 3290 }, { "epoch": 0.4975124378109453, "grad_norm": 2.8843533992767334, "learning_rate": 9.86855997116571e-06, "loss": 0.2934, "step": 3300 }, { "epoch": 0.49902005125885723, "grad_norm": 3.136643886566162, "learning_rate": 9.86655460208319e-06, "loss": 0.2701, "step": 3310 }, { "epoch": 0.5005276647067692, "grad_norm": 3.207012891769409, "learning_rate": 9.86453425759618e-06, "loss": 0.2231, "step": 3320 }, { "epoch": 0.5020352781546812, "grad_norm": 3.9083306789398193, "learning_rate": 9.862498943921704e-06, "loss": 0.2344, "step": 3330 }, { "epoch": 0.5035428916025931, "grad_norm": 2.8889682292938232, "learning_rate": 9.860448667322848e-06, "loss": 0.2935, "step": 3340 }, { "epoch": 0.5050505050505051, "grad_norm": 2.486769676208496, "learning_rate": 9.858383434108741e-06, "loss": 0.2484, "step": 3350 }, { "epoch": 0.506558118498417, "grad_norm": 2.2321736812591553, "learning_rate": 9.856303250634536e-06, "loss": 0.2862, "step": 3360 }, { "epoch": 0.508065731946329, "grad_norm": 2.492518663406372, "learning_rate": 9.854208123301392e-06, "loss": 0.2334, "step": 3370 }, { "epoch": 0.5095733453942409, "grad_norm": 2.828885555267334, "learning_rate": 9.852098058556451e-06, "loss": 0.1871, "step": 3380 }, { "epoch": 0.5110809588421529, "grad_norm": 2.234206438064575, "learning_rate": 9.849973062892828e-06, "loss": 0.2629, "step": 3390 }, { "epoch": 0.5125885722900648, "grad_norm": 2.7285687923431396, "learning_rate": 9.847833142849575e-06, "loss": 0.3007, "step": 3400 }, { "epoch": 0.5140961857379768, "grad_norm": 3.2594521045684814, "learning_rate": 9.84567830501167e-06, "loss": 0.2694, "step": 3410 }, { "epoch": 0.5156037991858887, "grad_norm": 2.9900317192077637, "learning_rate": 9.843508556010004e-06, "loss": 0.1906, "step": 3420 }, { "epoch": 0.5171114126338007, "grad_norm": 4.776864528656006, "learning_rate": 9.841323902521345e-06, "loss": 0.2523, "step": 3430 }, { "epoch": 0.5186190260817126, "grad_norm": 2.9263594150543213, "learning_rate": 9.839124351268328e-06, "loss": 0.2978, "step": 3440 }, { "epoch": 0.5201266395296246, "grad_norm": 2.220384120941162, "learning_rate": 9.836909909019432e-06, "loss": 0.2892, "step": 3450 }, { "epoch": 0.5216342529775365, "grad_norm": 2.452528953552246, "learning_rate": 9.834680582588957e-06, "loss": 0.2377, "step": 3460 }, { "epoch": 0.5231418664254485, "grad_norm": 3.9738259315490723, "learning_rate": 9.832436378837006e-06, "loss": 0.248, "step": 3470 }, { "epoch": 0.5246494798733605, "grad_norm": 2.201779842376709, "learning_rate": 9.830177304669465e-06, "loss": 0.238, "step": 3480 }, { "epoch": 0.5261570933212725, "grad_norm": 3.181610345840454, "learning_rate": 9.827903367037975e-06, "loss": 0.182, "step": 3490 }, { "epoch": 0.5276647067691844, "grad_norm": 2.4478986263275146, "learning_rate": 9.825614572939917e-06, "loss": 0.2214, "step": 3500 }, { "epoch": 0.5291723202170964, "grad_norm": 3.1642370223999023, "learning_rate": 9.823310929418386e-06, "loss": 0.2542, "step": 3510 }, { "epoch": 0.5306799336650083, "grad_norm": 2.1762421131134033, "learning_rate": 9.820992443562177e-06, "loss": 0.2396, "step": 3520 }, { "epoch": 0.5321875471129203, "grad_norm": 2.365755558013916, "learning_rate": 9.818659122505754e-06, "loss": 0.2463, "step": 3530 }, { "epoch": 0.5336951605608322, "grad_norm": 3.018524408340454, "learning_rate": 9.816310973429233e-06, "loss": 0.2229, "step": 3540 }, { "epoch": 0.5352027740087442, "grad_norm": 2.4103195667266846, "learning_rate": 9.81394800355836e-06, "loss": 0.2097, "step": 3550 }, { "epoch": 0.5367103874566561, "grad_norm": 2.5197532176971436, "learning_rate": 9.811570220164484e-06, "loss": 0.2691, "step": 3560 }, { "epoch": 0.5382180009045681, "grad_norm": 3.330446720123291, "learning_rate": 9.809177630564544e-06, "loss": 0.2484, "step": 3570 }, { "epoch": 0.53972561435248, "grad_norm": 3.5988144874572754, "learning_rate": 9.806770242121036e-06, "loss": 0.277, "step": 3580 }, { "epoch": 0.541233227800392, "grad_norm": 2.60546875, "learning_rate": 9.804348062241995e-06, "loss": 0.3456, "step": 3590 }, { "epoch": 0.5427408412483039, "grad_norm": 1.6180322170257568, "learning_rate": 9.801911098380978e-06, "loss": 0.2364, "step": 3600 }, { "epoch": 0.5442484546962159, "grad_norm": 3.871858596801758, "learning_rate": 9.799459358037028e-06, "loss": 0.2844, "step": 3610 }, { "epoch": 0.5457560681441278, "grad_norm": 3.808302879333496, "learning_rate": 9.796992848754666e-06, "loss": 0.2796, "step": 3620 }, { "epoch": 0.5472636815920398, "grad_norm": 3.1992087364196777, "learning_rate": 9.794511578123853e-06, "loss": 0.2822, "step": 3630 }, { "epoch": 0.5487712950399517, "grad_norm": 3.509726047515869, "learning_rate": 9.792015553779976e-06, "loss": 0.2849, "step": 3640 }, { "epoch": 0.5502789084878638, "grad_norm": 2.728024482727051, "learning_rate": 9.789504783403825e-06, "loss": 0.2938, "step": 3650 }, { "epoch": 0.5517865219357757, "grad_norm": 3.6429426670074463, "learning_rate": 9.786979274721564e-06, "loss": 0.2668, "step": 3660 }, { "epoch": 0.5532941353836877, "grad_norm": 4.020580768585205, "learning_rate": 9.784439035504711e-06, "loss": 0.2997, "step": 3670 }, { "epoch": 0.5548017488315996, "grad_norm": 3.093716621398926, "learning_rate": 9.781884073570112e-06, "loss": 0.2752, "step": 3680 }, { "epoch": 0.5563093622795116, "grad_norm": 1.7954763174057007, "learning_rate": 9.779314396779917e-06, "loss": 0.252, "step": 3690 }, { "epoch": 0.5578169757274235, "grad_norm": 2.5687835216522217, "learning_rate": 9.776730013041559e-06, "loss": 0.221, "step": 3700 }, { "epoch": 0.5593245891753355, "grad_norm": 3.3336739540100098, "learning_rate": 9.774130930307727e-06, "loss": 0.2802, "step": 3710 }, { "epoch": 0.5608322026232474, "grad_norm": 4.337209701538086, "learning_rate": 9.77151715657634e-06, "loss": 0.2693, "step": 3720 }, { "epoch": 0.5623398160711593, "grad_norm": 3.4135901927948, "learning_rate": 9.768888699890527e-06, "loss": 0.2403, "step": 3730 }, { "epoch": 0.5638474295190713, "grad_norm": 2.9365642070770264, "learning_rate": 9.766245568338595e-06, "loss": 0.199, "step": 3740 }, { "epoch": 0.5653550429669832, "grad_norm": 2.916250228881836, "learning_rate": 9.763587770054014e-06, "loss": 0.2126, "step": 3750 }, { "epoch": 0.5668626564148952, "grad_norm": 2.2557594776153564, "learning_rate": 9.760915313215385e-06, "loss": 0.2633, "step": 3760 }, { "epoch": 0.5683702698628071, "grad_norm": 2.2766077518463135, "learning_rate": 9.758228206046412e-06, "loss": 0.2833, "step": 3770 }, { "epoch": 0.5698778833107191, "grad_norm": 3.1747584342956543, "learning_rate": 9.75552645681589e-06, "loss": 0.2297, "step": 3780 }, { "epoch": 0.571385496758631, "grad_norm": 2.1428303718566895, "learning_rate": 9.75281007383766e-06, "loss": 0.237, "step": 3790 }, { "epoch": 0.572893110206543, "grad_norm": 3.0258967876434326, "learning_rate": 9.750079065470601e-06, "loss": 0.1994, "step": 3800 }, { "epoch": 0.574400723654455, "grad_norm": 3.0928382873535156, "learning_rate": 9.7473334401186e-06, "loss": 0.2566, "step": 3810 }, { "epoch": 0.575908337102367, "grad_norm": 3.9277491569519043, "learning_rate": 9.744573206230514e-06, "loss": 0.2812, "step": 3820 }, { "epoch": 0.577415950550279, "grad_norm": 3.0802135467529297, "learning_rate": 9.741798372300162e-06, "loss": 0.2583, "step": 3830 }, { "epoch": 0.5789235639981909, "grad_norm": 3.56453013420105, "learning_rate": 9.739008946866286e-06, "loss": 0.3022, "step": 3840 }, { "epoch": 0.5804311774461028, "grad_norm": 3.556722640991211, "learning_rate": 9.736204938512532e-06, "loss": 0.2387, "step": 3850 }, { "epoch": 0.5819387908940148, "grad_norm": 2.4037909507751465, "learning_rate": 9.733386355867416e-06, "loss": 0.2172, "step": 3860 }, { "epoch": 0.5834464043419267, "grad_norm": 1.809337854385376, "learning_rate": 9.730553207604307e-06, "loss": 0.2557, "step": 3870 }, { "epoch": 0.5849540177898387, "grad_norm": 2.6946959495544434, "learning_rate": 9.727705502441396e-06, "loss": 0.275, "step": 3880 }, { "epoch": 0.5864616312377506, "grad_norm": 2.236497163772583, "learning_rate": 9.724843249141663e-06, "loss": 0.2749, "step": 3890 }, { "epoch": 0.5879692446856626, "grad_norm": 2.764671802520752, "learning_rate": 9.72196645651286e-06, "loss": 0.2436, "step": 3900 }, { "epoch": 0.5894768581335745, "grad_norm": 3.119699001312256, "learning_rate": 9.719075133407479e-06, "loss": 0.1994, "step": 3910 }, { "epoch": 0.5909844715814865, "grad_norm": 3.1113548278808594, "learning_rate": 9.716169288722724e-06, "loss": 0.1974, "step": 3920 }, { "epoch": 0.5924920850293984, "grad_norm": 2.815882682800293, "learning_rate": 9.713248931400487e-06, "loss": 0.2261, "step": 3930 }, { "epoch": 0.5939996984773104, "grad_norm": 3.3475730419158936, "learning_rate": 9.710314070427316e-06, "loss": 0.2074, "step": 3940 }, { "epoch": 0.5955073119252223, "grad_norm": 2.1607978343963623, "learning_rate": 9.70736471483439e-06, "loss": 0.2314, "step": 3950 }, { "epoch": 0.5970149253731343, "grad_norm": 3.6977317333221436, "learning_rate": 9.704400873697493e-06, "loss": 0.3072, "step": 3960 }, { "epoch": 0.5985225388210463, "grad_norm": 3.218519449234009, "learning_rate": 9.701422556136985e-06, "loss": 0.2827, "step": 3970 }, { "epoch": 0.6000301522689583, "grad_norm": 5.845335006713867, "learning_rate": 9.698429771317768e-06, "loss": 0.299, "step": 3980 }, { "epoch": 0.6015377657168702, "grad_norm": 3.364065408706665, "learning_rate": 9.695422528449265e-06, "loss": 0.2069, "step": 3990 }, { "epoch": 0.6030453791647822, "grad_norm": 2.452024221420288, "learning_rate": 9.692400836785394e-06, "loss": 0.2513, "step": 4000 }, { "epoch": 0.6045529926126941, "grad_norm": 4.545656681060791, "learning_rate": 9.68936470562453e-06, "loss": 0.2589, "step": 4010 }, { "epoch": 0.6060606060606061, "grad_norm": 1.9340651035308838, "learning_rate": 9.686314144309477e-06, "loss": 0.214, "step": 4020 }, { "epoch": 0.607568219508518, "grad_norm": 3.230806589126587, "learning_rate": 9.683249162227455e-06, "loss": 0.2594, "step": 4030 }, { "epoch": 0.60907583295643, "grad_norm": 3.4083361625671387, "learning_rate": 9.680169768810052e-06, "loss": 0.1967, "step": 4040 }, { "epoch": 0.6105834464043419, "grad_norm": 2.5320820808410645, "learning_rate": 9.677075973533206e-06, "loss": 0.2737, "step": 4050 }, { "epoch": 0.6120910598522539, "grad_norm": 7.365027904510498, "learning_rate": 9.673967785917168e-06, "loss": 0.2758, "step": 4060 }, { "epoch": 0.6135986733001658, "grad_norm": 4.4401469230651855, "learning_rate": 9.67084521552648e-06, "loss": 0.2231, "step": 4070 }, { "epoch": 0.6151062867480778, "grad_norm": 2.5268232822418213, "learning_rate": 9.667708271969947e-06, "loss": 0.2483, "step": 4080 }, { "epoch": 0.6166139001959897, "grad_norm": 2.4290549755096436, "learning_rate": 9.664556964900598e-06, "loss": 0.2581, "step": 4090 }, { "epoch": 0.6181215136439017, "grad_norm": 3.8180243968963623, "learning_rate": 9.66139130401566e-06, "loss": 0.3164, "step": 4100 }, { "epoch": 0.6196291270918136, "grad_norm": 3.1298816204071045, "learning_rate": 9.658211299056532e-06, "loss": 0.2067, "step": 4110 }, { "epoch": 0.6211367405397256, "grad_norm": 3.8558623790740967, "learning_rate": 9.655016959808756e-06, "loss": 0.201, "step": 4120 }, { "epoch": 0.6226443539876376, "grad_norm": 2.6686689853668213, "learning_rate": 9.651808296101981e-06, "loss": 0.2719, "step": 4130 }, { "epoch": 0.6241519674355496, "grad_norm": 3.417377233505249, "learning_rate": 9.648585317809933e-06, "loss": 0.2406, "step": 4140 }, { "epoch": 0.6256595808834615, "grad_norm": 2.8243823051452637, "learning_rate": 9.64534803485039e-06, "loss": 0.2523, "step": 4150 }, { "epoch": 0.6271671943313735, "grad_norm": 3.189911365509033, "learning_rate": 9.642096457185147e-06, "loss": 0.2367, "step": 4160 }, { "epoch": 0.6286748077792854, "grad_norm": 2.631085157394409, "learning_rate": 9.638830594819989e-06, "loss": 0.3244, "step": 4170 }, { "epoch": 0.6301824212271974, "grad_norm": 2.5704314708709717, "learning_rate": 9.635550457804655e-06, "loss": 0.2387, "step": 4180 }, { "epoch": 0.6316900346751093, "grad_norm": 2.325026273727417, "learning_rate": 9.632256056232809e-06, "loss": 0.2482, "step": 4190 }, { "epoch": 0.6331976481230213, "grad_norm": 1.841233730316162, "learning_rate": 9.628947400242016e-06, "loss": 0.3191, "step": 4200 }, { "epoch": 0.6347052615709332, "grad_norm": 4.137796878814697, "learning_rate": 9.625624500013702e-06, "loss": 0.2467, "step": 4210 }, { "epoch": 0.6362128750188452, "grad_norm": 1.8487067222595215, "learning_rate": 9.622287365773122e-06, "loss": 0.214, "step": 4220 }, { "epoch": 0.6377204884667571, "grad_norm": 3.056353807449341, "learning_rate": 9.618936007789334e-06, "loss": 0.225, "step": 4230 }, { "epoch": 0.6392281019146691, "grad_norm": 3.2448298931121826, "learning_rate": 9.615570436375168e-06, "loss": 0.2265, "step": 4240 }, { "epoch": 0.640735715362581, "grad_norm": 2.8674564361572266, "learning_rate": 9.612190661887192e-06, "loss": 0.2227, "step": 4250 }, { "epoch": 0.642243328810493, "grad_norm": 2.5992038249969482, "learning_rate": 9.608796694725672e-06, "loss": 0.2433, "step": 4260 }, { "epoch": 0.6437509422584049, "grad_norm": 3.8672006130218506, "learning_rate": 9.605388545334557e-06, "loss": 0.3032, "step": 4270 }, { "epoch": 0.6452585557063168, "grad_norm": 3.381237506866455, "learning_rate": 9.601966224201436e-06, "loss": 0.2861, "step": 4280 }, { "epoch": 0.6467661691542289, "grad_norm": 2.9332003593444824, "learning_rate": 9.598529741857502e-06, "loss": 0.2868, "step": 4290 }, { "epoch": 0.6482737826021409, "grad_norm": 3.6170814037323, "learning_rate": 9.59507910887753e-06, "loss": 0.2239, "step": 4300 }, { "epoch": 0.6497813960500528, "grad_norm": 3.169149398803711, "learning_rate": 9.59161433587984e-06, "loss": 0.2699, "step": 4310 }, { "epoch": 0.6512890094979648, "grad_norm": 2.846353769302368, "learning_rate": 9.588135433526258e-06, "loss": 0.2698, "step": 4320 }, { "epoch": 0.6527966229458767, "grad_norm": 2.552703857421875, "learning_rate": 9.584642412522096e-06, "loss": 0.2309, "step": 4330 }, { "epoch": 0.6543042363937887, "grad_norm": 2.225543737411499, "learning_rate": 9.581135283616107e-06, "loss": 0.2193, "step": 4340 }, { "epoch": 0.6558118498417006, "grad_norm": 2.8051180839538574, "learning_rate": 9.57761405760046e-06, "loss": 0.2198, "step": 4350 }, { "epoch": 0.6573194632896125, "grad_norm": 2.849327802658081, "learning_rate": 9.574078745310703e-06, "loss": 0.2295, "step": 4360 }, { "epoch": 0.6588270767375245, "grad_norm": 2.5996482372283936, "learning_rate": 9.570529357625727e-06, "loss": 0.2433, "step": 4370 }, { "epoch": 0.6603346901854364, "grad_norm": 4.449861526489258, "learning_rate": 9.566965905467742e-06, "loss": 0.2743, "step": 4380 }, { "epoch": 0.6618423036333484, "grad_norm": 3.4761452674865723, "learning_rate": 9.563388399802232e-06, "loss": 0.2661, "step": 4390 }, { "epoch": 0.6633499170812603, "grad_norm": 3.1688876152038574, "learning_rate": 9.559796851637932e-06, "loss": 0.2722, "step": 4400 }, { "epoch": 0.6648575305291723, "grad_norm": 5.761277675628662, "learning_rate": 9.556191272026783e-06, "loss": 0.2922, "step": 4410 }, { "epoch": 0.6663651439770842, "grad_norm": 3.3400580883026123, "learning_rate": 9.552571672063906e-06, "loss": 0.2376, "step": 4420 }, { "epoch": 0.6678727574249962, "grad_norm": 2.58154559135437, "learning_rate": 9.548938062887565e-06, "loss": 0.2685, "step": 4430 }, { "epoch": 0.6693803708729081, "grad_norm": 3.304809331893921, "learning_rate": 9.545290455679137e-06, "loss": 0.2671, "step": 4440 }, { "epoch": 0.6708879843208202, "grad_norm": 3.996570110321045, "learning_rate": 9.54162886166307e-06, "loss": 0.2427, "step": 4450 }, { "epoch": 0.6723955977687321, "grad_norm": 1.6975947618484497, "learning_rate": 9.537953292106851e-06, "loss": 0.2238, "step": 4460 }, { "epoch": 0.6739032112166441, "grad_norm": 4.104971885681152, "learning_rate": 9.534263758320978e-06, "loss": 0.2746, "step": 4470 }, { "epoch": 0.675410824664556, "grad_norm": 2.856086015701294, "learning_rate": 9.530560271658913e-06, "loss": 0.3071, "step": 4480 }, { "epoch": 0.676918438112468, "grad_norm": 3.29301381111145, "learning_rate": 9.52684284351706e-06, "loss": 0.2047, "step": 4490 }, { "epoch": 0.6784260515603799, "grad_norm": 2.817046642303467, "learning_rate": 9.523111485334724e-06, "loss": 0.2646, "step": 4500 }, { "epoch": 0.6799336650082919, "grad_norm": 2.672760486602783, "learning_rate": 9.519366208594067e-06, "loss": 0.2521, "step": 4510 }, { "epoch": 0.6814412784562038, "grad_norm": 3.315795660018921, "learning_rate": 9.515607024820093e-06, "loss": 0.1977, "step": 4520 }, { "epoch": 0.6829488919041158, "grad_norm": 3.4324212074279785, "learning_rate": 9.511833945580592e-06, "loss": 0.2485, "step": 4530 }, { "epoch": 0.6844565053520277, "grad_norm": 4.157164573669434, "learning_rate": 9.508046982486116e-06, "loss": 0.2107, "step": 4540 }, { "epoch": 0.6859641187999397, "grad_norm": 2.6974451541900635, "learning_rate": 9.504246147189946e-06, "loss": 0.2449, "step": 4550 }, { "epoch": 0.6874717322478516, "grad_norm": 2.6978440284729004, "learning_rate": 9.50043145138804e-06, "loss": 0.2218, "step": 4560 }, { "epoch": 0.6889793456957636, "grad_norm": 3.2333078384399414, "learning_rate": 9.496602906819015e-06, "loss": 0.2478, "step": 4570 }, { "epoch": 0.6904869591436755, "grad_norm": 4.802003860473633, "learning_rate": 9.492760525264105e-06, "loss": 0.2405, "step": 4580 }, { "epoch": 0.6919945725915875, "grad_norm": 1.8715565204620361, "learning_rate": 9.488904318547114e-06, "loss": 0.225, "step": 4590 }, { "epoch": 0.6935021860394994, "grad_norm": 2.335191488265991, "learning_rate": 9.485034298534402e-06, "loss": 0.214, "step": 4600 }, { "epoch": 0.6950097994874114, "grad_norm": 2.9438278675079346, "learning_rate": 9.481150477134825e-06, "loss": 0.2351, "step": 4610 }, { "epoch": 0.6965174129353234, "grad_norm": 3.3307900428771973, "learning_rate": 9.477252866299711e-06, "loss": 0.3387, "step": 4620 }, { "epoch": 0.6980250263832354, "grad_norm": 3.0406653881073, "learning_rate": 9.473341478022824e-06, "loss": 0.3788, "step": 4630 }, { "epoch": 0.6995326398311473, "grad_norm": 3.2175819873809814, "learning_rate": 9.469416324340323e-06, "loss": 0.2807, "step": 4640 }, { "epoch": 0.7010402532790593, "grad_norm": 2.0659897327423096, "learning_rate": 9.465477417330724e-06, "loss": 0.2949, "step": 4650 }, { "epoch": 0.7025478667269712, "grad_norm": 2.9035227298736572, "learning_rate": 9.461524769114867e-06, "loss": 0.2477, "step": 4660 }, { "epoch": 0.7040554801748832, "grad_norm": 1.884451150894165, "learning_rate": 9.457558391855874e-06, "loss": 0.2394, "step": 4670 }, { "epoch": 0.7055630936227951, "grad_norm": 2.050718069076538, "learning_rate": 9.45357829775912e-06, "loss": 0.2342, "step": 4680 }, { "epoch": 0.7070707070707071, "grad_norm": 4.5528998374938965, "learning_rate": 9.449584499072182e-06, "loss": 0.271, "step": 4690 }, { "epoch": 0.708578320518619, "grad_norm": 3.4352259635925293, "learning_rate": 9.445577008084811e-06, "loss": 0.2364, "step": 4700 }, { "epoch": 0.710085933966531, "grad_norm": 2.127429723739624, "learning_rate": 9.441555837128897e-06, "loss": 0.2086, "step": 4710 }, { "epoch": 0.7115935474144429, "grad_norm": 2.515162944793701, "learning_rate": 9.43752099857842e-06, "loss": 0.2897, "step": 4720 }, { "epoch": 0.7131011608623549, "grad_norm": 2.887550115585327, "learning_rate": 9.433472504849423e-06, "loss": 0.2211, "step": 4730 }, { "epoch": 0.7146087743102668, "grad_norm": 3.452927827835083, "learning_rate": 9.429410368399964e-06, "loss": 0.2623, "step": 4740 }, { "epoch": 0.7161163877581788, "grad_norm": 3.454716205596924, "learning_rate": 9.425334601730085e-06, "loss": 0.2435, "step": 4750 }, { "epoch": 0.7176240012060907, "grad_norm": 2.720240592956543, "learning_rate": 9.421245217381774e-06, "loss": 0.2695, "step": 4760 }, { "epoch": 0.7191316146540027, "grad_norm": 3.3350539207458496, "learning_rate": 9.417142227938916e-06, "loss": 0.2582, "step": 4770 }, { "epoch": 0.7206392281019147, "grad_norm": 2.624946117401123, "learning_rate": 9.41302564602727e-06, "loss": 0.2612, "step": 4780 }, { "epoch": 0.7221468415498267, "grad_norm": 4.783843994140625, "learning_rate": 9.408895484314414e-06, "loss": 0.2292, "step": 4790 }, { "epoch": 0.7236544549977386, "grad_norm": 1.6848976612091064, "learning_rate": 9.40475175550972e-06, "loss": 0.2549, "step": 4800 }, { "epoch": 0.7251620684456506, "grad_norm": 3.300201654434204, "learning_rate": 9.400594472364308e-06, "loss": 0.2342, "step": 4810 }, { "epoch": 0.7266696818935625, "grad_norm": 3.1572763919830322, "learning_rate": 9.396423647671005e-06, "loss": 0.2416, "step": 4820 }, { "epoch": 0.7281772953414745, "grad_norm": 2.272669553756714, "learning_rate": 9.392239294264309e-06, "loss": 0.3377, "step": 4830 }, { "epoch": 0.7296849087893864, "grad_norm": 2.5629944801330566, "learning_rate": 9.38804142502035e-06, "loss": 0.2269, "step": 4840 }, { "epoch": 0.7311925222372984, "grad_norm": 2.6120753288269043, "learning_rate": 9.383830052856843e-06, "loss": 0.2432, "step": 4850 }, { "epoch": 0.7327001356852103, "grad_norm": 2.3984546661376953, "learning_rate": 9.379605190733066e-06, "loss": 0.2613, "step": 4860 }, { "epoch": 0.7342077491331223, "grad_norm": 3.810535192489624, "learning_rate": 9.375366851649799e-06, "loss": 0.2902, "step": 4870 }, { "epoch": 0.7357153625810342, "grad_norm": 2.6155178546905518, "learning_rate": 9.371115048649297e-06, "loss": 0.2599, "step": 4880 }, { "epoch": 0.7372229760289462, "grad_norm": 2.397418975830078, "learning_rate": 9.366849794815245e-06, "loss": 0.2592, "step": 4890 }, { "epoch": 0.7387305894768581, "grad_norm": 2.985116958618164, "learning_rate": 9.362571103272718e-06, "loss": 0.2488, "step": 4900 }, { "epoch": 0.74023820292477, "grad_norm": 3.1448967456817627, "learning_rate": 9.358278987188145e-06, "loss": 0.2799, "step": 4910 }, { "epoch": 0.741745816372682, "grad_norm": 3.313995122909546, "learning_rate": 9.353973459769264e-06, "loss": 0.2057, "step": 4920 }, { "epoch": 0.743253429820594, "grad_norm": 2.8209805488586426, "learning_rate": 9.349654534265078e-06, "loss": 0.2112, "step": 4930 }, { "epoch": 0.744761043268506, "grad_norm": 2.983332633972168, "learning_rate": 9.345322223965827e-06, "loss": 0.2039, "step": 4940 }, { "epoch": 0.746268656716418, "grad_norm": 3.894395351409912, "learning_rate": 9.34097654220293e-06, "loss": 0.1941, "step": 4950 }, { "epoch": 0.7477762701643299, "grad_norm": 1.9024150371551514, "learning_rate": 9.336617502348962e-06, "loss": 0.2233, "step": 4960 }, { "epoch": 0.7492838836122419, "grad_norm": 2.283095359802246, "learning_rate": 9.332245117817594e-06, "loss": 0.3099, "step": 4970 }, { "epoch": 0.7507914970601538, "grad_norm": 2.5077128410339355, "learning_rate": 9.327859402063566e-06, "loss": 0.2366, "step": 4980 }, { "epoch": 0.7522991105080657, "grad_norm": 2.0630249977111816, "learning_rate": 9.323460368582645e-06, "loss": 0.2395, "step": 4990 }, { "epoch": 0.7538067239559777, "grad_norm": 3.460604667663574, "learning_rate": 9.319048030911573e-06, "loss": 0.2438, "step": 5000 }, { "epoch": 0.7553143374038896, "grad_norm": 3.0792925357818604, "learning_rate": 9.314622402628034e-06, "loss": 0.2833, "step": 5010 }, { "epoch": 0.7568219508518016, "grad_norm": 2.323240041732788, "learning_rate": 9.31018349735061e-06, "loss": 0.2779, "step": 5020 }, { "epoch": 0.7583295642997135, "grad_norm": 2.353419065475464, "learning_rate": 9.305731328738741e-06, "loss": 0.2505, "step": 5030 }, { "epoch": 0.7598371777476255, "grad_norm": 2.9187779426574707, "learning_rate": 9.301265910492675e-06, "loss": 0.198, "step": 5040 }, { "epoch": 0.7613447911955374, "grad_norm": 5.569884300231934, "learning_rate": 9.296787256353444e-06, "loss": 0.3162, "step": 5050 }, { "epoch": 0.7628524046434494, "grad_norm": 2.981611490249634, "learning_rate": 9.292295380102793e-06, "loss": 0.2332, "step": 5060 }, { "epoch": 0.7643600180913613, "grad_norm": 4.700687885284424, "learning_rate": 9.287790295563169e-06, "loss": 0.2795, "step": 5070 }, { "epoch": 0.7658676315392733, "grad_norm": 2.7378945350646973, "learning_rate": 9.283272016597654e-06, "loss": 0.2255, "step": 5080 }, { "epoch": 0.7673752449871852, "grad_norm": 2.1525986194610596, "learning_rate": 9.278740557109938e-06, "loss": 0.276, "step": 5090 }, { "epoch": 0.7688828584350973, "grad_norm": 2.9229061603546143, "learning_rate": 9.274195931044268e-06, "loss": 0.1872, "step": 5100 }, { "epoch": 0.7703904718830092, "grad_norm": 2.6095480918884277, "learning_rate": 9.269638152385406e-06, "loss": 0.2216, "step": 5110 }, { "epoch": 0.7718980853309212, "grad_norm": 3.259446382522583, "learning_rate": 9.265067235158586e-06, "loss": 0.3022, "step": 5120 }, { "epoch": 0.7734056987788331, "grad_norm": 3.5533852577209473, "learning_rate": 9.26048319342948e-06, "loss": 0.2738, "step": 5130 }, { "epoch": 0.7749133122267451, "grad_norm": 2.4258086681365967, "learning_rate": 9.255886041304138e-06, "loss": 0.2518, "step": 5140 }, { "epoch": 0.776420925674657, "grad_norm": 3.017108201980591, "learning_rate": 9.251275792928957e-06, "loss": 0.2149, "step": 5150 }, { "epoch": 0.777928539122569, "grad_norm": 4.170431137084961, "learning_rate": 9.246652462490634e-06, "loss": 0.2841, "step": 5160 }, { "epoch": 0.7794361525704809, "grad_norm": 2.978444814682007, "learning_rate": 9.242016064216123e-06, "loss": 0.2295, "step": 5170 }, { "epoch": 0.7809437660183929, "grad_norm": 2.989793539047241, "learning_rate": 9.237366612372587e-06, "loss": 0.28, "step": 5180 }, { "epoch": 0.7824513794663048, "grad_norm": 3.9537975788116455, "learning_rate": 9.232704121267364e-06, "loss": 0.2407, "step": 5190 }, { "epoch": 0.7839589929142168, "grad_norm": 2.3232221603393555, "learning_rate": 9.228028605247908e-06, "loss": 0.2111, "step": 5200 }, { "epoch": 0.7854666063621287, "grad_norm": 5.361274242401123, "learning_rate": 9.223340078701761e-06, "loss": 0.206, "step": 5210 }, { "epoch": 0.7869742198100407, "grad_norm": 3.090134382247925, "learning_rate": 9.218638556056497e-06, "loss": 0.2188, "step": 5220 }, { "epoch": 0.7884818332579526, "grad_norm": 3.895893096923828, "learning_rate": 9.213924051779682e-06, "loss": 0.2689, "step": 5230 }, { "epoch": 0.7899894467058646, "grad_norm": 3.0961127281188965, "learning_rate": 9.209196580378831e-06, "loss": 0.1999, "step": 5240 }, { "epoch": 0.7914970601537765, "grad_norm": 2.58487868309021, "learning_rate": 9.20445615640136e-06, "loss": 0.2263, "step": 5250 }, { "epoch": 0.7930046736016886, "grad_norm": 2.7709848880767822, "learning_rate": 9.199702794434542e-06, "loss": 0.3087, "step": 5260 }, { "epoch": 0.7945122870496005, "grad_norm": 3.456580638885498, "learning_rate": 9.194936509105467e-06, "loss": 0.2723, "step": 5270 }, { "epoch": 0.7960199004975125, "grad_norm": 3.3948299884796143, "learning_rate": 9.190157315080987e-06, "loss": 0.2494, "step": 5280 }, { "epoch": 0.7975275139454244, "grad_norm": 2.280669689178467, "learning_rate": 9.185365227067683e-06, "loss": 0.2333, "step": 5290 }, { "epoch": 0.7990351273933364, "grad_norm": 3.0251879692077637, "learning_rate": 9.180560259811807e-06, "loss": 0.2586, "step": 5300 }, { "epoch": 0.8005427408412483, "grad_norm": 2.5153489112854004, "learning_rate": 9.175742428099248e-06, "loss": 0.2721, "step": 5310 }, { "epoch": 0.8020503542891603, "grad_norm": 2.6558239459991455, "learning_rate": 9.170911746755479e-06, "loss": 0.234, "step": 5320 }, { "epoch": 0.8035579677370722, "grad_norm": 2.716525077819824, "learning_rate": 9.166068230645516e-06, "loss": 0.2124, "step": 5330 }, { "epoch": 0.8050655811849842, "grad_norm": 3.249938726425171, "learning_rate": 9.16121189467387e-06, "loss": 0.219, "step": 5340 }, { "epoch": 0.8065731946328961, "grad_norm": 3.9071543216705322, "learning_rate": 9.156342753784497e-06, "loss": 0.2438, "step": 5350 }, { "epoch": 0.8080808080808081, "grad_norm": 3.0705063343048096, "learning_rate": 9.151460822960765e-06, "loss": 0.2829, "step": 5360 }, { "epoch": 0.80958842152872, "grad_norm": 2.603959321975708, "learning_rate": 9.14656611722539e-06, "loss": 0.2411, "step": 5370 }, { "epoch": 0.811096034976632, "grad_norm": 2.619256019592285, "learning_rate": 9.141658651640406e-06, "loss": 0.2218, "step": 5380 }, { "epoch": 0.8126036484245439, "grad_norm": 2.596096992492676, "learning_rate": 9.136738441307109e-06, "loss": 0.2419, "step": 5390 }, { "epoch": 0.8141112618724559, "grad_norm": 2.995586395263672, "learning_rate": 9.131805501366013e-06, "loss": 0.2293, "step": 5400 }, { "epoch": 0.8156188753203678, "grad_norm": 2.5093657970428467, "learning_rate": 9.126859846996803e-06, "loss": 0.2687, "step": 5410 }, { "epoch": 0.8171264887682799, "grad_norm": 2.225210666656494, "learning_rate": 9.12190149341829e-06, "loss": 0.2395, "step": 5420 }, { "epoch": 0.8186341022161918, "grad_norm": 2.922473430633545, "learning_rate": 9.116930455888365e-06, "loss": 0.2617, "step": 5430 }, { "epoch": 0.8201417156641038, "grad_norm": 2.1815104484558105, "learning_rate": 9.111946749703948e-06, "loss": 0.3076, "step": 5440 }, { "epoch": 0.8216493291120157, "grad_norm": 2.1248698234558105, "learning_rate": 9.10695039020094e-06, "loss": 0.2336, "step": 5450 }, { "epoch": 0.8231569425599277, "grad_norm": 2.2220993041992188, "learning_rate": 9.101941392754187e-06, "loss": 0.2448, "step": 5460 }, { "epoch": 0.8246645560078396, "grad_norm": 2.8120875358581543, "learning_rate": 9.096919772777417e-06, "loss": 0.2109, "step": 5470 }, { "epoch": 0.8261721694557516, "grad_norm": 3.668710470199585, "learning_rate": 9.091885545723205e-06, "loss": 0.2455, "step": 5480 }, { "epoch": 0.8276797829036635, "grad_norm": 2.8986730575561523, "learning_rate": 9.086838727082914e-06, "loss": 0.2699, "step": 5490 }, { "epoch": 0.8291873963515755, "grad_norm": 1.940796136856079, "learning_rate": 9.081779332386665e-06, "loss": 0.199, "step": 5500 }, { "epoch": 0.8306950097994874, "grad_norm": 5.511064052581787, "learning_rate": 9.076707377203267e-06, "loss": 0.2643, "step": 5510 }, { "epoch": 0.8322026232473994, "grad_norm": 3.5281729698181152, "learning_rate": 9.071622877140188e-06, "loss": 0.2475, "step": 5520 }, { "epoch": 0.8337102366953113, "grad_norm": 2.7086427211761475, "learning_rate": 9.066525847843492e-06, "loss": 0.2928, "step": 5530 }, { "epoch": 0.8352178501432233, "grad_norm": 2.352342128753662, "learning_rate": 9.061416304997807e-06, "loss": 0.2765, "step": 5540 }, { "epoch": 0.8367254635911352, "grad_norm": 2.3874220848083496, "learning_rate": 9.05629426432626e-06, "loss": 0.2745, "step": 5550 }, { "epoch": 0.8382330770390471, "grad_norm": 1.943677544593811, "learning_rate": 9.051159741590442e-06, "loss": 0.2719, "step": 5560 }, { "epoch": 0.8397406904869591, "grad_norm": 3.4037392139434814, "learning_rate": 9.04601275259035e-06, "loss": 0.2631, "step": 5570 }, { "epoch": 0.841248303934871, "grad_norm": 2.904156446456909, "learning_rate": 9.040853313164342e-06, "loss": 0.23, "step": 5580 }, { "epoch": 0.8427559173827831, "grad_norm": 3.6543831825256348, "learning_rate": 9.03568143918909e-06, "loss": 0.2996, "step": 5590 }, { "epoch": 0.844263530830695, "grad_norm": 2.9863405227661133, "learning_rate": 9.030497146579535e-06, "loss": 0.1904, "step": 5600 }, { "epoch": 0.845771144278607, "grad_norm": 3.3716237545013428, "learning_rate": 9.025300451288824e-06, "loss": 0.2225, "step": 5610 }, { "epoch": 0.847278757726519, "grad_norm": 3.27250075340271, "learning_rate": 9.020091369308273e-06, "loss": 0.2298, "step": 5620 }, { "epoch": 0.8487863711744309, "grad_norm": 1.9541891813278198, "learning_rate": 9.014869916667315e-06, "loss": 0.2981, "step": 5630 }, { "epoch": 0.8502939846223428, "grad_norm": 2.819779634475708, "learning_rate": 9.009636109433451e-06, "loss": 0.2281, "step": 5640 }, { "epoch": 0.8518015980702548, "grad_norm": 3.4168295860290527, "learning_rate": 9.004389963712196e-06, "loss": 0.2718, "step": 5650 }, { "epoch": 0.8533092115181667, "grad_norm": 3.69096040725708, "learning_rate": 8.99913149564704e-06, "loss": 0.2875, "step": 5660 }, { "epoch": 0.8548168249660787, "grad_norm": 2.4907710552215576, "learning_rate": 8.993860721419383e-06, "loss": 0.2135, "step": 5670 }, { "epoch": 0.8563244384139906, "grad_norm": 2.9878644943237305, "learning_rate": 8.988577657248498e-06, "loss": 0.2321, "step": 5680 }, { "epoch": 0.8578320518619026, "grad_norm": 4.044392108917236, "learning_rate": 8.983282319391479e-06, "loss": 0.3104, "step": 5690 }, { "epoch": 0.8593396653098145, "grad_norm": 2.584432363510132, "learning_rate": 8.977974724143183e-06, "loss": 0.2441, "step": 5700 }, { "epoch": 0.8608472787577265, "grad_norm": 3.622022867202759, "learning_rate": 8.972654887836192e-06, "loss": 0.2105, "step": 5710 }, { "epoch": 0.8623548922056384, "grad_norm": 3.3017630577087402, "learning_rate": 8.96732282684075e-06, "loss": 0.252, "step": 5720 }, { "epoch": 0.8638625056535504, "grad_norm": 5.205654144287109, "learning_rate": 8.961978557564723e-06, "loss": 0.2336, "step": 5730 }, { "epoch": 0.8653701191014623, "grad_norm": 1.993915319442749, "learning_rate": 8.956622096453544e-06, "loss": 0.2201, "step": 5740 }, { "epoch": 0.8668777325493744, "grad_norm": 2.4250314235687256, "learning_rate": 8.951253459990161e-06, "loss": 0.3218, "step": 5750 }, { "epoch": 0.8683853459972863, "grad_norm": 2.8070802688598633, "learning_rate": 8.945872664694989e-06, "loss": 0.1898, "step": 5760 }, { "epoch": 0.8698929594451983, "grad_norm": 3.8658294677734375, "learning_rate": 8.94047972712586e-06, "loss": 0.2579, "step": 5770 }, { "epoch": 0.8714005728931102, "grad_norm": 2.795346975326538, "learning_rate": 8.935074663877967e-06, "loss": 0.2452, "step": 5780 }, { "epoch": 0.8729081863410222, "grad_norm": 2.4461915493011475, "learning_rate": 8.929657491583817e-06, "loss": 0.2059, "step": 5790 }, { "epoch": 0.8744157997889341, "grad_norm": 3.3603625297546387, "learning_rate": 8.924228226913184e-06, "loss": 0.25, "step": 5800 }, { "epoch": 0.8759234132368461, "grad_norm": 3.7504005432128906, "learning_rate": 8.918786886573045e-06, "loss": 0.2499, "step": 5810 }, { "epoch": 0.877431026684758, "grad_norm": 4.306051731109619, "learning_rate": 8.913333487307543e-06, "loss": 0.2483, "step": 5820 }, { "epoch": 0.87893864013267, "grad_norm": 4.687183380126953, "learning_rate": 8.907868045897924e-06, "loss": 0.2208, "step": 5830 }, { "epoch": 0.8804462535805819, "grad_norm": 2.7215170860290527, "learning_rate": 8.902390579162493e-06, "loss": 0.2441, "step": 5840 }, { "epoch": 0.8819538670284939, "grad_norm": 2.37863826751709, "learning_rate": 8.896901103956561e-06, "loss": 0.2426, "step": 5850 }, { "epoch": 0.8834614804764058, "grad_norm": 2.955584764480591, "learning_rate": 8.891399637172386e-06, "loss": 0.2347, "step": 5860 }, { "epoch": 0.8849690939243178, "grad_norm": 2.8029117584228516, "learning_rate": 8.88588619573913e-06, "loss": 0.2157, "step": 5870 }, { "epoch": 0.8864767073722297, "grad_norm": 2.7195956707000732, "learning_rate": 8.880360796622804e-06, "loss": 0.2429, "step": 5880 }, { "epoch": 0.8879843208201417, "grad_norm": 2.2344369888305664, "learning_rate": 8.874823456826215e-06, "loss": 0.1999, "step": 5890 }, { "epoch": 0.8894919342680536, "grad_norm": 2.4235787391662598, "learning_rate": 8.869274193388913e-06, "loss": 0.2285, "step": 5900 }, { "epoch": 0.8909995477159657, "grad_norm": 2.3134164810180664, "learning_rate": 8.863713023387142e-06, "loss": 0.2104, "step": 5910 }, { "epoch": 0.8925071611638776, "grad_norm": 3.61189341545105, "learning_rate": 8.85813996393378e-06, "loss": 0.1932, "step": 5920 }, { "epoch": 0.8940147746117896, "grad_norm": 2.4070184230804443, "learning_rate": 8.852555032178296e-06, "loss": 0.2359, "step": 5930 }, { "epoch": 0.8955223880597015, "grad_norm": 2.1681087017059326, "learning_rate": 8.84695824530669e-06, "loss": 0.2473, "step": 5940 }, { "epoch": 0.8970300015076135, "grad_norm": 3.404526948928833, "learning_rate": 8.841349620541447e-06, "loss": 0.2248, "step": 5950 }, { "epoch": 0.8985376149555254, "grad_norm": 3.881133794784546, "learning_rate": 8.83572917514147e-06, "loss": 0.2412, "step": 5960 }, { "epoch": 0.9000452284034374, "grad_norm": 2.827536106109619, "learning_rate": 8.83009692640205e-06, "loss": 0.2078, "step": 5970 }, { "epoch": 0.9015528418513493, "grad_norm": 1.9669289588928223, "learning_rate": 8.824452891654787e-06, "loss": 0.2771, "step": 5980 }, { "epoch": 0.9030604552992613, "grad_norm": 3.531554937362671, "learning_rate": 8.818797088267556e-06, "loss": 0.2598, "step": 5990 }, { "epoch": 0.9045680687471732, "grad_norm": 2.953885078430176, "learning_rate": 8.813129533644446e-06, "loss": 0.2222, "step": 6000 }, { "epoch": 0.9060756821950852, "grad_norm": 2.0051162242889404, "learning_rate": 8.807450245225704e-06, "loss": 0.2371, "step": 6010 }, { "epoch": 0.9075832956429971, "grad_norm": 1.7887074947357178, "learning_rate": 8.801759240487687e-06, "loss": 0.192, "step": 6020 }, { "epoch": 0.9090909090909091, "grad_norm": 3.2498350143432617, "learning_rate": 8.796056536942805e-06, "loss": 0.2925, "step": 6030 }, { "epoch": 0.910598522538821, "grad_norm": 2.7137231826782227, "learning_rate": 8.790342152139465e-06, "loss": 0.252, "step": 6040 }, { "epoch": 0.912106135986733, "grad_norm": 2.830386161804199, "learning_rate": 8.784616103662024e-06, "loss": 0.2224, "step": 6050 }, { "epoch": 0.9136137494346449, "grad_norm": 2.349468469619751, "learning_rate": 8.778878409130728e-06, "loss": 0.2241, "step": 6060 }, { "epoch": 0.915121362882557, "grad_norm": 1.9477012157440186, "learning_rate": 8.773129086201664e-06, "loss": 0.2648, "step": 6070 }, { "epoch": 0.9166289763304689, "grad_norm": 3.2217156887054443, "learning_rate": 8.767368152566693e-06, "loss": 0.2493, "step": 6080 }, { "epoch": 0.9181365897783809, "grad_norm": 2.414722442626953, "learning_rate": 8.761595625953417e-06, "loss": 0.2658, "step": 6090 }, { "epoch": 0.9196442032262928, "grad_norm": 4.434691429138184, "learning_rate": 8.755811524125099e-06, "loss": 0.2568, "step": 6100 }, { "epoch": 0.9211518166742048, "grad_norm": 3.3716375827789307, "learning_rate": 8.750015864880631e-06, "loss": 0.2873, "step": 6110 }, { "epoch": 0.9226594301221167, "grad_norm": 3.3805150985717773, "learning_rate": 8.744208666054468e-06, "loss": 0.2438, "step": 6120 }, { "epoch": 0.9241670435700287, "grad_norm": 4.044703006744385, "learning_rate": 8.738389945516571e-06, "loss": 0.3011, "step": 6130 }, { "epoch": 0.9256746570179406, "grad_norm": 4.547442436218262, "learning_rate": 8.732559721172358e-06, "loss": 0.2598, "step": 6140 }, { "epoch": 0.9271822704658526, "grad_norm": 2.5407986640930176, "learning_rate": 8.726718010962648e-06, "loss": 0.2254, "step": 6150 }, { "epoch": 0.9286898839137645, "grad_norm": 2.5933055877685547, "learning_rate": 8.7208648328636e-06, "loss": 0.2463, "step": 6160 }, { "epoch": 0.9301974973616765, "grad_norm": 3.7310166358947754, "learning_rate": 8.715000204886669e-06, "loss": 0.2746, "step": 6170 }, { "epoch": 0.9317051108095884, "grad_norm": 3.7371816635131836, "learning_rate": 8.709124145078536e-06, "loss": 0.2416, "step": 6180 }, { "epoch": 0.9332127242575003, "grad_norm": 3.375037908554077, "learning_rate": 8.703236671521065e-06, "loss": 0.2281, "step": 6190 }, { "epoch": 0.9347203377054123, "grad_norm": 2.4336585998535156, "learning_rate": 8.697337802331243e-06, "loss": 0.2176, "step": 6200 }, { "epoch": 0.9362279511533242, "grad_norm": 2.401395797729492, "learning_rate": 8.691427555661118e-06, "loss": 0.2208, "step": 6210 }, { "epoch": 0.9377355646012362, "grad_norm": 2.67345929145813, "learning_rate": 8.685505949697757e-06, "loss": 0.1957, "step": 6220 }, { "epoch": 0.9392431780491483, "grad_norm": 3.82041335105896, "learning_rate": 8.679573002663175e-06, "loss": 0.318, "step": 6230 }, { "epoch": 0.9407507914970602, "grad_norm": 2.6415598392486572, "learning_rate": 8.673628732814289e-06, "loss": 0.2314, "step": 6240 }, { "epoch": 0.9422584049449721, "grad_norm": 2.2734360694885254, "learning_rate": 8.66767315844286e-06, "loss": 0.1856, "step": 6250 }, { "epoch": 0.9437660183928841, "grad_norm": 2.489954948425293, "learning_rate": 8.661706297875433e-06, "loss": 0.2477, "step": 6260 }, { "epoch": 0.945273631840796, "grad_norm": 2.5175702571868896, "learning_rate": 8.655728169473284e-06, "loss": 0.2544, "step": 6270 }, { "epoch": 0.946781245288708, "grad_norm": 3.511953353881836, "learning_rate": 8.649738791632358e-06, "loss": 0.1965, "step": 6280 }, { "epoch": 0.9482888587366199, "grad_norm": 4.435739040374756, "learning_rate": 8.643738182783227e-06, "loss": 0.2645, "step": 6290 }, { "epoch": 0.9497964721845319, "grad_norm": 1.985802173614502, "learning_rate": 8.637726361391015e-06, "loss": 0.2251, "step": 6300 }, { "epoch": 0.9513040856324438, "grad_norm": 3.072113513946533, "learning_rate": 8.63170334595535e-06, "loss": 0.2272, "step": 6310 }, { "epoch": 0.9528116990803558, "grad_norm": 2.162796974182129, "learning_rate": 8.625669155010307e-06, "loss": 0.2119, "step": 6320 }, { "epoch": 0.9543193125282677, "grad_norm": 2.672593355178833, "learning_rate": 8.619623807124356e-06, "loss": 0.2761, "step": 6330 }, { "epoch": 0.9558269259761797, "grad_norm": 3.140590190887451, "learning_rate": 8.613567320900292e-06, "loss": 0.2284, "step": 6340 }, { "epoch": 0.9573345394240916, "grad_norm": 1.7530702352523804, "learning_rate": 8.607499714975185e-06, "loss": 0.2124, "step": 6350 }, { "epoch": 0.9588421528720036, "grad_norm": 2.5740535259246826, "learning_rate": 8.601421008020329e-06, "loss": 0.2356, "step": 6360 }, { "epoch": 0.9603497663199155, "grad_norm": 2.024742841720581, "learning_rate": 8.595331218741172e-06, "loss": 0.2147, "step": 6370 }, { "epoch": 0.9618573797678275, "grad_norm": 2.188408374786377, "learning_rate": 8.589230365877268e-06, "loss": 0.2262, "step": 6380 }, { "epoch": 0.9633649932157394, "grad_norm": 2.3507211208343506, "learning_rate": 8.583118468202214e-06, "loss": 0.2556, "step": 6390 }, { "epoch": 0.9648726066636515, "grad_norm": 2.750495433807373, "learning_rate": 8.576995544523596e-06, "loss": 0.2364, "step": 6400 }, { "epoch": 0.9663802201115634, "grad_norm": 3.479285478591919, "learning_rate": 8.570861613682929e-06, "loss": 0.2117, "step": 6410 }, { "epoch": 0.9678878335594754, "grad_norm": 2.6610944271087646, "learning_rate": 8.564716694555597e-06, "loss": 0.2363, "step": 6420 }, { "epoch": 0.9693954470073873, "grad_norm": 3.0084638595581055, "learning_rate": 8.558560806050802e-06, "loss": 0.2274, "step": 6430 }, { "epoch": 0.9709030604552993, "grad_norm": 2.527921676635742, "learning_rate": 8.552393967111494e-06, "loss": 0.2167, "step": 6440 }, { "epoch": 0.9724106739032112, "grad_norm": 3.2821383476257324, "learning_rate": 8.546216196714325e-06, "loss": 0.2632, "step": 6450 }, { "epoch": 0.9739182873511232, "grad_norm": 2.2432119846343994, "learning_rate": 8.540027513869587e-06, "loss": 0.2121, "step": 6460 }, { "epoch": 0.9754259007990351, "grad_norm": 3.3564796447753906, "learning_rate": 8.533827937621146e-06, "loss": 0.1923, "step": 6470 }, { "epoch": 0.9769335142469471, "grad_norm": 2.6124722957611084, "learning_rate": 8.527617487046394e-06, "loss": 0.2715, "step": 6480 }, { "epoch": 0.978441127694859, "grad_norm": 1.9076937437057495, "learning_rate": 8.521396181256182e-06, "loss": 0.2454, "step": 6490 }, { "epoch": 0.979948741142771, "grad_norm": 2.338266372680664, "learning_rate": 8.51516403939477e-06, "loss": 0.2385, "step": 6500 }, { "epoch": 0.9814563545906829, "grad_norm": 2.5707411766052246, "learning_rate": 8.508921080639757e-06, "loss": 0.2381, "step": 6510 }, { "epoch": 0.9829639680385949, "grad_norm": 2.406142473220825, "learning_rate": 8.502667324202032e-06, "loss": 0.2668, "step": 6520 }, { "epoch": 0.9844715814865068, "grad_norm": 2.7965104579925537, "learning_rate": 8.496402789325708e-06, "loss": 0.2263, "step": 6530 }, { "epoch": 0.9859791949344188, "grad_norm": 6.217304229736328, "learning_rate": 8.490127495288068e-06, "loss": 0.242, "step": 6540 }, { "epoch": 0.9874868083823307, "grad_norm": 2.4762349128723145, "learning_rate": 8.483841461399501e-06, "loss": 0.2351, "step": 6550 }, { "epoch": 0.9889944218302428, "grad_norm": 1.9303960800170898, "learning_rate": 8.477544707003443e-06, "loss": 0.2353, "step": 6560 }, { "epoch": 0.9905020352781547, "grad_norm": 3.2324724197387695, "learning_rate": 8.471237251476326e-06, "loss": 0.2689, "step": 6570 }, { "epoch": 0.9920096487260667, "grad_norm": 3.507396697998047, "learning_rate": 8.464919114227505e-06, "loss": 0.2554, "step": 6580 }, { "epoch": 0.9935172621739786, "grad_norm": 4.214005470275879, "learning_rate": 8.458590314699208e-06, "loss": 0.2466, "step": 6590 }, { "epoch": 0.9950248756218906, "grad_norm": 2.7016959190368652, "learning_rate": 8.452250872366473e-06, "loss": 0.2243, "step": 6600 }, { "epoch": 0.9965324890698025, "grad_norm": 3.3990437984466553, "learning_rate": 8.445900806737088e-06, "loss": 0.1685, "step": 6610 }, { "epoch": 0.9980401025177145, "grad_norm": 2.950164794921875, "learning_rate": 8.439540137351528e-06, "loss": 0.1806, "step": 6620 }, { "epoch": 0.9995477159656264, "grad_norm": 2.1688122749328613, "learning_rate": 8.433168883782901e-06, "loss": 0.2301, "step": 6630 }, { "epoch": 1.0010553294135385, "grad_norm": 2.0140140056610107, "learning_rate": 8.426787065636888e-06, "loss": 0.256, "step": 6640 }, { "epoch": 1.0025629428614504, "grad_norm": 3.8153982162475586, "learning_rate": 8.420394702551671e-06, "loss": 0.2156, "step": 6650 }, { "epoch": 1.0040705563093624, "grad_norm": 2.3902337551116943, "learning_rate": 8.413991814197891e-06, "loss": 0.1949, "step": 6660 }, { "epoch": 1.0055781697572743, "grad_norm": 2.238041877746582, "learning_rate": 8.407578420278567e-06, "loss": 0.2117, "step": 6670 }, { "epoch": 1.0070857832051863, "grad_norm": 2.890782117843628, "learning_rate": 8.401154540529053e-06, "loss": 0.1899, "step": 6680 }, { "epoch": 1.0085933966530982, "grad_norm": 2.4909114837646484, "learning_rate": 8.39472019471697e-06, "loss": 0.2382, "step": 6690 }, { "epoch": 1.0101010101010102, "grad_norm": 3.4136745929718018, "learning_rate": 8.388275402642142e-06, "loss": 0.2046, "step": 6700 }, { "epoch": 1.0116086235489221, "grad_norm": 3.1697499752044678, "learning_rate": 8.381820184136536e-06, "loss": 0.1648, "step": 6710 }, { "epoch": 1.013116236996834, "grad_norm": 2.2768239974975586, "learning_rate": 8.375354559064212e-06, "loss": 0.2344, "step": 6720 }, { "epoch": 1.014623850444746, "grad_norm": 3.1577417850494385, "learning_rate": 8.368878547321246e-06, "loss": 0.2008, "step": 6730 }, { "epoch": 1.016131463892658, "grad_norm": 4.09757137298584, "learning_rate": 8.36239216883568e-06, "loss": 0.2612, "step": 6740 }, { "epoch": 1.01763907734057, "grad_norm": 2.1461269855499268, "learning_rate": 8.355895443567452e-06, "loss": 0.202, "step": 6750 }, { "epoch": 1.0191466907884819, "grad_norm": 2.205613136291504, "learning_rate": 8.349388391508344e-06, "loss": 0.2246, "step": 6760 }, { "epoch": 1.0206543042363938, "grad_norm": 3.3272602558135986, "learning_rate": 8.342871032681912e-06, "loss": 0.2552, "step": 6770 }, { "epoch": 1.0221619176843058, "grad_norm": 2.6921675205230713, "learning_rate": 8.336343387143432e-06, "loss": 0.2399, "step": 6780 }, { "epoch": 1.0236695311322177, "grad_norm": 3.674072265625, "learning_rate": 8.32980547497983e-06, "loss": 0.2044, "step": 6790 }, { "epoch": 1.0251771445801297, "grad_norm": 2.566761016845703, "learning_rate": 8.323257316309627e-06, "loss": 0.2206, "step": 6800 }, { "epoch": 1.0266847580280416, "grad_norm": 2.6536505222320557, "learning_rate": 8.316698931282873e-06, "loss": 0.2545, "step": 6810 }, { "epoch": 1.0281923714759535, "grad_norm": 1.9551825523376465, "learning_rate": 8.310130340081093e-06, "loss": 0.2251, "step": 6820 }, { "epoch": 1.0296999849238655, "grad_norm": 2.5148026943206787, "learning_rate": 8.303551562917208e-06, "loss": 0.2256, "step": 6830 }, { "epoch": 1.0312075983717774, "grad_norm": 2.4629952907562256, "learning_rate": 8.29696262003549e-06, "loss": 0.2162, "step": 6840 }, { "epoch": 1.0327152118196894, "grad_norm": 1.8050205707550049, "learning_rate": 8.290363531711494e-06, "loss": 0.2311, "step": 6850 }, { "epoch": 1.0342228252676013, "grad_norm": 5.78726053237915, "learning_rate": 8.28375431825199e-06, "loss": 0.1993, "step": 6860 }, { "epoch": 1.0357304387155133, "grad_norm": 3.3627989292144775, "learning_rate": 8.27713499999491e-06, "loss": 0.2404, "step": 6870 }, { "epoch": 1.0372380521634252, "grad_norm": 3.8735406398773193, "learning_rate": 8.270505597309273e-06, "loss": 0.2206, "step": 6880 }, { "epoch": 1.0387456656113372, "grad_norm": 2.7618348598480225, "learning_rate": 8.263866130595141e-06, "loss": 0.1896, "step": 6890 }, { "epoch": 1.0402532790592491, "grad_norm": 2.8327956199645996, "learning_rate": 8.257216620283538e-06, "loss": 0.2246, "step": 6900 }, { "epoch": 1.041760892507161, "grad_norm": 4.053680896759033, "learning_rate": 8.250557086836392e-06, "loss": 0.2071, "step": 6910 }, { "epoch": 1.043268505955073, "grad_norm": 2.1786561012268066, "learning_rate": 8.243887550746484e-06, "loss": 0.2032, "step": 6920 }, { "epoch": 1.044776119402985, "grad_norm": 2.564005136489868, "learning_rate": 8.237208032537363e-06, "loss": 0.2469, "step": 6930 }, { "epoch": 1.046283732850897, "grad_norm": 3.5162343978881836, "learning_rate": 8.230518552763301e-06, "loss": 0.1976, "step": 6940 }, { "epoch": 1.0477913462988089, "grad_norm": 2.6153948307037354, "learning_rate": 8.223819132009227e-06, "loss": 0.2861, "step": 6950 }, { "epoch": 1.049298959746721, "grad_norm": 3.826033353805542, "learning_rate": 8.217109790890657e-06, "loss": 0.2143, "step": 6960 }, { "epoch": 1.050806573194633, "grad_norm": 2.8082435131073, "learning_rate": 8.210390550053632e-06, "loss": 0.1784, "step": 6970 }, { "epoch": 1.052314186642545, "grad_norm": 3.2394065856933594, "learning_rate": 8.203661430174663e-06, "loss": 0.2436, "step": 6980 }, { "epoch": 1.053821800090457, "grad_norm": 2.6734447479248047, "learning_rate": 8.196922451960651e-06, "loss": 0.1862, "step": 6990 }, { "epoch": 1.0553294135383688, "grad_norm": 2.8466451168060303, "learning_rate": 8.190173636148844e-06, "loss": 0.2473, "step": 7000 }, { "epoch": 1.0568370269862808, "grad_norm": 3.1291816234588623, "learning_rate": 8.183415003506753e-06, "loss": 0.2138, "step": 7010 }, { "epoch": 1.0583446404341927, "grad_norm": 3.2858996391296387, "learning_rate": 8.176646574832104e-06, "loss": 0.2872, "step": 7020 }, { "epoch": 1.0598522538821047, "grad_norm": 6.466736316680908, "learning_rate": 8.169868370952765e-06, "loss": 0.2087, "step": 7030 }, { "epoch": 1.0613598673300166, "grad_norm": 2.6094579696655273, "learning_rate": 8.163080412726685e-06, "loss": 0.2252, "step": 7040 }, { "epoch": 1.0628674807779286, "grad_norm": 2.4909939765930176, "learning_rate": 8.156282721041825e-06, "loss": 0.2301, "step": 7050 }, { "epoch": 1.0643750942258405, "grad_norm": 2.2814037799835205, "learning_rate": 8.149475316816106e-06, "loss": 0.1988, "step": 7060 }, { "epoch": 1.0658827076737525, "grad_norm": 2.281097173690796, "learning_rate": 8.142658220997331e-06, "loss": 0.1762, "step": 7070 }, { "epoch": 1.0673903211216644, "grad_norm": 2.963184356689453, "learning_rate": 8.135831454563124e-06, "loss": 0.1801, "step": 7080 }, { "epoch": 1.0688979345695764, "grad_norm": 2.1050667762756348, "learning_rate": 8.128995038520872e-06, "loss": 0.2251, "step": 7090 }, { "epoch": 1.0704055480174883, "grad_norm": 2.4854631423950195, "learning_rate": 8.122148993907657e-06, "loss": 0.18, "step": 7100 }, { "epoch": 1.0719131614654003, "grad_norm": 3.083214044570923, "learning_rate": 8.115293341790186e-06, "loss": 0.2094, "step": 7110 }, { "epoch": 1.0734207749133122, "grad_norm": 2.807391881942749, "learning_rate": 8.108428103264728e-06, "loss": 0.2479, "step": 7120 }, { "epoch": 1.0749283883612242, "grad_norm": 2.1845507621765137, "learning_rate": 8.10155329945706e-06, "loss": 0.253, "step": 7130 }, { "epoch": 1.0764360018091361, "grad_norm": 4.2442803382873535, "learning_rate": 8.094668951522388e-06, "loss": 0.2145, "step": 7140 }, { "epoch": 1.077943615257048, "grad_norm": 2.8470234870910645, "learning_rate": 8.087775080645286e-06, "loss": 0.2482, "step": 7150 }, { "epoch": 1.07945122870496, "grad_norm": 2.8226003646850586, "learning_rate": 8.080871708039633e-06, "loss": 0.2524, "step": 7160 }, { "epoch": 1.080958842152872, "grad_norm": 1.9936553239822388, "learning_rate": 8.07395885494855e-06, "loss": 0.1731, "step": 7170 }, { "epoch": 1.082466455600784, "grad_norm": 3.5770115852355957, "learning_rate": 8.06703654264433e-06, "loss": 0.2273, "step": 7180 }, { "epoch": 1.0839740690486959, "grad_norm": 2.4086077213287354, "learning_rate": 8.060104792428372e-06, "loss": 0.2289, "step": 7190 }, { "epoch": 1.0854816824966078, "grad_norm": 3.434783458709717, "learning_rate": 8.053163625631118e-06, "loss": 0.1799, "step": 7200 }, { "epoch": 1.0869892959445198, "grad_norm": 3.257915735244751, "learning_rate": 8.046213063611988e-06, "loss": 0.2232, "step": 7210 }, { "epoch": 1.0884969093924317, "grad_norm": 2.1257095336914062, "learning_rate": 8.03925312775931e-06, "loss": 0.1869, "step": 7220 }, { "epoch": 1.0900045228403437, "grad_norm": 3.5441856384277344, "learning_rate": 8.03228383949026e-06, "loss": 0.2176, "step": 7230 }, { "epoch": 1.0915121362882556, "grad_norm": 3.0019657611846924, "learning_rate": 8.025305220250799e-06, "loss": 0.1822, "step": 7240 }, { "epoch": 1.0930197497361676, "grad_norm": 5.074293613433838, "learning_rate": 8.018317291515587e-06, "loss": 0.2183, "step": 7250 }, { "epoch": 1.0945273631840795, "grad_norm": 3.3142778873443604, "learning_rate": 8.011320074787944e-06, "loss": 0.2512, "step": 7260 }, { "epoch": 1.0960349766319915, "grad_norm": 2.9208028316497803, "learning_rate": 8.004313591599768e-06, "loss": 0.2201, "step": 7270 }, { "epoch": 1.0975425900799034, "grad_norm": 3.930175542831421, "learning_rate": 7.997297863511467e-06, "loss": 0.1955, "step": 7280 }, { "epoch": 1.0990502035278156, "grad_norm": 2.882797956466675, "learning_rate": 7.990272912111902e-06, "loss": 0.2677, "step": 7290 }, { "epoch": 1.1005578169757275, "grad_norm": 1.9190322160720825, "learning_rate": 7.983238759018317e-06, "loss": 0.2188, "step": 7300 }, { "epoch": 1.1020654304236395, "grad_norm": 2.7930893898010254, "learning_rate": 7.976195425876268e-06, "loss": 0.2298, "step": 7310 }, { "epoch": 1.1035730438715514, "grad_norm": 2.493242025375366, "learning_rate": 7.96914293435956e-06, "loss": 0.2249, "step": 7320 }, { "epoch": 1.1050806573194634, "grad_norm": 2.2709383964538574, "learning_rate": 7.962081306170185e-06, "loss": 0.1829, "step": 7330 }, { "epoch": 1.1065882707673753, "grad_norm": 2.298851490020752, "learning_rate": 7.955010563038245e-06, "loss": 0.2139, "step": 7340 }, { "epoch": 1.1080958842152873, "grad_norm": 4.105879783630371, "learning_rate": 7.947930726721893e-06, "loss": 0.2281, "step": 7350 }, { "epoch": 1.1096034976631992, "grad_norm": 1.8366310596466064, "learning_rate": 7.940841819007264e-06, "loss": 0.2143, "step": 7360 }, { "epoch": 1.1111111111111112, "grad_norm": 3.1858928203582764, "learning_rate": 7.933743861708406e-06, "loss": 0.2431, "step": 7370 }, { "epoch": 1.112618724559023, "grad_norm": 2.661524772644043, "learning_rate": 7.926636876667215e-06, "loss": 0.2282, "step": 7380 }, { "epoch": 1.114126338006935, "grad_norm": 3.544917106628418, "learning_rate": 7.919520885753372e-06, "loss": 0.1954, "step": 7390 }, { "epoch": 1.115633951454847, "grad_norm": 2.1512629985809326, "learning_rate": 7.91239591086426e-06, "loss": 0.2097, "step": 7400 }, { "epoch": 1.117141564902759, "grad_norm": 4.436117649078369, "learning_rate": 7.905261973924919e-06, "loss": 0.2306, "step": 7410 }, { "epoch": 1.118649178350671, "grad_norm": 2.36808180809021, "learning_rate": 7.89811909688796e-06, "loss": 0.2136, "step": 7420 }, { "epoch": 1.1201567917985829, "grad_norm": 2.6645290851593018, "learning_rate": 7.890967301733508e-06, "loss": 0.1903, "step": 7430 }, { "epoch": 1.1216644052464948, "grad_norm": 5.91689395904541, "learning_rate": 7.883806610469129e-06, "loss": 0.2071, "step": 7440 }, { "epoch": 1.1231720186944067, "grad_norm": 2.4235317707061768, "learning_rate": 7.876637045129768e-06, "loss": 0.2492, "step": 7450 }, { "epoch": 1.1246796321423187, "grad_norm": 2.096876859664917, "learning_rate": 7.86945862777767e-06, "loss": 0.2312, "step": 7460 }, { "epoch": 1.1261872455902306, "grad_norm": 2.0976669788360596, "learning_rate": 7.862271380502324e-06, "loss": 0.194, "step": 7470 }, { "epoch": 1.1276948590381426, "grad_norm": 4.018562316894531, "learning_rate": 7.855075325420392e-06, "loss": 0.2414, "step": 7480 }, { "epoch": 1.1292024724860545, "grad_norm": 2.597379446029663, "learning_rate": 7.84787048467564e-06, "loss": 0.2534, "step": 7490 }, { "epoch": 1.1307100859339665, "grad_norm": 4.7941975593566895, "learning_rate": 7.840656880438863e-06, "loss": 0.2207, "step": 7500 }, { "epoch": 1.1322176993818784, "grad_norm": 2.7498698234558105, "learning_rate": 7.833434534907828e-06, "loss": 0.2136, "step": 7510 }, { "epoch": 1.1337253128297904, "grad_norm": 3.2771618366241455, "learning_rate": 7.826203470307202e-06, "loss": 0.1827, "step": 7520 }, { "epoch": 1.1352329262777023, "grad_norm": 2.4990131855010986, "learning_rate": 7.81896370888848e-06, "loss": 0.2414, "step": 7530 }, { "epoch": 1.1367405397256143, "grad_norm": 2.424741268157959, "learning_rate": 7.811715272929917e-06, "loss": 0.1925, "step": 7540 }, { "epoch": 1.1382481531735262, "grad_norm": 4.885912895202637, "learning_rate": 7.804458184736466e-06, "loss": 0.2289, "step": 7550 }, { "epoch": 1.1397557666214382, "grad_norm": 2.8225977420806885, "learning_rate": 7.797192466639702e-06, "loss": 0.2145, "step": 7560 }, { "epoch": 1.1412633800693501, "grad_norm": 4.637665748596191, "learning_rate": 7.789918140997757e-06, "loss": 0.1812, "step": 7570 }, { "epoch": 1.142770993517262, "grad_norm": 2.9110684394836426, "learning_rate": 7.782635230195245e-06, "loss": 0.2552, "step": 7580 }, { "epoch": 1.144278606965174, "grad_norm": 2.162874937057495, "learning_rate": 7.775343756643211e-06, "loss": 0.2354, "step": 7590 }, { "epoch": 1.1457862204130862, "grad_norm": 2.841996431350708, "learning_rate": 7.768043742779034e-06, "loss": 0.2452, "step": 7600 }, { "epoch": 1.147293833860998, "grad_norm": 2.0952706336975098, "learning_rate": 7.760735211066386e-06, "loss": 0.2801, "step": 7610 }, { "epoch": 1.14880144730891, "grad_norm": 2.4509336948394775, "learning_rate": 7.753418183995143e-06, "loss": 0.2561, "step": 7620 }, { "epoch": 1.150309060756822, "grad_norm": 3.0848522186279297, "learning_rate": 7.746092684081321e-06, "loss": 0.2135, "step": 7630 }, { "epoch": 1.151816674204734, "grad_norm": 3.4702353477478027, "learning_rate": 7.738758733867017e-06, "loss": 0.2209, "step": 7640 }, { "epoch": 1.153324287652646, "grad_norm": 2.609631061553955, "learning_rate": 7.73141635592033e-06, "loss": 0.2285, "step": 7650 }, { "epoch": 1.154831901100558, "grad_norm": 2.203744888305664, "learning_rate": 7.724065572835282e-06, "loss": 0.2334, "step": 7660 }, { "epoch": 1.1563395145484698, "grad_norm": 3.7753446102142334, "learning_rate": 7.71670640723177e-06, "loss": 0.2506, "step": 7670 }, { "epoch": 1.1578471279963818, "grad_norm": 2.7950692176818848, "learning_rate": 7.709338881755486e-06, "loss": 0.3055, "step": 7680 }, { "epoch": 1.1593547414442937, "grad_norm": 2.0808823108673096, "learning_rate": 7.70196301907784e-06, "loss": 0.1657, "step": 7690 }, { "epoch": 1.1608623548922057, "grad_norm": 2.2093474864959717, "learning_rate": 7.694578841895905e-06, "loss": 0.2888, "step": 7700 }, { "epoch": 1.1623699683401176, "grad_norm": 2.7525131702423096, "learning_rate": 7.687186372932333e-06, "loss": 0.1544, "step": 7710 }, { "epoch": 1.1638775817880296, "grad_norm": 2.022024154663086, "learning_rate": 7.679785634935295e-06, "loss": 0.2221, "step": 7720 }, { "epoch": 1.1653851952359415, "grad_norm": 1.7302602529525757, "learning_rate": 7.672376650678407e-06, "loss": 0.2332, "step": 7730 }, { "epoch": 1.1668928086838535, "grad_norm": 2.9526822566986084, "learning_rate": 7.66495944296066e-06, "loss": 0.2582, "step": 7740 }, { "epoch": 1.1684004221317654, "grad_norm": 2.514312744140625, "learning_rate": 7.657534034606351e-06, "loss": 0.2158, "step": 7750 }, { "epoch": 1.1699080355796774, "grad_norm": 2.4483351707458496, "learning_rate": 7.65010044846501e-06, "loss": 0.2465, "step": 7760 }, { "epoch": 1.1714156490275893, "grad_norm": 4.1274027824401855, "learning_rate": 7.642658707411337e-06, "loss": 0.1929, "step": 7770 }, { "epoch": 1.1729232624755013, "grad_norm": 3.3393261432647705, "learning_rate": 7.63520883434512e-06, "loss": 0.1848, "step": 7780 }, { "epoch": 1.1744308759234132, "grad_norm": 4.692409515380859, "learning_rate": 7.6277508521911746e-06, "loss": 0.2249, "step": 7790 }, { "epoch": 1.1759384893713252, "grad_norm": 3.0122721195220947, "learning_rate": 7.6202847838992696e-06, "loss": 0.2321, "step": 7800 }, { "epoch": 1.1774461028192371, "grad_norm": 3.2805733680725098, "learning_rate": 7.612810652444055e-06, "loss": 0.1902, "step": 7810 }, { "epoch": 1.178953716267149, "grad_norm": 2.8621909618377686, "learning_rate": 7.605328480824993e-06, "loss": 0.1901, "step": 7820 }, { "epoch": 1.180461329715061, "grad_norm": 3.1451427936553955, "learning_rate": 7.597838292066289e-06, "loss": 0.2345, "step": 7830 }, { "epoch": 1.181968943162973, "grad_norm": 3.4608941078186035, "learning_rate": 7.590340109216816e-06, "loss": 0.2275, "step": 7840 }, { "epoch": 1.183476556610885, "grad_norm": 2.9734973907470703, "learning_rate": 7.58283395535005e-06, "loss": 0.2298, "step": 7850 }, { "epoch": 1.1849841700587969, "grad_norm": 2.664593458175659, "learning_rate": 7.575319853563992e-06, "loss": 0.2105, "step": 7860 }, { "epoch": 1.1864917835067088, "grad_norm": 2.428028106689453, "learning_rate": 7.567797826981101e-06, "loss": 0.2226, "step": 7870 }, { "epoch": 1.1879993969546208, "grad_norm": 4.391481876373291, "learning_rate": 7.560267898748226e-06, "loss": 0.1974, "step": 7880 }, { "epoch": 1.1895070104025327, "grad_norm": 2.6691741943359375, "learning_rate": 7.552730092036524e-06, "loss": 0.2628, "step": 7890 }, { "epoch": 1.1910146238504447, "grad_norm": 2.82861590385437, "learning_rate": 7.545184430041405e-06, "loss": 0.202, "step": 7900 }, { "epoch": 1.1925222372983568, "grad_norm": 2.821493148803711, "learning_rate": 7.537630935982443e-06, "loss": 0.2248, "step": 7910 }, { "epoch": 1.1940298507462686, "grad_norm": 3.690410614013672, "learning_rate": 7.530069633103317e-06, "loss": 0.225, "step": 7920 }, { "epoch": 1.1955374641941807, "grad_norm": 3.357448101043701, "learning_rate": 7.522500544671733e-06, "loss": 0.2506, "step": 7930 }, { "epoch": 1.1970450776420924, "grad_norm": 3.9041812419891357, "learning_rate": 7.514923693979359e-06, "loss": 0.1929, "step": 7940 }, { "epoch": 1.1985526910900046, "grad_norm": 2.8400979042053223, "learning_rate": 7.507339104341746e-06, "loss": 0.2258, "step": 7950 }, { "epoch": 1.2000603045379166, "grad_norm": 3.219043493270874, "learning_rate": 7.499746799098258e-06, "loss": 0.1919, "step": 7960 }, { "epoch": 1.2015679179858285, "grad_norm": 2.0987255573272705, "learning_rate": 7.492146801612004e-06, "loss": 0.2064, "step": 7970 }, { "epoch": 1.2030755314337405, "grad_norm": 4.356513977050781, "learning_rate": 7.4845391352697625e-06, "loss": 0.1947, "step": 7980 }, { "epoch": 1.2045831448816524, "grad_norm": 3.298701524734497, "learning_rate": 7.476923823481912e-06, "loss": 0.316, "step": 7990 }, { "epoch": 1.2060907583295644, "grad_norm": 2.9225351810455322, "learning_rate": 7.469300889682355e-06, "loss": 0.1905, "step": 8000 }, { "epoch": 1.2075983717774763, "grad_norm": 2.967430591583252, "learning_rate": 7.461670357328452e-06, "loss": 0.2031, "step": 8010 }, { "epoch": 1.2091059852253883, "grad_norm": 2.873541831970215, "learning_rate": 7.454032249900945e-06, "loss": 0.226, "step": 8020 }, { "epoch": 1.2106135986733002, "grad_norm": 2.933043956756592, "learning_rate": 7.446386590903881e-06, "loss": 0.2291, "step": 8030 }, { "epoch": 1.2121212121212122, "grad_norm": 3.1239023208618164, "learning_rate": 7.438733403864553e-06, "loss": 0.2046, "step": 8040 }, { "epoch": 1.213628825569124, "grad_norm": 1.9577778577804565, "learning_rate": 7.431072712333415e-06, "loss": 0.2007, "step": 8050 }, { "epoch": 1.215136439017036, "grad_norm": 3.1991846561431885, "learning_rate": 7.423404539884011e-06, "loss": 0.2231, "step": 8060 }, { "epoch": 1.216644052464948, "grad_norm": 3.1039211750030518, "learning_rate": 7.41572891011291e-06, "loss": 0.2237, "step": 8070 }, { "epoch": 1.21815166591286, "grad_norm": 2.3894362449645996, "learning_rate": 7.408045846639629e-06, "loss": 0.2142, "step": 8080 }, { "epoch": 1.219659279360772, "grad_norm": 2.1897590160369873, "learning_rate": 7.400355373106556e-06, "loss": 0.2187, "step": 8090 }, { "epoch": 1.2211668928086838, "grad_norm": 2.123955726623535, "learning_rate": 7.392657513178885e-06, "loss": 0.2303, "step": 8100 }, { "epoch": 1.2226745062565958, "grad_norm": 3.3568975925445557, "learning_rate": 7.384952290544537e-06, "loss": 0.212, "step": 8110 }, { "epoch": 1.2241821197045077, "grad_norm": 5.481867790222168, "learning_rate": 7.377239728914092e-06, "loss": 0.2053, "step": 8120 }, { "epoch": 1.2256897331524197, "grad_norm": 2.1676690578460693, "learning_rate": 7.369519852020712e-06, "loss": 0.2137, "step": 8130 }, { "epoch": 1.2271973466003316, "grad_norm": 5.490156650543213, "learning_rate": 7.361792683620067e-06, "loss": 0.1898, "step": 8140 }, { "epoch": 1.2287049600482436, "grad_norm": 4.136428356170654, "learning_rate": 7.3540582474902714e-06, "loss": 0.2285, "step": 8150 }, { "epoch": 1.2302125734961555, "grad_norm": 4.89162540435791, "learning_rate": 7.346316567431797e-06, "loss": 0.2311, "step": 8160 }, { "epoch": 1.2317201869440675, "grad_norm": 2.855414390563965, "learning_rate": 7.338567667267411e-06, "loss": 0.1979, "step": 8170 }, { "epoch": 1.2332278003919794, "grad_norm": 1.8300881385803223, "learning_rate": 7.330811570842098e-06, "loss": 0.2482, "step": 8180 }, { "epoch": 1.2347354138398914, "grad_norm": 2.941772699356079, "learning_rate": 7.3230483020229835e-06, "loss": 0.2007, "step": 8190 }, { "epoch": 1.2362430272878033, "grad_norm": 2.8991847038269043, "learning_rate": 7.315277884699267e-06, "loss": 0.2243, "step": 8200 }, { "epoch": 1.2377506407357153, "grad_norm": 3.066657066345215, "learning_rate": 7.307500342782146e-06, "loss": 0.2167, "step": 8210 }, { "epoch": 1.2392582541836272, "grad_norm": 2.737740993499756, "learning_rate": 7.299715700204739e-06, "loss": 0.2025, "step": 8220 }, { "epoch": 1.2407658676315392, "grad_norm": 2.4988322257995605, "learning_rate": 7.291923980922018e-06, "loss": 0.2782, "step": 8230 }, { "epoch": 1.2422734810794513, "grad_norm": 3.019148588180542, "learning_rate": 7.284125208910728e-06, "loss": 0.2117, "step": 8240 }, { "epoch": 1.243781094527363, "grad_norm": 5.731674671173096, "learning_rate": 7.27631940816932e-06, "loss": 0.2761, "step": 8250 }, { "epoch": 1.2452887079752752, "grad_norm": 2.0630712509155273, "learning_rate": 7.268506602717874e-06, "loss": 0.1889, "step": 8260 }, { "epoch": 1.246796321423187, "grad_norm": 4.0968217849731445, "learning_rate": 7.260686816598019e-06, "loss": 0.2475, "step": 8270 }, { "epoch": 1.2483039348710991, "grad_norm": 3.395163059234619, "learning_rate": 7.252860073872874e-06, "loss": 0.2659, "step": 8280 }, { "epoch": 1.249811548319011, "grad_norm": 3.276055335998535, "learning_rate": 7.245026398626959e-06, "loss": 0.2432, "step": 8290 }, { "epoch": 1.251319161766923, "grad_norm": 3.1735739707946777, "learning_rate": 7.237185814966125e-06, "loss": 0.2283, "step": 8300 }, { "epoch": 1.252826775214835, "grad_norm": 2.786041021347046, "learning_rate": 7.229338347017489e-06, "loss": 0.2629, "step": 8310 }, { "epoch": 1.254334388662747, "grad_norm": 2.5890464782714844, "learning_rate": 7.221484018929346e-06, "loss": 0.198, "step": 8320 }, { "epoch": 1.2558420021106589, "grad_norm": 2.7281830310821533, "learning_rate": 7.213622854871104e-06, "loss": 0.216, "step": 8330 }, { "epoch": 1.2573496155585708, "grad_norm": 3.4709675312042236, "learning_rate": 7.205754879033205e-06, "loss": 0.2894, "step": 8340 }, { "epoch": 1.2588572290064828, "grad_norm": 2.7679717540740967, "learning_rate": 7.197880115627056e-06, "loss": 0.1964, "step": 8350 }, { "epoch": 1.2603648424543947, "grad_norm": 7.3572306632995605, "learning_rate": 7.189998588884943e-06, "loss": 0.3104, "step": 8360 }, { "epoch": 1.2618724559023067, "grad_norm": 2.1700196266174316, "learning_rate": 7.182110323059974e-06, "loss": 0.2292, "step": 8370 }, { "epoch": 1.2633800693502186, "grad_norm": 2.858429431915283, "learning_rate": 7.174215342425989e-06, "loss": 0.2456, "step": 8380 }, { "epoch": 1.2648876827981306, "grad_norm": 2.4983880519866943, "learning_rate": 7.166313671277488e-06, "loss": 0.2081, "step": 8390 }, { "epoch": 1.2663952962460425, "grad_norm": 2.5511984825134277, "learning_rate": 7.158405333929566e-06, "loss": 0.3031, "step": 8400 }, { "epoch": 1.2679029096939545, "grad_norm": 2.9711878299713135, "learning_rate": 7.150490354717827e-06, "loss": 0.1959, "step": 8410 }, { "epoch": 1.2694105231418664, "grad_norm": 3.398319959640503, "learning_rate": 7.142568757998316e-06, "loss": 0.2224, "step": 8420 }, { "epoch": 1.2709181365897784, "grad_norm": 2.8604140281677246, "learning_rate": 7.134640568147437e-06, "loss": 0.2116, "step": 8430 }, { "epoch": 1.2724257500376903, "grad_norm": 3.26908540725708, "learning_rate": 7.126705809561888e-06, "loss": 0.2501, "step": 8440 }, { "epoch": 1.2739333634856023, "grad_norm": 3.9943041801452637, "learning_rate": 7.118764506658575e-06, "loss": 0.2524, "step": 8450 }, { "epoch": 1.2754409769335142, "grad_norm": 2.96848201751709, "learning_rate": 7.110816683874548e-06, "loss": 0.2062, "step": 8460 }, { "epoch": 1.2769485903814262, "grad_norm": 2.540069103240967, "learning_rate": 7.102862365666916e-06, "loss": 0.2059, "step": 8470 }, { "epoch": 1.2784562038293381, "grad_norm": 2.9394333362579346, "learning_rate": 7.0949015765127745e-06, "loss": 0.206, "step": 8480 }, { "epoch": 1.27996381727725, "grad_norm": 3.5569851398468018, "learning_rate": 7.086934340909135e-06, "loss": 0.1911, "step": 8490 }, { "epoch": 1.281471430725162, "grad_norm": 2.69736647605896, "learning_rate": 7.078960683372847e-06, "loss": 0.2441, "step": 8500 }, { "epoch": 1.282979044173074, "grad_norm": 2.3172683715820312, "learning_rate": 7.070980628440517e-06, "loss": 0.206, "step": 8510 }, { "epoch": 1.284486657620986, "grad_norm": 3.1582236289978027, "learning_rate": 7.0629942006684395e-06, "loss": 0.2163, "step": 8520 }, { "epoch": 1.2859942710688979, "grad_norm": 3.9475295543670654, "learning_rate": 7.055001424632521e-06, "loss": 0.2033, "step": 8530 }, { "epoch": 1.2875018845168098, "grad_norm": 2.5940451622009277, "learning_rate": 7.047002324928202e-06, "loss": 0.2115, "step": 8540 }, { "epoch": 1.289009497964722, "grad_norm": 2.57807993888855, "learning_rate": 7.038996926170383e-06, "loss": 0.2002, "step": 8550 }, { "epoch": 1.2905171114126337, "grad_norm": 2.735013961791992, "learning_rate": 7.030985252993347e-06, "loss": 0.2016, "step": 8560 }, { "epoch": 1.2920247248605459, "grad_norm": 2.3615503311157227, "learning_rate": 7.022967330050683e-06, "loss": 0.2276, "step": 8570 }, { "epoch": 1.2935323383084576, "grad_norm": 2.6498122215270996, "learning_rate": 7.014943182015216e-06, "loss": 0.2472, "step": 8580 }, { "epoch": 1.2950399517563698, "grad_norm": 2.4924991130828857, "learning_rate": 7.006912833578925e-06, "loss": 0.2302, "step": 8590 }, { "epoch": 1.2965475652042815, "grad_norm": 2.772217035293579, "learning_rate": 6.998876309452866e-06, "loss": 0.2078, "step": 8600 }, { "epoch": 1.2980551786521937, "grad_norm": 3.531331777572632, "learning_rate": 6.990833634367102e-06, "loss": 0.2391, "step": 8610 }, { "epoch": 1.2995627921001056, "grad_norm": 4.218138217926025, "learning_rate": 6.982784833070626e-06, "loss": 0.257, "step": 8620 }, { "epoch": 1.3010704055480176, "grad_norm": 1.879697322845459, "learning_rate": 6.974729930331277e-06, "loss": 0.1746, "step": 8630 }, { "epoch": 1.3025780189959295, "grad_norm": 2.4804651737213135, "learning_rate": 6.966668950935676e-06, "loss": 0.2009, "step": 8640 }, { "epoch": 1.3040856324438415, "grad_norm": 4.29323673248291, "learning_rate": 6.958601919689134e-06, "loss": 0.1914, "step": 8650 }, { "epoch": 1.3055932458917534, "grad_norm": 2.097311496734619, "learning_rate": 6.950528861415594e-06, "loss": 0.2207, "step": 8660 }, { "epoch": 1.3071008593396654, "grad_norm": 2.8585970401763916, "learning_rate": 6.942449800957543e-06, "loss": 0.2513, "step": 8670 }, { "epoch": 1.3086084727875773, "grad_norm": 2.6370959281921387, "learning_rate": 6.9343647631759315e-06, "loss": 0.2176, "step": 8680 }, { "epoch": 1.3101160862354893, "grad_norm": 2.4373223781585693, "learning_rate": 6.926273772950114e-06, "loss": 0.2409, "step": 8690 }, { "epoch": 1.3116236996834012, "grad_norm": 3.004948854446411, "learning_rate": 6.918176855177754e-06, "loss": 0.2259, "step": 8700 }, { "epoch": 1.3131313131313131, "grad_norm": 2.9051156044006348, "learning_rate": 6.910074034774757e-06, "loss": 0.2066, "step": 8710 }, { "epoch": 1.314638926579225, "grad_norm": 3.8995919227600098, "learning_rate": 6.901965336675195e-06, "loss": 0.2756, "step": 8720 }, { "epoch": 1.316146540027137, "grad_norm": 2.220339298248291, "learning_rate": 6.8938507858312255e-06, "loss": 0.2008, "step": 8730 }, { "epoch": 1.317654153475049, "grad_norm": 1.9448142051696777, "learning_rate": 6.885730407213012e-06, "loss": 0.1949, "step": 8740 }, { "epoch": 1.319161766922961, "grad_norm": 3.6238343715667725, "learning_rate": 6.877604225808658e-06, "loss": 0.1779, "step": 8750 }, { "epoch": 1.320669380370873, "grad_norm": 3.377504825592041, "learning_rate": 6.869472266624118e-06, "loss": 0.2325, "step": 8760 }, { "epoch": 1.3221769938187848, "grad_norm": 2.088999032974243, "learning_rate": 6.86133455468313e-06, "loss": 0.2184, "step": 8770 }, { "epoch": 1.3236846072666968, "grad_norm": 2.0817081928253174, "learning_rate": 6.85319111502713e-06, "loss": 0.2328, "step": 8780 }, { "epoch": 1.3251922207146087, "grad_norm": 2.22271990776062, "learning_rate": 6.845041972715186e-06, "loss": 0.1727, "step": 8790 }, { "epoch": 1.3266998341625207, "grad_norm": 2.9864983558654785, "learning_rate": 6.836887152823905e-06, "loss": 0.1845, "step": 8800 }, { "epoch": 1.3282074476104326, "grad_norm": 2.2376010417938232, "learning_rate": 6.8287266804473754e-06, "loss": 0.188, "step": 8810 }, { "epoch": 1.3297150610583446, "grad_norm": 3.306100606918335, "learning_rate": 6.820560580697071e-06, "loss": 0.2099, "step": 8820 }, { "epoch": 1.3312226745062565, "grad_norm": 3.6137495040893555, "learning_rate": 6.812388878701787e-06, "loss": 0.214, "step": 8830 }, { "epoch": 1.3327302879541685, "grad_norm": 2.941082000732422, "learning_rate": 6.804211599607554e-06, "loss": 0.2052, "step": 8840 }, { "epoch": 1.3342379014020804, "grad_norm": 1.865797519683838, "learning_rate": 6.7960287685775714e-06, "loss": 0.249, "step": 8850 }, { "epoch": 1.3357455148499926, "grad_norm": 2.461007833480835, "learning_rate": 6.787840410792115e-06, "loss": 0.265, "step": 8860 }, { "epoch": 1.3372531282979043, "grad_norm": 3.158944845199585, "learning_rate": 6.779646551448471e-06, "loss": 0.2156, "step": 8870 }, { "epoch": 1.3387607417458165, "grad_norm": 3.4301722049713135, "learning_rate": 6.771447215760858e-06, "loss": 0.1921, "step": 8880 }, { "epoch": 1.3402683551937282, "grad_norm": 2.937448501586914, "learning_rate": 6.763242428960341e-06, "loss": 0.2199, "step": 8890 }, { "epoch": 1.3417759686416404, "grad_norm": 3.4282402992248535, "learning_rate": 6.755032216294762e-06, "loss": 0.2286, "step": 8900 }, { "epoch": 1.3432835820895521, "grad_norm": 2.408128499984741, "learning_rate": 6.746816603028662e-06, "loss": 0.2259, "step": 8910 }, { "epoch": 1.3447911955374643, "grad_norm": 2.1112146377563477, "learning_rate": 6.738595614443195e-06, "loss": 0.182, "step": 8920 }, { "epoch": 1.346298808985376, "grad_norm": 3.07350754737854, "learning_rate": 6.730369275836062e-06, "loss": 0.2337, "step": 8930 }, { "epoch": 1.3478064224332882, "grad_norm": 2.5887691974639893, "learning_rate": 6.722137612521423e-06, "loss": 0.2422, "step": 8940 }, { "epoch": 1.3493140358812001, "grad_norm": 3.7940030097961426, "learning_rate": 6.713900649829823e-06, "loss": 0.2016, "step": 8950 }, { "epoch": 1.350821649329112, "grad_norm": 2.0362460613250732, "learning_rate": 6.7056584131081185e-06, "loss": 0.2658, "step": 8960 }, { "epoch": 1.352329262777024, "grad_norm": 3.2314605712890625, "learning_rate": 6.697410927719391e-06, "loss": 0.1855, "step": 8970 }, { "epoch": 1.353836876224936, "grad_norm": 2.6266448497772217, "learning_rate": 6.689158219042875e-06, "loss": 0.2405, "step": 8980 }, { "epoch": 1.355344489672848, "grad_norm": 3.6971042156219482, "learning_rate": 6.680900312473877e-06, "loss": 0.1721, "step": 8990 }, { "epoch": 1.3568521031207599, "grad_norm": 2.2129275798797607, "learning_rate": 6.672637233423703e-06, "loss": 0.2264, "step": 9000 }, { "epoch": 1.3583597165686718, "grad_norm": 3.0593392848968506, "learning_rate": 6.664369007319569e-06, "loss": 0.1839, "step": 9010 }, { "epoch": 1.3598673300165838, "grad_norm": 3.6365883350372314, "learning_rate": 6.656095659604533e-06, "loss": 0.2919, "step": 9020 }, { "epoch": 1.3613749434644957, "grad_norm": 2.0104966163635254, "learning_rate": 6.6478172157374154e-06, "loss": 0.2426, "step": 9030 }, { "epoch": 1.3628825569124077, "grad_norm": 1.9104632139205933, "learning_rate": 6.6395337011927134e-06, "loss": 0.193, "step": 9040 }, { "epoch": 1.3643901703603196, "grad_norm": 2.984358310699463, "learning_rate": 6.631245141460531e-06, "loss": 0.191, "step": 9050 }, { "epoch": 1.3658977838082316, "grad_norm": 2.6253223419189453, "learning_rate": 6.6229515620465e-06, "loss": 0.2157, "step": 9060 }, { "epoch": 1.3674053972561435, "grad_norm": 3.8765869140625, "learning_rate": 6.614652988471694e-06, "loss": 0.2173, "step": 9070 }, { "epoch": 1.3689130107040555, "grad_norm": 3.161318778991699, "learning_rate": 6.606349446272553e-06, "loss": 0.2796, "step": 9080 }, { "epoch": 1.3704206241519674, "grad_norm": 2.2424557209014893, "learning_rate": 6.598040961000815e-06, "loss": 0.1952, "step": 9090 }, { "epoch": 1.3719282375998794, "grad_norm": 2.900841474533081, "learning_rate": 6.589727558223421e-06, "loss": 0.1961, "step": 9100 }, { "epoch": 1.3734358510477913, "grad_norm": 2.4227395057678223, "learning_rate": 6.581409263522447e-06, "loss": 0.1958, "step": 9110 }, { "epoch": 1.3749434644957033, "grad_norm": 2.821117877960205, "learning_rate": 6.573086102495023e-06, "loss": 0.2411, "step": 9120 }, { "epoch": 1.3764510779436152, "grad_norm": 2.152346611022949, "learning_rate": 6.564758100753253e-06, "loss": 0.2822, "step": 9130 }, { "epoch": 1.3779586913915272, "grad_norm": 1.7358295917510986, "learning_rate": 6.5564252839241375e-06, "loss": 0.2256, "step": 9140 }, { "epoch": 1.379466304839439, "grad_norm": 2.944204568862915, "learning_rate": 6.548087677649491e-06, "loss": 0.2433, "step": 9150 }, { "epoch": 1.380973918287351, "grad_norm": 2.7156269550323486, "learning_rate": 6.539745307585872e-06, "loss": 0.2333, "step": 9160 }, { "epoch": 1.382481531735263, "grad_norm": 2.596701145172119, "learning_rate": 6.531398199404493e-06, "loss": 0.1947, "step": 9170 }, { "epoch": 1.383989145183175, "grad_norm": 2.706632614135742, "learning_rate": 6.523046378791147e-06, "loss": 0.2498, "step": 9180 }, { "epoch": 1.3854967586310871, "grad_norm": 3.3823888301849365, "learning_rate": 6.514689871446133e-06, "loss": 0.1681, "step": 9190 }, { "epoch": 1.3870043720789988, "grad_norm": 2.7610626220703125, "learning_rate": 6.5063287030841674e-06, "loss": 0.2189, "step": 9200 }, { "epoch": 1.388511985526911, "grad_norm": 2.0381906032562256, "learning_rate": 6.497962899434309e-06, "loss": 0.2036, "step": 9210 }, { "epoch": 1.3900195989748227, "grad_norm": 2.066706657409668, "learning_rate": 6.489592486239884e-06, "loss": 0.2177, "step": 9220 }, { "epoch": 1.391527212422735, "grad_norm": 2.356865167617798, "learning_rate": 6.481217489258402e-06, "loss": 0.2655, "step": 9230 }, { "epoch": 1.3930348258706466, "grad_norm": 2.583953380584717, "learning_rate": 6.472837934261475e-06, "loss": 0.28, "step": 9240 }, { "epoch": 1.3945424393185588, "grad_norm": 3.329251527786255, "learning_rate": 6.464453847034743e-06, "loss": 0.1503, "step": 9250 }, { "epoch": 1.3960500527664705, "grad_norm": 2.956700086593628, "learning_rate": 6.4560652533777944e-06, "loss": 0.2579, "step": 9260 }, { "epoch": 1.3975576662143827, "grad_norm": 2.4820573329925537, "learning_rate": 6.447672179104083e-06, "loss": 0.2429, "step": 9270 }, { "epoch": 1.3990652796622947, "grad_norm": 1.5425758361816406, "learning_rate": 6.4392746500408495e-06, "loss": 0.1695, "step": 9280 }, { "epoch": 1.4005728931102066, "grad_norm": 1.9509296417236328, "learning_rate": 6.430872692029046e-06, "loss": 0.2564, "step": 9290 }, { "epoch": 1.4020805065581186, "grad_norm": 2.492788314819336, "learning_rate": 6.422466330923247e-06, "loss": 0.2514, "step": 9300 }, { "epoch": 1.4035881200060305, "grad_norm": 2.5514938831329346, "learning_rate": 6.414055592591585e-06, "loss": 0.2177, "step": 9310 }, { "epoch": 1.4050957334539425, "grad_norm": 3.2965357303619385, "learning_rate": 6.405640502915658e-06, "loss": 0.1806, "step": 9320 }, { "epoch": 1.4066033469018544, "grad_norm": 2.9107489585876465, "learning_rate": 6.397221087790448e-06, "loss": 0.2051, "step": 9330 }, { "epoch": 1.4081109603497663, "grad_norm": 4.705292224884033, "learning_rate": 6.388797373124259e-06, "loss": 0.2122, "step": 9340 }, { "epoch": 1.4096185737976783, "grad_norm": 2.1875317096710205, "learning_rate": 6.380369384838618e-06, "loss": 0.203, "step": 9350 }, { "epoch": 1.4111261872455902, "grad_norm": 1.9471478462219238, "learning_rate": 6.371937148868203e-06, "loss": 0.2076, "step": 9360 }, { "epoch": 1.4126338006935022, "grad_norm": 3.7997817993164062, "learning_rate": 6.363500691160768e-06, "loss": 0.2149, "step": 9370 }, { "epoch": 1.4141414141414141, "grad_norm": 3.8854000568389893, "learning_rate": 6.355060037677049e-06, "loss": 0.2297, "step": 9380 }, { "epoch": 1.415649027589326, "grad_norm": 3.1625874042510986, "learning_rate": 6.346615214390703e-06, "loss": 0.2027, "step": 9390 }, { "epoch": 1.417156641037238, "grad_norm": 3.26188588142395, "learning_rate": 6.338166247288213e-06, "loss": 0.213, "step": 9400 }, { "epoch": 1.41866425448515, "grad_norm": 2.380507469177246, "learning_rate": 6.329713162368816e-06, "loss": 0.2559, "step": 9410 }, { "epoch": 1.420171867933062, "grad_norm": 2.3469531536102295, "learning_rate": 6.3212559856444166e-06, "loss": 0.1919, "step": 9420 }, { "epoch": 1.4216794813809739, "grad_norm": 2.297518014907837, "learning_rate": 6.312794743139516e-06, "loss": 0.2778, "step": 9430 }, { "epoch": 1.4231870948288858, "grad_norm": 3.7566170692443848, "learning_rate": 6.30432946089112e-06, "loss": 0.2122, "step": 9440 }, { "epoch": 1.4246947082767978, "grad_norm": 3.086963653564453, "learning_rate": 6.295860164948673e-06, "loss": 0.1983, "step": 9450 }, { "epoch": 1.4262023217247097, "grad_norm": 2.8482837677001953, "learning_rate": 6.2873868813739645e-06, "loss": 0.2022, "step": 9460 }, { "epoch": 1.4277099351726217, "grad_norm": 2.4557838439941406, "learning_rate": 6.278909636241059e-06, "loss": 0.2181, "step": 9470 }, { "epoch": 1.4292175486205336, "grad_norm": 2.2392547130584717, "learning_rate": 6.270428455636207e-06, "loss": 0.1576, "step": 9480 }, { "epoch": 1.4307251620684456, "grad_norm": 2.3875930309295654, "learning_rate": 6.261943365657774e-06, "loss": 0.1913, "step": 9490 }, { "epoch": 1.4322327755163575, "grad_norm": 2.372875213623047, "learning_rate": 6.2534543924161524e-06, "loss": 0.2758, "step": 9500 }, { "epoch": 1.4337403889642695, "grad_norm": 2.9019930362701416, "learning_rate": 6.244961562033685e-06, "loss": 0.2189, "step": 9510 }, { "epoch": 1.4352480024121816, "grad_norm": 3.1032378673553467, "learning_rate": 6.236464900644586e-06, "loss": 0.285, "step": 9520 }, { "epoch": 1.4367556158600934, "grad_norm": 1.9693683385849, "learning_rate": 6.227964434394852e-06, "loss": 0.182, "step": 9530 }, { "epoch": 1.4382632293080055, "grad_norm": 6.724595069885254, "learning_rate": 6.219460189442195e-06, "loss": 0.2178, "step": 9540 }, { "epoch": 1.4397708427559173, "grad_norm": 2.745851993560791, "learning_rate": 6.210952191955953e-06, "loss": 0.2135, "step": 9550 }, { "epoch": 1.4412784562038294, "grad_norm": 3.5638883113861084, "learning_rate": 6.202440468117008e-06, "loss": 0.2263, "step": 9560 }, { "epoch": 1.4427860696517412, "grad_norm": 2.4637179374694824, "learning_rate": 6.193925044117712e-06, "loss": 0.2113, "step": 9570 }, { "epoch": 1.4442936830996533, "grad_norm": 2.313713312149048, "learning_rate": 6.185405946161806e-06, "loss": 0.2325, "step": 9580 }, { "epoch": 1.4458012965475653, "grad_norm": 3.367506980895996, "learning_rate": 6.176883200464327e-06, "loss": 0.1933, "step": 9590 }, { "epoch": 1.4473089099954772, "grad_norm": 2.251965045928955, "learning_rate": 6.168356833251546e-06, "loss": 0.1633, "step": 9600 }, { "epoch": 1.4488165234433892, "grad_norm": 2.7979564666748047, "learning_rate": 6.159826870760875e-06, "loss": 0.2039, "step": 9610 }, { "epoch": 1.4503241368913011, "grad_norm": 5.096176624298096, "learning_rate": 6.151293339240788e-06, "loss": 0.2198, "step": 9620 }, { "epoch": 1.451831750339213, "grad_norm": 2.8948352336883545, "learning_rate": 6.142756264950744e-06, "loss": 0.2214, "step": 9630 }, { "epoch": 1.453339363787125, "grad_norm": 2.3739264011383057, "learning_rate": 6.1342156741611035e-06, "loss": 0.2463, "step": 9640 }, { "epoch": 1.454846977235037, "grad_norm": 2.689268112182617, "learning_rate": 6.125671593153046e-06, "loss": 0.1881, "step": 9650 }, { "epoch": 1.456354590682949, "grad_norm": 2.929445266723633, "learning_rate": 6.117124048218492e-06, "loss": 0.2117, "step": 9660 }, { "epoch": 1.4578622041308609, "grad_norm": 3.631105422973633, "learning_rate": 6.108573065660025e-06, "loss": 0.1628, "step": 9670 }, { "epoch": 1.4593698175787728, "grad_norm": 2.8038947582244873, "learning_rate": 6.100018671790801e-06, "loss": 0.2256, "step": 9680 }, { "epoch": 1.4608774310266848, "grad_norm": 3.9210457801818848, "learning_rate": 6.091460892934474e-06, "loss": 0.2165, "step": 9690 }, { "epoch": 1.4623850444745967, "grad_norm": 2.053070068359375, "learning_rate": 6.08289975542512e-06, "loss": 0.1695, "step": 9700 }, { "epoch": 1.4638926579225087, "grad_norm": 3.8655450344085693, "learning_rate": 6.074335285607144e-06, "loss": 0.2731, "step": 9710 }, { "epoch": 1.4654002713704206, "grad_norm": 3.4523849487304688, "learning_rate": 6.065767509835209e-06, "loss": 0.2202, "step": 9720 }, { "epoch": 1.4669078848183326, "grad_norm": 2.0000720024108887, "learning_rate": 6.057196454474148e-06, "loss": 0.2105, "step": 9730 }, { "epoch": 1.4684154982662445, "grad_norm": 2.607348918914795, "learning_rate": 6.0486221458988885e-06, "loss": 0.1832, "step": 9740 }, { "epoch": 1.4699231117141565, "grad_norm": 2.058584690093994, "learning_rate": 6.040044610494369e-06, "loss": 0.2241, "step": 9750 }, { "epoch": 1.4714307251620684, "grad_norm": 2.495776414871216, "learning_rate": 6.031463874655455e-06, "loss": 0.1754, "step": 9760 }, { "epoch": 1.4729383386099804, "grad_norm": 1.8608089685440063, "learning_rate": 6.022879964786863e-06, "loss": 0.2123, "step": 9770 }, { "epoch": 1.4744459520578923, "grad_norm": 3.2169888019561768, "learning_rate": 6.014292907303077e-06, "loss": 0.2414, "step": 9780 }, { "epoch": 1.4759535655058043, "grad_norm": 4.854318618774414, "learning_rate": 6.005702728628262e-06, "loss": 0.1977, "step": 9790 }, { "epoch": 1.4774611789537162, "grad_norm": 2.5517358779907227, "learning_rate": 5.997109455196194e-06, "loss": 0.1952, "step": 9800 }, { "epoch": 1.4789687924016282, "grad_norm": 2.8208670616149902, "learning_rate": 5.988513113450169e-06, "loss": 0.221, "step": 9810 }, { "epoch": 1.48047640584954, "grad_norm": 1.955098271369934, "learning_rate": 5.979913729842924e-06, "loss": 0.1986, "step": 9820 }, { "epoch": 1.4819840192974523, "grad_norm": 4.356140613555908, "learning_rate": 5.97131133083656e-06, "loss": 0.2015, "step": 9830 }, { "epoch": 1.483491632745364, "grad_norm": 2.6684792041778564, "learning_rate": 5.962705942902453e-06, "loss": 0.2274, "step": 9840 }, { "epoch": 1.4849992461932762, "grad_norm": 2.4632065296173096, "learning_rate": 5.954097592521176e-06, "loss": 0.1917, "step": 9850 }, { "epoch": 1.486506859641188, "grad_norm": 2.4304165840148926, "learning_rate": 5.945486306182424e-06, "loss": 0.2441, "step": 9860 }, { "epoch": 1.4880144730891, "grad_norm": 2.5911800861358643, "learning_rate": 5.936872110384921e-06, "loss": 0.2081, "step": 9870 }, { "epoch": 1.4895220865370118, "grad_norm": 3.9193522930145264, "learning_rate": 5.928255031636347e-06, "loss": 0.2285, "step": 9880 }, { "epoch": 1.491029699984924, "grad_norm": 4.857326507568359, "learning_rate": 5.91963509645325e-06, "loss": 0.2405, "step": 9890 }, { "epoch": 1.4925373134328357, "grad_norm": 2.1035478115081787, "learning_rate": 5.911012331360974e-06, "loss": 0.2046, "step": 9900 }, { "epoch": 1.4940449268807479, "grad_norm": 2.5527045726776123, "learning_rate": 5.902386762893562e-06, "loss": 0.2006, "step": 9910 }, { "epoch": 1.4955525403286598, "grad_norm": 2.6492323875427246, "learning_rate": 5.893758417593693e-06, "loss": 0.1838, "step": 9920 }, { "epoch": 1.4970601537765718, "grad_norm": 2.4303665161132812, "learning_rate": 5.885127322012586e-06, "loss": 0.2674, "step": 9930 }, { "epoch": 1.4985677672244837, "grad_norm": 2.340404510498047, "learning_rate": 5.876493502709924e-06, "loss": 0.2592, "step": 9940 }, { "epoch": 1.5000753806723957, "grad_norm": 3.8629953861236572, "learning_rate": 5.867856986253769e-06, "loss": 0.1975, "step": 9950 }, { "epoch": 1.5015829941203076, "grad_norm": 2.635892629623413, "learning_rate": 5.859217799220489e-06, "loss": 0.2168, "step": 9960 }, { "epoch": 1.5030906075682195, "grad_norm": 3.7080371379852295, "learning_rate": 5.850575968194664e-06, "loss": 0.2654, "step": 9970 }, { "epoch": 1.5045982210161315, "grad_norm": 2.83652925491333, "learning_rate": 5.8419315197690115e-06, "loss": 0.2518, "step": 9980 }, { "epoch": 1.5061058344640434, "grad_norm": 3.308972120285034, "learning_rate": 5.833284480544307e-06, "loss": 0.2036, "step": 9990 }, { "epoch": 1.5076134479119554, "grad_norm": 3.187816619873047, "learning_rate": 5.824634877129291e-06, "loss": 0.2405, "step": 10000 }, { "epoch": 1.5091210613598673, "grad_norm": 2.5466272830963135, "learning_rate": 5.815982736140603e-06, "loss": 0.1845, "step": 10010 }, { "epoch": 1.5106286748077793, "grad_norm": 2.0828938484191895, "learning_rate": 5.807328084202686e-06, "loss": 0.1648, "step": 10020 }, { "epoch": 1.5121362882556912, "grad_norm": 2.521336317062378, "learning_rate": 5.7986709479477086e-06, "loss": 0.2018, "step": 10030 }, { "epoch": 1.5136439017036032, "grad_norm": 3.714402675628662, "learning_rate": 5.790011354015489e-06, "loss": 0.2483, "step": 10040 }, { "epoch": 1.5151515151515151, "grad_norm": 2.7911055088043213, "learning_rate": 5.781349329053405e-06, "loss": 0.2042, "step": 10050 }, { "epoch": 1.516659128599427, "grad_norm": 2.7285470962524414, "learning_rate": 5.772684899716314e-06, "loss": 0.2284, "step": 10060 }, { "epoch": 1.518166742047339, "grad_norm": 2.7233762741088867, "learning_rate": 5.764018092666477e-06, "loss": 0.2581, "step": 10070 }, { "epoch": 1.519674355495251, "grad_norm": 3.263014316558838, "learning_rate": 5.7553489345734655e-06, "loss": 0.2032, "step": 10080 }, { "epoch": 1.521181968943163, "grad_norm": 2.5660572052001953, "learning_rate": 5.7466774521140914e-06, "loss": 0.2343, "step": 10090 }, { "epoch": 1.5226895823910749, "grad_norm": 2.905480146408081, "learning_rate": 5.7380036719723144e-06, "loss": 0.2215, "step": 10100 }, { "epoch": 1.5241971958389868, "grad_norm": 3.6878092288970947, "learning_rate": 5.729327620839169e-06, "loss": 0.2084, "step": 10110 }, { "epoch": 1.525704809286899, "grad_norm": 3.8649849891662598, "learning_rate": 5.720649325412673e-06, "loss": 0.2259, "step": 10120 }, { "epoch": 1.5272124227348107, "grad_norm": 2.1345512866973877, "learning_rate": 5.711968812397755e-06, "loss": 0.2288, "step": 10130 }, { "epoch": 1.528720036182723, "grad_norm": 3.8009471893310547, "learning_rate": 5.7032861085061675e-06, "loss": 0.1972, "step": 10140 }, { "epoch": 1.5302276496306346, "grad_norm": 3.5313949584960938, "learning_rate": 5.694601240456399e-06, "loss": 0.1815, "step": 10150 }, { "epoch": 1.5317352630785468, "grad_norm": 2.2386393547058105, "learning_rate": 5.685914234973604e-06, "loss": 0.2031, "step": 10160 }, { "epoch": 1.5332428765264585, "grad_norm": 3.850325345993042, "learning_rate": 5.677225118789511e-06, "loss": 0.2141, "step": 10170 }, { "epoch": 1.5347504899743707, "grad_norm": 3.5715975761413574, "learning_rate": 5.668533918642347e-06, "loss": 0.2197, "step": 10180 }, { "epoch": 1.5362581034222824, "grad_norm": 3.1304280757904053, "learning_rate": 5.6598406612767464e-06, "loss": 0.2529, "step": 10190 }, { "epoch": 1.5377657168701946, "grad_norm": 4.868100166320801, "learning_rate": 5.651145373443677e-06, "loss": 0.2706, "step": 10200 }, { "epoch": 1.5392733303181063, "grad_norm": 1.8785432577133179, "learning_rate": 5.642448081900358e-06, "loss": 0.1984, "step": 10210 }, { "epoch": 1.5407809437660185, "grad_norm": 3.8038504123687744, "learning_rate": 5.633748813410167e-06, "loss": 0.1743, "step": 10220 }, { "epoch": 1.5422885572139302, "grad_norm": 2.944627285003662, "learning_rate": 5.625047594742571e-06, "loss": 0.2355, "step": 10230 }, { "epoch": 1.5437961706618424, "grad_norm": 3.0645830631256104, "learning_rate": 5.616344452673038e-06, "loss": 0.2019, "step": 10240 }, { "epoch": 1.545303784109754, "grad_norm": 2.658796548843384, "learning_rate": 5.6076394139829504e-06, "loss": 0.2662, "step": 10250 }, { "epoch": 1.5468113975576663, "grad_norm": 2.1324121952056885, "learning_rate": 5.598932505459531e-06, "loss": 0.1873, "step": 10260 }, { "epoch": 1.548319011005578, "grad_norm": 3.1280572414398193, "learning_rate": 5.590223753895755e-06, "loss": 0.2574, "step": 10270 }, { "epoch": 1.5498266244534902, "grad_norm": 2.9483163356781006, "learning_rate": 5.581513186090269e-06, "loss": 0.2103, "step": 10280 }, { "epoch": 1.5513342379014021, "grad_norm": 2.1132242679595947, "learning_rate": 5.572800828847308e-06, "loss": 0.2055, "step": 10290 }, { "epoch": 1.552841851349314, "grad_norm": 5.813694953918457, "learning_rate": 5.564086708976615e-06, "loss": 0.2394, "step": 10300 }, { "epoch": 1.554349464797226, "grad_norm": 6.263071537017822, "learning_rate": 5.555370853293357e-06, "loss": 0.1992, "step": 10310 }, { "epoch": 1.555857078245138, "grad_norm": 3.6203510761260986, "learning_rate": 5.546653288618038e-06, "loss": 0.2507, "step": 10320 }, { "epoch": 1.55736469169305, "grad_norm": 3.0470738410949707, "learning_rate": 5.537934041776427e-06, "loss": 0.216, "step": 10330 }, { "epoch": 1.5588723051409619, "grad_norm": 2.021218776702881, "learning_rate": 5.529213139599469e-06, "loss": 0.2021, "step": 10340 }, { "epoch": 1.5603799185888738, "grad_norm": 4.537146091461182, "learning_rate": 5.5204906089231945e-06, "loss": 0.2368, "step": 10350 }, { "epoch": 1.5618875320367858, "grad_norm": 2.9721572399139404, "learning_rate": 5.511766476588657e-06, "loss": 0.2313, "step": 10360 }, { "epoch": 1.5633951454846977, "grad_norm": 3.4381823539733887, "learning_rate": 5.503040769441831e-06, "loss": 0.2664, "step": 10370 }, { "epoch": 1.5649027589326097, "grad_norm": 3.1475419998168945, "learning_rate": 5.494313514333538e-06, "loss": 0.219, "step": 10380 }, { "epoch": 1.5664103723805216, "grad_norm": 1.6786974668502808, "learning_rate": 5.485584738119366e-06, "loss": 0.2322, "step": 10390 }, { "epoch": 1.5679179858284336, "grad_norm": 2.4191033840179443, "learning_rate": 5.4768544676595805e-06, "loss": 0.1998, "step": 10400 }, { "epoch": 1.5694255992763455, "grad_norm": 2.989999294281006, "learning_rate": 5.468122729819046e-06, "loss": 0.1872, "step": 10410 }, { "epoch": 1.5709332127242575, "grad_norm": 3.1854565143585205, "learning_rate": 5.4593895514671426e-06, "loss": 0.1541, "step": 10420 }, { "epoch": 1.5724408261721694, "grad_norm": 3.5412790775299072, "learning_rate": 5.450654959477686e-06, "loss": 0.2788, "step": 10430 }, { "epoch": 1.5739484396200814, "grad_norm": 3.106663465499878, "learning_rate": 5.441918980728836e-06, "loss": 0.2696, "step": 10440 }, { "epoch": 1.5754560530679935, "grad_norm": 3.668318033218384, "learning_rate": 5.433181642103023e-06, "loss": 0.2053, "step": 10450 }, { "epoch": 1.5769636665159052, "grad_norm": 3.3185365200042725, "learning_rate": 5.424442970486864e-06, "loss": 0.3065, "step": 10460 }, { "epoch": 1.5784712799638174, "grad_norm": 3.680145263671875, "learning_rate": 5.415702992771075e-06, "loss": 0.2328, "step": 10470 }, { "epoch": 1.5799788934117291, "grad_norm": 2.0849153995513916, "learning_rate": 5.406961735850392e-06, "loss": 0.2182, "step": 10480 }, { "epoch": 1.5814865068596413, "grad_norm": 2.69311261177063, "learning_rate": 5.398219226623488e-06, "loss": 0.2045, "step": 10490 }, { "epoch": 1.582994120307553, "grad_norm": 2.597031831741333, "learning_rate": 5.389475491992887e-06, "loss": 0.2287, "step": 10500 }, { "epoch": 1.5845017337554652, "grad_norm": 2.6055359840393066, "learning_rate": 5.380730558864889e-06, "loss": 0.1872, "step": 10510 }, { "epoch": 1.586009347203377, "grad_norm": 3.179610252380371, "learning_rate": 5.371984454149477e-06, "loss": 0.2205, "step": 10520 }, { "epoch": 1.587516960651289, "grad_norm": 3.249697685241699, "learning_rate": 5.363237204760241e-06, "loss": 0.1823, "step": 10530 }, { "epoch": 1.5890245740992008, "grad_norm": 3.7330448627471924, "learning_rate": 5.354488837614296e-06, "loss": 0.1968, "step": 10540 }, { "epoch": 1.590532187547113, "grad_norm": 3.068127155303955, "learning_rate": 5.345739379632189e-06, "loss": 0.2126, "step": 10550 }, { "epoch": 1.5920398009950247, "grad_norm": 4.486553192138672, "learning_rate": 5.336988857737832e-06, "loss": 0.2388, "step": 10560 }, { "epoch": 1.593547414442937, "grad_norm": 2.761319398880005, "learning_rate": 5.328237298858411e-06, "loss": 0.2458, "step": 10570 }, { "epoch": 1.5950550278908486, "grad_norm": 2.6513454914093018, "learning_rate": 5.319484729924294e-06, "loss": 0.2459, "step": 10580 }, { "epoch": 1.5965626413387608, "grad_norm": 3.307061195373535, "learning_rate": 5.310731177868964e-06, "loss": 0.1677, "step": 10590 }, { "epoch": 1.5980702547866728, "grad_norm": 2.6724188327789307, "learning_rate": 5.301976669628931e-06, "loss": 0.1848, "step": 10600 }, { "epoch": 1.5995778682345847, "grad_norm": 3.0480902194976807, "learning_rate": 5.293221232143642e-06, "loss": 0.1978, "step": 10610 }, { "epoch": 1.6010854816824966, "grad_norm": 2.032073497772217, "learning_rate": 5.284464892355406e-06, "loss": 0.204, "step": 10620 }, { "epoch": 1.6025930951304086, "grad_norm": 2.7751457691192627, "learning_rate": 5.275707677209309e-06, "loss": 0.2273, "step": 10630 }, { "epoch": 1.6041007085783205, "grad_norm": 2.6655921936035156, "learning_rate": 5.266949613653132e-06, "loss": 0.1849, "step": 10640 }, { "epoch": 1.6056083220262325, "grad_norm": 2.7997829914093018, "learning_rate": 5.258190728637263e-06, "loss": 0.228, "step": 10650 }, { "epoch": 1.6071159354741444, "grad_norm": 3.252248525619507, "learning_rate": 5.249431049114621e-06, "loss": 0.2101, "step": 10660 }, { "epoch": 1.6086235489220564, "grad_norm": 2.057248115539551, "learning_rate": 5.2406706020405665e-06, "loss": 0.2089, "step": 10670 }, { "epoch": 1.6101311623699683, "grad_norm": 4.077506065368652, "learning_rate": 5.231909414372824e-06, "loss": 0.1614, "step": 10680 }, { "epoch": 1.6116387758178803, "grad_norm": 2.4184350967407227, "learning_rate": 5.223147513071401e-06, "loss": 0.2669, "step": 10690 }, { "epoch": 1.6131463892657922, "grad_norm": 2.716625213623047, "learning_rate": 5.2143849250984945e-06, "loss": 0.208, "step": 10700 }, { "epoch": 1.6146540027137042, "grad_norm": 2.4333291053771973, "learning_rate": 5.205621677418415e-06, "loss": 0.1947, "step": 10710 }, { "epoch": 1.6161616161616161, "grad_norm": 1.919173002243042, "learning_rate": 5.196857796997508e-06, "loss": 0.2147, "step": 10720 }, { "epoch": 1.617669229609528, "grad_norm": 2.699201822280884, "learning_rate": 5.188093310804063e-06, "loss": 0.1824, "step": 10730 }, { "epoch": 1.61917684305744, "grad_norm": 3.646385431289673, "learning_rate": 5.179328245808232e-06, "loss": 0.2088, "step": 10740 }, { "epoch": 1.620684456505352, "grad_norm": 2.1557772159576416, "learning_rate": 5.170562628981952e-06, "loss": 0.2008, "step": 10750 }, { "epoch": 1.6221920699532641, "grad_norm": 3.70930814743042, "learning_rate": 5.1617964872988535e-06, "loss": 0.2472, "step": 10760 }, { "epoch": 1.6236996834011759, "grad_norm": 3.2618329524993896, "learning_rate": 5.153029847734185e-06, "loss": 0.2251, "step": 10770 }, { "epoch": 1.625207296849088, "grad_norm": 2.9697513580322266, "learning_rate": 5.144262737264729e-06, "loss": 0.2694, "step": 10780 }, { "epoch": 1.6267149102969998, "grad_norm": 1.7489781379699707, "learning_rate": 5.135495182868713e-06, "loss": 0.182, "step": 10790 }, { "epoch": 1.628222523744912, "grad_norm": 2.5825796127319336, "learning_rate": 5.1267272115257314e-06, "loss": 0.2411, "step": 10800 }, { "epoch": 1.6297301371928237, "grad_norm": 3.798006057739258, "learning_rate": 5.117958850216665e-06, "loss": 0.1667, "step": 10810 }, { "epoch": 1.6312377506407358, "grad_norm": 2.5724143981933594, "learning_rate": 5.109190125923588e-06, "loss": 0.2539, "step": 10820 }, { "epoch": 1.6327453640886476, "grad_norm": 4.944222450256348, "learning_rate": 5.1004210656297e-06, "loss": 0.2489, "step": 10830 }, { "epoch": 1.6342529775365597, "grad_norm": 2.4763920307159424, "learning_rate": 5.091651696319229e-06, "loss": 0.1972, "step": 10840 }, { "epoch": 1.6357605909844715, "grad_norm": 2.2794830799102783, "learning_rate": 5.082882044977353e-06, "loss": 0.2187, "step": 10850 }, { "epoch": 1.6372682044323836, "grad_norm": 2.4988720417022705, "learning_rate": 5.074112138590123e-06, "loss": 0.2287, "step": 10860 }, { "epoch": 1.6387758178802954, "grad_norm": 4.125413417816162, "learning_rate": 5.065342004144369e-06, "loss": 0.2286, "step": 10870 }, { "epoch": 1.6402834313282075, "grad_norm": 2.198838233947754, "learning_rate": 5.056571668627628e-06, "loss": 0.2314, "step": 10880 }, { "epoch": 1.6417910447761193, "grad_norm": 3.841594934463501, "learning_rate": 5.047801159028052e-06, "loss": 0.2399, "step": 10890 }, { "epoch": 1.6432986582240314, "grad_norm": 3.7544615268707275, "learning_rate": 5.03903050233433e-06, "loss": 0.2466, "step": 10900 }, { "epoch": 1.6448062716719432, "grad_norm": 2.7486722469329834, "learning_rate": 5.030259725535605e-06, "loss": 0.2008, "step": 10910 }, { "epoch": 1.6463138851198553, "grad_norm": 2.146705389022827, "learning_rate": 5.021488855621387e-06, "loss": 0.1899, "step": 10920 }, { "epoch": 1.6478214985677673, "grad_norm": 2.388690948486328, "learning_rate": 5.012717919581473e-06, "loss": 0.2157, "step": 10930 }, { "epoch": 1.6493291120156792, "grad_norm": 3.351613998413086, "learning_rate": 5.003946944405866e-06, "loss": 0.1977, "step": 10940 }, { "epoch": 1.6508367254635912, "grad_norm": 3.285557270050049, "learning_rate": 4.995175957084686e-06, "loss": 0.1922, "step": 10950 }, { "epoch": 1.6523443389115031, "grad_norm": 1.9676432609558105, "learning_rate": 4.986404984608094e-06, "loss": 0.2203, "step": 10960 }, { "epoch": 1.653851952359415, "grad_norm": 2.598944664001465, "learning_rate": 4.977634053966205e-06, "loss": 0.2433, "step": 10970 }, { "epoch": 1.655359565807327, "grad_norm": 2.1047685146331787, "learning_rate": 4.9688631921489985e-06, "loss": 0.2165, "step": 10980 }, { "epoch": 1.656867179255239, "grad_norm": 2.6813762187957764, "learning_rate": 4.96009242614625e-06, "loss": 0.2242, "step": 10990 }, { "epoch": 1.658374792703151, "grad_norm": 3.520195484161377, "learning_rate": 4.9513217829474385e-06, "loss": 0.2046, "step": 11000 }, { "epoch": 1.6598824061510629, "grad_norm": 2.29897141456604, "learning_rate": 4.942551289541663e-06, "loss": 0.2051, "step": 11010 }, { "epoch": 1.6613900195989748, "grad_norm": 3.928990125656128, "learning_rate": 4.933780972917562e-06, "loss": 0.192, "step": 11020 }, { "epoch": 1.6628976330468868, "grad_norm": 3.0884320735931396, "learning_rate": 4.9250108600632355e-06, "loss": 0.1874, "step": 11030 }, { "epoch": 1.6644052464947987, "grad_norm": 2.8028669357299805, "learning_rate": 4.916240977966144e-06, "loss": 0.2194, "step": 11040 }, { "epoch": 1.6659128599427107, "grad_norm": 3.1463639736175537, "learning_rate": 4.90747135361305e-06, "loss": 0.24, "step": 11050 }, { "epoch": 1.6674204733906226, "grad_norm": 2.513676166534424, "learning_rate": 4.898702013989918e-06, "loss": 0.2041, "step": 11060 }, { "epoch": 1.6689280868385346, "grad_norm": 2.891702175140381, "learning_rate": 4.889932986081837e-06, "loss": 0.1819, "step": 11070 }, { "epoch": 1.6704357002864465, "grad_norm": 3.228700876235962, "learning_rate": 4.881164296872937e-06, "loss": 0.2223, "step": 11080 }, { "epoch": 1.6719433137343587, "grad_norm": 2.4058382511138916, "learning_rate": 4.872395973346305e-06, "loss": 0.2142, "step": 11090 }, { "epoch": 1.6734509271822704, "grad_norm": 3.616018295288086, "learning_rate": 4.863628042483901e-06, "loss": 0.1945, "step": 11100 }, { "epoch": 1.6749585406301826, "grad_norm": 2.917780876159668, "learning_rate": 4.85486053126648e-06, "loss": 0.2311, "step": 11110 }, { "epoch": 1.6764661540780943, "grad_norm": 17.181398391723633, "learning_rate": 4.8460934666735046e-06, "loss": 0.2431, "step": 11120 }, { "epoch": 1.6779737675260065, "grad_norm": 1.6472820043563843, "learning_rate": 4.837326875683064e-06, "loss": 0.2449, "step": 11130 }, { "epoch": 1.6794813809739182, "grad_norm": 3.1147937774658203, "learning_rate": 4.828560785271788e-06, "loss": 0.2305, "step": 11140 }, { "epoch": 1.6809889944218304, "grad_norm": 2.320223331451416, "learning_rate": 4.819795222414766e-06, "loss": 0.1934, "step": 11150 }, { "epoch": 1.682496607869742, "grad_norm": 3.3983521461486816, "learning_rate": 4.8110302140854655e-06, "loss": 0.1614, "step": 11160 }, { "epoch": 1.6840042213176543, "grad_norm": 1.9729855060577393, "learning_rate": 4.802265787255646e-06, "loss": 0.2815, "step": 11170 }, { "epoch": 1.685511834765566, "grad_norm": 1.980739951133728, "learning_rate": 4.79350196889528e-06, "loss": 0.2066, "step": 11180 }, { "epoch": 1.6870194482134782, "grad_norm": 2.2750532627105713, "learning_rate": 4.7847387859724655e-06, "loss": 0.1958, "step": 11190 }, { "epoch": 1.6885270616613899, "grad_norm": 3.5897281169891357, "learning_rate": 4.775976265453348e-06, "loss": 0.2245, "step": 11200 }, { "epoch": 1.690034675109302, "grad_norm": 2.714763641357422, "learning_rate": 4.7672144343020286e-06, "loss": 0.1935, "step": 11210 }, { "epoch": 1.6915422885572138, "grad_norm": 2.673192024230957, "learning_rate": 4.758453319480495e-06, "loss": 0.2547, "step": 11220 }, { "epoch": 1.693049902005126, "grad_norm": 3.909123420715332, "learning_rate": 4.7496929479485214e-06, "loss": 0.2112, "step": 11230 }, { "epoch": 1.6945575154530377, "grad_norm": 3.4704341888427734, "learning_rate": 4.740933346663604e-06, "loss": 0.2009, "step": 11240 }, { "epoch": 1.6960651289009498, "grad_norm": 2.9476680755615234, "learning_rate": 4.732174542580863e-06, "loss": 0.1805, "step": 11250 }, { "epoch": 1.6975727423488618, "grad_norm": 3.4118812084198, "learning_rate": 4.72341656265297e-06, "loss": 0.2857, "step": 11260 }, { "epoch": 1.6990803557967737, "grad_norm": 3.797183036804199, "learning_rate": 4.714659433830053e-06, "loss": 0.2124, "step": 11270 }, { "epoch": 1.7005879692446857, "grad_norm": 2.4681220054626465, "learning_rate": 4.705903183059628e-06, "loss": 0.158, "step": 11280 }, { "epoch": 1.7020955826925976, "grad_norm": 3.1627724170684814, "learning_rate": 4.697147837286508e-06, "loss": 0.2729, "step": 11290 }, { "epoch": 1.7036031961405096, "grad_norm": 3.2062103748321533, "learning_rate": 4.6883934234527165e-06, "loss": 0.1877, "step": 11300 }, { "epoch": 1.7051108095884215, "grad_norm": 2.573796033859253, "learning_rate": 4.679639968497415e-06, "loss": 0.2031, "step": 11310 }, { "epoch": 1.7066184230363335, "grad_norm": 3.02756404876709, "learning_rate": 4.670887499356812e-06, "loss": 0.1963, "step": 11320 }, { "epoch": 1.7081260364842454, "grad_norm": 2.544440269470215, "learning_rate": 4.662136042964081e-06, "loss": 0.2627, "step": 11330 }, { "epoch": 1.7096336499321574, "grad_norm": 4.346510410308838, "learning_rate": 4.65338562624928e-06, "loss": 0.172, "step": 11340 }, { "epoch": 1.7111412633800693, "grad_norm": 2.6229193210601807, "learning_rate": 4.644636276139269e-06, "loss": 0.2009, "step": 11350 }, { "epoch": 1.7126488768279813, "grad_norm": 3.2552661895751953, "learning_rate": 4.6358880195576245e-06, "loss": 0.2095, "step": 11360 }, { "epoch": 1.7141564902758932, "grad_norm": 2.1566054821014404, "learning_rate": 4.627140883424558e-06, "loss": 0.1997, "step": 11370 }, { "epoch": 1.7156641037238052, "grad_norm": 2.198666572570801, "learning_rate": 4.618394894656835e-06, "loss": 0.1982, "step": 11380 }, { "epoch": 1.7171717171717171, "grad_norm": 3.776926040649414, "learning_rate": 4.609650080167687e-06, "loss": 0.2522, "step": 11390 }, { "epoch": 1.718679330619629, "grad_norm": 3.5220277309417725, "learning_rate": 4.600906466866735e-06, "loss": 0.2712, "step": 11400 }, { "epoch": 1.720186944067541, "grad_norm": 5.6673431396484375, "learning_rate": 4.592164081659902e-06, "loss": 0.2602, "step": 11410 }, { "epoch": 1.7216945575154532, "grad_norm": 3.638904094696045, "learning_rate": 4.583422951449333e-06, "loss": 0.1973, "step": 11420 }, { "epoch": 1.723202170963365, "grad_norm": 2.2550768852233887, "learning_rate": 4.57468310313331e-06, "loss": 0.2648, "step": 11430 }, { "epoch": 1.724709784411277, "grad_norm": 2.851170778274536, "learning_rate": 4.56594456360617e-06, "loss": 0.192, "step": 11440 }, { "epoch": 1.7262173978591888, "grad_norm": 1.6732882261276245, "learning_rate": 4.557207359758224e-06, "loss": 0.1852, "step": 11450 }, { "epoch": 1.727725011307101, "grad_norm": 2.6964340209960938, "learning_rate": 4.548471518475673e-06, "loss": 0.1883, "step": 11460 }, { "epoch": 1.7292326247550127, "grad_norm": 4.8833208084106445, "learning_rate": 4.539737066640524e-06, "loss": 0.2236, "step": 11470 }, { "epoch": 1.7307402382029249, "grad_norm": 3.164313316345215, "learning_rate": 4.531004031130509e-06, "loss": 0.2372, "step": 11480 }, { "epoch": 1.7322478516508366, "grad_norm": 1.9367437362670898, "learning_rate": 4.522272438819003e-06, "loss": 0.181, "step": 11490 }, { "epoch": 1.7337554650987488, "grad_norm": 3.253983736038208, "learning_rate": 4.5135423165749345e-06, "loss": 0.1728, "step": 11500 }, { "epoch": 1.7352630785466605, "grad_norm": 2.3438210487365723, "learning_rate": 4.504813691262714e-06, "loss": 0.2458, "step": 11510 }, { "epoch": 1.7367706919945727, "grad_norm": 4.087237358093262, "learning_rate": 4.496086589742145e-06, "loss": 0.2663, "step": 11520 }, { "epoch": 1.7382783054424844, "grad_norm": 3.381788730621338, "learning_rate": 4.487361038868339e-06, "loss": 0.2413, "step": 11530 }, { "epoch": 1.7397859188903966, "grad_norm": 3.7918875217437744, "learning_rate": 4.47863706549164e-06, "loss": 0.2347, "step": 11540 }, { "epoch": 1.7412935323383083, "grad_norm": 3.293243169784546, "learning_rate": 4.469914696457534e-06, "loss": 0.2226, "step": 11550 }, { "epoch": 1.7428011457862205, "grad_norm": 2.915106773376465, "learning_rate": 4.461193958606571e-06, "loss": 0.2458, "step": 11560 }, { "epoch": 1.7443087592341324, "grad_norm": 3.0779457092285156, "learning_rate": 4.452474878774282e-06, "loss": 0.2277, "step": 11570 }, { "epoch": 1.7458163726820444, "grad_norm": 2.496351718902588, "learning_rate": 4.4437574837910955e-06, "loss": 0.2467, "step": 11580 }, { "epoch": 1.7473239861299563, "grad_norm": 2.4611587524414062, "learning_rate": 4.435041800482257e-06, "loss": 0.2158, "step": 11590 }, { "epoch": 1.7488315995778683, "grad_norm": 3.4138259887695312, "learning_rate": 4.426327855667744e-06, "loss": 0.1854, "step": 11600 }, { "epoch": 1.7503392130257802, "grad_norm": 2.898651123046875, "learning_rate": 4.417615676162184e-06, "loss": 0.1986, "step": 11610 }, { "epoch": 1.7518468264736922, "grad_norm": 2.3005759716033936, "learning_rate": 4.40890528877477e-06, "loss": 0.245, "step": 11620 }, { "epoch": 1.7533544399216041, "grad_norm": 2.7288200855255127, "learning_rate": 4.400196720309184e-06, "loss": 0.2269, "step": 11630 }, { "epoch": 1.754862053369516, "grad_norm": 2.9712252616882324, "learning_rate": 4.3914899975635076e-06, "loss": 0.2198, "step": 11640 }, { "epoch": 1.756369666817428, "grad_norm": 3.5533580780029297, "learning_rate": 4.382785147330146e-06, "loss": 0.2293, "step": 11650 }, { "epoch": 1.75787728026534, "grad_norm": 1.9831428527832031, "learning_rate": 4.37408219639574e-06, "loss": 0.1962, "step": 11660 }, { "epoch": 1.759384893713252, "grad_norm": 3.2647225856781006, "learning_rate": 4.36538117154109e-06, "loss": 0.1726, "step": 11670 }, { "epoch": 1.7608925071611639, "grad_norm": 4.194611072540283, "learning_rate": 4.35668209954106e-06, "loss": 0.2058, "step": 11680 }, { "epoch": 1.7624001206090758, "grad_norm": 2.190162181854248, "learning_rate": 4.347985007164513e-06, "loss": 0.1862, "step": 11690 }, { "epoch": 1.7639077340569878, "grad_norm": 2.9177680015563965, "learning_rate": 4.339289921174218e-06, "loss": 0.2001, "step": 11700 }, { "epoch": 1.7654153475048997, "grad_norm": 1.8672374486923218, "learning_rate": 4.3305968683267715e-06, "loss": 0.2047, "step": 11710 }, { "epoch": 1.7669229609528116, "grad_norm": 2.0886166095733643, "learning_rate": 4.321905875372509e-06, "loss": 0.2079, "step": 11720 }, { "epoch": 1.7684305744007238, "grad_norm": 3.464622974395752, "learning_rate": 4.3132169690554354e-06, "loss": 0.2797, "step": 11730 }, { "epoch": 1.7699381878486355, "grad_norm": 3.1276984214782715, "learning_rate": 4.304530176113123e-06, "loss": 0.2107, "step": 11740 }, { "epoch": 1.7714458012965477, "grad_norm": 3.2031068801879883, "learning_rate": 4.295845523276651e-06, "loss": 0.2213, "step": 11750 }, { "epoch": 1.7729534147444594, "grad_norm": 1.7454053163528442, "learning_rate": 4.287163037270507e-06, "loss": 0.249, "step": 11760 }, { "epoch": 1.7744610281923716, "grad_norm": 2.5999598503112793, "learning_rate": 4.2784827448125145e-06, "loss": 0.2078, "step": 11770 }, { "epoch": 1.7759686416402833, "grad_norm": 2.703531265258789, "learning_rate": 4.269804672613745e-06, "loss": 0.2397, "step": 11780 }, { "epoch": 1.7774762550881955, "grad_norm": 2.9518818855285645, "learning_rate": 4.2611288473784415e-06, "loss": 0.2187, "step": 11790 }, { "epoch": 1.7789838685361072, "grad_norm": 2.8433148860931396, "learning_rate": 4.2524552958039246e-06, "loss": 0.2292, "step": 11800 }, { "epoch": 1.7804914819840194, "grad_norm": 2.891763687133789, "learning_rate": 4.243784044580524e-06, "loss": 0.2108, "step": 11810 }, { "epoch": 1.7819990954319311, "grad_norm": 3.007401466369629, "learning_rate": 4.235115120391493e-06, "loss": 0.2066, "step": 11820 }, { "epoch": 1.7835067088798433, "grad_norm": 3.65185809135437, "learning_rate": 4.226448549912919e-06, "loss": 0.184, "step": 11830 }, { "epoch": 1.785014322327755, "grad_norm": 2.869709014892578, "learning_rate": 4.217784359813651e-06, "loss": 0.1892, "step": 11840 }, { "epoch": 1.7865219357756672, "grad_norm": 1.920365333557129, "learning_rate": 4.209122576755206e-06, "loss": 0.2034, "step": 11850 }, { "epoch": 1.788029549223579, "grad_norm": 4.98642110824585, "learning_rate": 4.200463227391703e-06, "loss": 0.2143, "step": 11860 }, { "epoch": 1.789537162671491, "grad_norm": 3.374800205230713, "learning_rate": 4.191806338369766e-06, "loss": 0.2213, "step": 11870 }, { "epoch": 1.7910447761194028, "grad_norm": 4.124040126800537, "learning_rate": 4.18315193632845e-06, "loss": 0.1668, "step": 11880 }, { "epoch": 1.792552389567315, "grad_norm": 2.776641845703125, "learning_rate": 4.174500047899156e-06, "loss": 0.252, "step": 11890 }, { "epoch": 1.794060003015227, "grad_norm": 3.5871074199676514, "learning_rate": 4.165850699705555e-06, "loss": 0.2245, "step": 11900 }, { "epoch": 1.795567616463139, "grad_norm": 3.288665771484375, "learning_rate": 4.157203918363492e-06, "loss": 0.1653, "step": 11910 }, { "epoch": 1.7970752299110508, "grad_norm": 2.1320581436157227, "learning_rate": 4.14855973048092e-06, "loss": 0.2089, "step": 11920 }, { "epoch": 1.7985828433589628, "grad_norm": 2.998213291168213, "learning_rate": 4.1399181626578104e-06, "loss": 0.2206, "step": 11930 }, { "epoch": 1.8000904568068747, "grad_norm": 3.6605064868927, "learning_rate": 4.131279241486072e-06, "loss": 0.1909, "step": 11940 }, { "epoch": 1.8015980702547867, "grad_norm": 4.4735002517700195, "learning_rate": 4.122642993549466e-06, "loss": 0.2095, "step": 11950 }, { "epoch": 1.8031056837026986, "grad_norm": 2.7816972732543945, "learning_rate": 4.114009445423536e-06, "loss": 0.2056, "step": 11960 }, { "epoch": 1.8046132971506106, "grad_norm": 3.833822011947632, "learning_rate": 4.105378623675505e-06, "loss": 0.2081, "step": 11970 }, { "epoch": 1.8061209105985225, "grad_norm": 2.896423816680908, "learning_rate": 4.096750554864217e-06, "loss": 0.2265, "step": 11980 }, { "epoch": 1.8076285240464345, "grad_norm": 2.9309189319610596, "learning_rate": 4.088125265540041e-06, "loss": 0.1773, "step": 11990 }, { "epoch": 1.8091361374943464, "grad_norm": 2.014049530029297, "learning_rate": 4.079502782244792e-06, "loss": 0.1798, "step": 12000 }, { "epoch": 1.8106437509422584, "grad_norm": 2.828732490539551, "learning_rate": 4.070883131511651e-06, "loss": 0.2158, "step": 12010 }, { "epoch": 1.8121513643901703, "grad_norm": 2.015326738357544, "learning_rate": 4.062266339865087e-06, "loss": 0.2658, "step": 12020 }, { "epoch": 1.8136589778380823, "grad_norm": 2.88570237159729, "learning_rate": 4.05365243382076e-06, "loss": 0.2423, "step": 12030 }, { "epoch": 1.8151665912859942, "grad_norm": 3.742107629776001, "learning_rate": 4.045041439885461e-06, "loss": 0.2276, "step": 12040 }, { "epoch": 1.8166742047339062, "grad_norm": 2.541560411453247, "learning_rate": 4.036433384557016e-06, "loss": 0.2139, "step": 12050 }, { "epoch": 1.8181818181818183, "grad_norm": 2.1083765029907227, "learning_rate": 4.027828294324206e-06, "loss": 0.2216, "step": 12060 }, { "epoch": 1.81968943162973, "grad_norm": 2.0851738452911377, "learning_rate": 4.0192261956666935e-06, "loss": 0.2003, "step": 12070 }, { "epoch": 1.8211970450776422, "grad_norm": 4.776218414306641, "learning_rate": 4.010627115054932e-06, "loss": 0.1651, "step": 12080 }, { "epoch": 1.822704658525554, "grad_norm": 2.916724920272827, "learning_rate": 4.002031078950084e-06, "loss": 0.2839, "step": 12090 }, { "epoch": 1.8242122719734661, "grad_norm": 3.993910551071167, "learning_rate": 3.99343811380395e-06, "loss": 0.2404, "step": 12100 }, { "epoch": 1.8257198854213779, "grad_norm": 3.024043560028076, "learning_rate": 3.984848246058876e-06, "loss": 0.1554, "step": 12110 }, { "epoch": 1.82722749886929, "grad_norm": 2.0465457439422607, "learning_rate": 3.97626150214768e-06, "loss": 0.2283, "step": 12120 }, { "epoch": 1.8287351123172018, "grad_norm": 2.694305181503296, "learning_rate": 3.9676779084935645e-06, "loss": 0.233, "step": 12130 }, { "epoch": 1.830242725765114, "grad_norm": 2.477524757385254, "learning_rate": 3.959097491510041e-06, "loss": 0.2353, "step": 12140 }, { "epoch": 1.8317503392130257, "grad_norm": 3.1026499271392822, "learning_rate": 3.9505202776008414e-06, "loss": 0.2266, "step": 12150 }, { "epoch": 1.8332579526609378, "grad_norm": 2.5856189727783203, "learning_rate": 3.941946293159844e-06, "loss": 0.264, "step": 12160 }, { "epoch": 1.8347655661088496, "grad_norm": 2.351128101348877, "learning_rate": 3.933375564570989e-06, "loss": 0.1779, "step": 12170 }, { "epoch": 1.8362731795567617, "grad_norm": 2.825209856033325, "learning_rate": 3.924808118208199e-06, "loss": 0.2089, "step": 12180 }, { "epoch": 1.8377807930046735, "grad_norm": 3.131795644760132, "learning_rate": 3.916243980435295e-06, "loss": 0.1918, "step": 12190 }, { "epoch": 1.8392884064525856, "grad_norm": 3.01877760887146, "learning_rate": 3.907683177605915e-06, "loss": 0.2614, "step": 12200 }, { "epoch": 1.8407960199004973, "grad_norm": 3.6640915870666504, "learning_rate": 3.8991257360634375e-06, "loss": 0.1962, "step": 12210 }, { "epoch": 1.8423036333484095, "grad_norm": 2.025000810623169, "learning_rate": 3.890571682140896e-06, "loss": 0.1629, "step": 12220 }, { "epoch": 1.8438112467963215, "grad_norm": 3.3687691688537598, "learning_rate": 3.882021042160901e-06, "loss": 0.195, "step": 12230 }, { "epoch": 1.8453188602442334, "grad_norm": 3.9597957134246826, "learning_rate": 3.873473842435557e-06, "loss": 0.2319, "step": 12240 }, { "epoch": 1.8468264736921454, "grad_norm": 4.04033088684082, "learning_rate": 3.864930109266382e-06, "loss": 0.2117, "step": 12250 }, { "epoch": 1.8483340871400573, "grad_norm": 3.3645472526550293, "learning_rate": 3.856389868944225e-06, "loss": 0.1823, "step": 12260 }, { "epoch": 1.8498417005879693, "grad_norm": 2.3581202030181885, "learning_rate": 3.847853147749191e-06, "loss": 0.2192, "step": 12270 }, { "epoch": 1.8513493140358812, "grad_norm": 3.0115413665771484, "learning_rate": 3.83931997195055e-06, "loss": 0.245, "step": 12280 }, { "epoch": 1.8528569274837932, "grad_norm": 3.3404693603515625, "learning_rate": 3.8307903678066686e-06, "loss": 0.2056, "step": 12290 }, { "epoch": 1.854364540931705, "grad_norm": 2.7438411712646484, "learning_rate": 3.822264361564917e-06, "loss": 0.2603, "step": 12300 }, { "epoch": 1.855872154379617, "grad_norm": 2.801582098007202, "learning_rate": 3.8137419794615986e-06, "loss": 0.2236, "step": 12310 }, { "epoch": 1.857379767827529, "grad_norm": 2.4423065185546875, "learning_rate": 3.8052232477218603e-06, "loss": 0.1419, "step": 12320 }, { "epoch": 1.858887381275441, "grad_norm": 2.744873285293579, "learning_rate": 3.7967081925596195e-06, "loss": 0.2438, "step": 12330 }, { "epoch": 1.860394994723353, "grad_norm": 2.5213463306427, "learning_rate": 3.7881968401774784e-06, "loss": 0.1724, "step": 12340 }, { "epoch": 1.8619026081712649, "grad_norm": 3.226952075958252, "learning_rate": 3.779689216766644e-06, "loss": 0.1874, "step": 12350 }, { "epoch": 1.8634102216191768, "grad_norm": 3.1130213737487793, "learning_rate": 3.7711853485068507e-06, "loss": 0.2267, "step": 12360 }, { "epoch": 1.8649178350670887, "grad_norm": 2.418051242828369, "learning_rate": 3.762685261566277e-06, "loss": 0.2075, "step": 12370 }, { "epoch": 1.8664254485150007, "grad_norm": 2.9016170501708984, "learning_rate": 3.754188982101463e-06, "loss": 0.2164, "step": 12380 }, { "epoch": 1.8679330619629129, "grad_norm": 3.092372417449951, "learning_rate": 3.7456965362572356e-06, "loss": 0.2083, "step": 12390 }, { "epoch": 1.8694406754108246, "grad_norm": 5.767421722412109, "learning_rate": 3.7372079501666247e-06, "loss": 0.2039, "step": 12400 }, { "epoch": 1.8709482888587368, "grad_norm": 1.904005765914917, "learning_rate": 3.7287232499507796e-06, "loss": 0.2563, "step": 12410 }, { "epoch": 1.8724559023066485, "grad_norm": 2.0788073539733887, "learning_rate": 3.720242461718896e-06, "loss": 0.215, "step": 12420 }, { "epoch": 1.8739635157545607, "grad_norm": 4.604334354400635, "learning_rate": 3.7117656115681296e-06, "loss": 0.1672, "step": 12430 }, { "epoch": 1.8754711292024724, "grad_norm": 3.309016227722168, "learning_rate": 3.703292725583516e-06, "loss": 0.2458, "step": 12440 }, { "epoch": 1.8769787426503846, "grad_norm": 2.8750290870666504, "learning_rate": 3.6948238298378965e-06, "loss": 0.2089, "step": 12450 }, { "epoch": 1.8784863560982963, "grad_norm": 3.2614612579345703, "learning_rate": 3.68635895039183e-06, "loss": 0.2315, "step": 12460 }, { "epoch": 1.8799939695462085, "grad_norm": 2.5713553428649902, "learning_rate": 3.67789811329352e-06, "loss": 0.2363, "step": 12470 }, { "epoch": 1.8815015829941202, "grad_norm": 2.667750835418701, "learning_rate": 3.669441344578725e-06, "loss": 0.1889, "step": 12480 }, { "epoch": 1.8830091964420324, "grad_norm": 3.6025168895721436, "learning_rate": 3.6609886702706914e-06, "loss": 0.1848, "step": 12490 }, { "epoch": 1.884516809889944, "grad_norm": 5.186005592346191, "learning_rate": 3.65254011638006e-06, "loss": 0.2276, "step": 12500 }, { "epoch": 1.8860244233378562, "grad_norm": 2.3484935760498047, "learning_rate": 3.6440957089047956e-06, "loss": 0.2181, "step": 12510 }, { "epoch": 1.887532036785768, "grad_norm": 5.662171840667725, "learning_rate": 3.6356554738301032e-06, "loss": 0.2028, "step": 12520 }, { "epoch": 1.8890396502336801, "grad_norm": 3.2923097610473633, "learning_rate": 3.6272194371283486e-06, "loss": 0.2152, "step": 12530 }, { "epoch": 1.890547263681592, "grad_norm": 2.1895859241485596, "learning_rate": 3.6187876247589783e-06, "loss": 0.2296, "step": 12540 }, { "epoch": 1.892054877129504, "grad_norm": 3.6705384254455566, "learning_rate": 3.6103600626684385e-06, "loss": 0.2307, "step": 12550 }, { "epoch": 1.893562490577416, "grad_norm": 3.1269428730010986, "learning_rate": 3.6019367767900964e-06, "loss": 0.2201, "step": 12560 }, { "epoch": 1.895070104025328, "grad_norm": 1.8871471881866455, "learning_rate": 3.593517793044161e-06, "loss": 0.2218, "step": 12570 }, { "epoch": 1.8965777174732399, "grad_norm": 2.3896496295928955, "learning_rate": 3.585103137337605e-06, "loss": 0.2506, "step": 12580 }, { "epoch": 1.8980853309211518, "grad_norm": 2.4082393646240234, "learning_rate": 3.5766928355640786e-06, "loss": 0.2127, "step": 12590 }, { "epoch": 1.8995929443690638, "grad_norm": 2.666170358657837, "learning_rate": 3.5682869136038397e-06, "loss": 0.2025, "step": 12600 }, { "epoch": 1.9011005578169757, "grad_norm": 3.1439576148986816, "learning_rate": 3.5598853973236596e-06, "loss": 0.2687, "step": 12610 }, { "epoch": 1.9026081712648877, "grad_norm": 3.237046718597412, "learning_rate": 3.5514883125767606e-06, "loss": 0.2054, "step": 12620 }, { "epoch": 1.9041157847127996, "grad_norm": 5.535610675811768, "learning_rate": 3.5430956852027267e-06, "loss": 0.1929, "step": 12630 }, { "epoch": 1.9056233981607116, "grad_norm": 2.7378811836242676, "learning_rate": 3.5347075410274222e-06, "loss": 0.2444, "step": 12640 }, { "epoch": 1.9071310116086235, "grad_norm": 2.6082146167755127, "learning_rate": 3.5263239058629197e-06, "loss": 0.1679, "step": 12650 }, { "epoch": 1.9086386250565355, "grad_norm": 2.8664164543151855, "learning_rate": 3.517944805507415e-06, "loss": 0.193, "step": 12660 }, { "epoch": 1.9101462385044474, "grad_norm": 2.733973264694214, "learning_rate": 3.509570265745147e-06, "loss": 0.2348, "step": 12670 }, { "epoch": 1.9116538519523594, "grad_norm": 2.747835874557495, "learning_rate": 3.5012003123463246e-06, "loss": 0.2683, "step": 12680 }, { "epoch": 1.9131614654002713, "grad_norm": 2.7970662117004395, "learning_rate": 3.4928349710670407e-06, "loss": 0.1943, "step": 12690 }, { "epoch": 1.9146690788481835, "grad_norm": 3.4264767169952393, "learning_rate": 3.484474267649197e-06, "loss": 0.1706, "step": 12700 }, { "epoch": 1.9161766922960952, "grad_norm": 3.248516321182251, "learning_rate": 3.476118227820424e-06, "loss": 0.2142, "step": 12710 }, { "epoch": 1.9176843057440074, "grad_norm": 2.9080593585968018, "learning_rate": 3.467766877294003e-06, "loss": 0.1894, "step": 12720 }, { "epoch": 1.9191919191919191, "grad_norm": 2.5220963954925537, "learning_rate": 3.4594202417687793e-06, "loss": 0.2609, "step": 12730 }, { "epoch": 1.9206995326398313, "grad_norm": 2.8439948558807373, "learning_rate": 3.451078346929093e-06, "loss": 0.2382, "step": 12740 }, { "epoch": 1.922207146087743, "grad_norm": 4.194735527038574, "learning_rate": 3.4427412184446977e-06, "loss": 0.2039, "step": 12750 }, { "epoch": 1.9237147595356552, "grad_norm": 2.1273186206817627, "learning_rate": 3.4344088819706778e-06, "loss": 0.1667, "step": 12760 }, { "epoch": 1.925222372983567, "grad_norm": 2.1236917972564697, "learning_rate": 3.4260813631473723e-06, "loss": 0.1989, "step": 12770 }, { "epoch": 1.926729986431479, "grad_norm": 3.5355234146118164, "learning_rate": 3.4177586876002968e-06, "loss": 0.2322, "step": 12780 }, { "epoch": 1.9282375998793908, "grad_norm": 2.3498196601867676, "learning_rate": 3.4094408809400575e-06, "loss": 0.2156, "step": 12790 }, { "epoch": 1.929745213327303, "grad_norm": 2.400656223297119, "learning_rate": 3.4011279687622845e-06, "loss": 0.1958, "step": 12800 }, { "epoch": 1.9312528267752147, "grad_norm": 4.361973762512207, "learning_rate": 3.3928199766475435e-06, "loss": 0.21, "step": 12810 }, { "epoch": 1.9327604402231269, "grad_norm": 2.5233652591705322, "learning_rate": 3.38451693016126e-06, "loss": 0.2493, "step": 12820 }, { "epoch": 1.9342680536710386, "grad_norm": 2.1211163997650146, "learning_rate": 3.3762188548536436e-06, "loss": 0.2076, "step": 12830 }, { "epoch": 1.9357756671189508, "grad_norm": 3.162459373474121, "learning_rate": 3.3679257762596045e-06, "loss": 0.2096, "step": 12840 }, { "epoch": 1.9372832805668625, "grad_norm": 3.734987497329712, "learning_rate": 3.3596377198986733e-06, "loss": 0.2208, "step": 12850 }, { "epoch": 1.9387908940147747, "grad_norm": 3.3292925357818604, "learning_rate": 3.3513547112749324e-06, "loss": 0.2084, "step": 12860 }, { "epoch": 1.9402985074626866, "grad_norm": 3.664447069168091, "learning_rate": 3.3430767758769272e-06, "loss": 0.2362, "step": 12870 }, { "epoch": 1.9418061209105986, "grad_norm": 2.683830976486206, "learning_rate": 3.3348039391775942e-06, "loss": 0.1949, "step": 12880 }, { "epoch": 1.9433137343585105, "grad_norm": 2.370433807373047, "learning_rate": 3.326536226634179e-06, "loss": 0.2425, "step": 12890 }, { "epoch": 1.9448213478064225, "grad_norm": 2.548781633377075, "learning_rate": 3.31827366368816e-06, "loss": 0.1865, "step": 12900 }, { "epoch": 1.9463289612543344, "grad_norm": 2.0896973609924316, "learning_rate": 3.310016275765166e-06, "loss": 0.2255, "step": 12910 }, { "epoch": 1.9478365747022464, "grad_norm": 3.324918746948242, "learning_rate": 3.301764088274904e-06, "loss": 0.2099, "step": 12920 }, { "epoch": 1.9493441881501583, "grad_norm": 3.6539573669433594, "learning_rate": 3.2935171266110788e-06, "loss": 0.214, "step": 12930 }, { "epoch": 1.9508518015980703, "grad_norm": 3.344766139984131, "learning_rate": 3.285275416151312e-06, "loss": 0.1952, "step": 12940 }, { "epoch": 1.9523594150459822, "grad_norm": 2.625490427017212, "learning_rate": 3.277038982257071e-06, "loss": 0.2299, "step": 12950 }, { "epoch": 1.9538670284938942, "grad_norm": 4.696839809417725, "learning_rate": 3.2688078502735766e-06, "loss": 0.2336, "step": 12960 }, { "epoch": 1.955374641941806, "grad_norm": 2.697025775909424, "learning_rate": 3.260582045529743e-06, "loss": 0.252, "step": 12970 }, { "epoch": 1.956882255389718, "grad_norm": 3.431110143661499, "learning_rate": 3.2523615933380893e-06, "loss": 0.1533, "step": 12980 }, { "epoch": 1.95838986883763, "grad_norm": 2.518953323364258, "learning_rate": 3.244146518994662e-06, "loss": 0.274, "step": 12990 }, { "epoch": 1.959897482285542, "grad_norm": 2.7656660079956055, "learning_rate": 3.235936847778961e-06, "loss": 0.2058, "step": 13000 }, { "epoch": 1.961405095733454, "grad_norm": 3.8093807697296143, "learning_rate": 3.227732604953859e-06, "loss": 0.1999, "step": 13010 }, { "epoch": 1.9629127091813658, "grad_norm": 3.4675326347351074, "learning_rate": 3.2195338157655215e-06, "loss": 0.2286, "step": 13020 }, { "epoch": 1.964420322629278, "grad_norm": 3.0316765308380127, "learning_rate": 3.2113405054433357e-06, "loss": 0.1997, "step": 13030 }, { "epoch": 1.9659279360771897, "grad_norm": 3.3988842964172363, "learning_rate": 3.203152699199828e-06, "loss": 0.1911, "step": 13040 }, { "epoch": 1.967435549525102, "grad_norm": 2.018101453781128, "learning_rate": 3.1949704222305877e-06, "loss": 0.2217, "step": 13050 }, { "epoch": 1.9689431629730136, "grad_norm": 3.1518211364746094, "learning_rate": 3.186793699714189e-06, "loss": 0.1719, "step": 13060 }, { "epoch": 1.9704507764209258, "grad_norm": 4.616468906402588, "learning_rate": 3.1786225568121166e-06, "loss": 0.233, "step": 13070 }, { "epoch": 1.9719583898688375, "grad_norm": 1.7885780334472656, "learning_rate": 3.170457018668679e-06, "loss": 0.2117, "step": 13080 }, { "epoch": 1.9734660033167497, "grad_norm": 3.3399362564086914, "learning_rate": 3.1622971104109424e-06, "loss": 0.2124, "step": 13090 }, { "epoch": 1.9749736167646614, "grad_norm": 4.4371466636657715, "learning_rate": 3.15414285714865e-06, "loss": 0.1993, "step": 13100 }, { "epoch": 1.9764812302125736, "grad_norm": 2.315176486968994, "learning_rate": 3.145994283974141e-06, "loss": 0.2826, "step": 13110 }, { "epoch": 1.9779888436604853, "grad_norm": 2.9133994579315186, "learning_rate": 3.1378514159622753e-06, "loss": 0.2097, "step": 13120 }, { "epoch": 1.9794964571083975, "grad_norm": 2.984955310821533, "learning_rate": 3.1297142781703597e-06, "loss": 0.1726, "step": 13130 }, { "epoch": 1.9810040705563092, "grad_norm": 3.03562068939209, "learning_rate": 3.1215828956380634e-06, "loss": 0.1962, "step": 13140 }, { "epoch": 1.9825116840042214, "grad_norm": 4.074100971221924, "learning_rate": 3.1134572933873485e-06, "loss": 0.2046, "step": 13150 }, { "epoch": 1.9840192974521331, "grad_norm": 3.0852999687194824, "learning_rate": 3.1053374964223906e-06, "loss": 0.2328, "step": 13160 }, { "epoch": 1.9855269109000453, "grad_norm": 2.042654037475586, "learning_rate": 3.0972235297294994e-06, "loss": 0.2287, "step": 13170 }, { "epoch": 1.987034524347957, "grad_norm": 3.0236499309539795, "learning_rate": 3.0891154182770446e-06, "loss": 0.2006, "step": 13180 }, { "epoch": 1.9885421377958692, "grad_norm": 2.880749225616455, "learning_rate": 3.081013187015379e-06, "loss": 0.2464, "step": 13190 }, { "epoch": 1.9900497512437811, "grad_norm": 2.978698492050171, "learning_rate": 3.0729168608767575e-06, "loss": 0.2428, "step": 13200 }, { "epoch": 1.991557364691693, "grad_norm": 3.12019419670105, "learning_rate": 3.064826464775267e-06, "loss": 0.2612, "step": 13210 }, { "epoch": 1.993064978139605, "grad_norm": 3.283493757247925, "learning_rate": 3.056742023606744e-06, "loss": 0.2621, "step": 13220 }, { "epoch": 1.994572591587517, "grad_norm": 1.8703701496124268, "learning_rate": 3.0486635622487036e-06, "loss": 0.2287, "step": 13230 }, { "epoch": 1.996080205035429, "grad_norm": 3.2255666255950928, "learning_rate": 3.0405911055602572e-06, "loss": 0.2547, "step": 13240 }, { "epoch": 1.9975878184833409, "grad_norm": 2.833178758621216, "learning_rate": 3.0325246783820412e-06, "loss": 0.2348, "step": 13250 }, { "epoch": 1.9990954319312528, "grad_norm": 2.0889644622802734, "learning_rate": 3.024464305536132e-06, "loss": 0.1764, "step": 13260 }, { "epoch": 2.000603045379165, "grad_norm": 2.7422523498535156, "learning_rate": 3.016410011825982e-06, "loss": 0.1886, "step": 13270 }, { "epoch": 2.002110658827077, "grad_norm": 2.3081746101379395, "learning_rate": 3.0083618220363344e-06, "loss": 0.1661, "step": 13280 }, { "epoch": 2.0036182722749887, "grad_norm": 2.7018778324127197, "learning_rate": 3.0003197609331487e-06, "loss": 0.1825, "step": 13290 }, { "epoch": 2.005125885722901, "grad_norm": 2.0416979789733887, "learning_rate": 2.9922838532635262e-06, "loss": 0.135, "step": 13300 }, { "epoch": 2.0066334991708126, "grad_norm": 1.9465279579162598, "learning_rate": 2.984254123755633e-06, "loss": 0.1758, "step": 13310 }, { "epoch": 2.0081411126187247, "grad_norm": 3.1792728900909424, "learning_rate": 2.976230597118624e-06, "loss": 0.213, "step": 13320 }, { "epoch": 2.0096487260666365, "grad_norm": 2.439166307449341, "learning_rate": 2.9682132980425625e-06, "loss": 0.1709, "step": 13330 }, { "epoch": 2.0111563395145486, "grad_norm": 2.0587236881256104, "learning_rate": 2.960202251198354e-06, "loss": 0.2507, "step": 13340 }, { "epoch": 2.0126639529624604, "grad_norm": 3.566237688064575, "learning_rate": 2.952197481237661e-06, "loss": 0.2212, "step": 13350 }, { "epoch": 2.0141715664103725, "grad_norm": 2.6652166843414307, "learning_rate": 2.9441990127928325e-06, "loss": 0.1872, "step": 13360 }, { "epoch": 2.0156791798582843, "grad_norm": 2.4022216796875, "learning_rate": 2.9362068704768254e-06, "loss": 0.1965, "step": 13370 }, { "epoch": 2.0171867933061964, "grad_norm": 3.7958662509918213, "learning_rate": 2.928221078883131e-06, "loss": 0.2103, "step": 13380 }, { "epoch": 2.018694406754108, "grad_norm": 3.9771008491516113, "learning_rate": 2.920241662585695e-06, "loss": 0.2091, "step": 13390 }, { "epoch": 2.0202020202020203, "grad_norm": 2.544232130050659, "learning_rate": 2.912268646138849e-06, "loss": 0.1648, "step": 13400 }, { "epoch": 2.021709633649932, "grad_norm": 2.8252806663513184, "learning_rate": 2.904302054077228e-06, "loss": 0.2089, "step": 13410 }, { "epoch": 2.0232172470978442, "grad_norm": 2.4302892684936523, "learning_rate": 2.8963419109157026e-06, "loss": 0.2184, "step": 13420 }, { "epoch": 2.024724860545756, "grad_norm": 3.0514228343963623, "learning_rate": 2.8883882411492893e-06, "loss": 0.2158, "step": 13430 }, { "epoch": 2.026232473993668, "grad_norm": 2.723155975341797, "learning_rate": 2.8804410692530936e-06, "loss": 0.2098, "step": 13440 }, { "epoch": 2.02774008744158, "grad_norm": 2.3905491828918457, "learning_rate": 2.8725004196822227e-06, "loss": 0.1884, "step": 13450 }, { "epoch": 2.029247700889492, "grad_norm": 2.2242300510406494, "learning_rate": 2.864566316871712e-06, "loss": 0.1779, "step": 13460 }, { "epoch": 2.0307553143374037, "grad_norm": 3.1515984535217285, "learning_rate": 2.856638785236453e-06, "loss": 0.2113, "step": 13470 }, { "epoch": 2.032262927785316, "grad_norm": 3.7826383113861084, "learning_rate": 2.8487178491711165e-06, "loss": 0.2134, "step": 13480 }, { "epoch": 2.0337705412332276, "grad_norm": 2.9099128246307373, "learning_rate": 2.8408035330500744e-06, "loss": 0.2003, "step": 13490 }, { "epoch": 2.03527815468114, "grad_norm": 2.392629623413086, "learning_rate": 2.83289586122733e-06, "loss": 0.239, "step": 13500 }, { "epoch": 2.0367857681290515, "grad_norm": 3.0483973026275635, "learning_rate": 2.8249948580364413e-06, "loss": 0.1989, "step": 13510 }, { "epoch": 2.0382933815769637, "grad_norm": 2.6781299114227295, "learning_rate": 2.8171005477904436e-06, "loss": 0.1761, "step": 13520 }, { "epoch": 2.0398009950248754, "grad_norm": 2.412611246109009, "learning_rate": 2.8092129547817783e-06, "loss": 0.2025, "step": 13530 }, { "epoch": 2.0413086084727876, "grad_norm": 2.881101131439209, "learning_rate": 2.8013321032822177e-06, "loss": 0.1864, "step": 13540 }, { "epoch": 2.0428162219206993, "grad_norm": 7.387816429138184, "learning_rate": 2.7934580175427835e-06, "loss": 0.2017, "step": 13550 }, { "epoch": 2.0443238353686115, "grad_norm": 2.835645914077759, "learning_rate": 2.7855907217936823e-06, "loss": 0.1872, "step": 13560 }, { "epoch": 2.0458314488165232, "grad_norm": 3.0997979640960693, "learning_rate": 2.7777302402442264e-06, "loss": 0.1754, "step": 13570 }, { "epoch": 2.0473390622644354, "grad_norm": 3.844759225845337, "learning_rate": 2.7698765970827577e-06, "loss": 0.2187, "step": 13580 }, { "epoch": 2.0488466757123476, "grad_norm": 2.8539113998413086, "learning_rate": 2.7620298164765773e-06, "loss": 0.2011, "step": 13590 }, { "epoch": 2.0503542891602593, "grad_norm": 2.1837692260742188, "learning_rate": 2.7541899225718683e-06, "loss": 0.1653, "step": 13600 }, { "epoch": 2.0518619026081715, "grad_norm": 1.9793051481246948, "learning_rate": 2.7463569394936168e-06, "loss": 0.2359, "step": 13610 }, { "epoch": 2.053369516056083, "grad_norm": 3.2559502124786377, "learning_rate": 2.738530891345549e-06, "loss": 0.256, "step": 13620 }, { "epoch": 2.0548771295039954, "grad_norm": 2.535839557647705, "learning_rate": 2.73071180221005e-06, "loss": 0.2022, "step": 13630 }, { "epoch": 2.056384742951907, "grad_norm": 4.17906379699707, "learning_rate": 2.7228996961480876e-06, "loss": 0.1752, "step": 13640 }, { "epoch": 2.0578923563998193, "grad_norm": 2.614445924758911, "learning_rate": 2.7150945971991445e-06, "loss": 0.208, "step": 13650 }, { "epoch": 2.059399969847731, "grad_norm": 2.8570632934570312, "learning_rate": 2.7072965293811406e-06, "loss": 0.2399, "step": 13660 }, { "epoch": 2.060907583295643, "grad_norm": 3.3064613342285156, "learning_rate": 2.6995055166903554e-06, "loss": 0.2175, "step": 13670 }, { "epoch": 2.062415196743555, "grad_norm": 1.9073411226272583, "learning_rate": 2.691721583101363e-06, "loss": 0.1668, "step": 13680 }, { "epoch": 2.063922810191467, "grad_norm": 2.0336592197418213, "learning_rate": 2.6839447525669526e-06, "loss": 0.207, "step": 13690 }, { "epoch": 2.065430423639379, "grad_norm": 3.391634941101074, "learning_rate": 2.676175049018055e-06, "loss": 0.2381, "step": 13700 }, { "epoch": 2.066938037087291, "grad_norm": 2.220540761947632, "learning_rate": 2.6684124963636716e-06, "loss": 0.1952, "step": 13710 }, { "epoch": 2.0684456505352027, "grad_norm": 3.149351119995117, "learning_rate": 2.660657118490794e-06, "loss": 0.1703, "step": 13720 }, { "epoch": 2.069953263983115, "grad_norm": 5.175192832946777, "learning_rate": 2.6529089392643416e-06, "loss": 0.2004, "step": 13730 }, { "epoch": 2.0714608774310266, "grad_norm": 3.0444443225860596, "learning_rate": 2.6451679825270775e-06, "loss": 0.2039, "step": 13740 }, { "epoch": 2.0729684908789388, "grad_norm": 2.6953611373901367, "learning_rate": 2.637434272099544e-06, "loss": 0.2112, "step": 13750 }, { "epoch": 2.0744761043268505, "grad_norm": 2.68148136138916, "learning_rate": 2.6297078317799807e-06, "loss": 0.1801, "step": 13760 }, { "epoch": 2.0759837177747626, "grad_norm": 4.698742866516113, "learning_rate": 2.62198868534426e-06, "loss": 0.1837, "step": 13770 }, { "epoch": 2.0774913312226744, "grad_norm": 3.1671454906463623, "learning_rate": 2.6142768565458043e-06, "loss": 0.1693, "step": 13780 }, { "epoch": 2.0789989446705865, "grad_norm": 3.484947443008423, "learning_rate": 2.606572369115521e-06, "loss": 0.1324, "step": 13790 }, { "epoch": 2.0805065581184983, "grad_norm": 2.4008893966674805, "learning_rate": 2.5988752467617274e-06, "loss": 0.1803, "step": 13800 }, { "epoch": 2.0820141715664104, "grad_norm": 2.527247190475464, "learning_rate": 2.591185513170076e-06, "loss": 0.2317, "step": 13810 }, { "epoch": 2.083521785014322, "grad_norm": 2.161057949066162, "learning_rate": 2.5835031920034814e-06, "loss": 0.1863, "step": 13820 }, { "epoch": 2.0850293984622343, "grad_norm": 2.7872486114501953, "learning_rate": 2.5758283069020502e-06, "loss": 0.1551, "step": 13830 }, { "epoch": 2.086537011910146, "grad_norm": 3.0810861587524414, "learning_rate": 2.5681608814830084e-06, "loss": 0.1776, "step": 13840 }, { "epoch": 2.0880446253580582, "grad_norm": 2.805619955062866, "learning_rate": 2.560500939340621e-06, "loss": 0.2106, "step": 13850 }, { "epoch": 2.08955223880597, "grad_norm": 4.003946304321289, "learning_rate": 2.5528485040461304e-06, "loss": 0.2247, "step": 13860 }, { "epoch": 2.091059852253882, "grad_norm": 3.1921539306640625, "learning_rate": 2.5452035991476775e-06, "loss": 0.203, "step": 13870 }, { "epoch": 2.092567465701794, "grad_norm": 2.4134411811828613, "learning_rate": 2.537566248170231e-06, "loss": 0.2238, "step": 13880 }, { "epoch": 2.094075079149706, "grad_norm": 2.1407511234283447, "learning_rate": 2.5299364746155144e-06, "loss": 0.2364, "step": 13890 }, { "epoch": 2.0955826925976178, "grad_norm": 2.8444745540618896, "learning_rate": 2.522314301961934e-06, "loss": 0.195, "step": 13900 }, { "epoch": 2.09709030604553, "grad_norm": 3.7683000564575195, "learning_rate": 2.514699753664508e-06, "loss": 0.2431, "step": 13910 }, { "epoch": 2.098597919493442, "grad_norm": 2.4107770919799805, "learning_rate": 2.5070928531547867e-06, "loss": 0.1552, "step": 13920 }, { "epoch": 2.100105532941354, "grad_norm": 2.5231709480285645, "learning_rate": 2.4994936238407935e-06, "loss": 0.2029, "step": 13930 }, { "epoch": 2.101613146389266, "grad_norm": 2.2843058109283447, "learning_rate": 2.491902089106944e-06, "loss": 0.2259, "step": 13940 }, { "epoch": 2.1031207598371777, "grad_norm": 2.768021821975708, "learning_rate": 2.4843182723139742e-06, "loss": 0.2435, "step": 13950 }, { "epoch": 2.10462837328509, "grad_norm": 4.868978977203369, "learning_rate": 2.476742196798873e-06, "loss": 0.2268, "step": 13960 }, { "epoch": 2.1061359867330016, "grad_norm": 2.6995389461517334, "learning_rate": 2.469173885874807e-06, "loss": 0.1622, "step": 13970 }, { "epoch": 2.107643600180914, "grad_norm": 3.075286865234375, "learning_rate": 2.4616133628310463e-06, "loss": 0.1867, "step": 13980 }, { "epoch": 2.1091512136288255, "grad_norm": 3.4826931953430176, "learning_rate": 2.4540606509329e-06, "loss": 0.1896, "step": 13990 }, { "epoch": 2.1106588270767377, "grad_norm": 3.0355093479156494, "learning_rate": 2.446515773421639e-06, "loss": 0.197, "step": 14000 }, { "epoch": 2.1121664405246494, "grad_norm": 3.257383108139038, "learning_rate": 2.438978753514428e-06, "loss": 0.1986, "step": 14010 }, { "epoch": 2.1136740539725616, "grad_norm": 4.074547290802002, "learning_rate": 2.431449614404248e-06, "loss": 0.1851, "step": 14020 }, { "epoch": 2.1151816674204733, "grad_norm": 4.327542781829834, "learning_rate": 2.423928379259835e-06, "loss": 0.1783, "step": 14030 }, { "epoch": 2.1166892808683855, "grad_norm": 3.1360273361206055, "learning_rate": 2.4164150712255996e-06, "loss": 0.2145, "step": 14040 }, { "epoch": 2.118196894316297, "grad_norm": 2.0761067867279053, "learning_rate": 2.408909713421556e-06, "loss": 0.213, "step": 14050 }, { "epoch": 2.1197045077642094, "grad_norm": 2.7790839672088623, "learning_rate": 2.401412328943259e-06, "loss": 0.1921, "step": 14060 }, { "epoch": 2.121212121212121, "grad_norm": 3.4575977325439453, "learning_rate": 2.3939229408617254e-06, "loss": 0.1789, "step": 14070 }, { "epoch": 2.1227197346600333, "grad_norm": 3.0412304401397705, "learning_rate": 2.3864415722233666e-06, "loss": 0.2381, "step": 14080 }, { "epoch": 2.124227348107945, "grad_norm": 3.299269914627075, "learning_rate": 2.3789682460499146e-06, "loss": 0.1576, "step": 14090 }, { "epoch": 2.125734961555857, "grad_norm": 3.055762529373169, "learning_rate": 2.371502985338356e-06, "loss": 0.2246, "step": 14100 }, { "epoch": 2.127242575003769, "grad_norm": 3.1013965606689453, "learning_rate": 2.364045813060857e-06, "loss": 0.1726, "step": 14110 }, { "epoch": 2.128750188451681, "grad_norm": 2.3734488487243652, "learning_rate": 2.35659675216469e-06, "loss": 0.1981, "step": 14120 }, { "epoch": 2.130257801899593, "grad_norm": 2.855130672454834, "learning_rate": 2.3491558255721725e-06, "loss": 0.2089, "step": 14130 }, { "epoch": 2.131765415347505, "grad_norm": 2.8625171184539795, "learning_rate": 2.341723056180589e-06, "loss": 0.2682, "step": 14140 }, { "epoch": 2.1332730287954167, "grad_norm": 3.3017189502716064, "learning_rate": 2.334298466862123e-06, "loss": 0.1842, "step": 14150 }, { "epoch": 2.134780642243329, "grad_norm": 4.073229789733887, "learning_rate": 2.3268820804637848e-06, "loss": 0.1392, "step": 14160 }, { "epoch": 2.1362882556912406, "grad_norm": 2.8767499923706055, "learning_rate": 2.319473919807347e-06, "loss": 0.1752, "step": 14170 }, { "epoch": 2.1377958691391528, "grad_norm": 2.8563263416290283, "learning_rate": 2.3120740076892613e-06, "loss": 0.2066, "step": 14180 }, { "epoch": 2.1393034825870645, "grad_norm": 2.519503593444824, "learning_rate": 2.304682366880605e-06, "loss": 0.2102, "step": 14190 }, { "epoch": 2.1408110960349767, "grad_norm": 2.1089539527893066, "learning_rate": 2.297299020127e-06, "loss": 0.2051, "step": 14200 }, { "epoch": 2.1423187094828884, "grad_norm": 3.057185411453247, "learning_rate": 2.2899239901485453e-06, "loss": 0.202, "step": 14210 }, { "epoch": 2.1438263229308006, "grad_norm": 2.8150668144226074, "learning_rate": 2.2825572996397476e-06, "loss": 0.2467, "step": 14220 }, { "epoch": 2.1453339363787123, "grad_norm": 2.8598833084106445, "learning_rate": 2.275198971269452e-06, "loss": 0.1636, "step": 14230 }, { "epoch": 2.1468415498266245, "grad_norm": 5.480029106140137, "learning_rate": 2.2678490276807737e-06, "loss": 0.1756, "step": 14240 }, { "epoch": 2.1483491632745366, "grad_norm": 2.0345752239227295, "learning_rate": 2.2605074914910173e-06, "loss": 0.191, "step": 14250 }, { "epoch": 2.1498567767224483, "grad_norm": 2.610340118408203, "learning_rate": 2.253174385291627e-06, "loss": 0.2229, "step": 14260 }, { "epoch": 2.1513643901703605, "grad_norm": 1.945361614227295, "learning_rate": 2.2458497316481e-06, "loss": 0.1674, "step": 14270 }, { "epoch": 2.1528720036182722, "grad_norm": 1.876328468322754, "learning_rate": 2.238533553099924e-06, "loss": 0.1725, "step": 14280 }, { "epoch": 2.1543796170661844, "grad_norm": 2.879284143447876, "learning_rate": 2.23122587216051e-06, "loss": 0.1691, "step": 14290 }, { "epoch": 2.155887230514096, "grad_norm": 3.360105037689209, "learning_rate": 2.2239267113171177e-06, "loss": 0.1632, "step": 14300 }, { "epoch": 2.1573948439620083, "grad_norm": 2.241434097290039, "learning_rate": 2.2166360930307863e-06, "loss": 0.1886, "step": 14310 }, { "epoch": 2.15890245740992, "grad_norm": 3.1058220863342285, "learning_rate": 2.2093540397362718e-06, "loss": 0.1813, "step": 14320 }, { "epoch": 2.160410070857832, "grad_norm": 2.3794758319854736, "learning_rate": 2.202080573841972e-06, "loss": 0.18, "step": 14330 }, { "epoch": 2.161917684305744, "grad_norm": 3.1874425411224365, "learning_rate": 2.1948157177298602e-06, "loss": 0.1878, "step": 14340 }, { "epoch": 2.163425297753656, "grad_norm": 5.301786422729492, "learning_rate": 2.1875594937554157e-06, "loss": 0.2684, "step": 14350 }, { "epoch": 2.164932911201568, "grad_norm": 2.7101266384124756, "learning_rate": 2.180311924247556e-06, "loss": 0.1805, "step": 14360 }, { "epoch": 2.16644052464948, "grad_norm": 2.4568824768066406, "learning_rate": 2.1730730315085625e-06, "loss": 0.2013, "step": 14370 }, { "epoch": 2.1679481380973917, "grad_norm": 3.2412123680114746, "learning_rate": 2.16584283781402e-06, "loss": 0.1553, "step": 14380 }, { "epoch": 2.169455751545304, "grad_norm": 2.761200428009033, "learning_rate": 2.1586213654127446e-06, "loss": 0.2366, "step": 14390 }, { "epoch": 2.1709633649932156, "grad_norm": 3.1093828678131104, "learning_rate": 2.151408636526715e-06, "loss": 0.231, "step": 14400 }, { "epoch": 2.172470978441128, "grad_norm": 2.657222270965576, "learning_rate": 2.144204673351003e-06, "loss": 0.204, "step": 14410 }, { "epoch": 2.1739785918890395, "grad_norm": 3.4127113819122314, "learning_rate": 2.137009498053709e-06, "loss": 0.1526, "step": 14420 }, { "epoch": 2.1754862053369517, "grad_norm": 2.9945461750030518, "learning_rate": 2.129823132775887e-06, "loss": 0.1723, "step": 14430 }, { "epoch": 2.1769938187848634, "grad_norm": 3.8545806407928467, "learning_rate": 2.122645599631484e-06, "loss": 0.1925, "step": 14440 }, { "epoch": 2.1785014322327756, "grad_norm": 2.5218615531921387, "learning_rate": 2.1154769207072675e-06, "loss": 0.2118, "step": 14450 }, { "epoch": 2.1800090456806873, "grad_norm": 3.1691579818725586, "learning_rate": 2.1083171180627603e-06, "loss": 0.2199, "step": 14460 }, { "epoch": 2.1815166591285995, "grad_norm": 3.506556272506714, "learning_rate": 2.101166213730171e-06, "loss": 0.2697, "step": 14470 }, { "epoch": 2.183024272576511, "grad_norm": 3.476484537124634, "learning_rate": 2.0940242297143223e-06, "loss": 0.2163, "step": 14480 }, { "epoch": 2.1845318860244234, "grad_norm": 2.2501580715179443, "learning_rate": 2.0868911879925903e-06, "loss": 0.2466, "step": 14490 }, { "epoch": 2.186039499472335, "grad_norm": 3.4930546283721924, "learning_rate": 2.079767110514835e-06, "loss": 0.205, "step": 14500 }, { "epoch": 2.1875471129202473, "grad_norm": 3.0154542922973633, "learning_rate": 2.0726520192033288e-06, "loss": 0.1871, "step": 14510 }, { "epoch": 2.189054726368159, "grad_norm": 1.9023457765579224, "learning_rate": 2.0655459359526946e-06, "loss": 0.2413, "step": 14520 }, { "epoch": 2.190562339816071, "grad_norm": 2.928551435470581, "learning_rate": 2.0584488826298348e-06, "loss": 0.217, "step": 14530 }, { "epoch": 2.192069953263983, "grad_norm": 2.691746473312378, "learning_rate": 2.0513608810738607e-06, "loss": 0.1511, "step": 14540 }, { "epoch": 2.193577566711895, "grad_norm": 4.4917402267456055, "learning_rate": 2.0442819530960346e-06, "loss": 0.2268, "step": 14550 }, { "epoch": 2.195085180159807, "grad_norm": 3.4771831035614014, "learning_rate": 2.0372121204796956e-06, "loss": 0.2204, "step": 14560 }, { "epoch": 2.196592793607719, "grad_norm": 1.9381318092346191, "learning_rate": 2.030151404980195e-06, "loss": 0.1525, "step": 14570 }, { "epoch": 2.198100407055631, "grad_norm": 4.554322242736816, "learning_rate": 2.0230998283248286e-06, "loss": 0.1391, "step": 14580 }, { "epoch": 2.199608020503543, "grad_norm": 3.143808364868164, "learning_rate": 2.0160574122127714e-06, "loss": 0.257, "step": 14590 }, { "epoch": 2.201115633951455, "grad_norm": 2.2741968631744385, "learning_rate": 2.009024178315004e-06, "loss": 0.178, "step": 14600 }, { "epoch": 2.2026232473993668, "grad_norm": 2.0042724609375, "learning_rate": 2.002000148274258e-06, "loss": 0.1583, "step": 14610 }, { "epoch": 2.204130860847279, "grad_norm": 2.273075580596924, "learning_rate": 1.994985343704939e-06, "loss": 0.219, "step": 14620 }, { "epoch": 2.2056384742951907, "grad_norm": 3.0473814010620117, "learning_rate": 1.9879797861930668e-06, "loss": 0.2075, "step": 14630 }, { "epoch": 2.207146087743103, "grad_norm": 3.727538585662842, "learning_rate": 1.980983497296203e-06, "loss": 0.1816, "step": 14640 }, { "epoch": 2.2086537011910146, "grad_norm": 3.1143639087677, "learning_rate": 1.973996498543392e-06, "loss": 0.1481, "step": 14650 }, { "epoch": 2.2101613146389267, "grad_norm": 2.9430434703826904, "learning_rate": 1.967018811435083e-06, "loss": 0.1729, "step": 14660 }, { "epoch": 2.2116689280868385, "grad_norm": 2.8585591316223145, "learning_rate": 1.9600504574430777e-06, "loss": 0.2014, "step": 14670 }, { "epoch": 2.2131765415347506, "grad_norm": 5.699854373931885, "learning_rate": 1.9530914580104576e-06, "loss": 0.1777, "step": 14680 }, { "epoch": 2.2146841549826624, "grad_norm": 3.5599498748779297, "learning_rate": 1.9461418345515152e-06, "loss": 0.2251, "step": 14690 }, { "epoch": 2.2161917684305745, "grad_norm": 2.680243492126465, "learning_rate": 1.9392016084516934e-06, "loss": 0.2109, "step": 14700 }, { "epoch": 2.2176993818784863, "grad_norm": 3.1329753398895264, "learning_rate": 1.932270801067519e-06, "loss": 0.217, "step": 14710 }, { "epoch": 2.2192069953263984, "grad_norm": 2.932131290435791, "learning_rate": 1.925349433726529e-06, "loss": 0.1492, "step": 14720 }, { "epoch": 2.22071460877431, "grad_norm": 2.9370596408843994, "learning_rate": 1.918437527727217e-06, "loss": 0.195, "step": 14730 }, { "epoch": 2.2222222222222223, "grad_norm": 2.766899585723877, "learning_rate": 1.911535104338962e-06, "loss": 0.1636, "step": 14740 }, { "epoch": 2.223729835670134, "grad_norm": 2.5094223022460938, "learning_rate": 1.904642184801961e-06, "loss": 0.2154, "step": 14750 }, { "epoch": 2.225237449118046, "grad_norm": 2.1278772354125977, "learning_rate": 1.8977587903271666e-06, "loss": 0.2503, "step": 14760 }, { "epoch": 2.226745062565958, "grad_norm": 4.967206954956055, "learning_rate": 1.8908849420962223e-06, "loss": 0.1699, "step": 14770 }, { "epoch": 2.22825267601387, "grad_norm": 2.68778133392334, "learning_rate": 1.8840206612613903e-06, "loss": 0.2034, "step": 14780 }, { "epoch": 2.229760289461782, "grad_norm": 1.894081950187683, "learning_rate": 1.877165968945498e-06, "loss": 0.2106, "step": 14790 }, { "epoch": 2.231267902909694, "grad_norm": 2.9423828125, "learning_rate": 1.8703208862418648e-06, "loss": 0.2127, "step": 14800 }, { "epoch": 2.2327755163576057, "grad_norm": 2.6027657985687256, "learning_rate": 1.8634854342142395e-06, "loss": 0.1513, "step": 14810 }, { "epoch": 2.234283129805518, "grad_norm": 4.208071708679199, "learning_rate": 1.8566596338967353e-06, "loss": 0.2182, "step": 14820 }, { "epoch": 2.2357907432534296, "grad_norm": 2.5634965896606445, "learning_rate": 1.8498435062937626e-06, "loss": 0.16, "step": 14830 }, { "epoch": 2.237298356701342, "grad_norm": 3.174802780151367, "learning_rate": 1.8430370723799707e-06, "loss": 0.1696, "step": 14840 }, { "epoch": 2.2388059701492535, "grad_norm": 3.9306371212005615, "learning_rate": 1.8362403531001772e-06, "loss": 0.1941, "step": 14850 }, { "epoch": 2.2403135835971657, "grad_norm": 1.920767903327942, "learning_rate": 1.829453369369306e-06, "loss": 0.1622, "step": 14860 }, { "epoch": 2.241821197045078, "grad_norm": 2.751326560974121, "learning_rate": 1.8226761420723243e-06, "loss": 0.2349, "step": 14870 }, { "epoch": 2.2433288104929896, "grad_norm": 2.1145246028900146, "learning_rate": 1.8159086920641755e-06, "loss": 0.1762, "step": 14880 }, { "epoch": 2.2448364239409013, "grad_norm": 2.433253049850464, "learning_rate": 1.8091510401697137e-06, "loss": 0.2573, "step": 14890 }, { "epoch": 2.2463440373888135, "grad_norm": 3.0133955478668213, "learning_rate": 1.8024032071836456e-06, "loss": 0.1839, "step": 14900 }, { "epoch": 2.2478516508367257, "grad_norm": 4.035202980041504, "learning_rate": 1.7956652138704629e-06, "loss": 0.1727, "step": 14910 }, { "epoch": 2.2493592642846374, "grad_norm": 3.1524760723114014, "learning_rate": 1.7889370809643764e-06, "loss": 0.2251, "step": 14920 }, { "epoch": 2.2508668777325496, "grad_norm": 3.831266403198242, "learning_rate": 1.7822188291692572e-06, "loss": 0.1776, "step": 14930 }, { "epoch": 2.2523744911804613, "grad_norm": 2.844193696975708, "learning_rate": 1.7755104791585686e-06, "loss": 0.2809, "step": 14940 }, { "epoch": 2.2538821046283735, "grad_norm": 2.97367525100708, "learning_rate": 1.7688120515753021e-06, "loss": 0.1942, "step": 14950 }, { "epoch": 2.255389718076285, "grad_norm": 2.5909624099731445, "learning_rate": 1.7621235670319192e-06, "loss": 0.2339, "step": 14960 }, { "epoch": 2.2568973315241974, "grad_norm": 3.2207963466644287, "learning_rate": 1.755445046110283e-06, "loss": 0.1934, "step": 14970 }, { "epoch": 2.258404944972109, "grad_norm": 2.899423360824585, "learning_rate": 1.748776509361596e-06, "loss": 0.1731, "step": 14980 }, { "epoch": 2.2599125584200213, "grad_norm": 2.14694881439209, "learning_rate": 1.742117977306339e-06, "loss": 0.1872, "step": 14990 }, { "epoch": 2.261420171867933, "grad_norm": 4.132833003997803, "learning_rate": 1.7354694704342062e-06, "loss": 0.1984, "step": 15000 }, { "epoch": 2.262927785315845, "grad_norm": 3.723544120788574, "learning_rate": 1.7288310092040378e-06, "loss": 0.2114, "step": 15010 }, { "epoch": 2.264435398763757, "grad_norm": 2.4816057682037354, "learning_rate": 1.7222026140437676e-06, "loss": 0.1728, "step": 15020 }, { "epoch": 2.265943012211669, "grad_norm": 3.4777331352233887, "learning_rate": 1.71558430535035e-06, "loss": 0.1775, "step": 15030 }, { "epoch": 2.2674506256595808, "grad_norm": 3.021200180053711, "learning_rate": 1.7089761034897035e-06, "loss": 0.1853, "step": 15040 }, { "epoch": 2.268958239107493, "grad_norm": 2.9010074138641357, "learning_rate": 1.7023780287966441e-06, "loss": 0.1588, "step": 15050 }, { "epoch": 2.2704658525554047, "grad_norm": 3.6851589679718018, "learning_rate": 1.6957901015748274e-06, "loss": 0.2069, "step": 15060 }, { "epoch": 2.271973466003317, "grad_norm": 2.920166254043579, "learning_rate": 1.6892123420966771e-06, "loss": 0.2187, "step": 15070 }, { "epoch": 2.2734810794512286, "grad_norm": 4.041018009185791, "learning_rate": 1.682644770603334e-06, "loss": 0.2008, "step": 15080 }, { "epoch": 2.2749886928991407, "grad_norm": 3.2905325889587402, "learning_rate": 1.6760874073045864e-06, "loss": 0.1958, "step": 15090 }, { "epoch": 2.2764963063470525, "grad_norm": 3.1345436573028564, "learning_rate": 1.6695402723788108e-06, "loss": 0.1624, "step": 15100 }, { "epoch": 2.2780039197949646, "grad_norm": 2.308281898498535, "learning_rate": 1.6630033859729079e-06, "loss": 0.1765, "step": 15110 }, { "epoch": 2.2795115332428764, "grad_norm": 3.72396183013916, "learning_rate": 1.656476768202243e-06, "loss": 0.2254, "step": 15120 }, { "epoch": 2.2810191466907885, "grad_norm": 2.2301063537597656, "learning_rate": 1.6499604391505796e-06, "loss": 0.1799, "step": 15130 }, { "epoch": 2.2825267601387003, "grad_norm": 3.410407543182373, "learning_rate": 1.6434544188700225e-06, "loss": 0.1798, "step": 15140 }, { "epoch": 2.2840343735866124, "grad_norm": 2.4204392433166504, "learning_rate": 1.6369587273809557e-06, "loss": 0.1761, "step": 15150 }, { "epoch": 2.285541987034524, "grad_norm": 2.0324671268463135, "learning_rate": 1.6304733846719772e-06, "loss": 0.1443, "step": 15160 }, { "epoch": 2.2870496004824363, "grad_norm": 3.4651341438293457, "learning_rate": 1.6239984106998408e-06, "loss": 0.1835, "step": 15170 }, { "epoch": 2.288557213930348, "grad_norm": 3.403531551361084, "learning_rate": 1.6175338253893946e-06, "loss": 0.2077, "step": 15180 }, { "epoch": 2.2900648273782602, "grad_norm": 3.3289942741394043, "learning_rate": 1.611079648633514e-06, "loss": 0.1941, "step": 15190 }, { "epoch": 2.2915724408261724, "grad_norm": 1.7043676376342773, "learning_rate": 1.60463590029305e-06, "loss": 0.1729, "step": 15200 }, { "epoch": 2.293080054274084, "grad_norm": 4.037632465362549, "learning_rate": 1.5982026001967616e-06, "loss": 0.2264, "step": 15210 }, { "epoch": 2.294587667721996, "grad_norm": 3.3121495246887207, "learning_rate": 1.5917797681412561e-06, "loss": 0.2005, "step": 15220 }, { "epoch": 2.296095281169908, "grad_norm": 2.945084810256958, "learning_rate": 1.5853674238909306e-06, "loss": 0.1847, "step": 15230 }, { "epoch": 2.29760289461782, "grad_norm": 2.6231887340545654, "learning_rate": 1.578965587177903e-06, "loss": 0.268, "step": 15240 }, { "epoch": 2.299110508065732, "grad_norm": 3.3439857959747314, "learning_rate": 1.5725742777019637e-06, "loss": 0.1622, "step": 15250 }, { "epoch": 2.300618121513644, "grad_norm": 2.4877359867095947, "learning_rate": 1.5661935151305058e-06, "loss": 0.1838, "step": 15260 }, { "epoch": 2.302125734961556, "grad_norm": 2.9858293533325195, "learning_rate": 1.5598233190984679e-06, "loss": 0.2068, "step": 15270 }, { "epoch": 2.303633348409468, "grad_norm": 3.8211748600006104, "learning_rate": 1.5534637092082733e-06, "loss": 0.2026, "step": 15280 }, { "epoch": 2.3051409618573797, "grad_norm": 3.261504650115967, "learning_rate": 1.5471147050297714e-06, "loss": 0.2236, "step": 15290 }, { "epoch": 2.306648575305292, "grad_norm": 5.558926105499268, "learning_rate": 1.540776326100169e-06, "loss": 0.2776, "step": 15300 }, { "epoch": 2.3081561887532036, "grad_norm": 2.4713661670684814, "learning_rate": 1.5344485919239838e-06, "loss": 0.1865, "step": 15310 }, { "epoch": 2.309663802201116, "grad_norm": 1.8835035562515259, "learning_rate": 1.5281315219729748e-06, "loss": 0.2186, "step": 15320 }, { "epoch": 2.3111714156490275, "grad_norm": 3.412154197692871, "learning_rate": 1.5218251356860852e-06, "loss": 0.2005, "step": 15330 }, { "epoch": 2.3126790290969397, "grad_norm": 2.4885752201080322, "learning_rate": 1.5155294524693814e-06, "loss": 0.1957, "step": 15340 }, { "epoch": 2.3141866425448514, "grad_norm": 2.6715471744537354, "learning_rate": 1.5092444916959975e-06, "loss": 0.2015, "step": 15350 }, { "epoch": 2.3156942559927636, "grad_norm": 3.0562663078308105, "learning_rate": 1.5029702727060663e-06, "loss": 0.1537, "step": 15360 }, { "epoch": 2.3172018694406753, "grad_norm": 3.302978992462158, "learning_rate": 1.496706814806671e-06, "loss": 0.1608, "step": 15370 }, { "epoch": 2.3187094828885875, "grad_norm": 2.7377941608428955, "learning_rate": 1.4904541372717796e-06, "loss": 0.1947, "step": 15380 }, { "epoch": 2.320217096336499, "grad_norm": 3.515874147415161, "learning_rate": 1.4842122593421854e-06, "loss": 0.179, "step": 15390 }, { "epoch": 2.3217247097844114, "grad_norm": 3.5612196922302246, "learning_rate": 1.4779812002254506e-06, "loss": 0.2295, "step": 15400 }, { "epoch": 2.323232323232323, "grad_norm": 1.9921789169311523, "learning_rate": 1.4717609790958465e-06, "loss": 0.2594, "step": 15410 }, { "epoch": 2.3247399366802353, "grad_norm": 2.7102062702178955, "learning_rate": 1.4655516150942882e-06, "loss": 0.2297, "step": 15420 }, { "epoch": 2.326247550128147, "grad_norm": 3.493344783782959, "learning_rate": 1.459353127328288e-06, "loss": 0.1986, "step": 15430 }, { "epoch": 2.327755163576059, "grad_norm": 3.4780454635620117, "learning_rate": 1.4531655348718854e-06, "loss": 0.2154, "step": 15440 }, { "epoch": 2.329262777023971, "grad_norm": 2.996227979660034, "learning_rate": 1.4469888567655949e-06, "loss": 0.182, "step": 15450 }, { "epoch": 2.330770390471883, "grad_norm": 3.671422243118286, "learning_rate": 1.4408231120163442e-06, "loss": 0.2114, "step": 15460 }, { "epoch": 2.332278003919795, "grad_norm": 1.9382660388946533, "learning_rate": 1.4346683195974186e-06, "loss": 0.178, "step": 15470 }, { "epoch": 2.333785617367707, "grad_norm": 2.939931631088257, "learning_rate": 1.4285244984483965e-06, "loss": 0.1834, "step": 15480 }, { "epoch": 2.3352932308156187, "grad_norm": 2.358708381652832, "learning_rate": 1.4223916674750998e-06, "loss": 0.1844, "step": 15490 }, { "epoch": 2.336800844263531, "grad_norm": 3.649674654006958, "learning_rate": 1.4162698455495294e-06, "loss": 0.1997, "step": 15500 }, { "epoch": 2.3383084577114426, "grad_norm": 2.6348838806152344, "learning_rate": 1.4101590515098095e-06, "loss": 0.1667, "step": 15510 }, { "epoch": 2.3398160711593547, "grad_norm": 4.104774475097656, "learning_rate": 1.4040593041601297e-06, "loss": 0.2003, "step": 15520 }, { "epoch": 2.341323684607267, "grad_norm": 2.317133903503418, "learning_rate": 1.397970622270687e-06, "loss": 0.2219, "step": 15530 }, { "epoch": 2.3428312980551786, "grad_norm": 2.354557752609253, "learning_rate": 1.391893024577623e-06, "loss": 0.1563, "step": 15540 }, { "epoch": 2.3443389115030904, "grad_norm": 3.1752395629882812, "learning_rate": 1.385826529782977e-06, "loss": 0.2027, "step": 15550 }, { "epoch": 2.3458465249510025, "grad_norm": 2.943479537963867, "learning_rate": 1.3797711565546195e-06, "loss": 0.1475, "step": 15560 }, { "epoch": 2.3473541383989147, "grad_norm": 3.0597429275512695, "learning_rate": 1.3737269235261991e-06, "loss": 0.1709, "step": 15570 }, { "epoch": 2.3488617518468264, "grad_norm": 7.190746307373047, "learning_rate": 1.3676938492970827e-06, "loss": 0.2335, "step": 15580 }, { "epoch": 2.3503693652947386, "grad_norm": 3.301384449005127, "learning_rate": 1.3616719524322974e-06, "loss": 0.2333, "step": 15590 }, { "epoch": 2.3518769787426503, "grad_norm": 3.7778255939483643, "learning_rate": 1.3556612514624778e-06, "loss": 0.172, "step": 15600 }, { "epoch": 2.3533845921905625, "grad_norm": 3.225708484649658, "learning_rate": 1.3496617648838056e-06, "loss": 0.1561, "step": 15610 }, { "epoch": 2.3548922056384742, "grad_norm": 4.238531112670898, "learning_rate": 1.3436735111579542e-06, "loss": 0.2095, "step": 15620 }, { "epoch": 2.3563998190863864, "grad_norm": 2.560643434524536, "learning_rate": 1.3376965087120296e-06, "loss": 0.1506, "step": 15630 }, { "epoch": 2.357907432534298, "grad_norm": 2.4668755531311035, "learning_rate": 1.3317307759385185e-06, "loss": 0.2376, "step": 15640 }, { "epoch": 2.3594150459822103, "grad_norm": 3.1670868396759033, "learning_rate": 1.3257763311952225e-06, "loss": 0.1805, "step": 15650 }, { "epoch": 2.360922659430122, "grad_norm": 5.149980068206787, "learning_rate": 1.3198331928052128e-06, "loss": 0.2356, "step": 15660 }, { "epoch": 2.362430272878034, "grad_norm": 3.4360740184783936, "learning_rate": 1.3139013790567667e-06, "loss": 0.1937, "step": 15670 }, { "epoch": 2.363937886325946, "grad_norm": 3.152125120162964, "learning_rate": 1.307980908203314e-06, "loss": 0.2276, "step": 15680 }, { "epoch": 2.365445499773858, "grad_norm": 2.3850903511047363, "learning_rate": 1.3020717984633795e-06, "loss": 0.1689, "step": 15690 }, { "epoch": 2.36695311322177, "grad_norm": 3.3965256214141846, "learning_rate": 1.2961740680205286e-06, "loss": 0.1963, "step": 15700 }, { "epoch": 2.368460726669682, "grad_norm": 2.730005979537964, "learning_rate": 1.2902877350233061e-06, "loss": 0.2173, "step": 15710 }, { "epoch": 2.3699683401175937, "grad_norm": 2.1236581802368164, "learning_rate": 1.2844128175851894e-06, "loss": 0.2409, "step": 15720 }, { "epoch": 2.371475953565506, "grad_norm": 2.288994312286377, "learning_rate": 1.2785493337845256e-06, "loss": 0.1686, "step": 15730 }, { "epoch": 2.3729835670134176, "grad_norm": 2.3982362747192383, "learning_rate": 1.2726973016644785e-06, "loss": 0.2302, "step": 15740 }, { "epoch": 2.37449118046133, "grad_norm": 3.291775703430176, "learning_rate": 1.2668567392329728e-06, "loss": 0.1668, "step": 15750 }, { "epoch": 2.3759987939092415, "grad_norm": 2.3893113136291504, "learning_rate": 1.2610276644626401e-06, "loss": 0.1729, "step": 15760 }, { "epoch": 2.3775064073571537, "grad_norm": 3.3729090690612793, "learning_rate": 1.255210095290757e-06, "loss": 0.2355, "step": 15770 }, { "epoch": 2.3790140208050654, "grad_norm": 2.8152525424957275, "learning_rate": 1.2494040496192e-06, "loss": 0.1914, "step": 15780 }, { "epoch": 2.3805216342529776, "grad_norm": 4.163690567016602, "learning_rate": 1.2436095453143837e-06, "loss": 0.189, "step": 15790 }, { "epoch": 2.3820292477008893, "grad_norm": 3.549351930618286, "learning_rate": 1.2378266002072075e-06, "loss": 0.1955, "step": 15800 }, { "epoch": 2.3835368611488015, "grad_norm": 3.5292084217071533, "learning_rate": 1.2320552320930013e-06, "loss": 0.2, "step": 15810 }, { "epoch": 2.3850444745967136, "grad_norm": 3.3973467350006104, "learning_rate": 1.2262954587314702e-06, "loss": 0.1931, "step": 15820 }, { "epoch": 2.3865520880446254, "grad_norm": 3.481182813644409, "learning_rate": 1.2205472978466371e-06, "loss": 0.1935, "step": 15830 }, { "epoch": 2.388059701492537, "grad_norm": 2.094430685043335, "learning_rate": 1.2148107671267945e-06, "loss": 0.1782, "step": 15840 }, { "epoch": 2.3895673149404493, "grad_norm": 3.4120049476623535, "learning_rate": 1.2090858842244447e-06, "loss": 0.2007, "step": 15850 }, { "epoch": 2.3910749283883614, "grad_norm": 2.9854838848114014, "learning_rate": 1.2033726667562473e-06, "loss": 0.1939, "step": 15860 }, { "epoch": 2.392582541836273, "grad_norm": 2.0431244373321533, "learning_rate": 1.197671132302966e-06, "loss": 0.2322, "step": 15870 }, { "epoch": 2.394090155284185, "grad_norm": 3.0406241416931152, "learning_rate": 1.1919812984094137e-06, "loss": 0.1992, "step": 15880 }, { "epoch": 2.395597768732097, "grad_norm": 3.455282211303711, "learning_rate": 1.186303182584394e-06, "loss": 0.1839, "step": 15890 }, { "epoch": 2.3971053821800092, "grad_norm": 1.7336950302124023, "learning_rate": 1.1806368023006565e-06, "loss": 0.1858, "step": 15900 }, { "epoch": 2.398612995627921, "grad_norm": 3.3505141735076904, "learning_rate": 1.1749821749948354e-06, "loss": 0.2046, "step": 15910 }, { "epoch": 2.400120609075833, "grad_norm": 3.884289264678955, "learning_rate": 1.1693393180674006e-06, "loss": 0.1602, "step": 15920 }, { "epoch": 2.401628222523745, "grad_norm": 3.23075008392334, "learning_rate": 1.1637082488826007e-06, "loss": 0.1832, "step": 15930 }, { "epoch": 2.403135835971657, "grad_norm": 3.2060842514038086, "learning_rate": 1.1580889847684118e-06, "loss": 0.1983, "step": 15940 }, { "epoch": 2.4046434494195688, "grad_norm": 3.157905101776123, "learning_rate": 1.152481543016481e-06, "loss": 0.1665, "step": 15950 }, { "epoch": 2.406151062867481, "grad_norm": 2.756596088409424, "learning_rate": 1.1468859408820777e-06, "loss": 0.1952, "step": 15960 }, { "epoch": 2.4076586763153927, "grad_norm": 2.544299364089966, "learning_rate": 1.1413021955840375e-06, "loss": 0.1693, "step": 15970 }, { "epoch": 2.409166289763305, "grad_norm": 2.962543249130249, "learning_rate": 1.135730324304712e-06, "loss": 0.1959, "step": 15980 }, { "epoch": 2.4106739032112166, "grad_norm": 2.7918922901153564, "learning_rate": 1.130170344189911e-06, "loss": 0.195, "step": 15990 }, { "epoch": 2.4121815166591287, "grad_norm": 3.431638717651367, "learning_rate": 1.124622272348856e-06, "loss": 0.2022, "step": 16000 }, { "epoch": 2.4136891301070404, "grad_norm": 2.9055063724517822, "learning_rate": 1.1190861258541192e-06, "loss": 0.2534, "step": 16010 }, { "epoch": 2.4151967435549526, "grad_norm": 2.6904940605163574, "learning_rate": 1.1135619217415806e-06, "loss": 0.1929, "step": 16020 }, { "epoch": 2.4167043570028643, "grad_norm": 3.671067953109741, "learning_rate": 1.1080496770103693e-06, "loss": 0.2298, "step": 16030 }, { "epoch": 2.4182119704507765, "grad_norm": 3.0274817943573, "learning_rate": 1.102549408622814e-06, "loss": 0.1847, "step": 16040 }, { "epoch": 2.4197195838986882, "grad_norm": 2.6310391426086426, "learning_rate": 1.0970611335043884e-06, "loss": 0.187, "step": 16050 }, { "epoch": 2.4212271973466004, "grad_norm": 4.190754413604736, "learning_rate": 1.0915848685436603e-06, "loss": 0.1567, "step": 16060 }, { "epoch": 2.422734810794512, "grad_norm": 3.104261636734009, "learning_rate": 1.0861206305922418e-06, "loss": 0.2429, "step": 16070 }, { "epoch": 2.4242424242424243, "grad_norm": 2.996833086013794, "learning_rate": 1.0806684364647313e-06, "loss": 0.175, "step": 16080 }, { "epoch": 2.425750037690336, "grad_norm": 3.076505422592163, "learning_rate": 1.0752283029386701e-06, "loss": 0.1743, "step": 16090 }, { "epoch": 2.427257651138248, "grad_norm": 2.1828794479370117, "learning_rate": 1.0698002467544837e-06, "loss": 0.1692, "step": 16100 }, { "epoch": 2.42876526458616, "grad_norm": 3.260761260986328, "learning_rate": 1.0643842846154356e-06, "loss": 0.2157, "step": 16110 }, { "epoch": 2.430272878034072, "grad_norm": 3.727475643157959, "learning_rate": 1.0589804331875708e-06, "loss": 0.1919, "step": 16120 }, { "epoch": 2.431780491481984, "grad_norm": 3.2262232303619385, "learning_rate": 1.05358870909967e-06, "loss": 0.2571, "step": 16130 }, { "epoch": 2.433288104929896, "grad_norm": 3.560070753097534, "learning_rate": 1.0482091289431918e-06, "loss": 0.166, "step": 16140 }, { "epoch": 2.434795718377808, "grad_norm": 3.336533308029175, "learning_rate": 1.0428417092722277e-06, "loss": 0.1972, "step": 16150 }, { "epoch": 2.43630333182572, "grad_norm": 3.3564276695251465, "learning_rate": 1.037486466603449e-06, "loss": 0.2007, "step": 16160 }, { "epoch": 2.4378109452736316, "grad_norm": 2.6915061473846436, "learning_rate": 1.0321434174160555e-06, "loss": 0.195, "step": 16170 }, { "epoch": 2.439318558721544, "grad_norm": 5.729076862335205, "learning_rate": 1.0268125781517247e-06, "loss": 0.1844, "step": 16180 }, { "epoch": 2.440826172169456, "grad_norm": 3.3835582733154297, "learning_rate": 1.0214939652145622e-06, "loss": 0.19, "step": 16190 }, { "epoch": 2.4423337856173677, "grad_norm": 3.5197067260742188, "learning_rate": 1.016187594971052e-06, "loss": 0.1959, "step": 16200 }, { "epoch": 2.4438413990652794, "grad_norm": 2.4764926433563232, "learning_rate": 1.0108934837500001e-06, "loss": 0.2159, "step": 16210 }, { "epoch": 2.4453490125131916, "grad_norm": 2.882168769836426, "learning_rate": 1.0056116478424932e-06, "loss": 0.1773, "step": 16220 }, { "epoch": 2.4468566259611038, "grad_norm": 3.8938753604888916, "learning_rate": 1.000342103501843e-06, "loss": 0.1716, "step": 16230 }, { "epoch": 2.4483642394090155, "grad_norm": 4.689419746398926, "learning_rate": 9.950848669435381e-07, "loss": 0.2435, "step": 16240 }, { "epoch": 2.4498718528569277, "grad_norm": 3.0633020401000977, "learning_rate": 9.898399543451919e-07, "loss": 0.1846, "step": 16250 }, { "epoch": 2.4513794663048394, "grad_norm": 4.190016269683838, "learning_rate": 9.846073818464968e-07, "loss": 0.1752, "step": 16260 }, { "epoch": 2.4528870797527516, "grad_norm": 2.9920053482055664, "learning_rate": 9.79387165549171e-07, "loss": 0.1803, "step": 16270 }, { "epoch": 2.4543946932006633, "grad_norm": 3.16174054145813, "learning_rate": 9.741793215169076e-07, "loss": 0.1904, "step": 16280 }, { "epoch": 2.4559023066485755, "grad_norm": 4.421197891235352, "learning_rate": 9.689838657753314e-07, "loss": 0.1883, "step": 16290 }, { "epoch": 2.457409920096487, "grad_norm": 3.0015816688537598, "learning_rate": 9.638008143119442e-07, "loss": 0.1758, "step": 16300 }, { "epoch": 2.4589175335443993, "grad_norm": 2.9260945320129395, "learning_rate": 9.586301830760775e-07, "loss": 0.2046, "step": 16310 }, { "epoch": 2.460425146992311, "grad_norm": 3.4380767345428467, "learning_rate": 9.534719879788424e-07, "loss": 0.1755, "step": 16320 }, { "epoch": 2.4619327604402232, "grad_norm": 3.182044267654419, "learning_rate": 9.483262448930841e-07, "loss": 0.1914, "step": 16330 }, { "epoch": 2.463440373888135, "grad_norm": 2.1702449321746826, "learning_rate": 9.43192969653326e-07, "loss": 0.1799, "step": 16340 }, { "epoch": 2.464947987336047, "grad_norm": 2.454672336578369, "learning_rate": 9.380721780557284e-07, "loss": 0.1568, "step": 16350 }, { "epoch": 2.466455600783959, "grad_norm": 3.5807056427001953, "learning_rate": 9.329638858580359e-07, "loss": 0.1757, "step": 16360 }, { "epoch": 2.467963214231871, "grad_norm": 4.162808418273926, "learning_rate": 9.278681087795305e-07, "loss": 0.1978, "step": 16370 }, { "epoch": 2.4694708276797828, "grad_norm": 1.9279392957687378, "learning_rate": 9.227848625009822e-07, "loss": 0.1794, "step": 16380 }, { "epoch": 2.470978441127695, "grad_norm": 4.072895526885986, "learning_rate": 9.177141626646002e-07, "loss": 0.1608, "step": 16390 }, { "epoch": 2.4724860545756067, "grad_norm": 2.194889783859253, "learning_rate": 9.12656024873989e-07, "loss": 0.1954, "step": 16400 }, { "epoch": 2.473993668023519, "grad_norm": 2.841707944869995, "learning_rate": 9.076104646940915e-07, "loss": 0.1962, "step": 16410 }, { "epoch": 2.4755012814714306, "grad_norm": 3.3716492652893066, "learning_rate": 9.025774976511503e-07, "loss": 0.2729, "step": 16420 }, { "epoch": 2.4770088949193427, "grad_norm": 3.926175832748413, "learning_rate": 8.975571392326565e-07, "loss": 0.2128, "step": 16430 }, { "epoch": 2.4785165083672545, "grad_norm": 2.3835458755493164, "learning_rate": 8.925494048873007e-07, "loss": 0.2128, "step": 16440 }, { "epoch": 2.4800241218151666, "grad_norm": 2.5094921588897705, "learning_rate": 8.875543100249273e-07, "loss": 0.1572, "step": 16450 }, { "epoch": 2.4815317352630784, "grad_norm": 2.408616065979004, "learning_rate": 8.825718700164865e-07, "loss": 0.1834, "step": 16460 }, { "epoch": 2.4830393487109905, "grad_norm": 1.8410701751708984, "learning_rate": 8.776021001939849e-07, "loss": 0.2377, "step": 16470 }, { "epoch": 2.4845469621589027, "grad_norm": 3.5871660709381104, "learning_rate": 8.726450158504424e-07, "loss": 0.2134, "step": 16480 }, { "epoch": 2.4860545756068144, "grad_norm": 2.9731717109680176, "learning_rate": 8.677006322398424e-07, "loss": 0.2084, "step": 16490 }, { "epoch": 2.487562189054726, "grad_norm": 3.2143025398254395, "learning_rate": 8.627689645770853e-07, "loss": 0.2049, "step": 16500 }, { "epoch": 2.4890698025026383, "grad_norm": 3.17525315284729, "learning_rate": 8.578500280379426e-07, "loss": 0.2411, "step": 16510 }, { "epoch": 2.4905774159505505, "grad_norm": 3.7180213928222656, "learning_rate": 8.529438377590099e-07, "loss": 0.1707, "step": 16520 }, { "epoch": 2.492085029398462, "grad_norm": 2.140578269958496, "learning_rate": 8.480504088376563e-07, "loss": 0.1737, "step": 16530 }, { "epoch": 2.493592642846374, "grad_norm": 4.5150861740112305, "learning_rate": 8.431697563319863e-07, "loss": 0.2136, "step": 16540 }, { "epoch": 2.495100256294286, "grad_norm": 4.474433898925781, "learning_rate": 8.383018952607852e-07, "loss": 0.1758, "step": 16550 }, { "epoch": 2.4966078697421983, "grad_norm": 2.7521488666534424, "learning_rate": 8.334468406034785e-07, "loss": 0.1833, "step": 16560 }, { "epoch": 2.49811548319011, "grad_norm": 2.866765022277832, "learning_rate": 8.286046073000831e-07, "loss": 0.2003, "step": 16570 }, { "epoch": 2.499623096638022, "grad_norm": 4.549286842346191, "learning_rate": 8.237752102511632e-07, "loss": 0.1724, "step": 16580 }, { "epoch": 2.501130710085934, "grad_norm": 2.998640775680542, "learning_rate": 8.189586643177783e-07, "loss": 0.2445, "step": 16590 }, { "epoch": 2.502638323533846, "grad_norm": 2.847081422805786, "learning_rate": 8.14154984321448e-07, "loss": 0.1531, "step": 16600 }, { "epoch": 2.504145936981758, "grad_norm": 3.6362550258636475, "learning_rate": 8.09364185044098e-07, "loss": 0.236, "step": 16610 }, { "epoch": 2.50565355042967, "grad_norm": 2.4879753589630127, "learning_rate": 8.045862812280181e-07, "loss": 0.206, "step": 16620 }, { "epoch": 2.5071611638775817, "grad_norm": 2.800388813018799, "learning_rate": 7.998212875758149e-07, "loss": 0.2262, "step": 16630 }, { "epoch": 2.508668777325494, "grad_norm": 2.5295913219451904, "learning_rate": 7.950692187503717e-07, "loss": 0.2245, "step": 16640 }, { "epoch": 2.5101763907734056, "grad_norm": 3.5858139991760254, "learning_rate": 7.903300893747922e-07, "loss": 0.1744, "step": 16650 }, { "epoch": 2.5116840042213178, "grad_norm": 3.4839518070220947, "learning_rate": 7.856039140323685e-07, "loss": 0.2413, "step": 16660 }, { "epoch": 2.5131916176692295, "grad_norm": 2.684366464614868, "learning_rate": 7.808907072665295e-07, "loss": 0.1666, "step": 16670 }, { "epoch": 2.5146992311171417, "grad_norm": 2.633725881576538, "learning_rate": 7.761904835807954e-07, "loss": 0.1773, "step": 16680 }, { "epoch": 2.5162068445650534, "grad_norm": 2.4121763706207275, "learning_rate": 7.715032574387366e-07, "loss": 0.267, "step": 16690 }, { "epoch": 2.5177144580129656, "grad_norm": 2.6491031646728516, "learning_rate": 7.668290432639274e-07, "loss": 0.2324, "step": 16700 }, { "epoch": 2.5192220714608773, "grad_norm": 2.9321720600128174, "learning_rate": 7.621678554398981e-07, "loss": 0.2188, "step": 16710 }, { "epoch": 2.5207296849087895, "grad_norm": 2.7587833404541016, "learning_rate": 7.575197083100977e-07, "loss": 0.1906, "step": 16720 }, { "epoch": 2.522237298356701, "grad_norm": 1.9733242988586426, "learning_rate": 7.528846161778458e-07, "loss": 0.165, "step": 16730 }, { "epoch": 2.5237449118046134, "grad_norm": 1.9545135498046875, "learning_rate": 7.482625933062881e-07, "loss": 0.169, "step": 16740 }, { "epoch": 2.525252525252525, "grad_norm": 1.9173047542572021, "learning_rate": 7.436536539183559e-07, "loss": 0.1913, "step": 16750 }, { "epoch": 2.5267601387004373, "grad_norm": 2.7444396018981934, "learning_rate": 7.390578121967146e-07, "loss": 0.1864, "step": 16760 }, { "epoch": 2.5282677521483494, "grad_norm": 3.399019956588745, "learning_rate": 7.344750822837298e-07, "loss": 0.2116, "step": 16770 }, { "epoch": 2.529775365596261, "grad_norm": 2.2537262439727783, "learning_rate": 7.299054782814186e-07, "loss": 0.1511, "step": 16780 }, { "epoch": 2.531282979044173, "grad_norm": 3.1940770149230957, "learning_rate": 7.253490142514069e-07, "loss": 0.2088, "step": 16790 }, { "epoch": 2.532790592492085, "grad_norm": 2.137446165084839, "learning_rate": 7.208057042148847e-07, "loss": 0.2058, "step": 16800 }, { "epoch": 2.534298205939997, "grad_norm": 4.086893558502197, "learning_rate": 7.16275562152568e-07, "loss": 0.1952, "step": 16810 }, { "epoch": 2.535805819387909, "grad_norm": 3.555161237716675, "learning_rate": 7.117586020046457e-07, "loss": 0.2222, "step": 16820 }, { "epoch": 2.5373134328358207, "grad_norm": 2.4928367137908936, "learning_rate": 7.0725483767075e-07, "loss": 0.2103, "step": 16830 }, { "epoch": 2.538821046283733, "grad_norm": 3.169013738632202, "learning_rate": 7.027642830099025e-07, "loss": 0.2247, "step": 16840 }, { "epoch": 2.540328659731645, "grad_norm": 3.64982533454895, "learning_rate": 6.982869518404783e-07, "loss": 0.1804, "step": 16850 }, { "epoch": 2.5418362731795567, "grad_norm": 2.793654680252075, "learning_rate": 6.938228579401601e-07, "loss": 0.2409, "step": 16860 }, { "epoch": 2.5433438866274685, "grad_norm": 2.118222236633301, "learning_rate": 6.893720150458977e-07, "loss": 0.1858, "step": 16870 }, { "epoch": 2.5448515000753806, "grad_norm": 2.9152028560638428, "learning_rate": 6.849344368538613e-07, "loss": 0.158, "step": 16880 }, { "epoch": 2.546359113523293, "grad_norm": 6.132944583892822, "learning_rate": 6.805101370194072e-07, "loss": 0.2551, "step": 16890 }, { "epoch": 2.5478667269712045, "grad_norm": 2.0241217613220215, "learning_rate": 6.76099129157029e-07, "loss": 0.1818, "step": 16900 }, { "epoch": 2.5493743404191167, "grad_norm": 4.067783355712891, "learning_rate": 6.717014268403193e-07, "loss": 0.1767, "step": 16910 }, { "epoch": 2.5508819538670284, "grad_norm": 2.9850211143493652, "learning_rate": 6.673170436019261e-07, "loss": 0.2262, "step": 16920 }, { "epoch": 2.5523895673149406, "grad_norm": 3.031498670578003, "learning_rate": 6.62945992933513e-07, "loss": 0.225, "step": 16930 }, { "epoch": 2.5538971807628523, "grad_norm": 5.537667274475098, "learning_rate": 6.585882882857131e-07, "loss": 0.2099, "step": 16940 }, { "epoch": 2.5554047942107645, "grad_norm": 3.1840193271636963, "learning_rate": 6.542439430680941e-07, "loss": 0.2493, "step": 16950 }, { "epoch": 2.5569124076586762, "grad_norm": 2.338442087173462, "learning_rate": 6.499129706491142e-07, "loss": 0.14, "step": 16960 }, { "epoch": 2.5584200211065884, "grad_norm": 3.587310552597046, "learning_rate": 6.455953843560786e-07, "loss": 0.2499, "step": 16970 }, { "epoch": 2.5599276345545, "grad_norm": 2.6882870197296143, "learning_rate": 6.412911974751029e-07, "loss": 0.1795, "step": 16980 }, { "epoch": 2.5614352480024123, "grad_norm": 3.9107022285461426, "learning_rate": 6.370004232510685e-07, "loss": 0.2018, "step": 16990 }, { "epoch": 2.562942861450324, "grad_norm": 3.055806875228882, "learning_rate": 6.327230748875824e-07, "loss": 0.215, "step": 17000 }, { "epoch": 2.564450474898236, "grad_norm": 3.0541207790374756, "learning_rate": 6.284591655469385e-07, "loss": 0.1746, "step": 17010 }, { "epoch": 2.565958088346148, "grad_norm": 4.431246280670166, "learning_rate": 6.242087083500769e-07, "loss": 0.2508, "step": 17020 }, { "epoch": 2.56746570179406, "grad_norm": 2.1575329303741455, "learning_rate": 6.199717163765412e-07, "loss": 0.1611, "step": 17030 }, { "epoch": 2.568973315241972, "grad_norm": 3.138880729675293, "learning_rate": 6.157482026644407e-07, "loss": 0.2005, "step": 17040 }, { "epoch": 2.570480928689884, "grad_norm": 2.9477691650390625, "learning_rate": 6.115381802104098e-07, "loss": 0.2033, "step": 17050 }, { "epoch": 2.5719885421377957, "grad_norm": 3.697857618331909, "learning_rate": 6.073416619695638e-07, "loss": 0.1743, "step": 17060 }, { "epoch": 2.573496155585708, "grad_norm": 5.843478679656982, "learning_rate": 6.031586608554673e-07, "loss": 0.1829, "step": 17070 }, { "epoch": 2.5750037690336196, "grad_norm": 4.003309726715088, "learning_rate": 5.989891897400874e-07, "loss": 0.2265, "step": 17080 }, { "epoch": 2.5765113824815318, "grad_norm": 2.2403392791748047, "learning_rate": 5.948332614537572e-07, "loss": 0.1826, "step": 17090 }, { "epoch": 2.578018995929444, "grad_norm": 3.054004430770874, "learning_rate": 5.906908887851376e-07, "loss": 0.251, "step": 17100 }, { "epoch": 2.5795266093773557, "grad_norm": 2.1359493732452393, "learning_rate": 5.865620844811703e-07, "loss": 0.2218, "step": 17110 }, { "epoch": 2.5810342228252674, "grad_norm": 3.832974672317505, "learning_rate": 5.824468612470508e-07, "loss": 0.2268, "step": 17120 }, { "epoch": 2.5825418362731796, "grad_norm": 2.1287362575531006, "learning_rate": 5.783452317461796e-07, "loss": 0.2239, "step": 17130 }, { "epoch": 2.5840494497210917, "grad_norm": 2.432328939437866, "learning_rate": 5.742572086001264e-07, "loss": 0.2199, "step": 17140 }, { "epoch": 2.5855570631690035, "grad_norm": 2.363590955734253, "learning_rate": 5.701828043885926e-07, "loss": 0.1838, "step": 17150 }, { "epoch": 2.587064676616915, "grad_norm": 3.697007894515991, "learning_rate": 5.66122031649371e-07, "loss": 0.2243, "step": 17160 }, { "epoch": 2.5885722900648274, "grad_norm": 2.7513370513916016, "learning_rate": 5.620749028783057e-07, "loss": 0.182, "step": 17170 }, { "epoch": 2.5900799035127395, "grad_norm": 2.2406985759735107, "learning_rate": 5.580414305292569e-07, "loss": 0.1532, "step": 17180 }, { "epoch": 2.5915875169606513, "grad_norm": 2.7594995498657227, "learning_rate": 5.540216270140619e-07, "loss": 0.1599, "step": 17190 }, { "epoch": 2.593095130408563, "grad_norm": 2.712376356124878, "learning_rate": 5.500155047024952e-07, "loss": 0.2165, "step": 17200 }, { "epoch": 2.594602743856475, "grad_norm": 3.816192388534546, "learning_rate": 5.460230759222313e-07, "loss": 0.1895, "step": 17210 }, { "epoch": 2.5961103573043873, "grad_norm": 3.641583204269409, "learning_rate": 5.420443529588082e-07, "loss": 0.2263, "step": 17220 }, { "epoch": 2.597617970752299, "grad_norm": 3.764211654663086, "learning_rate": 5.380793480555846e-07, "loss": 0.1682, "step": 17230 }, { "epoch": 2.5991255842002112, "grad_norm": 2.7292368412017822, "learning_rate": 5.3412807341371e-07, "loss": 0.1823, "step": 17240 }, { "epoch": 2.600633197648123, "grad_norm": 2.699617385864258, "learning_rate": 5.30190541192081e-07, "loss": 0.1808, "step": 17250 }, { "epoch": 2.602140811096035, "grad_norm": 4.844589710235596, "learning_rate": 5.262667635073065e-07, "loss": 0.1519, "step": 17260 }, { "epoch": 2.603648424543947, "grad_norm": 2.656907320022583, "learning_rate": 5.223567524336692e-07, "loss": 0.2174, "step": 17270 }, { "epoch": 2.605156037991859, "grad_norm": 2.0844805240631104, "learning_rate": 5.184605200030912e-07, "loss": 0.1969, "step": 17280 }, { "epoch": 2.6066636514397707, "grad_norm": 3.1713502407073975, "learning_rate": 5.145780782050902e-07, "loss": 0.1993, "step": 17290 }, { "epoch": 2.608171264887683, "grad_norm": 3.211313009262085, "learning_rate": 5.107094389867517e-07, "loss": 0.2187, "step": 17300 }, { "epoch": 2.6096788783355946, "grad_norm": 2.1587347984313965, "learning_rate": 5.068546142526864e-07, "loss": 0.1983, "step": 17310 }, { "epoch": 2.611186491783507, "grad_norm": 3.6126112937927246, "learning_rate": 5.03013615864994e-07, "loss": 0.2011, "step": 17320 }, { "epoch": 2.6126941052314185, "grad_norm": 2.9415533542633057, "learning_rate": 4.991864556432291e-07, "loss": 0.1943, "step": 17330 }, { "epoch": 2.6142017186793307, "grad_norm": 3.5860438346862793, "learning_rate": 4.953731453643629e-07, "loss": 0.1867, "step": 17340 }, { "epoch": 2.6157093321272424, "grad_norm": 4.004639625549316, "learning_rate": 4.915736967627466e-07, "loss": 0.1782, "step": 17350 }, { "epoch": 2.6172169455751546, "grad_norm": 3.0191142559051514, "learning_rate": 4.877881215300762e-07, "loss": 0.1902, "step": 17360 }, { "epoch": 2.6187245590230663, "grad_norm": 3.5242838859558105, "learning_rate": 4.840164313153583e-07, "loss": 0.1785, "step": 17370 }, { "epoch": 2.6202321724709785, "grad_norm": 3.884747266769409, "learning_rate": 4.802586377248702e-07, "loss": 0.179, "step": 17380 }, { "epoch": 2.6217397859188902, "grad_norm": 3.0600240230560303, "learning_rate": 4.7651475232212753e-07, "loss": 0.192, "step": 17390 }, { "epoch": 2.6232473993668024, "grad_norm": 3.525468587875366, "learning_rate": 4.7278478662784843e-07, "loss": 0.1922, "step": 17400 }, { "epoch": 2.624755012814714, "grad_norm": 2.718449354171753, "learning_rate": 4.6906875211991385e-07, "loss": 0.1996, "step": 17410 }, { "epoch": 2.6262626262626263, "grad_norm": 4.856651782989502, "learning_rate": 4.65366660233339e-07, "loss": 0.1974, "step": 17420 }, { "epoch": 2.6277702397105385, "grad_norm": 3.328500747680664, "learning_rate": 4.6167852236023324e-07, "loss": 0.2169, "step": 17430 }, { "epoch": 2.62927785315845, "grad_norm": 2.742598295211792, "learning_rate": 4.5800434984976717e-07, "loss": 0.1908, "step": 17440 }, { "epoch": 2.630785466606362, "grad_norm": 1.6607505083084106, "learning_rate": 4.543441540081367e-07, "loss": 0.1927, "step": 17450 }, { "epoch": 2.632293080054274, "grad_norm": 2.489805221557617, "learning_rate": 4.506979460985278e-07, "loss": 0.147, "step": 17460 }, { "epoch": 2.6338006935021863, "grad_norm": 2.024975299835205, "learning_rate": 4.470657373410836e-07, "loss": 0.212, "step": 17470 }, { "epoch": 2.635308306950098, "grad_norm": 2.2821600437164307, "learning_rate": 4.4344753891286893e-07, "loss": 0.1885, "step": 17480 }, { "epoch": 2.6368159203980097, "grad_norm": 2.9861598014831543, "learning_rate": 4.398433619478359e-07, "loss": 0.2011, "step": 17490 }, { "epoch": 2.638323533845922, "grad_norm": 2.449739933013916, "learning_rate": 4.3625321753678895e-07, "loss": 0.2183, "step": 17500 }, { "epoch": 2.639831147293834, "grad_norm": 2.7962839603424072, "learning_rate": 4.326771167273536e-07, "loss": 0.1816, "step": 17510 }, { "epoch": 2.641338760741746, "grad_norm": 2.5714094638824463, "learning_rate": 4.2911507052393607e-07, "loss": 0.1685, "step": 17520 }, { "epoch": 2.6428463741896575, "grad_norm": 2.2739298343658447, "learning_rate": 4.255670898876979e-07, "loss": 0.2009, "step": 17530 }, { "epoch": 2.6443539876375697, "grad_norm": 2.3927371501922607, "learning_rate": 4.220331857365151e-07, "loss": 0.1761, "step": 17540 }, { "epoch": 2.645861601085482, "grad_norm": 2.594801902770996, "learning_rate": 4.185133689449494e-07, "loss": 0.2234, "step": 17550 }, { "epoch": 2.6473692145333936, "grad_norm": 3.226229667663574, "learning_rate": 4.1500765034421186e-07, "loss": 0.1989, "step": 17560 }, { "epoch": 2.6488768279813057, "grad_norm": 2.45801043510437, "learning_rate": 4.115160407221308e-07, "loss": 0.2135, "step": 17570 }, { "epoch": 2.6503844414292175, "grad_norm": 2.1664884090423584, "learning_rate": 4.0803855082311663e-07, "loss": 0.2013, "step": 17580 }, { "epoch": 2.6518920548771296, "grad_norm": 3.1232314109802246, "learning_rate": 4.045751913481327e-07, "loss": 0.2194, "step": 17590 }, { "epoch": 2.6533996683250414, "grad_norm": 2.2744221687316895, "learning_rate": 4.011259729546585e-07, "loss": 0.1912, "step": 17600 }, { "epoch": 2.6549072817729535, "grad_norm": 2.7228891849517822, "learning_rate": 3.976909062566603e-07, "loss": 0.2113, "step": 17610 }, { "epoch": 2.6564148952208653, "grad_norm": 3.617417812347412, "learning_rate": 3.9427000182455433e-07, "loss": 0.1881, "step": 17620 }, { "epoch": 2.6579225086687774, "grad_norm": 3.132847785949707, "learning_rate": 3.9086327018517887e-07, "loss": 0.1765, "step": 17630 }, { "epoch": 2.659430122116689, "grad_norm": 2.629220724105835, "learning_rate": 3.874707218217566e-07, "loss": 0.2485, "step": 17640 }, { "epoch": 2.6609377355646013, "grad_norm": 2.603776454925537, "learning_rate": 3.8409236717386887e-07, "loss": 0.1442, "step": 17650 }, { "epoch": 2.662445349012513, "grad_norm": 2.8508617877960205, "learning_rate": 3.807282166374171e-07, "loss": 0.2742, "step": 17660 }, { "epoch": 2.6639529624604252, "grad_norm": 3.1542749404907227, "learning_rate": 3.773782805645948e-07, "loss": 0.2157, "step": 17670 }, { "epoch": 2.665460575908337, "grad_norm": 3.551274299621582, "learning_rate": 3.740425692638555e-07, "loss": 0.2215, "step": 17680 }, { "epoch": 2.666968189356249, "grad_norm": 2.6299383640289307, "learning_rate": 3.707210929998795e-07, "loss": 0.1671, "step": 17690 }, { "epoch": 2.668475802804161, "grad_norm": 3.0749213695526123, "learning_rate": 3.674138619935408e-07, "loss": 0.1748, "step": 17700 }, { "epoch": 2.669983416252073, "grad_norm": 4.371676445007324, "learning_rate": 3.641208864218815e-07, "loss": 0.1945, "step": 17710 }, { "epoch": 2.671491029699985, "grad_norm": 3.5497512817382812, "learning_rate": 3.608421764180736e-07, "loss": 0.1966, "step": 17720 }, { "epoch": 2.672998643147897, "grad_norm": 3.1935269832611084, "learning_rate": 3.575777420713933e-07, "loss": 0.1773, "step": 17730 }, { "epoch": 2.6745062565958087, "grad_norm": 3.2933461666107178, "learning_rate": 3.543275934271856e-07, "loss": 0.1652, "step": 17740 }, { "epoch": 2.676013870043721, "grad_norm": 3.2031972408294678, "learning_rate": 3.51091740486838e-07, "loss": 0.2093, "step": 17750 }, { "epoch": 2.677521483491633, "grad_norm": 1.9364057779312134, "learning_rate": 3.4787019320774385e-07, "loss": 0.1943, "step": 17760 }, { "epoch": 2.6790290969395447, "grad_norm": 2.944190740585327, "learning_rate": 3.446629615032776e-07, "loss": 0.1461, "step": 17770 }, { "epoch": 2.6805367103874564, "grad_norm": 4.0722246170043945, "learning_rate": 3.414700552427602e-07, "loss": 0.1767, "step": 17780 }, { "epoch": 2.6820443238353686, "grad_norm": 2.2713727951049805, "learning_rate": 3.382914842514312e-07, "loss": 0.2091, "step": 17790 }, { "epoch": 2.683551937283281, "grad_norm": 2.03262996673584, "learning_rate": 3.3512725831041647e-07, "loss": 0.2007, "step": 17800 }, { "epoch": 2.6850595507311925, "grad_norm": 2.563105583190918, "learning_rate": 3.319773871567017e-07, "loss": 0.2337, "step": 17810 }, { "epoch": 2.6865671641791042, "grad_norm": 3.046205997467041, "learning_rate": 3.288418804830956e-07, "loss": 0.1808, "step": 17820 }, { "epoch": 2.6880747776270164, "grad_norm": 2.6014111042022705, "learning_rate": 3.257207479382085e-07, "loss": 0.1782, "step": 17830 }, { "epoch": 2.6895823910749286, "grad_norm": 2.992081642150879, "learning_rate": 3.2261399912641543e-07, "loss": 0.2082, "step": 17840 }, { "epoch": 2.6910900045228403, "grad_norm": 2.0261595249176025, "learning_rate": 3.1952164360783245e-07, "loss": 0.1868, "step": 17850 }, { "epoch": 2.692597617970752, "grad_norm": 3.115894317626953, "learning_rate": 3.1644369089828266e-07, "loss": 0.1606, "step": 17860 }, { "epoch": 2.694105231418664, "grad_norm": 4.041103363037109, "learning_rate": 3.133801504692685e-07, "loss": 0.2448, "step": 17870 }, { "epoch": 2.6956128448665764, "grad_norm": 3.294428825378418, "learning_rate": 3.103310317479441e-07, "loss": 0.1858, "step": 17880 }, { "epoch": 2.697120458314488, "grad_norm": 4.151124000549316, "learning_rate": 3.0729634411708344e-07, "loss": 0.1966, "step": 17890 }, { "epoch": 2.6986280717624003, "grad_norm": 2.90480637550354, "learning_rate": 3.042760969150549e-07, "loss": 0.1582, "step": 17900 }, { "epoch": 2.700135685210312, "grad_norm": 4.7059550285339355, "learning_rate": 3.012702994357897e-07, "loss": 0.2185, "step": 17910 }, { "epoch": 2.701643298658224, "grad_norm": 3.6957387924194336, "learning_rate": 2.98278960928754e-07, "loss": 0.1731, "step": 17920 }, { "epoch": 2.703150912106136, "grad_norm": 2.236680269241333, "learning_rate": 2.953020905989201e-07, "loss": 0.1919, "step": 17930 }, { "epoch": 2.704658525554048, "grad_norm": 5.218798637390137, "learning_rate": 2.9233969760673997e-07, "loss": 0.1822, "step": 17940 }, { "epoch": 2.70616613900196, "grad_norm": 4.163264274597168, "learning_rate": 2.8939179106811544e-07, "loss": 0.2074, "step": 17950 }, { "epoch": 2.707673752449872, "grad_norm": 2.579991340637207, "learning_rate": 2.8645838005437077e-07, "loss": 0.1781, "step": 17960 }, { "epoch": 2.7091813658977837, "grad_norm": 2.4549643993377686, "learning_rate": 2.835394735922231e-07, "loss": 0.2683, "step": 17970 }, { "epoch": 2.710688979345696, "grad_norm": 3.312816619873047, "learning_rate": 2.806350806637581e-07, "loss": 0.1724, "step": 17980 }, { "epoch": 2.7121965927936076, "grad_norm": 3.275773525238037, "learning_rate": 2.777452102063982e-07, "loss": 0.2187, "step": 17990 }, { "epoch": 2.7137042062415198, "grad_norm": 3.174048662185669, "learning_rate": 2.7486987111287776e-07, "loss": 0.1895, "step": 18000 }, { "epoch": 2.7152118196894315, "grad_norm": 3.5831849575042725, "learning_rate": 2.7200907223121684e-07, "loss": 0.2389, "step": 18010 }, { "epoch": 2.7167194331373437, "grad_norm": 3.1662704944610596, "learning_rate": 2.691628223646897e-07, "loss": 0.2113, "step": 18020 }, { "epoch": 2.7182270465852554, "grad_norm": 2.5291049480438232, "learning_rate": 2.663311302718019e-07, "loss": 0.1528, "step": 18030 }, { "epoch": 2.7197346600331676, "grad_norm": 2.72239351272583, "learning_rate": 2.6351400466626153e-07, "loss": 0.2465, "step": 18040 }, { "epoch": 2.7212422734810797, "grad_norm": 3.438528060913086, "learning_rate": 2.6071145421695144e-07, "loss": 0.1811, "step": 18050 }, { "epoch": 2.7227498869289914, "grad_norm": 3.148258686065674, "learning_rate": 2.5792348754790476e-07, "loss": 0.1865, "step": 18060 }, { "epoch": 2.724257500376903, "grad_norm": 3.502934217453003, "learning_rate": 2.5515011323827666e-07, "loss": 0.2104, "step": 18070 }, { "epoch": 2.7257651138248153, "grad_norm": 2.6545886993408203, "learning_rate": 2.523913398223193e-07, "loss": 0.2159, "step": 18080 }, { "epoch": 2.7272727272727275, "grad_norm": 2.2055623531341553, "learning_rate": 2.496471757893543e-07, "loss": 0.1652, "step": 18090 }, { "epoch": 2.7287803407206392, "grad_norm": 3.0607848167419434, "learning_rate": 2.4691762958374776e-07, "loss": 0.2033, "step": 18100 }, { "epoch": 2.730287954168551, "grad_norm": 3.1366970539093018, "learning_rate": 2.442027096048821e-07, "loss": 0.1946, "step": 18110 }, { "epoch": 2.731795567616463, "grad_norm": 2.8571975231170654, "learning_rate": 2.41502424207134e-07, "loss": 0.1894, "step": 18120 }, { "epoch": 2.7333031810643753, "grad_norm": 3.0656049251556396, "learning_rate": 2.3881678169984434e-07, "loss": 0.1747, "step": 18130 }, { "epoch": 2.734810794512287, "grad_norm": 2.359220266342163, "learning_rate": 2.3614579034729635e-07, "loss": 0.1476, "step": 18140 }, { "epoch": 2.7363184079601988, "grad_norm": 3.5892319679260254, "learning_rate": 2.3348945836868808e-07, "loss": 0.2028, "step": 18150 }, { "epoch": 2.737826021408111, "grad_norm": 2.628737688064575, "learning_rate": 2.308477939381082e-07, "loss": 0.2055, "step": 18160 }, { "epoch": 2.739333634856023, "grad_norm": 2.406731605529785, "learning_rate": 2.2822080518450807e-07, "loss": 0.2236, "step": 18170 }, { "epoch": 2.740841248303935, "grad_norm": 2.84006667137146, "learning_rate": 2.2560850019168134e-07, "loss": 0.2026, "step": 18180 }, { "epoch": 2.7423488617518466, "grad_norm": 3.680265188217163, "learning_rate": 2.2301088699823613e-07, "loss": 0.1922, "step": 18190 }, { "epoch": 2.7438564751997587, "grad_norm": 3.0745105743408203, "learning_rate": 2.2042797359757007e-07, "loss": 0.1866, "step": 18200 }, { "epoch": 2.745364088647671, "grad_norm": 3.2915241718292236, "learning_rate": 2.1785976793784747e-07, "loss": 0.163, "step": 18210 }, { "epoch": 2.7468717020955826, "grad_norm": 3.807180643081665, "learning_rate": 2.153062779219728e-07, "loss": 0.2155, "step": 18220 }, { "epoch": 2.748379315543495, "grad_norm": 2.3895137310028076, "learning_rate": 2.127675114075689e-07, "loss": 0.1723, "step": 18230 }, { "epoch": 2.7498869289914065, "grad_norm": 3.493196487426758, "learning_rate": 2.1024347620694885e-07, "loss": 0.1901, "step": 18240 }, { "epoch": 2.7513945424393187, "grad_norm": 3.1480934619903564, "learning_rate": 2.0773418008709634e-07, "loss": 0.2156, "step": 18250 }, { "epoch": 2.7529021558872304, "grad_norm": 3.687420129776001, "learning_rate": 2.0523963076963972e-07, "loss": 0.2098, "step": 18260 }, { "epoch": 2.7544097693351426, "grad_norm": 3.2478551864624023, "learning_rate": 2.0275983593082804e-07, "loss": 0.2111, "step": 18270 }, { "epoch": 2.7559173827830543, "grad_norm": 3.473320722579956, "learning_rate": 2.002948032015073e-07, "loss": 0.2617, "step": 18280 }, { "epoch": 2.7574249962309665, "grad_norm": 2.1258654594421387, "learning_rate": 1.978445401670992e-07, "loss": 0.2143, "step": 18290 }, { "epoch": 2.758932609678878, "grad_norm": 3.7735559940338135, "learning_rate": 1.9540905436757295e-07, "loss": 0.1717, "step": 18300 }, { "epoch": 2.7604402231267904, "grad_norm": 3.2371037006378174, "learning_rate": 1.9298835329742693e-07, "loss": 0.2489, "step": 18310 }, { "epoch": 2.761947836574702, "grad_norm": 2.176734447479248, "learning_rate": 1.9058244440566475e-07, "loss": 0.1895, "step": 18320 }, { "epoch": 2.7634554500226143, "grad_norm": 2.400078773498535, "learning_rate": 1.8819133509576927e-07, "loss": 0.1879, "step": 18330 }, { "epoch": 2.764963063470526, "grad_norm": 2.883479356765747, "learning_rate": 1.8581503272568413e-07, "loss": 0.213, "step": 18340 }, { "epoch": 2.766470676918438, "grad_norm": 2.5610833168029785, "learning_rate": 1.8345354460778675e-07, "loss": 0.1879, "step": 18350 }, { "epoch": 2.76797829036635, "grad_norm": 2.3394038677215576, "learning_rate": 1.8110687800887038e-07, "loss": 0.2838, "step": 18360 }, { "epoch": 2.769485903814262, "grad_norm": 2.5337727069854736, "learning_rate": 1.787750401501165e-07, "loss": 0.1882, "step": 18370 }, { "epoch": 2.7709935172621742, "grad_norm": 2.9988887310028076, "learning_rate": 1.7645803820707797e-07, "loss": 0.2179, "step": 18380 }, { "epoch": 2.772501130710086, "grad_norm": 3.4300122261047363, "learning_rate": 1.7415587930965317e-07, "loss": 0.1816, "step": 18390 }, { "epoch": 2.7740087441579977, "grad_norm": 3.572110891342163, "learning_rate": 1.7186857054206585e-07, "loss": 0.2484, "step": 18400 }, { "epoch": 2.77551635760591, "grad_norm": 2.229320526123047, "learning_rate": 1.6959611894284189e-07, "loss": 0.1538, "step": 18410 }, { "epoch": 2.777023971053822, "grad_norm": 2.682701349258423, "learning_rate": 1.673385315047904e-07, "loss": 0.1772, "step": 18420 }, { "epoch": 2.7785315845017338, "grad_norm": 2.7108969688415527, "learning_rate": 1.650958151749793e-07, "loss": 0.1596, "step": 18430 }, { "epoch": 2.7800391979496455, "grad_norm": 2.337172508239746, "learning_rate": 1.6286797685471368e-07, "loss": 0.1768, "step": 18440 }, { "epoch": 2.7815468113975577, "grad_norm": 2.7885591983795166, "learning_rate": 1.6065502339951745e-07, "loss": 0.1559, "step": 18450 }, { "epoch": 2.78305442484547, "grad_norm": 6.159884452819824, "learning_rate": 1.5845696161911117e-07, "loss": 0.1919, "step": 18460 }, { "epoch": 2.7845620382933816, "grad_norm": 3.314209222793579, "learning_rate": 1.562737982773893e-07, "loss": 0.1711, "step": 18470 }, { "epoch": 2.7860696517412933, "grad_norm": 2.380784749984741, "learning_rate": 1.541055400924013e-07, "loss": 0.1538, "step": 18480 }, { "epoch": 2.7875772651892055, "grad_norm": 6.039788246154785, "learning_rate": 1.5195219373633053e-07, "loss": 0.2038, "step": 18490 }, { "epoch": 2.7890848786371176, "grad_norm": 3.421523094177246, "learning_rate": 1.4981376583547258e-07, "loss": 0.2121, "step": 18500 }, { "epoch": 2.7905924920850294, "grad_norm": 3.5255966186523438, "learning_rate": 1.4769026297021705e-07, "loss": 0.2027, "step": 18510 }, { "epoch": 2.792100105532941, "grad_norm": 3.942056655883789, "learning_rate": 1.4558169167502579e-07, "loss": 0.2354, "step": 18520 }, { "epoch": 2.7936077189808532, "grad_norm": 1.7924891710281372, "learning_rate": 1.4348805843841241e-07, "loss": 0.1945, "step": 18530 }, { "epoch": 2.7951153324287654, "grad_norm": 3.9659979343414307, "learning_rate": 1.414093697029234e-07, "loss": 0.206, "step": 18540 }, { "epoch": 2.796622945876677, "grad_norm": 2.464761972427368, "learning_rate": 1.3934563186511818e-07, "loss": 0.2086, "step": 18550 }, { "epoch": 2.7981305593245893, "grad_norm": 2.2590885162353516, "learning_rate": 1.372968512755496e-07, "loss": 0.1786, "step": 18560 }, { "epoch": 2.799638172772501, "grad_norm": 2.6160314083099365, "learning_rate": 1.3526303423874233e-07, "loss": 0.2185, "step": 18570 }, { "epoch": 2.801145786220413, "grad_norm": 3.926872968673706, "learning_rate": 1.332441870131762e-07, "loss": 0.1626, "step": 18580 }, { "epoch": 2.802653399668325, "grad_norm": 5.818370342254639, "learning_rate": 1.3124031581126516e-07, "loss": 0.2169, "step": 18590 }, { "epoch": 2.804161013116237, "grad_norm": 2.5999948978424072, "learning_rate": 1.2925142679934e-07, "loss": 0.2065, "step": 18600 }, { "epoch": 2.805668626564149, "grad_norm": 2.844453811645508, "learning_rate": 1.2727752609762723e-07, "loss": 0.2426, "step": 18610 }, { "epoch": 2.807176240012061, "grad_norm": 3.4497768878936768, "learning_rate": 1.253186197802314e-07, "loss": 0.2053, "step": 18620 }, { "epoch": 2.8086838534599727, "grad_norm": 3.4090592861175537, "learning_rate": 1.2337471387511624e-07, "loss": 0.2102, "step": 18630 }, { "epoch": 2.810191466907885, "grad_norm": 2.2541327476501465, "learning_rate": 1.2144581436408565e-07, "loss": 0.1956, "step": 18640 }, { "epoch": 2.8116990803557966, "grad_norm": 2.6183440685272217, "learning_rate": 1.1953192718276606e-07, "loss": 0.2158, "step": 18650 }, { "epoch": 2.813206693803709, "grad_norm": 2.56131911277771, "learning_rate": 1.1763305822058868e-07, "loss": 0.1729, "step": 18660 }, { "epoch": 2.8147143072516205, "grad_norm": 3.1450722217559814, "learning_rate": 1.1574921332076883e-07, "loss": 0.1956, "step": 18670 }, { "epoch": 2.8162219206995327, "grad_norm": 2.528254985809326, "learning_rate": 1.1388039828029219e-07, "loss": 0.2186, "step": 18680 }, { "epoch": 2.8177295341474444, "grad_norm": 2.2274041175842285, "learning_rate": 1.120266188498903e-07, "loss": 0.2058, "step": 18690 }, { "epoch": 2.8192371475953566, "grad_norm": 2.828622817993164, "learning_rate": 1.1018788073403119e-07, "loss": 0.2035, "step": 18700 }, { "epoch": 2.8207447610432688, "grad_norm": 4.373176097869873, "learning_rate": 1.083641895908949e-07, "loss": 0.1959, "step": 18710 }, { "epoch": 2.8222523744911805, "grad_norm": 3.89735746383667, "learning_rate": 1.0655555103236125e-07, "loss": 0.1997, "step": 18720 }, { "epoch": 2.823759987939092, "grad_norm": 5.984492778778076, "learning_rate": 1.0476197062398718e-07, "loss": 0.1769, "step": 18730 }, { "epoch": 2.8252676013870044, "grad_norm": 2.421473741531372, "learning_rate": 1.0298345388499497e-07, "loss": 0.1605, "step": 18740 }, { "epoch": 2.8267752148349166, "grad_norm": 2.9252307415008545, "learning_rate": 1.012200062882518e-07, "loss": 0.1849, "step": 18750 }, { "epoch": 2.8282828282828283, "grad_norm": 3.9190523624420166, "learning_rate": 9.947163326025356e-08, "loss": 0.1882, "step": 18760 }, { "epoch": 2.82979044173074, "grad_norm": 3.981565475463867, "learning_rate": 9.77383401811094e-08, "loss": 0.1858, "step": 18770 }, { "epoch": 2.831298055178652, "grad_norm": 3.317584991455078, "learning_rate": 9.60201323845239e-08, "loss": 0.2067, "step": 18780 }, { "epoch": 2.8328056686265644, "grad_norm": 2.271526575088501, "learning_rate": 9.431701515778102e-08, "loss": 0.1965, "step": 18790 }, { "epoch": 2.834313282074476, "grad_norm": 3.4639670848846436, "learning_rate": 9.262899374172907e-08, "loss": 0.1976, "step": 18800 }, { "epoch": 2.835820895522388, "grad_norm": 2.7228710651397705, "learning_rate": 9.095607333076073e-08, "loss": 0.1611, "step": 18810 }, { "epoch": 2.8373285089703, "grad_norm": 2.191645383834839, "learning_rate": 8.929825907280199e-08, "loss": 0.1742, "step": 18820 }, { "epoch": 2.838836122418212, "grad_norm": 3.7960667610168457, "learning_rate": 8.765555606929377e-08, "loss": 0.2028, "step": 18830 }, { "epoch": 2.840343735866124, "grad_norm": 3.365687131881714, "learning_rate": 8.602796937517644e-08, "loss": 0.2083, "step": 18840 }, { "epoch": 2.841851349314036, "grad_norm": 3.159848690032959, "learning_rate": 8.441550399887311e-08, "loss": 0.1633, "step": 18850 }, { "epoch": 2.8433589627619478, "grad_norm": 3.2495322227478027, "learning_rate": 8.281816490227689e-08, "loss": 0.1801, "step": 18860 }, { "epoch": 2.84486657620986, "grad_norm": 3.4069721698760986, "learning_rate": 8.123595700073372e-08, "loss": 0.1792, "step": 18870 }, { "epoch": 2.8463741896577717, "grad_norm": 2.7665977478027344, "learning_rate": 7.966888516302728e-08, "loss": 0.1884, "step": 18880 }, { "epoch": 2.847881803105684, "grad_norm": 3.773259401321411, "learning_rate": 7.811695421136467e-08, "loss": 0.2217, "step": 18890 }, { "epoch": 2.8493894165535956, "grad_norm": 2.6209404468536377, "learning_rate": 7.658016892136189e-08, "loss": 0.1932, "step": 18900 }, { "epoch": 2.8508970300015077, "grad_norm": 3.839221477508545, "learning_rate": 7.505853402202834e-08, "loss": 0.2224, "step": 18910 }, { "epoch": 2.8524046434494195, "grad_norm": 3.456493854522705, "learning_rate": 7.355205419575185e-08, "loss": 0.2096, "step": 18920 }, { "epoch": 2.8539122568973316, "grad_norm": 3.459074020385742, "learning_rate": 7.206073407828529e-08, "loss": 0.2518, "step": 18930 }, { "epoch": 2.8554198703452434, "grad_norm": 3.1806628704071045, "learning_rate": 7.058457825873166e-08, "loss": 0.2018, "step": 18940 }, { "epoch": 2.8569274837931555, "grad_norm": 3.2925832271575928, "learning_rate": 6.912359127953128e-08, "loss": 0.2145, "step": 18950 }, { "epoch": 2.8584350972410673, "grad_norm": 3.0368525981903076, "learning_rate": 6.767777763644567e-08, "loss": 0.1298, "step": 18960 }, { "epoch": 2.8599427106889794, "grad_norm": 2.7542262077331543, "learning_rate": 6.624714177854596e-08, "loss": 0.2071, "step": 18970 }, { "epoch": 2.861450324136891, "grad_norm": 3.065699815750122, "learning_rate": 6.483168810819618e-08, "loss": 0.1754, "step": 18980 }, { "epoch": 2.8629579375848033, "grad_norm": 2.9569852352142334, "learning_rate": 6.343142098104327e-08, "loss": 0.1812, "step": 18990 }, { "epoch": 2.864465551032715, "grad_norm": 3.0794763565063477, "learning_rate": 6.204634470600212e-08, "loss": 0.2317, "step": 19000 }, { "epoch": 2.8659731644806272, "grad_norm": 2.339825391769409, "learning_rate": 6.067646354524059e-08, "loss": 0.2161, "step": 19010 }, { "epoch": 2.867480777928539, "grad_norm": 3.0979080200195312, "learning_rate": 5.932178171416947e-08, "loss": 0.1838, "step": 19020 }, { "epoch": 2.868988391376451, "grad_norm": 2.5213546752929688, "learning_rate": 5.798230338142863e-08, "loss": 0.1847, "step": 19030 }, { "epoch": 2.8704960048243633, "grad_norm": 3.3742454051971436, "learning_rate": 5.665803266887038e-08, "loss": 0.2034, "step": 19040 }, { "epoch": 2.872003618272275, "grad_norm": 2.2025163173675537, "learning_rate": 5.534897365155389e-08, "loss": 0.1928, "step": 19050 }, { "epoch": 2.8735112317201867, "grad_norm": 2.7820496559143066, "learning_rate": 5.40551303577258e-08, "loss": 0.2185, "step": 19060 }, { "epoch": 2.875018845168099, "grad_norm": 2.9924063682556152, "learning_rate": 5.2776506768813516e-08, "loss": 0.2198, "step": 19070 }, { "epoch": 2.876526458616011, "grad_norm": 2.5866243839263916, "learning_rate": 5.1513106819407464e-08, "loss": 0.2105, "step": 19080 }, { "epoch": 2.878034072063923, "grad_norm": 3.2160913944244385, "learning_rate": 5.0264934397253885e-08, "loss": 0.1712, "step": 19090 }, { "epoch": 2.8795416855118345, "grad_norm": 2.342047929763794, "learning_rate": 4.9031993343238714e-08, "loss": 0.2149, "step": 19100 }, { "epoch": 2.8810492989597467, "grad_norm": 2.7187843322753906, "learning_rate": 4.781428745138039e-08, "loss": 0.2251, "step": 19110 }, { "epoch": 2.882556912407659, "grad_norm": 2.4814705848693848, "learning_rate": 4.6611820468813185e-08, "loss": 0.1949, "step": 19120 }, { "epoch": 2.8840645258555706, "grad_norm": 3.269787549972534, "learning_rate": 4.542459609577998e-08, "loss": 0.1782, "step": 19130 }, { "epoch": 2.8855721393034823, "grad_norm": 2.9044387340545654, "learning_rate": 4.4252617985616755e-08, "loss": 0.1768, "step": 19140 }, { "epoch": 2.8870797527513945, "grad_norm": 3.418386936187744, "learning_rate": 4.3095889744745876e-08, "loss": 0.2344, "step": 19150 }, { "epoch": 2.8885873661993067, "grad_norm": 2.7028987407684326, "learning_rate": 4.195441493266117e-08, "loss": 0.2191, "step": 19160 }, { "epoch": 2.8900949796472184, "grad_norm": 5.8561577796936035, "learning_rate": 4.082819706191954e-08, "loss": 0.2332, "step": 19170 }, { "epoch": 2.8916025930951306, "grad_norm": 3.5421624183654785, "learning_rate": 3.971723959812712e-08, "loss": 0.1782, "step": 19180 }, { "epoch": 2.8931102065430423, "grad_norm": 2.6349470615386963, "learning_rate": 3.862154595993317e-08, "loss": 0.2294, "step": 19190 }, { "epoch": 2.8946178199909545, "grad_norm": 1.7092329263687134, "learning_rate": 3.754111951901562e-08, "loss": 0.2277, "step": 19200 }, { "epoch": 2.896125433438866, "grad_norm": 3.8334434032440186, "learning_rate": 3.647596360007222e-08, "loss": 0.2386, "step": 19210 }, { "epoch": 2.8976330468867784, "grad_norm": 3.797410011291504, "learning_rate": 3.542608148080939e-08, "loss": 0.1471, "step": 19220 }, { "epoch": 2.89914066033469, "grad_norm": 3.1496922969818115, "learning_rate": 3.439147639193396e-08, "loss": 0.2581, "step": 19230 }, { "epoch": 2.9006482737826023, "grad_norm": 4.098090171813965, "learning_rate": 3.337215151714202e-08, "loss": 0.233, "step": 19240 }, { "epoch": 2.902155887230514, "grad_norm": 2.0828075408935547, "learning_rate": 3.236810999310891e-08, "loss": 0.2573, "step": 19250 }, { "epoch": 2.903663500678426, "grad_norm": 3.7276837825775146, "learning_rate": 3.1379354909479855e-08, "loss": 0.2063, "step": 19260 }, { "epoch": 2.905171114126338, "grad_norm": 2.3038418292999268, "learning_rate": 3.040588930886102e-08, "loss": 0.2285, "step": 19270 }, { "epoch": 2.90667872757425, "grad_norm": 2.251199722290039, "learning_rate": 2.9447716186808973e-08, "loss": 0.1756, "step": 19280 }, { "epoch": 2.908186341022162, "grad_norm": 3.600229024887085, "learning_rate": 2.8504838491822396e-08, "loss": 0.1928, "step": 19290 }, { "epoch": 2.909693954470074, "grad_norm": 3.5759217739105225, "learning_rate": 2.7577259125333155e-08, "loss": 0.2182, "step": 19300 }, { "epoch": 2.9112015679179857, "grad_norm": 3.401257038116455, "learning_rate": 2.6664980941696338e-08, "loss": 0.1566, "step": 19310 }, { "epoch": 2.912709181365898, "grad_norm": 4.856666564941406, "learning_rate": 2.5768006748181917e-08, "loss": 0.1726, "step": 19320 }, { "epoch": 2.9142167948138096, "grad_norm": 3.120567798614502, "learning_rate": 2.488633930496809e-08, "loss": 0.242, "step": 19330 }, { "epoch": 2.9157244082617217, "grad_norm": 4.224436283111572, "learning_rate": 2.401998132512795e-08, "loss": 0.1776, "step": 19340 }, { "epoch": 2.9172320217096335, "grad_norm": 3.443085193634033, "learning_rate": 2.3168935474627285e-08, "loss": 0.1708, "step": 19350 }, { "epoch": 2.9187396351575456, "grad_norm": 3.8224339485168457, "learning_rate": 2.2333204372311235e-08, "loss": 0.1954, "step": 19360 }, { "epoch": 2.920247248605458, "grad_norm": 3.144111156463623, "learning_rate": 2.1512790589898747e-08, "loss": 0.1568, "step": 19370 }, { "epoch": 2.9217548620533695, "grad_norm": 2.432152032852173, "learning_rate": 2.070769665197425e-08, "loss": 0.1793, "step": 19380 }, { "epoch": 2.9232624755012813, "grad_norm": 4.014102935791016, "learning_rate": 1.9917925035980447e-08, "loss": 0.239, "step": 19390 }, { "epoch": 2.9247700889491934, "grad_norm": 2.984321355819702, "learning_rate": 1.914347817220885e-08, "loss": 0.185, "step": 19400 }, { "epoch": 2.9262777023971056, "grad_norm": 2.674076557159424, "learning_rate": 1.8384358443794826e-08, "loss": 0.2253, "step": 19410 }, { "epoch": 2.9277853158450173, "grad_norm": 4.467591285705566, "learning_rate": 1.7640568186707562e-08, "loss": 0.1923, "step": 19420 }, { "epoch": 2.929292929292929, "grad_norm": 4.1216654777526855, "learning_rate": 1.6912109689745104e-08, "loss": 0.2115, "step": 19430 }, { "epoch": 2.9308005427408412, "grad_norm": 2.538414716720581, "learning_rate": 1.6198985194526563e-08, "loss": 0.1814, "step": 19440 }, { "epoch": 2.9323081561887534, "grad_norm": 2.189082384109497, "learning_rate": 1.5501196895484903e-08, "loss": 0.175, "step": 19450 }, { "epoch": 2.933815769636665, "grad_norm": 2.95762300491333, "learning_rate": 1.4818746939860296e-08, "loss": 0.1732, "step": 19460 }, { "epoch": 2.935323383084577, "grad_norm": 3.419302225112915, "learning_rate": 1.4151637427693431e-08, "loss": 0.2242, "step": 19470 }, { "epoch": 2.936830996532489, "grad_norm": 1.8826903104782104, "learning_rate": 1.3499870411819993e-08, "loss": 0.1796, "step": 19480 }, { "epoch": 2.938338609980401, "grad_norm": 2.501823902130127, "learning_rate": 1.286344789786287e-08, "loss": 0.2196, "step": 19490 }, { "epoch": 2.939846223428313, "grad_norm": 1.7875875234603882, "learning_rate": 1.2242371844227719e-08, "loss": 0.2677, "step": 19500 }, { "epoch": 2.941353836876225, "grad_norm": 2.68408203125, "learning_rate": 1.163664416209409e-08, "loss": 0.1849, "step": 19510 }, { "epoch": 2.942861450324137, "grad_norm": 3.8646411895751953, "learning_rate": 1.1046266715412645e-08, "loss": 0.193, "step": 19520 }, { "epoch": 2.944369063772049, "grad_norm": 3.8750669956207275, "learning_rate": 1.047124132089905e-08, "loss": 0.2192, "step": 19530 }, { "epoch": 2.9458766772199607, "grad_norm": 2.285654306411743, "learning_rate": 9.911569748025651e-09, "loss": 0.2072, "step": 19540 }, { "epoch": 2.947384290667873, "grad_norm": 2.4493703842163086, "learning_rate": 9.367253719019253e-09, "loss": 0.1617, "step": 19550 }, { "epoch": 2.9488919041157846, "grad_norm": 3.798147439956665, "learning_rate": 8.838294908853351e-09, "loss": 0.1589, "step": 19560 }, { "epoch": 2.950399517563697, "grad_norm": 3.5094010829925537, "learning_rate": 8.324694945244793e-09, "loss": 0.206, "step": 19570 }, { "epoch": 2.9519071310116085, "grad_norm": 2.916867256164551, "learning_rate": 7.826455408648237e-09, "loss": 0.1864, "step": 19580 }, { "epoch": 2.9534147444595207, "grad_norm": 3.079050302505493, "learning_rate": 7.343577832250037e-09, "loss": 0.1895, "step": 19590 }, { "epoch": 2.9549223579074324, "grad_norm": 3.7051098346710205, "learning_rate": 6.876063701966584e-09, "loss": 0.2102, "step": 19600 }, { "epoch": 2.9564299713553446, "grad_norm": 2.050788640975952, "learning_rate": 6.423914456434866e-09, "loss": 0.1753, "step": 19610 }, { "epoch": 2.9579375848032563, "grad_norm": 3.9632387161254883, "learning_rate": 5.987131487013575e-09, "loss": 0.1651, "step": 19620 }, { "epoch": 2.9594451982511685, "grad_norm": 2.1361775398254395, "learning_rate": 5.5657161377747905e-09, "loss": 0.1797, "step": 19630 }, { "epoch": 2.96095281169908, "grad_norm": 3.433924436569214, "learning_rate": 5.159669705501746e-09, "loss": 0.223, "step": 19640 }, { "epoch": 2.9624604251469924, "grad_norm": 3.3153045177459717, "learning_rate": 4.768993439683844e-09, "loss": 0.2042, "step": 19650 }, { "epoch": 2.9639680385949045, "grad_norm": 3.906982898712158, "learning_rate": 4.393688542513319e-09, "loss": 0.1904, "step": 19660 }, { "epoch": 2.9654756520428163, "grad_norm": 2.6775619983673096, "learning_rate": 4.033756168881908e-09, "loss": 0.197, "step": 19670 }, { "epoch": 2.966983265490728, "grad_norm": 5.685617446899414, "learning_rate": 3.689197426376412e-09, "loss": 0.2185, "step": 19680 }, { "epoch": 2.96849087893864, "grad_norm": 3.3192522525787354, "learning_rate": 3.360013375277027e-09, "loss": 0.1768, "step": 19690 }, { "epoch": 2.9699984923865523, "grad_norm": 2.1422977447509766, "learning_rate": 3.046205028550131e-09, "loss": 0.2332, "step": 19700 }, { "epoch": 2.971506105834464, "grad_norm": 1.8530495166778564, "learning_rate": 2.7477733518510575e-09, "loss": 0.1998, "step": 19710 }, { "epoch": 2.973013719282376, "grad_norm": 2.612293004989624, "learning_rate": 2.4647192635157692e-09, "loss": 0.1714, "step": 19720 }, { "epoch": 2.974521332730288, "grad_norm": 3.2769341468811035, "learning_rate": 2.197043634560858e-09, "loss": 0.1855, "step": 19730 }, { "epoch": 2.9760289461782, "grad_norm": 3.228468418121338, "learning_rate": 1.94474728867966e-09, "loss": 0.1468, "step": 19740 }, { "epoch": 2.977536559626112, "grad_norm": 2.4429333209991455, "learning_rate": 1.7078310022411447e-09, "loss": 0.1845, "step": 19750 }, { "epoch": 2.9790441730740236, "grad_norm": 2.7234740257263184, "learning_rate": 1.4862955042860282e-09, "loss": 0.1843, "step": 19760 }, { "epoch": 2.9805517865219358, "grad_norm": 2.768825054168701, "learning_rate": 1.2801414765256647e-09, "loss": 0.1524, "step": 19770 }, { "epoch": 2.982059399969848, "grad_norm": 2.09014630317688, "learning_rate": 1.089369553338715e-09, "loss": 0.2129, "step": 19780 }, { "epoch": 2.9835670134177597, "grad_norm": 5.465130805969238, "learning_rate": 9.139803217700361e-10, "loss": 0.1981, "step": 19790 }, { "epoch": 2.9850746268656714, "grad_norm": 2.9086251258850098, "learning_rate": 7.539743215290163e-10, "loss": 0.2004, "step": 19800 }, { "epoch": 2.9865822403135835, "grad_norm": 2.944516658782959, "learning_rate": 6.093520449879098e-10, "loss": 0.2163, "step": 19810 }, { "epoch": 2.9880898537614957, "grad_norm": 3.097722291946411, "learning_rate": 4.801139371796159e-10, "loss": 0.2003, "step": 19820 }, { "epoch": 2.9895974672094074, "grad_norm": 2.7440078258514404, "learning_rate": 3.6626039579601424e-10, "loss": 0.1901, "step": 19830 }, { "epoch": 2.9911050806573196, "grad_norm": 2.257997989654541, "learning_rate": 2.677917711890743e-10, "loss": 0.1907, "step": 19840 }, { "epoch": 2.9926126941052313, "grad_norm": 5.857127666473389, "learning_rate": 1.847083663669702e-10, "loss": 0.2442, "step": 19850 }, { "epoch": 2.9941203075531435, "grad_norm": 2.7533628940582275, "learning_rate": 1.1701043699408055e-10, "loss": 0.1999, "step": 19860 }, { "epoch": 2.9956279210010552, "grad_norm": 4.109286308288574, "learning_rate": 6.469819139209854e-11, "loss": 0.1525, "step": 19870 }, { "epoch": 2.9971355344489674, "grad_norm": 2.7020206451416016, "learning_rate": 2.7771790536701426e-11, "loss": 0.1851, "step": 19880 }, { "epoch": 2.998643147896879, "grad_norm": 3.834559202194214, "learning_rate": 6.231348056995324e-12, "loss": 0.1713, "step": 19890 }, { "epoch": 3.0, "step": 19899, "total_flos": 2.164125637223121e+18, "train_loss": 0.22789792727810623, "train_runtime": 8452.7246, "train_samples_per_second": 9.417, "train_steps_per_second": 2.354 } ], "logging_steps": 10, "max_steps": 19899, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.164125637223121e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }