{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999599396423945, "eval_steps": 500, "global_step": 18721, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005341381014061185, "grad_norm": 24.63877296447754, "learning_rate": 3.558718861209965e-07, "loss": 2.6153, "step": 10 }, { "epoch": 0.001068276202812237, "grad_norm": 18.015764236450195, "learning_rate": 7.11743772241993e-07, "loss": 2.4432, "step": 20 }, { "epoch": 0.0016024143042183557, "grad_norm": 11.500164031982422, "learning_rate": 1.0676156583629894e-06, "loss": 2.0629, "step": 30 }, { "epoch": 0.002136552405624474, "grad_norm": 7.562989711761475, "learning_rate": 1.423487544483986e-06, "loss": 1.6082, "step": 40 }, { "epoch": 0.0026706905070305927, "grad_norm": 6.592193126678467, "learning_rate": 1.7793594306049826e-06, "loss": 1.2785, "step": 50 }, { "epoch": 0.0032048286084367114, "grad_norm": 5.542476654052734, "learning_rate": 2.135231316725979e-06, "loss": 1.1094, "step": 60 }, { "epoch": 0.0037389667098428297, "grad_norm": 4.7591962814331055, "learning_rate": 2.4911032028469755e-06, "loss": 1.0078, "step": 70 }, { "epoch": 0.004273104811248948, "grad_norm": 5.974002838134766, "learning_rate": 2.846975088967972e-06, "loss": 0.9115, "step": 80 }, { "epoch": 0.004807242912655067, "grad_norm": 4.8790812492370605, "learning_rate": 3.2028469750889684e-06, "loss": 0.8329, "step": 90 }, { "epoch": 0.0053413810140611855, "grad_norm": 5.5810933113098145, "learning_rate": 3.558718861209965e-06, "loss": 0.7884, "step": 100 }, { "epoch": 0.005875519115467304, "grad_norm": 4.7397332191467285, "learning_rate": 3.914590747330961e-06, "loss": 0.7539, "step": 110 }, { "epoch": 0.006409657216873423, "grad_norm": 4.994381427764893, "learning_rate": 4.270462633451958e-06, "loss": 0.7122, "step": 120 }, { "epoch": 0.006943795318279541, "grad_norm": 4.726022243499756, "learning_rate": 4.626334519572954e-06, "loss": 0.6996, "step": 130 }, { "epoch": 0.0074779334196856595, "grad_norm": 5.91300630569458, "learning_rate": 4.982206405693951e-06, "loss": 0.6562, "step": 140 }, { "epoch": 0.008012071521091778, "grad_norm": 4.6057047843933105, "learning_rate": 5.338078291814946e-06, "loss": 0.6473, "step": 150 }, { "epoch": 0.008546209622497896, "grad_norm": 5.253528118133545, "learning_rate": 5.693950177935944e-06, "loss": 0.662, "step": 160 }, { "epoch": 0.009080347723904016, "grad_norm": 4.646815776824951, "learning_rate": 6.049822064056941e-06, "loss": 0.614, "step": 170 }, { "epoch": 0.009614485825310133, "grad_norm": 4.555067539215088, "learning_rate": 6.405693950177937e-06, "loss": 0.6263, "step": 180 }, { "epoch": 0.010148623926716253, "grad_norm": 5.40243673324585, "learning_rate": 6.761565836298933e-06, "loss": 0.5909, "step": 190 }, { "epoch": 0.010682762028122371, "grad_norm": 5.421019077301025, "learning_rate": 7.11743772241993e-06, "loss": 0.5876, "step": 200 }, { "epoch": 0.011216900129528489, "grad_norm": 4.492127418518066, "learning_rate": 7.4733096085409265e-06, "loss": 0.5925, "step": 210 }, { "epoch": 0.011751038230934608, "grad_norm": 4.839940071105957, "learning_rate": 7.829181494661923e-06, "loss": 0.5748, "step": 220 }, { "epoch": 0.012285176332340726, "grad_norm": 4.641572952270508, "learning_rate": 8.18505338078292e-06, "loss": 0.5853, "step": 230 }, { "epoch": 0.012819314433746846, "grad_norm": 4.245635986328125, "learning_rate": 8.540925266903915e-06, "loss": 0.5495, "step": 240 }, { "epoch": 0.013353452535152964, "grad_norm": 4.625709533691406, "learning_rate": 8.896797153024912e-06, "loss": 0.544, "step": 250 }, { "epoch": 0.013887590636559081, "grad_norm": 3.908940076828003, "learning_rate": 9.252669039145908e-06, "loss": 0.5546, "step": 260 }, { "epoch": 0.014421728737965201, "grad_norm": 5.441112518310547, "learning_rate": 9.608540925266905e-06, "loss": 0.525, "step": 270 }, { "epoch": 0.014955866839371319, "grad_norm": 4.847513675689697, "learning_rate": 9.964412811387902e-06, "loss": 0.529, "step": 280 }, { "epoch": 0.015490004940777439, "grad_norm": 4.915203094482422, "learning_rate": 1.0320284697508897e-05, "loss": 0.5492, "step": 290 }, { "epoch": 0.016024143042183556, "grad_norm": 4.549454689025879, "learning_rate": 1.0676156583629893e-05, "loss": 0.5234, "step": 300 }, { "epoch": 0.016558281143589676, "grad_norm": 3.7291009426116943, "learning_rate": 1.103202846975089e-05, "loss": 0.5271, "step": 310 }, { "epoch": 0.017092419244995792, "grad_norm": 5.256275653839111, "learning_rate": 1.1387900355871889e-05, "loss": 0.525, "step": 320 }, { "epoch": 0.01762655734640191, "grad_norm": 4.500310897827148, "learning_rate": 1.1743772241992884e-05, "loss": 0.5257, "step": 330 }, { "epoch": 0.01816069544780803, "grad_norm": 3.7887609004974365, "learning_rate": 1.2099644128113881e-05, "loss": 0.502, "step": 340 }, { "epoch": 0.01869483354921415, "grad_norm": 5.051647186279297, "learning_rate": 1.2455516014234877e-05, "loss": 0.4693, "step": 350 }, { "epoch": 0.019228971650620267, "grad_norm": 5.201557636260986, "learning_rate": 1.2811387900355874e-05, "loss": 0.4895, "step": 360 }, { "epoch": 0.019763109752026387, "grad_norm": 4.449207305908203, "learning_rate": 1.3167259786476869e-05, "loss": 0.4944, "step": 370 }, { "epoch": 0.020297247853432506, "grad_norm": 4.097253322601318, "learning_rate": 1.3523131672597866e-05, "loss": 0.4636, "step": 380 }, { "epoch": 0.020831385954838622, "grad_norm": 4.542652606964111, "learning_rate": 1.3879003558718862e-05, "loss": 0.5052, "step": 390 }, { "epoch": 0.021365524056244742, "grad_norm": 3.9565999507904053, "learning_rate": 1.423487544483986e-05, "loss": 0.4903, "step": 400 }, { "epoch": 0.02189966215765086, "grad_norm": 4.017820835113525, "learning_rate": 1.4590747330960854e-05, "loss": 0.4951, "step": 410 }, { "epoch": 0.022433800259056978, "grad_norm": 4.008388996124268, "learning_rate": 1.4946619217081853e-05, "loss": 0.476, "step": 420 }, { "epoch": 0.022967938360463097, "grad_norm": 5.351808071136475, "learning_rate": 1.5302491103202847e-05, "loss": 0.4651, "step": 430 }, { "epoch": 0.023502076461869217, "grad_norm": 5.8637800216674805, "learning_rate": 1.5658362989323845e-05, "loss": 0.4791, "step": 440 }, { "epoch": 0.024036214563275336, "grad_norm": 4.3994574546813965, "learning_rate": 1.601423487544484e-05, "loss": 0.5026, "step": 450 }, { "epoch": 0.024570352664681452, "grad_norm": 3.5824458599090576, "learning_rate": 1.637010676156584e-05, "loss": 0.4702, "step": 460 }, { "epoch": 0.025104490766087572, "grad_norm": 4.551873683929443, "learning_rate": 1.6725978647686835e-05, "loss": 0.4895, "step": 470 }, { "epoch": 0.02563862886749369, "grad_norm": 3.756704092025757, "learning_rate": 1.708185053380783e-05, "loss": 0.4829, "step": 480 }, { "epoch": 0.026172766968899808, "grad_norm": 4.819668292999268, "learning_rate": 1.7437722419928826e-05, "loss": 0.4784, "step": 490 }, { "epoch": 0.026706905070305927, "grad_norm": 3.9722914695739746, "learning_rate": 1.7793594306049825e-05, "loss": 0.4987, "step": 500 }, { "epoch": 0.027241043171712047, "grad_norm": 3.4831347465515137, "learning_rate": 1.814946619217082e-05, "loss": 0.4636, "step": 510 }, { "epoch": 0.027775181273118163, "grad_norm": 4.324979782104492, "learning_rate": 1.8505338078291815e-05, "loss": 0.4369, "step": 520 }, { "epoch": 0.028309319374524283, "grad_norm": 4.316407680511475, "learning_rate": 1.886120996441281e-05, "loss": 0.4508, "step": 530 }, { "epoch": 0.028843457475930402, "grad_norm": 3.6342215538024902, "learning_rate": 1.921708185053381e-05, "loss": 0.453, "step": 540 }, { "epoch": 0.029377595577336522, "grad_norm": 3.7194764614105225, "learning_rate": 1.9572953736654805e-05, "loss": 0.4359, "step": 550 }, { "epoch": 0.029911733678742638, "grad_norm": 5.211735725402832, "learning_rate": 1.9928825622775804e-05, "loss": 0.4447, "step": 560 }, { "epoch": 0.030445871780148757, "grad_norm": 4.747473239898682, "learning_rate": 1.999999042219927e-05, "loss": 0.4587, "step": 570 }, { "epoch": 0.030980009881554877, "grad_norm": 2.9805171489715576, "learning_rate": 1.999995151241525e-05, "loss": 0.4588, "step": 580 }, { "epoch": 0.031514147982961, "grad_norm": 5.058079242706299, "learning_rate": 1.9999882672151766e-05, "loss": 0.4481, "step": 590 }, { "epoch": 0.03204828608436711, "grad_norm": 3.8155128955841064, "learning_rate": 1.9999783901614854e-05, "loss": 0.4451, "step": 600 }, { "epoch": 0.03258242418577323, "grad_norm": 4.303606986999512, "learning_rate": 1.9999655201100148e-05, "loss": 0.444, "step": 610 }, { "epoch": 0.03311656228717935, "grad_norm": 3.660501480102539, "learning_rate": 1.999949657099285e-05, "loss": 0.4344, "step": 620 }, { "epoch": 0.03365070038858547, "grad_norm": 4.342318058013916, "learning_rate": 1.9999308011767756e-05, "loss": 0.4379, "step": 630 }, { "epoch": 0.034184838489991584, "grad_norm": 4.153019428253174, "learning_rate": 1.9999089523989234e-05, "loss": 0.4542, "step": 640 }, { "epoch": 0.03471897659139771, "grad_norm": 4.421793460845947, "learning_rate": 1.999884110831123e-05, "loss": 0.4533, "step": 650 }, { "epoch": 0.03525311469280382, "grad_norm": 5.650811672210693, "learning_rate": 1.999856276547727e-05, "loss": 0.4478, "step": 660 }, { "epoch": 0.035787252794209946, "grad_norm": 5.65400505065918, "learning_rate": 1.9998254496320445e-05, "loss": 0.4585, "step": 670 }, { "epoch": 0.03632139089561606, "grad_norm": 4.007586479187012, "learning_rate": 1.999791630176343e-05, "loss": 0.4331, "step": 680 }, { "epoch": 0.03685552899702218, "grad_norm": 3.635618209838867, "learning_rate": 1.9997548182818466e-05, "loss": 0.4321, "step": 690 }, { "epoch": 0.0373896670984283, "grad_norm": 3.7454850673675537, "learning_rate": 1.9997150140587346e-05, "loss": 0.4327, "step": 700 }, { "epoch": 0.03792380519983442, "grad_norm": 4.554793357849121, "learning_rate": 1.9996722176261442e-05, "loss": 0.4319, "step": 710 }, { "epoch": 0.038457943301240534, "grad_norm": 4.4009904861450195, "learning_rate": 1.9996264291121675e-05, "loss": 0.4233, "step": 720 }, { "epoch": 0.03899208140264666, "grad_norm": 3.1878607273101807, "learning_rate": 1.9995776486538527e-05, "loss": 0.428, "step": 730 }, { "epoch": 0.03952621950405277, "grad_norm": 3.9422080516815186, "learning_rate": 1.9995258763972026e-05, "loss": 0.4382, "step": 740 }, { "epoch": 0.04006035760545889, "grad_norm": 4.085798740386963, "learning_rate": 1.9994711124971746e-05, "loss": 0.4263, "step": 750 }, { "epoch": 0.04059449570686501, "grad_norm": 6.82937479019165, "learning_rate": 1.9994133571176806e-05, "loss": 0.4419, "step": 760 }, { "epoch": 0.04112863380827113, "grad_norm": 3.6260581016540527, "learning_rate": 1.9993526104315864e-05, "loss": 0.4443, "step": 770 }, { "epoch": 0.041662771909677244, "grad_norm": 5.513408184051514, "learning_rate": 1.99928887262071e-05, "loss": 0.4136, "step": 780 }, { "epoch": 0.04219691001108337, "grad_norm": 3.920865058898926, "learning_rate": 1.999222143875823e-05, "loss": 0.431, "step": 790 }, { "epoch": 0.042731048112489484, "grad_norm": 5.597921371459961, "learning_rate": 1.9991524243966487e-05, "loss": 0.4247, "step": 800 }, { "epoch": 0.0432651862138956, "grad_norm": 3.887965440750122, "learning_rate": 1.999079714391862e-05, "loss": 0.4572, "step": 810 }, { "epoch": 0.04379932431530172, "grad_norm": 3.3102221488952637, "learning_rate": 1.9990040140790882e-05, "loss": 0.3998, "step": 820 }, { "epoch": 0.04433346241670784, "grad_norm": 2.6862614154815674, "learning_rate": 1.998925323684903e-05, "loss": 0.421, "step": 830 }, { "epoch": 0.044867600518113955, "grad_norm": 2.8465285301208496, "learning_rate": 1.9988436434448323e-05, "loss": 0.4287, "step": 840 }, { "epoch": 0.04540173861952008, "grad_norm": 3.344913959503174, "learning_rate": 1.9987589736033492e-05, "loss": 0.4008, "step": 850 }, { "epoch": 0.045935876720926194, "grad_norm": 3.295952081680298, "learning_rate": 1.9986713144138765e-05, "loss": 0.4218, "step": 860 }, { "epoch": 0.04647001482233232, "grad_norm": 5.973206043243408, "learning_rate": 1.9985806661387836e-05, "loss": 0.4457, "step": 870 }, { "epoch": 0.04700415292373843, "grad_norm": 3.6733510494232178, "learning_rate": 1.998487029049386e-05, "loss": 0.4434, "step": 880 }, { "epoch": 0.04753829102514455, "grad_norm": 3.297353506088257, "learning_rate": 1.998390403425946e-05, "loss": 0.4114, "step": 890 }, { "epoch": 0.04807242912655067, "grad_norm": 3.7135517597198486, "learning_rate": 1.9982907895576697e-05, "loss": 0.4286, "step": 900 }, { "epoch": 0.04860656722795679, "grad_norm": 3.4130375385284424, "learning_rate": 1.9981881877427075e-05, "loss": 0.4112, "step": 910 }, { "epoch": 0.049140705329362905, "grad_norm": 4.161523342132568, "learning_rate": 1.9980825982881537e-05, "loss": 0.4185, "step": 920 }, { "epoch": 0.04967484343076903, "grad_norm": 5.1869707107543945, "learning_rate": 1.9979740215100433e-05, "loss": 0.4258, "step": 930 }, { "epoch": 0.050208981532175144, "grad_norm": 4.530313014984131, "learning_rate": 1.997862457733354e-05, "loss": 0.4161, "step": 940 }, { "epoch": 0.05074311963358126, "grad_norm": 3.3689589500427246, "learning_rate": 1.997747907292003e-05, "loss": 0.4198, "step": 950 }, { "epoch": 0.05127725773498738, "grad_norm": 3.011768341064453, "learning_rate": 1.9976303705288464e-05, "loss": 0.395, "step": 960 }, { "epoch": 0.0518113958363935, "grad_norm": 3.640399932861328, "learning_rate": 1.99750984779568e-05, "loss": 0.4075, "step": 970 }, { "epoch": 0.052345533937799615, "grad_norm": 4.555020332336426, "learning_rate": 1.9973863394532352e-05, "loss": 0.4273, "step": 980 }, { "epoch": 0.05287967203920574, "grad_norm": 3.2937865257263184, "learning_rate": 1.9972598458711804e-05, "loss": 0.3877, "step": 990 }, { "epoch": 0.053413810140611855, "grad_norm": 5.399692058563232, "learning_rate": 1.9971303674281185e-05, "loss": 0.4278, "step": 1000 }, { "epoch": 0.05394794824201797, "grad_norm": 2.8460562229156494, "learning_rate": 1.996997904511587e-05, "loss": 0.3924, "step": 1010 }, { "epoch": 0.054482086343424094, "grad_norm": 3.966013193130493, "learning_rate": 1.9968624575180558e-05, "loss": 0.4071, "step": 1020 }, { "epoch": 0.05501622444483021, "grad_norm": 4.833566665649414, "learning_rate": 1.9967240268529255e-05, "loss": 0.3973, "step": 1030 }, { "epoch": 0.055550362546236326, "grad_norm": 4.089812278747559, "learning_rate": 1.9965826129305284e-05, "loss": 0.4031, "step": 1040 }, { "epoch": 0.05608450064764245, "grad_norm": 3.154456615447998, "learning_rate": 1.996438216174125e-05, "loss": 0.4197, "step": 1050 }, { "epoch": 0.056618638749048565, "grad_norm": 3.9685862064361572, "learning_rate": 1.9962908370159036e-05, "loss": 0.3921, "step": 1060 }, { "epoch": 0.05715277685045469, "grad_norm": 3.892277479171753, "learning_rate": 1.9961404758969794e-05, "loss": 0.3805, "step": 1070 }, { "epoch": 0.057686914951860804, "grad_norm": 7.212051868438721, "learning_rate": 1.9959871332673927e-05, "loss": 0.3969, "step": 1080 }, { "epoch": 0.05822105305326692, "grad_norm": 4.365306854248047, "learning_rate": 1.9958308095861074e-05, "loss": 0.4037, "step": 1090 }, { "epoch": 0.058755191154673043, "grad_norm": 3.167987823486328, "learning_rate": 1.9956715053210104e-05, "loss": 0.3777, "step": 1100 }, { "epoch": 0.05928932925607916, "grad_norm": 3.2435383796691895, "learning_rate": 1.9955092209489086e-05, "loss": 0.3967, "step": 1110 }, { "epoch": 0.059823467357485276, "grad_norm": 2.819976806640625, "learning_rate": 1.99534395695553e-05, "loss": 0.4009, "step": 1120 }, { "epoch": 0.0603576054588914, "grad_norm": 3.4456632137298584, "learning_rate": 1.9951757138355195e-05, "loss": 0.3856, "step": 1130 }, { "epoch": 0.060891743560297515, "grad_norm": 4.264998912811279, "learning_rate": 1.99500449209244e-05, "loss": 0.3867, "step": 1140 }, { "epoch": 0.06142588166170363, "grad_norm": 3.479968547821045, "learning_rate": 1.994830292238768e-05, "loss": 0.3964, "step": 1150 }, { "epoch": 0.061960019763109754, "grad_norm": 2.6845524311065674, "learning_rate": 1.9946531147958947e-05, "loss": 0.3983, "step": 1160 }, { "epoch": 0.06249415786451587, "grad_norm": 3.3149259090423584, "learning_rate": 1.994472960294124e-05, "loss": 0.3972, "step": 1170 }, { "epoch": 0.063028295965922, "grad_norm": 3.2550976276397705, "learning_rate": 1.994289829272669e-05, "loss": 0.3937, "step": 1180 }, { "epoch": 0.0635624340673281, "grad_norm": 2.429530382156372, "learning_rate": 1.9941037222796522e-05, "loss": 0.3933, "step": 1190 }, { "epoch": 0.06409657216873423, "grad_norm": 4.116220951080322, "learning_rate": 1.9939146398721034e-05, "loss": 0.3732, "step": 1200 }, { "epoch": 0.06463071027014035, "grad_norm": 5.098406791687012, "learning_rate": 1.9937225826159587e-05, "loss": 0.3939, "step": 1210 }, { "epoch": 0.06516484837154646, "grad_norm": 2.5038375854492188, "learning_rate": 1.9935275510860567e-05, "loss": 0.4033, "step": 1220 }, { "epoch": 0.06569898647295258, "grad_norm": 4.1177825927734375, "learning_rate": 1.9933295458661393e-05, "loss": 0.4118, "step": 1230 }, { "epoch": 0.0662331245743587, "grad_norm": 6.29306697845459, "learning_rate": 1.9931285675488486e-05, "loss": 0.3808, "step": 1240 }, { "epoch": 0.06676726267576481, "grad_norm": 2.8112893104553223, "learning_rate": 1.992924616735725e-05, "loss": 0.3699, "step": 1250 }, { "epoch": 0.06730140077717094, "grad_norm": 3.3691790103912354, "learning_rate": 1.992717694037206e-05, "loss": 0.386, "step": 1260 }, { "epoch": 0.06783553887857706, "grad_norm": 3.526228427886963, "learning_rate": 1.9925078000726246e-05, "loss": 0.3902, "step": 1270 }, { "epoch": 0.06836967697998317, "grad_norm": 3.1308605670928955, "learning_rate": 1.9922949354702063e-05, "loss": 0.3655, "step": 1280 }, { "epoch": 0.06890381508138929, "grad_norm": 3.8480379581451416, "learning_rate": 1.992079100867068e-05, "loss": 0.4004, "step": 1290 }, { "epoch": 0.06943795318279541, "grad_norm": 3.981570243835449, "learning_rate": 1.991860296909216e-05, "loss": 0.3861, "step": 1300 }, { "epoch": 0.06997209128420152, "grad_norm": 2.9895670413970947, "learning_rate": 1.9916385242515447e-05, "loss": 0.3839, "step": 1310 }, { "epoch": 0.07050622938560765, "grad_norm": 3.7068064212799072, "learning_rate": 1.9914137835578332e-05, "loss": 0.3662, "step": 1320 }, { "epoch": 0.07104036748701377, "grad_norm": 2.8939402103424072, "learning_rate": 1.9911860755007445e-05, "loss": 0.3831, "step": 1330 }, { "epoch": 0.07157450558841989, "grad_norm": 5.6148681640625, "learning_rate": 1.990955400761823e-05, "loss": 0.3784, "step": 1340 }, { "epoch": 0.072108643689826, "grad_norm": 3.1284894943237305, "learning_rate": 1.9907217600314928e-05, "loss": 0.3659, "step": 1350 }, { "epoch": 0.07264278179123212, "grad_norm": 2.4304819107055664, "learning_rate": 1.990485154009055e-05, "loss": 0.3781, "step": 1360 }, { "epoch": 0.07317691989263825, "grad_norm": 3.663529396057129, "learning_rate": 1.9902455834026863e-05, "loss": 0.3774, "step": 1370 }, { "epoch": 0.07371105799404436, "grad_norm": 3.560011386871338, "learning_rate": 1.9900030489294363e-05, "loss": 0.3762, "step": 1380 }, { "epoch": 0.07424519609545048, "grad_norm": 2.8052570819854736, "learning_rate": 1.989757551315226e-05, "loss": 0.3824, "step": 1390 }, { "epoch": 0.0747793341968566, "grad_norm": 2.9042539596557617, "learning_rate": 1.9895090912948453e-05, "loss": 0.3765, "step": 1400 }, { "epoch": 0.07531347229826271, "grad_norm": 2.720327615737915, "learning_rate": 1.98925766961195e-05, "loss": 0.385, "step": 1410 }, { "epoch": 0.07584761039966884, "grad_norm": 3.354515552520752, "learning_rate": 1.9890032870190613e-05, "loss": 0.3859, "step": 1420 }, { "epoch": 0.07638174850107496, "grad_norm": 4.520259380340576, "learning_rate": 1.9887459442775624e-05, "loss": 0.3799, "step": 1430 }, { "epoch": 0.07691588660248107, "grad_norm": 3.5492591857910156, "learning_rate": 1.9884856421576954e-05, "loss": 0.3831, "step": 1440 }, { "epoch": 0.07745002470388719, "grad_norm": 2.966611862182617, "learning_rate": 1.9882223814385618e-05, "loss": 0.3719, "step": 1450 }, { "epoch": 0.07798416280529331, "grad_norm": 3.529801368713379, "learning_rate": 1.9879561629081167e-05, "loss": 0.3838, "step": 1460 }, { "epoch": 0.07851830090669942, "grad_norm": 2.9840991497039795, "learning_rate": 1.9876869873631687e-05, "loss": 0.3686, "step": 1470 }, { "epoch": 0.07905243900810555, "grad_norm": 3.356109857559204, "learning_rate": 1.9874148556093774e-05, "loss": 0.3906, "step": 1480 }, { "epoch": 0.07958657710951167, "grad_norm": 3.180203437805176, "learning_rate": 1.9871397684612502e-05, "loss": 0.3596, "step": 1490 }, { "epoch": 0.08012071521091778, "grad_norm": 3.3919284343719482, "learning_rate": 1.9868617267421398e-05, "loss": 0.383, "step": 1500 }, { "epoch": 0.0806548533123239, "grad_norm": 2.9372594356536865, "learning_rate": 1.986580731284243e-05, "loss": 0.3642, "step": 1510 }, { "epoch": 0.08118899141373002, "grad_norm": 3.304171085357666, "learning_rate": 1.9862967829285956e-05, "loss": 0.3623, "step": 1520 }, { "epoch": 0.08172312951513613, "grad_norm": 2.4466633796691895, "learning_rate": 1.986009882525074e-05, "loss": 0.39, "step": 1530 }, { "epoch": 0.08225726761654226, "grad_norm": 3.968824625015259, "learning_rate": 1.9857200309323883e-05, "loss": 0.3845, "step": 1540 }, { "epoch": 0.08279140571794838, "grad_norm": 4.129607677459717, "learning_rate": 1.985427229018083e-05, "loss": 0.3618, "step": 1550 }, { "epoch": 0.08332554381935449, "grad_norm": 3.479787588119507, "learning_rate": 1.9851314776585318e-05, "loss": 0.363, "step": 1560 }, { "epoch": 0.08385968192076061, "grad_norm": 4.462396144866943, "learning_rate": 1.9848327777389375e-05, "loss": 0.3567, "step": 1570 }, { "epoch": 0.08439382002216674, "grad_norm": 3.3157875537872314, "learning_rate": 1.9845311301533275e-05, "loss": 0.3743, "step": 1580 }, { "epoch": 0.08492795812357284, "grad_norm": 3.2936484813690186, "learning_rate": 1.9842265358045515e-05, "loss": 0.3604, "step": 1590 }, { "epoch": 0.08546209622497897, "grad_norm": 2.485267162322998, "learning_rate": 1.9839189956042802e-05, "loss": 0.3856, "step": 1600 }, { "epoch": 0.08599623432638509, "grad_norm": 2.371533155441284, "learning_rate": 1.983608510473e-05, "loss": 0.3674, "step": 1610 }, { "epoch": 0.0865303724277912, "grad_norm": 2.9332618713378906, "learning_rate": 1.9832950813400123e-05, "loss": 0.3725, "step": 1620 }, { "epoch": 0.08706451052919732, "grad_norm": 2.6395232677459717, "learning_rate": 1.9829787091434305e-05, "loss": 0.3735, "step": 1630 }, { "epoch": 0.08759864863060345, "grad_norm": 3.93823504447937, "learning_rate": 1.982659394830176e-05, "loss": 0.3543, "step": 1640 }, { "epoch": 0.08813278673200955, "grad_norm": 3.257145404815674, "learning_rate": 1.9823371393559766e-05, "loss": 0.3632, "step": 1650 }, { "epoch": 0.08866692483341568, "grad_norm": 2.738173246383667, "learning_rate": 1.982011943685363e-05, "loss": 0.3818, "step": 1660 }, { "epoch": 0.0892010629348218, "grad_norm": 3.1435506343841553, "learning_rate": 1.981683808791666e-05, "loss": 0.3689, "step": 1670 }, { "epoch": 0.08973520103622791, "grad_norm": 2.9814436435699463, "learning_rate": 1.981352735657014e-05, "loss": 0.3627, "step": 1680 }, { "epoch": 0.09026933913763403, "grad_norm": 3.1710870265960693, "learning_rate": 1.9810187252723298e-05, "loss": 0.3713, "step": 1690 }, { "epoch": 0.09080347723904016, "grad_norm": 3.5158421993255615, "learning_rate": 1.980681778637327e-05, "loss": 0.3572, "step": 1700 }, { "epoch": 0.09133761534044628, "grad_norm": 3.327237367630005, "learning_rate": 1.9803418967605082e-05, "loss": 0.3575, "step": 1710 }, { "epoch": 0.09187175344185239, "grad_norm": 2.74479341506958, "learning_rate": 1.979999080659161e-05, "loss": 0.3825, "step": 1720 }, { "epoch": 0.09240589154325851, "grad_norm": 2.519000768661499, "learning_rate": 1.9796533313593552e-05, "loss": 0.3543, "step": 1730 }, { "epoch": 0.09294002964466463, "grad_norm": 3.0578513145446777, "learning_rate": 1.9793046498959402e-05, "loss": 0.368, "step": 1740 }, { "epoch": 0.09347416774607074, "grad_norm": 3.521696090698242, "learning_rate": 1.9789530373125413e-05, "loss": 0.349, "step": 1750 }, { "epoch": 0.09400830584747687, "grad_norm": 4.292337894439697, "learning_rate": 1.978598494661557e-05, "loss": 0.3721, "step": 1760 }, { "epoch": 0.09454244394888299, "grad_norm": 2.322366237640381, "learning_rate": 1.9782410230041558e-05, "loss": 0.3694, "step": 1770 }, { "epoch": 0.0950765820502891, "grad_norm": 2.7006542682647705, "learning_rate": 1.9778806234102723e-05, "loss": 0.3509, "step": 1780 }, { "epoch": 0.09561072015169522, "grad_norm": 3.400186061859131, "learning_rate": 1.977517296958605e-05, "loss": 0.3631, "step": 1790 }, { "epoch": 0.09614485825310135, "grad_norm": 3.719177007675171, "learning_rate": 1.9771510447366132e-05, "loss": 0.3368, "step": 1800 }, { "epoch": 0.09667899635450745, "grad_norm": 4.550919532775879, "learning_rate": 1.9767818678405123e-05, "loss": 0.3621, "step": 1810 }, { "epoch": 0.09721313445591358, "grad_norm": 2.492908239364624, "learning_rate": 1.976409767375272e-05, "loss": 0.3628, "step": 1820 }, { "epoch": 0.0977472725573197, "grad_norm": 7.344775199890137, "learning_rate": 1.976034744454612e-05, "loss": 0.3467, "step": 1830 }, { "epoch": 0.09828141065872581, "grad_norm": 2.216784954071045, "learning_rate": 1.975656800200999e-05, "loss": 0.3346, "step": 1840 }, { "epoch": 0.09881554876013193, "grad_norm": 3.7858848571777344, "learning_rate": 1.9752759357456448e-05, "loss": 0.3643, "step": 1850 }, { "epoch": 0.09934968686153806, "grad_norm": 4.518852233886719, "learning_rate": 1.9748921522284995e-05, "loss": 0.3444, "step": 1860 }, { "epoch": 0.09988382496294416, "grad_norm": 3.270376682281494, "learning_rate": 1.9745054507982514e-05, "loss": 0.3559, "step": 1870 }, { "epoch": 0.10041796306435029, "grad_norm": 3.1475467681884766, "learning_rate": 1.9741158326123228e-05, "loss": 0.3797, "step": 1880 }, { "epoch": 0.10095210116575641, "grad_norm": 3.485856533050537, "learning_rate": 1.973723298836864e-05, "loss": 0.3401, "step": 1890 }, { "epoch": 0.10148623926716252, "grad_norm": 2.7243599891662598, "learning_rate": 1.9733278506467538e-05, "loss": 0.3462, "step": 1900 }, { "epoch": 0.10202037736856864, "grad_norm": 5.423892021179199, "learning_rate": 1.9729294892255933e-05, "loss": 0.3573, "step": 1910 }, { "epoch": 0.10255451546997477, "grad_norm": 2.715611219406128, "learning_rate": 1.972528215765703e-05, "loss": 0.3656, "step": 1920 }, { "epoch": 0.10308865357138088, "grad_norm": 2.775381088256836, "learning_rate": 1.9721240314681196e-05, "loss": 0.3445, "step": 1930 }, { "epoch": 0.103622791672787, "grad_norm": 3.233672618865967, "learning_rate": 1.9717169375425917e-05, "loss": 0.3673, "step": 1940 }, { "epoch": 0.10415692977419312, "grad_norm": 3.027465581893921, "learning_rate": 1.971306935207577e-05, "loss": 0.3436, "step": 1950 }, { "epoch": 0.10469106787559923, "grad_norm": 3.5339982509613037, "learning_rate": 1.9708940256902382e-05, "loss": 0.3717, "step": 1960 }, { "epoch": 0.10522520597700535, "grad_norm": 4.634093284606934, "learning_rate": 1.970478210226439e-05, "loss": 0.369, "step": 1970 }, { "epoch": 0.10575934407841148, "grad_norm": 2.7980735301971436, "learning_rate": 1.9700594900607415e-05, "loss": 0.3376, "step": 1980 }, { "epoch": 0.10629348217981759, "grad_norm": 5.012727737426758, "learning_rate": 1.9696378664464008e-05, "loss": 0.3566, "step": 1990 }, { "epoch": 0.10682762028122371, "grad_norm": 3.3315470218658447, "learning_rate": 1.969213340645363e-05, "loss": 0.3821, "step": 2000 }, { "epoch": 0.10736175838262983, "grad_norm": 2.4778151512145996, "learning_rate": 1.9687859139282604e-05, "loss": 0.3612, "step": 2010 }, { "epoch": 0.10789589648403594, "grad_norm": 2.97556209564209, "learning_rate": 1.9683555875744074e-05, "loss": 0.3517, "step": 2020 }, { "epoch": 0.10843003458544206, "grad_norm": 2.5819060802459717, "learning_rate": 1.9679223628717983e-05, "loss": 0.3645, "step": 2030 }, { "epoch": 0.10896417268684819, "grad_norm": 3.486530303955078, "learning_rate": 1.9674862411171012e-05, "loss": 0.3531, "step": 2040 }, { "epoch": 0.1094983107882543, "grad_norm": 3.691188097000122, "learning_rate": 1.9670472236156558e-05, "loss": 0.3557, "step": 2050 }, { "epoch": 0.11003244888966042, "grad_norm": 2.3335423469543457, "learning_rate": 1.9666053116814693e-05, "loss": 0.363, "step": 2060 }, { "epoch": 0.11056658699106654, "grad_norm": 3.414902687072754, "learning_rate": 1.9661605066372115e-05, "loss": 0.34, "step": 2070 }, { "epoch": 0.11110072509247265, "grad_norm": 2.248997449874878, "learning_rate": 1.9657128098142115e-05, "loss": 0.3499, "step": 2080 }, { "epoch": 0.11163486319387877, "grad_norm": 3.0081093311309814, "learning_rate": 1.9652622225524537e-05, "loss": 0.345, "step": 2090 }, { "epoch": 0.1121690012952849, "grad_norm": 4.787203311920166, "learning_rate": 1.9648087462005746e-05, "loss": 0.3416, "step": 2100 }, { "epoch": 0.11270313939669102, "grad_norm": 3.1817538738250732, "learning_rate": 1.9643523821158566e-05, "loss": 0.3446, "step": 2110 }, { "epoch": 0.11323727749809713, "grad_norm": 2.8154513835906982, "learning_rate": 1.9638931316642265e-05, "loss": 0.3519, "step": 2120 }, { "epoch": 0.11377141559950325, "grad_norm": 3.6086184978485107, "learning_rate": 1.963430996220249e-05, "loss": 0.3454, "step": 2130 }, { "epoch": 0.11430555370090938, "grad_norm": 4.197864055633545, "learning_rate": 1.9629659771671244e-05, "loss": 0.3656, "step": 2140 }, { "epoch": 0.11483969180231549, "grad_norm": 3.165158271789551, "learning_rate": 1.962498075896684e-05, "loss": 0.333, "step": 2150 }, { "epoch": 0.11537382990372161, "grad_norm": 3.608457565307617, "learning_rate": 1.962027293809386e-05, "loss": 0.3557, "step": 2160 }, { "epoch": 0.11590796800512773, "grad_norm": 2.9369077682495117, "learning_rate": 1.9615536323143097e-05, "loss": 0.3539, "step": 2170 }, { "epoch": 0.11644210610653384, "grad_norm": 3.2493855953216553, "learning_rate": 1.9610770928291542e-05, "loss": 0.341, "step": 2180 }, { "epoch": 0.11697624420793996, "grad_norm": 3.7158823013305664, "learning_rate": 1.9605976767802315e-05, "loss": 0.3345, "step": 2190 }, { "epoch": 0.11751038230934609, "grad_norm": 2.6097054481506348, "learning_rate": 1.960115385602464e-05, "loss": 0.3397, "step": 2200 }, { "epoch": 0.1180445204107522, "grad_norm": 3.3834986686706543, "learning_rate": 1.959630220739379e-05, "loss": 0.3401, "step": 2210 }, { "epoch": 0.11857865851215832, "grad_norm": 2.5442283153533936, "learning_rate": 1.9591421836431057e-05, "loss": 0.3425, "step": 2220 }, { "epoch": 0.11911279661356444, "grad_norm": 3.4543728828430176, "learning_rate": 1.9586512757743693e-05, "loss": 0.3472, "step": 2230 }, { "epoch": 0.11964693471497055, "grad_norm": 2.6369924545288086, "learning_rate": 1.9581574986024882e-05, "loss": 0.33, "step": 2240 }, { "epoch": 0.12018107281637667, "grad_norm": 3.3894622325897217, "learning_rate": 1.9576608536053678e-05, "loss": 0.3398, "step": 2250 }, { "epoch": 0.1207152109177828, "grad_norm": 3.1803109645843506, "learning_rate": 1.9571613422694973e-05, "loss": 0.3463, "step": 2260 }, { "epoch": 0.1212493490191889, "grad_norm": 2.483438491821289, "learning_rate": 1.9566589660899463e-05, "loss": 0.3392, "step": 2270 }, { "epoch": 0.12178348712059503, "grad_norm": 3.298787832260132, "learning_rate": 1.9561537265703573e-05, "loss": 0.3323, "step": 2280 }, { "epoch": 0.12231762522200115, "grad_norm": 1.814156174659729, "learning_rate": 1.955645625222944e-05, "loss": 0.3497, "step": 2290 }, { "epoch": 0.12285176332340726, "grad_norm": 2.514442205429077, "learning_rate": 1.9551346635684853e-05, "loss": 0.3547, "step": 2300 }, { "epoch": 0.12338590142481339, "grad_norm": 3.6863417625427246, "learning_rate": 1.954620843136322e-05, "loss": 0.3563, "step": 2310 }, { "epoch": 0.12392003952621951, "grad_norm": 3.1592390537261963, "learning_rate": 1.9541041654643498e-05, "loss": 0.3416, "step": 2320 }, { "epoch": 0.12445417762762562, "grad_norm": 2.292806625366211, "learning_rate": 1.953584632099018e-05, "loss": 0.3428, "step": 2330 }, { "epoch": 0.12498831572903174, "grad_norm": 3.393087387084961, "learning_rate": 1.9530622445953217e-05, "loss": 0.3479, "step": 2340 }, { "epoch": 0.12552245383043786, "grad_norm": 3.725224494934082, "learning_rate": 1.9525370045168e-05, "loss": 0.3357, "step": 2350 }, { "epoch": 0.126056591931844, "grad_norm": 2.8739302158355713, "learning_rate": 1.9520089134355288e-05, "loss": 0.3391, "step": 2360 }, { "epoch": 0.1265907300332501, "grad_norm": 3.1822566986083984, "learning_rate": 1.9514779729321183e-05, "loss": 0.3424, "step": 2370 }, { "epoch": 0.1271248681346562, "grad_norm": 2.8543050289154053, "learning_rate": 1.950944184595706e-05, "loss": 0.3216, "step": 2380 }, { "epoch": 0.12765900623606233, "grad_norm": 3.0438168048858643, "learning_rate": 1.950407550023954e-05, "loss": 0.3532, "step": 2390 }, { "epoch": 0.12819314433746845, "grad_norm": 2.883988380432129, "learning_rate": 1.9498680708230426e-05, "loss": 0.3332, "step": 2400 }, { "epoch": 0.12872728243887457, "grad_norm": 1.9879076480865479, "learning_rate": 1.9493257486076675e-05, "loss": 0.3551, "step": 2410 }, { "epoch": 0.1292614205402807, "grad_norm": 2.9743425846099854, "learning_rate": 1.9487805850010323e-05, "loss": 0.3512, "step": 2420 }, { "epoch": 0.12979555864168682, "grad_norm": 2.2685039043426514, "learning_rate": 1.948232581634846e-05, "loss": 0.3344, "step": 2430 }, { "epoch": 0.13032969674309292, "grad_norm": 3.436135768890381, "learning_rate": 1.9476817401493173e-05, "loss": 0.3472, "step": 2440 }, { "epoch": 0.13086383484449904, "grad_norm": 2.7500932216644287, "learning_rate": 1.9471280621931488e-05, "loss": 0.342, "step": 2450 }, { "epoch": 0.13139797294590516, "grad_norm": 3.199063539505005, "learning_rate": 1.9465715494235332e-05, "loss": 0.3385, "step": 2460 }, { "epoch": 0.13193211104731128, "grad_norm": 2.204498291015625, "learning_rate": 1.9460122035061476e-05, "loss": 0.3337, "step": 2470 }, { "epoch": 0.1324662491487174, "grad_norm": 2.6278960704803467, "learning_rate": 1.9454500261151498e-05, "loss": 0.3201, "step": 2480 }, { "epoch": 0.13300038725012353, "grad_norm": 3.07122540473938, "learning_rate": 1.944885018933172e-05, "loss": 0.3364, "step": 2490 }, { "epoch": 0.13353452535152963, "grad_norm": 3.0265800952911377, "learning_rate": 1.9443171836513155e-05, "loss": 0.3414, "step": 2500 }, { "epoch": 0.13406866345293575, "grad_norm": 2.829035758972168, "learning_rate": 1.943746521969147e-05, "loss": 0.3559, "step": 2510 }, { "epoch": 0.13460280155434187, "grad_norm": 3.0216164588928223, "learning_rate": 1.9431730355946925e-05, "loss": 0.3517, "step": 2520 }, { "epoch": 0.135136939655748, "grad_norm": 2.604104518890381, "learning_rate": 1.9425967262444322e-05, "loss": 0.3339, "step": 2530 }, { "epoch": 0.13567107775715412, "grad_norm": 3.5679659843444824, "learning_rate": 1.9420175956432967e-05, "loss": 0.3535, "step": 2540 }, { "epoch": 0.13620521585856024, "grad_norm": 3.134125232696533, "learning_rate": 1.9414356455246597e-05, "loss": 0.3553, "step": 2550 }, { "epoch": 0.13673935395996634, "grad_norm": 2.4072985649108887, "learning_rate": 1.9408508776303342e-05, "loss": 0.3278, "step": 2560 }, { "epoch": 0.13727349206137246, "grad_norm": 10.248159408569336, "learning_rate": 1.940263293710567e-05, "loss": 0.3415, "step": 2570 }, { "epoch": 0.13780763016277858, "grad_norm": 3.167949676513672, "learning_rate": 1.9396728955240336e-05, "loss": 0.3365, "step": 2580 }, { "epoch": 0.1383417682641847, "grad_norm": 2.8755500316619873, "learning_rate": 1.9390796848378323e-05, "loss": 0.34, "step": 2590 }, { "epoch": 0.13887590636559083, "grad_norm": 3.016554355621338, "learning_rate": 1.9384836634274805e-05, "loss": 0.3177, "step": 2600 }, { "epoch": 0.13941004446699695, "grad_norm": 3.3917722702026367, "learning_rate": 1.9378848330769066e-05, "loss": 0.3315, "step": 2610 }, { "epoch": 0.13994418256840305, "grad_norm": 3.2236247062683105, "learning_rate": 1.937283195578448e-05, "loss": 0.3124, "step": 2620 }, { "epoch": 0.14047832066980917, "grad_norm": 2.9058220386505127, "learning_rate": 1.936678752732843e-05, "loss": 0.3313, "step": 2630 }, { "epoch": 0.1410124587712153, "grad_norm": 2.6476597785949707, "learning_rate": 1.9360715063492265e-05, "loss": 0.3411, "step": 2640 }, { "epoch": 0.14154659687262142, "grad_norm": 2.666013240814209, "learning_rate": 1.9354614582451253e-05, "loss": 0.3331, "step": 2650 }, { "epoch": 0.14208073497402754, "grad_norm": 1.8294079303741455, "learning_rate": 1.9348486102464513e-05, "loss": 0.3268, "step": 2660 }, { "epoch": 0.14261487307543366, "grad_norm": 2.538583517074585, "learning_rate": 1.934232964187497e-05, "loss": 0.342, "step": 2670 }, { "epoch": 0.14314901117683979, "grad_norm": 3.3661508560180664, "learning_rate": 1.9336145219109293e-05, "loss": 0.3379, "step": 2680 }, { "epoch": 0.14368314927824588, "grad_norm": 2.4903249740600586, "learning_rate": 1.9329932852677843e-05, "loss": 0.3386, "step": 2690 }, { "epoch": 0.144217287379652, "grad_norm": 3.2595815658569336, "learning_rate": 1.932369256117463e-05, "loss": 0.3247, "step": 2700 }, { "epoch": 0.14475142548105813, "grad_norm": 2.412522077560425, "learning_rate": 1.9317424363277223e-05, "loss": 0.3175, "step": 2710 }, { "epoch": 0.14528556358246425, "grad_norm": 2.359240770339966, "learning_rate": 1.9311128277746744e-05, "loss": 0.3508, "step": 2720 }, { "epoch": 0.14581970168387037, "grad_norm": 3.089524030685425, "learning_rate": 1.9304804323427757e-05, "loss": 0.3407, "step": 2730 }, { "epoch": 0.1463538397852765, "grad_norm": 1.834702491760254, "learning_rate": 1.9298452519248263e-05, "loss": 0.3294, "step": 2740 }, { "epoch": 0.1468879778866826, "grad_norm": 2.3364810943603516, "learning_rate": 1.9292072884219602e-05, "loss": 0.3269, "step": 2750 }, { "epoch": 0.14742211598808871, "grad_norm": 2.751586437225342, "learning_rate": 1.928566543743642e-05, "loss": 0.3052, "step": 2760 }, { "epoch": 0.14795625408949484, "grad_norm": 2.6108336448669434, "learning_rate": 1.9279230198076614e-05, "loss": 0.3253, "step": 2770 }, { "epoch": 0.14849039219090096, "grad_norm": 2.1829898357391357, "learning_rate": 1.9272767185401244e-05, "loss": 0.3197, "step": 2780 }, { "epoch": 0.14902453029230708, "grad_norm": 1.8864151239395142, "learning_rate": 1.9266276418754522e-05, "loss": 0.3211, "step": 2790 }, { "epoch": 0.1495586683937132, "grad_norm": 2.769913673400879, "learning_rate": 1.9259757917563714e-05, "loss": 0.323, "step": 2800 }, { "epoch": 0.1500928064951193, "grad_norm": 2.135254144668579, "learning_rate": 1.92532117013391e-05, "loss": 0.3281, "step": 2810 }, { "epoch": 0.15062694459652543, "grad_norm": 3.415153741836548, "learning_rate": 1.9246637789673913e-05, "loss": 0.3252, "step": 2820 }, { "epoch": 0.15116108269793155, "grad_norm": 2.0934529304504395, "learning_rate": 1.9240036202244282e-05, "loss": 0.3183, "step": 2830 }, { "epoch": 0.15169522079933767, "grad_norm": 2.2274978160858154, "learning_rate": 1.9233406958809174e-05, "loss": 0.3067, "step": 2840 }, { "epoch": 0.1522293589007438, "grad_norm": 2.069931745529175, "learning_rate": 1.9226750079210327e-05, "loss": 0.3203, "step": 2850 }, { "epoch": 0.15276349700214992, "grad_norm": 2.8688833713531494, "learning_rate": 1.9220065583372197e-05, "loss": 0.3313, "step": 2860 }, { "epoch": 0.153297635103556, "grad_norm": 2.8972392082214355, "learning_rate": 1.9213353491301897e-05, "loss": 0.3192, "step": 2870 }, { "epoch": 0.15383177320496214, "grad_norm": 2.599181890487671, "learning_rate": 1.9206613823089134e-05, "loss": 0.3203, "step": 2880 }, { "epoch": 0.15436591130636826, "grad_norm": 1.69851815700531, "learning_rate": 1.9199846598906163e-05, "loss": 0.3194, "step": 2890 }, { "epoch": 0.15490004940777438, "grad_norm": 2.2960007190704346, "learning_rate": 1.9193051839007707e-05, "loss": 0.3222, "step": 2900 }, { "epoch": 0.1554341875091805, "grad_norm": 1.9096348285675049, "learning_rate": 1.9186229563730905e-05, "loss": 0.3405, "step": 2910 }, { "epoch": 0.15596832561058663, "grad_norm": 3.0293571949005127, "learning_rate": 1.917937979349525e-05, "loss": 0.3231, "step": 2920 }, { "epoch": 0.15650246371199272, "grad_norm": 2.2593390941619873, "learning_rate": 1.9172502548802542e-05, "loss": 0.3221, "step": 2930 }, { "epoch": 0.15703660181339885, "grad_norm": 2.003842830657959, "learning_rate": 1.916559785023679e-05, "loss": 0.3229, "step": 2940 }, { "epoch": 0.15757073991480497, "grad_norm": 2.52433443069458, "learning_rate": 1.91586657184642e-05, "loss": 0.335, "step": 2950 }, { "epoch": 0.1581048780162111, "grad_norm": 2.268087387084961, "learning_rate": 1.915170617423307e-05, "loss": 0.3232, "step": 2960 }, { "epoch": 0.15863901611761722, "grad_norm": 2.3799333572387695, "learning_rate": 1.914471923837375e-05, "loss": 0.3144, "step": 2970 }, { "epoch": 0.15917315421902334, "grad_norm": 1.7771875858306885, "learning_rate": 1.9137704931798576e-05, "loss": 0.3026, "step": 2980 }, { "epoch": 0.15970729232042943, "grad_norm": 5.339052677154541, "learning_rate": 1.913066327550181e-05, "loss": 0.3387, "step": 2990 }, { "epoch": 0.16024143042183556, "grad_norm": 3.125415563583374, "learning_rate": 1.9123594290559563e-05, "loss": 0.3273, "step": 3000 }, { "epoch": 0.16077556852324168, "grad_norm": 1.921730637550354, "learning_rate": 1.911649799812975e-05, "loss": 0.309, "step": 3010 }, { "epoch": 0.1613097066246478, "grad_norm": 3.7113003730773926, "learning_rate": 1.9109374419452026e-05, "loss": 0.3135, "step": 3020 }, { "epoch": 0.16184384472605393, "grad_norm": 2.8721847534179688, "learning_rate": 1.9102223575847693e-05, "loss": 0.3211, "step": 3030 }, { "epoch": 0.16237798282746005, "grad_norm": 2.055187702178955, "learning_rate": 1.909504548871968e-05, "loss": 0.2891, "step": 3040 }, { "epoch": 0.16291212092886617, "grad_norm": 2.0971548557281494, "learning_rate": 1.9087840179552448e-05, "loss": 0.3183, "step": 3050 }, { "epoch": 0.16344625903027227, "grad_norm": 3.2272660732269287, "learning_rate": 1.908060766991194e-05, "loss": 0.335, "step": 3060 }, { "epoch": 0.1639803971316784, "grad_norm": 2.9197070598602295, "learning_rate": 1.9073347981445507e-05, "loss": 0.3107, "step": 3070 }, { "epoch": 0.1645145352330845, "grad_norm": 3.323416233062744, "learning_rate": 1.9066061135881852e-05, "loss": 0.3251, "step": 3080 }, { "epoch": 0.16504867333449064, "grad_norm": 2.8057186603546143, "learning_rate": 1.9058747155030956e-05, "loss": 0.308, "step": 3090 }, { "epoch": 0.16558281143589676, "grad_norm": 3.7518258094787598, "learning_rate": 1.9051406060784024e-05, "loss": 0.315, "step": 3100 }, { "epoch": 0.16611694953730288, "grad_norm": 2.0896756649017334, "learning_rate": 1.9044037875113405e-05, "loss": 0.3144, "step": 3110 }, { "epoch": 0.16665108763870898, "grad_norm": 1.6856944561004639, "learning_rate": 1.9036642620072544e-05, "loss": 0.3196, "step": 3120 }, { "epoch": 0.1671852257401151, "grad_norm": 2.2599587440490723, "learning_rate": 1.90292203177959e-05, "loss": 0.3153, "step": 3130 }, { "epoch": 0.16771936384152122, "grad_norm": 3.611912250518799, "learning_rate": 1.9021770990498884e-05, "loss": 0.3417, "step": 3140 }, { "epoch": 0.16825350194292735, "grad_norm": 2.35536789894104, "learning_rate": 1.90142946604778e-05, "loss": 0.3316, "step": 3150 }, { "epoch": 0.16878764004433347, "grad_norm": 2.74565052986145, "learning_rate": 1.900679135010977e-05, "loss": 0.309, "step": 3160 }, { "epoch": 0.1693217781457396, "grad_norm": 2.5373237133026123, "learning_rate": 1.8999261081852673e-05, "loss": 0.3215, "step": 3170 }, { "epoch": 0.1698559162471457, "grad_norm": 3.2715067863464355, "learning_rate": 1.899170387824507e-05, "loss": 0.3362, "step": 3180 }, { "epoch": 0.1703900543485518, "grad_norm": 2.4944100379943848, "learning_rate": 1.898411976190614e-05, "loss": 0.3265, "step": 3190 }, { "epoch": 0.17092419244995793, "grad_norm": 4.67669153213501, "learning_rate": 1.8976508755535627e-05, "loss": 0.3005, "step": 3200 }, { "epoch": 0.17145833055136406, "grad_norm": 2.6218225955963135, "learning_rate": 1.896887088191374e-05, "loss": 0.3268, "step": 3210 }, { "epoch": 0.17199246865277018, "grad_norm": 2.644885540008545, "learning_rate": 1.8961206163901113e-05, "loss": 0.3076, "step": 3220 }, { "epoch": 0.1725266067541763, "grad_norm": 3.3982152938842773, "learning_rate": 1.895351462443873e-05, "loss": 0.3074, "step": 3230 }, { "epoch": 0.1730607448555824, "grad_norm": 2.8020308017730713, "learning_rate": 1.894579628654784e-05, "loss": 0.2997, "step": 3240 }, { "epoch": 0.17359488295698852, "grad_norm": 2.4847612380981445, "learning_rate": 1.8938051173329924e-05, "loss": 0.3262, "step": 3250 }, { "epoch": 0.17412902105839465, "grad_norm": 1.876759648323059, "learning_rate": 1.8930279307966577e-05, "loss": 0.3058, "step": 3260 }, { "epoch": 0.17466315915980077, "grad_norm": 3.0607967376708984, "learning_rate": 1.892248071371948e-05, "loss": 0.3159, "step": 3270 }, { "epoch": 0.1751972972612069, "grad_norm": 3.056457281112671, "learning_rate": 1.891465541393032e-05, "loss": 0.3031, "step": 3280 }, { "epoch": 0.17573143536261301, "grad_norm": 3.2005832195281982, "learning_rate": 1.8906803432020703e-05, "loss": 0.3172, "step": 3290 }, { "epoch": 0.1762655734640191, "grad_norm": 2.290046453475952, "learning_rate": 1.8898924791492098e-05, "loss": 0.3046, "step": 3300 }, { "epoch": 0.17679971156542523, "grad_norm": 2.0925638675689697, "learning_rate": 1.8891019515925774e-05, "loss": 0.293, "step": 3310 }, { "epoch": 0.17733384966683136, "grad_norm": 2.9725520610809326, "learning_rate": 1.888308762898271e-05, "loss": 0.3129, "step": 3320 }, { "epoch": 0.17786798776823748, "grad_norm": 2.8957231044769287, "learning_rate": 1.8875129154403543e-05, "loss": 0.3025, "step": 3330 }, { "epoch": 0.1784021258696436, "grad_norm": 3.4979498386383057, "learning_rate": 1.8867144116008482e-05, "loss": 0.321, "step": 3340 }, { "epoch": 0.17893626397104972, "grad_norm": 2.4912140369415283, "learning_rate": 1.8859132537697245e-05, "loss": 0.3099, "step": 3350 }, { "epoch": 0.17947040207245582, "grad_norm": 2.159302234649658, "learning_rate": 1.885109444344899e-05, "loss": 0.2995, "step": 3360 }, { "epoch": 0.18000454017386194, "grad_norm": 2.479374408721924, "learning_rate": 1.884302985732223e-05, "loss": 0.3124, "step": 3370 }, { "epoch": 0.18053867827526807, "grad_norm": 2.085726261138916, "learning_rate": 1.883493880345478e-05, "loss": 0.3142, "step": 3380 }, { "epoch": 0.1810728163766742, "grad_norm": 3.4471168518066406, "learning_rate": 1.8826821306063668e-05, "loss": 0.3221, "step": 3390 }, { "epoch": 0.1816069544780803, "grad_norm": 2.7534632682800293, "learning_rate": 1.8818677389445065e-05, "loss": 0.3039, "step": 3400 }, { "epoch": 0.18214109257948644, "grad_norm": 2.717694044113159, "learning_rate": 1.8810507077974233e-05, "loss": 0.2975, "step": 3410 }, { "epoch": 0.18267523068089256, "grad_norm": 2.719078302383423, "learning_rate": 1.8802310396105415e-05, "loss": 0.3206, "step": 3420 }, { "epoch": 0.18320936878229865, "grad_norm": 2.4004297256469727, "learning_rate": 1.8794087368371798e-05, "loss": 0.3214, "step": 3430 }, { "epoch": 0.18374350688370478, "grad_norm": 2.9754390716552734, "learning_rate": 1.878583801938541e-05, "loss": 0.3252, "step": 3440 }, { "epoch": 0.1842776449851109, "grad_norm": 2.355118751525879, "learning_rate": 1.8777562373837077e-05, "loss": 0.2915, "step": 3450 }, { "epoch": 0.18481178308651702, "grad_norm": 2.6009035110473633, "learning_rate": 1.876926045649632e-05, "loss": 0.3023, "step": 3460 }, { "epoch": 0.18534592118792315, "grad_norm": 2.7690556049346924, "learning_rate": 1.876093229221129e-05, "loss": 0.3009, "step": 3470 }, { "epoch": 0.18588005928932927, "grad_norm": 2.1791882514953613, "learning_rate": 1.875257790590871e-05, "loss": 0.3183, "step": 3480 }, { "epoch": 0.18641419739073536, "grad_norm": 2.8397228717803955, "learning_rate": 1.8744197322593776e-05, "loss": 0.3136, "step": 3490 }, { "epoch": 0.1869483354921415, "grad_norm": 1.7311362028121948, "learning_rate": 1.8735790567350105e-05, "loss": 0.3049, "step": 3500 }, { "epoch": 0.1874824735935476, "grad_norm": 2.3864359855651855, "learning_rate": 1.8727357665339632e-05, "loss": 0.3019, "step": 3510 }, { "epoch": 0.18801661169495373, "grad_norm": 2.822932243347168, "learning_rate": 1.871889864180256e-05, "loss": 0.3186, "step": 3520 }, { "epoch": 0.18855074979635986, "grad_norm": 2.8305487632751465, "learning_rate": 1.8710413522057277e-05, "loss": 0.3154, "step": 3530 }, { "epoch": 0.18908488789776598, "grad_norm": 2.2257533073425293, "learning_rate": 1.8701902331500276e-05, "loss": 0.3005, "step": 3540 }, { "epoch": 0.18961902599917208, "grad_norm": 2.752640724182129, "learning_rate": 1.8693365095606087e-05, "loss": 0.3066, "step": 3550 }, { "epoch": 0.1901531641005782, "grad_norm": 2.0576658248901367, "learning_rate": 1.868480183992718e-05, "loss": 0.2935, "step": 3560 }, { "epoch": 0.19068730220198432, "grad_norm": 2.421781301498413, "learning_rate": 1.867621259009392e-05, "loss": 0.3135, "step": 3570 }, { "epoch": 0.19122144030339044, "grad_norm": 1.984196424484253, "learning_rate": 1.866759737181447e-05, "loss": 0.3111, "step": 3580 }, { "epoch": 0.19175557840479657, "grad_norm": 2.8647890090942383, "learning_rate": 1.8658956210874715e-05, "loss": 0.3132, "step": 3590 }, { "epoch": 0.1922897165062027, "grad_norm": 2.7351839542388916, "learning_rate": 1.8650289133138187e-05, "loss": 0.2976, "step": 3600 }, { "epoch": 0.19282385460760879, "grad_norm": 2.4232380390167236, "learning_rate": 1.864159616454599e-05, "loss": 0.3269, "step": 3610 }, { "epoch": 0.1933579927090149, "grad_norm": 1.921370267868042, "learning_rate": 1.863287733111673e-05, "loss": 0.2989, "step": 3620 }, { "epoch": 0.19389213081042103, "grad_norm": 3.1376841068267822, "learning_rate": 1.862413265894641e-05, "loss": 0.299, "step": 3630 }, { "epoch": 0.19442626891182715, "grad_norm": 3.6178529262542725, "learning_rate": 1.8615362174208386e-05, "loss": 0.3065, "step": 3640 }, { "epoch": 0.19496040701323328, "grad_norm": 2.5975160598754883, "learning_rate": 1.8606565903153264e-05, "loss": 0.2881, "step": 3650 }, { "epoch": 0.1954945451146394, "grad_norm": 2.196188449859619, "learning_rate": 1.8597743872108837e-05, "loss": 0.3021, "step": 3660 }, { "epoch": 0.1960286832160455, "grad_norm": 3.3378946781158447, "learning_rate": 1.8588896107479987e-05, "loss": 0.3011, "step": 3670 }, { "epoch": 0.19656282131745162, "grad_norm": 2.3805088996887207, "learning_rate": 1.8580022635748634e-05, "loss": 0.3006, "step": 3680 }, { "epoch": 0.19709695941885774, "grad_norm": 1.958283543586731, "learning_rate": 1.857112348347363e-05, "loss": 0.3242, "step": 3690 }, { "epoch": 0.19763109752026387, "grad_norm": 1.7068015336990356, "learning_rate": 1.8562198677290693e-05, "loss": 0.3049, "step": 3700 }, { "epoch": 0.19816523562167, "grad_norm": 3.316540002822876, "learning_rate": 1.8553248243912328e-05, "loss": 0.3103, "step": 3710 }, { "epoch": 0.1986993737230761, "grad_norm": 1.8615728616714478, "learning_rate": 1.8544272210127737e-05, "loss": 0.312, "step": 3720 }, { "epoch": 0.1992335118244822, "grad_norm": 2.6635255813598633, "learning_rate": 1.8535270602802754e-05, "loss": 0.3121, "step": 3730 }, { "epoch": 0.19976764992588833, "grad_norm": 1.9573800563812256, "learning_rate": 1.8526243448879747e-05, "loss": 0.298, "step": 3740 }, { "epoch": 0.20030178802729445, "grad_norm": 2.1269264221191406, "learning_rate": 1.8517190775377557e-05, "loss": 0.325, "step": 3750 }, { "epoch": 0.20083592612870058, "grad_norm": 1.8380793333053589, "learning_rate": 1.85081126093914e-05, "loss": 0.3061, "step": 3760 }, { "epoch": 0.2013700642301067, "grad_norm": 2.2267041206359863, "learning_rate": 1.849900897809279e-05, "loss": 0.3053, "step": 3770 }, { "epoch": 0.20190420233151282, "grad_norm": 3.1679797172546387, "learning_rate": 1.848987990872946e-05, "loss": 0.3036, "step": 3780 }, { "epoch": 0.20243834043291892, "grad_norm": 3.556302785873413, "learning_rate": 1.84807254286253e-05, "loss": 0.3089, "step": 3790 }, { "epoch": 0.20297247853432504, "grad_norm": 1.828749418258667, "learning_rate": 1.847154556518023e-05, "loss": 0.3134, "step": 3800 }, { "epoch": 0.20350661663573116, "grad_norm": 2.314093589782715, "learning_rate": 1.8462340345870156e-05, "loss": 0.2958, "step": 3810 }, { "epoch": 0.2040407547371373, "grad_norm": 2.3228211402893066, "learning_rate": 1.8453109798246878e-05, "loss": 0.2993, "step": 3820 }, { "epoch": 0.2045748928385434, "grad_norm": 3.1989004611968994, "learning_rate": 1.8443853949937997e-05, "loss": 0.2988, "step": 3830 }, { "epoch": 0.20510903093994953, "grad_norm": 2.0142407417297363, "learning_rate": 1.843457282864684e-05, "loss": 0.2982, "step": 3840 }, { "epoch": 0.20564316904135566, "grad_norm": 1.7243657112121582, "learning_rate": 1.84252664621524e-05, "loss": 0.3043, "step": 3850 }, { "epoch": 0.20617730714276175, "grad_norm": 2.888679027557373, "learning_rate": 1.8415934878309196e-05, "loss": 0.2982, "step": 3860 }, { "epoch": 0.20671144524416787, "grad_norm": 3.6155338287353516, "learning_rate": 1.840657810504725e-05, "loss": 0.315, "step": 3870 }, { "epoch": 0.207245583345574, "grad_norm": 2.467719554901123, "learning_rate": 1.8397196170371966e-05, "loss": 0.2819, "step": 3880 }, { "epoch": 0.20777972144698012, "grad_norm": 2.2238962650299072, "learning_rate": 1.838778910236406e-05, "loss": 0.3215, "step": 3890 }, { "epoch": 0.20831385954838624, "grad_norm": 2.7863171100616455, "learning_rate": 1.8378356929179476e-05, "loss": 0.3036, "step": 3900 }, { "epoch": 0.20884799764979237, "grad_norm": 3.77276349067688, "learning_rate": 1.83688996790493e-05, "loss": 0.2947, "step": 3910 }, { "epoch": 0.20938213575119846, "grad_norm": 3.0546200275421143, "learning_rate": 1.8359417380279666e-05, "loss": 0.3055, "step": 3920 }, { "epoch": 0.20991627385260458, "grad_norm": 2.453449249267578, "learning_rate": 1.834991006125169e-05, "loss": 0.2998, "step": 3930 }, { "epoch": 0.2104504119540107, "grad_norm": 2.077162742614746, "learning_rate": 1.834037775042137e-05, "loss": 0.298, "step": 3940 }, { "epoch": 0.21098455005541683, "grad_norm": 1.7122527360916138, "learning_rate": 1.833082047631951e-05, "loss": 0.3019, "step": 3950 }, { "epoch": 0.21151868815682295, "grad_norm": 3.5304980278015137, "learning_rate": 1.832123826755163e-05, "loss": 0.2914, "step": 3960 }, { "epoch": 0.21205282625822908, "grad_norm": 2.0481951236724854, "learning_rate": 1.8311631152797873e-05, "loss": 0.3063, "step": 3970 }, { "epoch": 0.21258696435963517, "grad_norm": 2.2833564281463623, "learning_rate": 1.8301999160812938e-05, "loss": 0.2899, "step": 3980 }, { "epoch": 0.2131211024610413, "grad_norm": 3.4269044399261475, "learning_rate": 1.8292342320425977e-05, "loss": 0.3266, "step": 3990 }, { "epoch": 0.21365524056244742, "grad_norm": 2.720754384994507, "learning_rate": 1.828266066054052e-05, "loss": 0.315, "step": 4000 }, { "epoch": 0.21418937866385354, "grad_norm": 9.334019660949707, "learning_rate": 1.8272954210134373e-05, "loss": 0.2727, "step": 4010 }, { "epoch": 0.21472351676525966, "grad_norm": 3.2519710063934326, "learning_rate": 1.8263222998259555e-05, "loss": 0.315, "step": 4020 }, { "epoch": 0.2152576548666658, "grad_norm": 3.185990571975708, "learning_rate": 1.8253467054042186e-05, "loss": 0.308, "step": 4030 }, { "epoch": 0.21579179296807188, "grad_norm": 1.896512508392334, "learning_rate": 1.824368640668242e-05, "loss": 0.2956, "step": 4040 }, { "epoch": 0.216325931069478, "grad_norm": 1.9384642839431763, "learning_rate": 1.8233881085454345e-05, "loss": 0.2991, "step": 4050 }, { "epoch": 0.21686006917088413, "grad_norm": 2.6338047981262207, "learning_rate": 1.8224051119705898e-05, "loss": 0.3014, "step": 4060 }, { "epoch": 0.21739420727229025, "grad_norm": 2.7578814029693604, "learning_rate": 1.8214196538858778e-05, "loss": 0.3286, "step": 4070 }, { "epoch": 0.21792834537369637, "grad_norm": 2.868448495864868, "learning_rate": 1.8204317372408367e-05, "loss": 0.3006, "step": 4080 }, { "epoch": 0.2184624834751025, "grad_norm": 2.4890317916870117, "learning_rate": 1.8194413649923626e-05, "loss": 0.3131, "step": 4090 }, { "epoch": 0.2189966215765086, "grad_norm": 2.3434038162231445, "learning_rate": 1.818448540104701e-05, "loss": 0.2926, "step": 4100 }, { "epoch": 0.21953075967791472, "grad_norm": 2.438072919845581, "learning_rate": 1.8174532655494394e-05, "loss": 0.2926, "step": 4110 }, { "epoch": 0.22006489777932084, "grad_norm": 3.137153148651123, "learning_rate": 1.8164555443054968e-05, "loss": 0.2876, "step": 4120 }, { "epoch": 0.22059903588072696, "grad_norm": 2.016876697540283, "learning_rate": 1.815455379359114e-05, "loss": 0.2832, "step": 4130 }, { "epoch": 0.22113317398213309, "grad_norm": 1.6930196285247803, "learning_rate": 1.8144527737038488e-05, "loss": 0.2822, "step": 4140 }, { "epoch": 0.2216673120835392, "grad_norm": 2.43959641456604, "learning_rate": 1.8134477303405617e-05, "loss": 0.2949, "step": 4150 }, { "epoch": 0.2222014501849453, "grad_norm": 3.732534646987915, "learning_rate": 1.81244025227741e-05, "loss": 0.3009, "step": 4160 }, { "epoch": 0.22273558828635143, "grad_norm": 2.196251153945923, "learning_rate": 1.811430342529839e-05, "loss": 0.2894, "step": 4170 }, { "epoch": 0.22326972638775755, "grad_norm": 1.8375812768936157, "learning_rate": 1.810418004120571e-05, "loss": 0.2943, "step": 4180 }, { "epoch": 0.22380386448916367, "grad_norm": 2.0561985969543457, "learning_rate": 1.8094032400795988e-05, "loss": 0.2928, "step": 4190 }, { "epoch": 0.2243380025905698, "grad_norm": 1.9155833721160889, "learning_rate": 1.8083860534441745e-05, "loss": 0.2936, "step": 4200 }, { "epoch": 0.22487214069197592, "grad_norm": 1.6386221647262573, "learning_rate": 1.8073664472588007e-05, "loss": 0.3033, "step": 4210 }, { "epoch": 0.22540627879338204, "grad_norm": 1.520893931388855, "learning_rate": 1.8063444245752228e-05, "loss": 0.2994, "step": 4220 }, { "epoch": 0.22594041689478814, "grad_norm": 2.4526896476745605, "learning_rate": 1.8053199884524187e-05, "loss": 0.3094, "step": 4230 }, { "epoch": 0.22647455499619426, "grad_norm": 4.6216044425964355, "learning_rate": 1.8042931419565893e-05, "loss": 0.2843, "step": 4240 }, { "epoch": 0.22700869309760038, "grad_norm": 2.241978406906128, "learning_rate": 1.8032638881611503e-05, "loss": 0.3098, "step": 4250 }, { "epoch": 0.2275428311990065, "grad_norm": 2.1523382663726807, "learning_rate": 1.802232230146723e-05, "loss": 0.2944, "step": 4260 }, { "epoch": 0.22807696930041263, "grad_norm": 2.646531581878662, "learning_rate": 1.801198171001124e-05, "loss": 0.3009, "step": 4270 }, { "epoch": 0.22861110740181875, "grad_norm": 2.4525656700134277, "learning_rate": 1.8001617138193568e-05, "loss": 0.2941, "step": 4280 }, { "epoch": 0.22914524550322485, "grad_norm": 2.497474431991577, "learning_rate": 1.7991228617036026e-05, "loss": 0.2914, "step": 4290 }, { "epoch": 0.22967938360463097, "grad_norm": 2.838111400604248, "learning_rate": 1.7980816177632106e-05, "loss": 0.3064, "step": 4300 }, { "epoch": 0.2302135217060371, "grad_norm": 2.1455724239349365, "learning_rate": 1.797037985114689e-05, "loss": 0.3023, "step": 4310 }, { "epoch": 0.23074765980744322, "grad_norm": 2.450465202331543, "learning_rate": 1.7959919668816957e-05, "loss": 0.2967, "step": 4320 }, { "epoch": 0.23128179790884934, "grad_norm": 2.2583377361297607, "learning_rate": 1.7949435661950282e-05, "loss": 0.2956, "step": 4330 }, { "epoch": 0.23181593601025546, "grad_norm": 1.8419855833053589, "learning_rate": 1.7938927861926157e-05, "loss": 0.2925, "step": 4340 }, { "epoch": 0.23235007411166156, "grad_norm": 2.5626420974731445, "learning_rate": 1.7928396300195082e-05, "loss": 0.288, "step": 4350 }, { "epoch": 0.23288421221306768, "grad_norm": 2.2265243530273438, "learning_rate": 1.7917841008278675e-05, "loss": 0.2955, "step": 4360 }, { "epoch": 0.2334183503144738, "grad_norm": 2.125892400741577, "learning_rate": 1.7907262017769592e-05, "loss": 0.2702, "step": 4370 }, { "epoch": 0.23395248841587993, "grad_norm": 2.640937566757202, "learning_rate": 1.7896659360331403e-05, "loss": 0.2861, "step": 4380 }, { "epoch": 0.23448662651728605, "grad_norm": 1.8127433061599731, "learning_rate": 1.788603306769853e-05, "loss": 0.293, "step": 4390 }, { "epoch": 0.23502076461869217, "grad_norm": 3.0571606159210205, "learning_rate": 1.787538317167613e-05, "loss": 0.2926, "step": 4400 }, { "epoch": 0.23555490272009827, "grad_norm": 2.4148664474487305, "learning_rate": 1.786470970414001e-05, "loss": 0.313, "step": 4410 }, { "epoch": 0.2360890408215044, "grad_norm": 2.3166868686676025, "learning_rate": 1.785401269703652e-05, "loss": 0.2821, "step": 4420 }, { "epoch": 0.23662317892291052, "grad_norm": 2.806251049041748, "learning_rate": 1.7843292182382467e-05, "loss": 0.2839, "step": 4430 }, { "epoch": 0.23715731702431664, "grad_norm": 2.173203468322754, "learning_rate": 1.783254819226503e-05, "loss": 0.2921, "step": 4440 }, { "epoch": 0.23769145512572276, "grad_norm": 1.659710168838501, "learning_rate": 1.7821780758841637e-05, "loss": 0.2773, "step": 4450 }, { "epoch": 0.23822559322712888, "grad_norm": 2.2198400497436523, "learning_rate": 1.781098991433989e-05, "loss": 0.3077, "step": 4460 }, { "epoch": 0.23875973132853498, "grad_norm": 2.8332021236419678, "learning_rate": 1.780017569105746e-05, "loss": 0.2922, "step": 4470 }, { "epoch": 0.2392938694299411, "grad_norm": 2.723341464996338, "learning_rate": 1.7789338121361993e-05, "loss": 0.2737, "step": 4480 }, { "epoch": 0.23982800753134723, "grad_norm": 2.113377809524536, "learning_rate": 1.777847723769101e-05, "loss": 0.2785, "step": 4490 }, { "epoch": 0.24036214563275335, "grad_norm": 1.974191665649414, "learning_rate": 1.776759307255182e-05, "loss": 0.2804, "step": 4500 }, { "epoch": 0.24089628373415947, "grad_norm": 2.1106488704681396, "learning_rate": 1.7756685658521406e-05, "loss": 0.2945, "step": 4510 }, { "epoch": 0.2414304218355656, "grad_norm": 1.9896161556243896, "learning_rate": 1.7745755028246335e-05, "loss": 0.282, "step": 4520 }, { "epoch": 0.2419645599369717, "grad_norm": 3.0781326293945312, "learning_rate": 1.7734801214442675e-05, "loss": 0.289, "step": 4530 }, { "epoch": 0.2424986980383778, "grad_norm": 2.5434863567352295, "learning_rate": 1.772382424989587e-05, "loss": 0.2801, "step": 4540 }, { "epoch": 0.24303283613978394, "grad_norm": 2.5370335578918457, "learning_rate": 1.7712824167460656e-05, "loss": 0.2748, "step": 4550 }, { "epoch": 0.24356697424119006, "grad_norm": 2.9817733764648438, "learning_rate": 1.7701801000060973e-05, "loss": 0.2778, "step": 4560 }, { "epoch": 0.24410111234259618, "grad_norm": 2.2162368297576904, "learning_rate": 1.7690754780689848e-05, "loss": 0.2905, "step": 4570 }, { "epoch": 0.2446352504440023, "grad_norm": 1.7004082202911377, "learning_rate": 1.7679685542409303e-05, "loss": 0.2781, "step": 4580 }, { "epoch": 0.24516938854540843, "grad_norm": 2.8697359561920166, "learning_rate": 1.7668593318350265e-05, "loss": 0.288, "step": 4590 }, { "epoch": 0.24570352664681452, "grad_norm": 2.6502087116241455, "learning_rate": 1.765747814171245e-05, "loss": 0.2897, "step": 4600 }, { "epoch": 0.24623766474822065, "grad_norm": 2.310603141784668, "learning_rate": 1.7646340045764276e-05, "loss": 0.2867, "step": 4610 }, { "epoch": 0.24677180284962677, "grad_norm": 1.6133915185928345, "learning_rate": 1.763517906384276e-05, "loss": 0.2884, "step": 4620 }, { "epoch": 0.2473059409510329, "grad_norm": 3.077385425567627, "learning_rate": 1.7623995229353417e-05, "loss": 0.2636, "step": 4630 }, { "epoch": 0.24784007905243902, "grad_norm": 2.629117250442505, "learning_rate": 1.7612788575770167e-05, "loss": 0.2933, "step": 4640 }, { "epoch": 0.24837421715384514, "grad_norm": 1.9995381832122803, "learning_rate": 1.7601559136635212e-05, "loss": 0.2979, "step": 4650 }, { "epoch": 0.24890835525525123, "grad_norm": 1.9694952964782715, "learning_rate": 1.759030694555898e-05, "loss": 0.3037, "step": 4660 }, { "epoch": 0.24944249335665736, "grad_norm": 2.7883951663970947, "learning_rate": 1.7579032036219976e-05, "loss": 0.286, "step": 4670 }, { "epoch": 0.24997663145806348, "grad_norm": 2.5562028884887695, "learning_rate": 1.7567734442364702e-05, "loss": 0.2698, "step": 4680 }, { "epoch": 0.2505107695594696, "grad_norm": 2.0029513835906982, "learning_rate": 1.755641419780757e-05, "loss": 0.284, "step": 4690 }, { "epoch": 0.2510449076608757, "grad_norm": 2.5123159885406494, "learning_rate": 1.754507133643077e-05, "loss": 0.2838, "step": 4700 }, { "epoch": 0.25157904576228185, "grad_norm": 2.2369096279144287, "learning_rate": 1.7533705892184206e-05, "loss": 0.3213, "step": 4710 }, { "epoch": 0.252113183863688, "grad_norm": 2.4693853855133057, "learning_rate": 1.7522317899085354e-05, "loss": 0.2807, "step": 4720 }, { "epoch": 0.2526473219650941, "grad_norm": 2.13126802444458, "learning_rate": 1.7510907391219193e-05, "loss": 0.2801, "step": 4730 }, { "epoch": 0.2531814600665002, "grad_norm": 3.3097527027130127, "learning_rate": 1.7499474402738084e-05, "loss": 0.2802, "step": 4740 }, { "epoch": 0.2537155981679063, "grad_norm": 3.427377700805664, "learning_rate": 1.7488018967861674e-05, "loss": 0.2902, "step": 4750 }, { "epoch": 0.2542497362693124, "grad_norm": 2.6044363975524902, "learning_rate": 1.74765411208768e-05, "loss": 0.2869, "step": 4760 }, { "epoch": 0.25478387437071853, "grad_norm": 2.454463481903076, "learning_rate": 1.7465040896137367e-05, "loss": 0.2881, "step": 4770 }, { "epoch": 0.25531801247212466, "grad_norm": 2.4974374771118164, "learning_rate": 1.7453518328064277e-05, "loss": 0.2801, "step": 4780 }, { "epoch": 0.2558521505735308, "grad_norm": 2.724273681640625, "learning_rate": 1.7441973451145285e-05, "loss": 0.2817, "step": 4790 }, { "epoch": 0.2563862886749369, "grad_norm": 2.444821357727051, "learning_rate": 1.7430406299934934e-05, "loss": 0.2805, "step": 4800 }, { "epoch": 0.256920426776343, "grad_norm": 1.8592000007629395, "learning_rate": 1.741881690905443e-05, "loss": 0.2859, "step": 4810 }, { "epoch": 0.25745456487774915, "grad_norm": 2.079247236251831, "learning_rate": 1.7407205313191546e-05, "loss": 0.2843, "step": 4820 }, { "epoch": 0.25798870297915527, "grad_norm": 3.3396663665771484, "learning_rate": 1.7395571547100506e-05, "loss": 0.2939, "step": 4830 }, { "epoch": 0.2585228410805614, "grad_norm": 2.6081440448760986, "learning_rate": 1.7383915645601905e-05, "loss": 0.2912, "step": 4840 }, { "epoch": 0.2590569791819675, "grad_norm": 2.6826016902923584, "learning_rate": 1.7372237643582583e-05, "loss": 0.2771, "step": 4850 }, { "epoch": 0.25959111728337364, "grad_norm": 2.218667507171631, "learning_rate": 1.736053757599553e-05, "loss": 0.2909, "step": 4860 }, { "epoch": 0.2601252553847797, "grad_norm": 1.7983503341674805, "learning_rate": 1.7348815477859777e-05, "loss": 0.2874, "step": 4870 }, { "epoch": 0.26065939348618583, "grad_norm": 2.8065969944000244, "learning_rate": 1.7337071384260296e-05, "loss": 0.3048, "step": 4880 }, { "epoch": 0.26119353158759195, "grad_norm": 2.6869609355926514, "learning_rate": 1.7325305330347893e-05, "loss": 0.2919, "step": 4890 }, { "epoch": 0.2617276696889981, "grad_norm": 2.789597988128662, "learning_rate": 1.7313517351339105e-05, "loss": 0.2668, "step": 4900 }, { "epoch": 0.2622618077904042, "grad_norm": 2.462343215942383, "learning_rate": 1.730170748251609e-05, "loss": 0.2742, "step": 4910 }, { "epoch": 0.2627959458918103, "grad_norm": 1.825661540031433, "learning_rate": 1.7289875759226517e-05, "loss": 0.2934, "step": 4920 }, { "epoch": 0.26333008399321645, "grad_norm": 2.6376566886901855, "learning_rate": 1.7278022216883474e-05, "loss": 0.2828, "step": 4930 }, { "epoch": 0.26386422209462257, "grad_norm": 2.8658931255340576, "learning_rate": 1.7266146890965358e-05, "loss": 0.2816, "step": 4940 }, { "epoch": 0.2643983601960287, "grad_norm": 2.0385499000549316, "learning_rate": 1.725424981701576e-05, "loss": 0.2834, "step": 4950 }, { "epoch": 0.2649324982974348, "grad_norm": 2.4839353561401367, "learning_rate": 1.7242331030643362e-05, "loss": 0.2717, "step": 4960 }, { "epoch": 0.26546663639884094, "grad_norm": 2.3219330310821533, "learning_rate": 1.723039056752184e-05, "loss": 0.2798, "step": 4970 }, { "epoch": 0.26600077450024706, "grad_norm": 1.8719227313995361, "learning_rate": 1.7218428463389744e-05, "loss": 0.2776, "step": 4980 }, { "epoch": 0.2665349126016532, "grad_norm": 2.2491092681884766, "learning_rate": 1.72064447540504e-05, "loss": 0.2859, "step": 4990 }, { "epoch": 0.26706905070305925, "grad_norm": 2.6476659774780273, "learning_rate": 1.7194439475371796e-05, "loss": 0.2794, "step": 5000 }, { "epoch": 0.2676031888044654, "grad_norm": 2.2501513957977295, "learning_rate": 1.7182412663286486e-05, "loss": 0.2697, "step": 5010 }, { "epoch": 0.2681373269058715, "grad_norm": 2.7113053798675537, "learning_rate": 1.7170364353791465e-05, "loss": 0.2779, "step": 5020 }, { "epoch": 0.2686714650072776, "grad_norm": 2.2244648933410645, "learning_rate": 1.7158294582948073e-05, "loss": 0.2803, "step": 5030 }, { "epoch": 0.26920560310868374, "grad_norm": 2.4210715293884277, "learning_rate": 1.7146203386881896e-05, "loss": 0.2878, "step": 5040 }, { "epoch": 0.26973974121008987, "grad_norm": 1.5157488584518433, "learning_rate": 1.7134090801782636e-05, "loss": 0.2788, "step": 5050 }, { "epoch": 0.270273879311496, "grad_norm": 2.465040922164917, "learning_rate": 1.7121956863904013e-05, "loss": 0.2871, "step": 5060 }, { "epoch": 0.2708080174129021, "grad_norm": 2.0018088817596436, "learning_rate": 1.710980160956367e-05, "loss": 0.2856, "step": 5070 }, { "epoch": 0.27134215551430824, "grad_norm": 2.0503087043762207, "learning_rate": 1.709762507514303e-05, "loss": 0.2765, "step": 5080 }, { "epoch": 0.27187629361571436, "grad_norm": 2.3763489723205566, "learning_rate": 1.708542729708723e-05, "loss": 0.3003, "step": 5090 }, { "epoch": 0.2724104317171205, "grad_norm": 2.3756635189056396, "learning_rate": 1.7073208311904978e-05, "loss": 0.2873, "step": 5100 }, { "epoch": 0.2729445698185266, "grad_norm": 2.52982759475708, "learning_rate": 1.706096815616846e-05, "loss": 0.2775, "step": 5110 }, { "epoch": 0.2734787079199327, "grad_norm": 1.5605673789978027, "learning_rate": 1.704870686651323e-05, "loss": 0.2717, "step": 5120 }, { "epoch": 0.2740128460213388, "grad_norm": 2.282503604888916, "learning_rate": 1.703642447963809e-05, "loss": 0.2982, "step": 5130 }, { "epoch": 0.2745469841227449, "grad_norm": 2.4783663749694824, "learning_rate": 1.7024121032304992e-05, "loss": 0.2931, "step": 5140 }, { "epoch": 0.27508112222415104, "grad_norm": 1.6152501106262207, "learning_rate": 1.7011796561338923e-05, "loss": 0.2666, "step": 5150 }, { "epoch": 0.27561526032555717, "grad_norm": 2.2677597999572754, "learning_rate": 1.6999451103627794e-05, "loss": 0.2927, "step": 5160 }, { "epoch": 0.2761493984269633, "grad_norm": 2.1877384185791016, "learning_rate": 1.6987084696122334e-05, "loss": 0.2862, "step": 5170 }, { "epoch": 0.2766835365283694, "grad_norm": 2.980351686477661, "learning_rate": 1.6974697375835966e-05, "loss": 0.2799, "step": 5180 }, { "epoch": 0.27721767462977553, "grad_norm": 2.0349535942077637, "learning_rate": 1.696228917984472e-05, "loss": 0.2703, "step": 5190 }, { "epoch": 0.27775181273118166, "grad_norm": 2.1837334632873535, "learning_rate": 1.6949860145287098e-05, "loss": 0.2781, "step": 5200 }, { "epoch": 0.2782859508325878, "grad_norm": 2.397613286972046, "learning_rate": 1.693741030936398e-05, "loss": 0.2756, "step": 5210 }, { "epoch": 0.2788200889339939, "grad_norm": 3.0145113468170166, "learning_rate": 1.6924939709338497e-05, "loss": 0.2793, "step": 5220 }, { "epoch": 0.2793542270354, "grad_norm": 1.854766845703125, "learning_rate": 1.691244838253594e-05, "loss": 0.2645, "step": 5230 }, { "epoch": 0.2798883651368061, "grad_norm": 1.921189546585083, "learning_rate": 1.6899936366343626e-05, "loss": 0.2524, "step": 5240 }, { "epoch": 0.2804225032382122, "grad_norm": 1.894126534461975, "learning_rate": 1.6887403698210802e-05, "loss": 0.2731, "step": 5250 }, { "epoch": 0.28095664133961834, "grad_norm": 2.150094747543335, "learning_rate": 1.6874850415648527e-05, "loss": 0.2717, "step": 5260 }, { "epoch": 0.28149077944102446, "grad_norm": 2.2324492931365967, "learning_rate": 1.686227655622956e-05, "loss": 0.27, "step": 5270 }, { "epoch": 0.2820249175424306, "grad_norm": 1.7700144052505493, "learning_rate": 1.6849682157588247e-05, "loss": 0.2737, "step": 5280 }, { "epoch": 0.2825590556438367, "grad_norm": 1.950440526008606, "learning_rate": 1.683706725742041e-05, "loss": 0.265, "step": 5290 }, { "epoch": 0.28309319374524283, "grad_norm": 3.150139331817627, "learning_rate": 1.6824431893483234e-05, "loss": 0.265, "step": 5300 }, { "epoch": 0.28362733184664896, "grad_norm": 2.657430648803711, "learning_rate": 1.681177610359515e-05, "loss": 0.2915, "step": 5310 }, { "epoch": 0.2841614699480551, "grad_norm": 2.66664457321167, "learning_rate": 1.679909992563573e-05, "loss": 0.2935, "step": 5320 }, { "epoch": 0.2846956080494612, "grad_norm": 2.1547904014587402, "learning_rate": 1.6786403397545567e-05, "loss": 0.3023, "step": 5330 }, { "epoch": 0.2852297461508673, "grad_norm": 2.157902240753174, "learning_rate": 1.6773686557326158e-05, "loss": 0.258, "step": 5340 }, { "epoch": 0.28576388425227345, "grad_norm": 2.4727182388305664, "learning_rate": 1.6760949443039805e-05, "loss": 0.2851, "step": 5350 }, { "epoch": 0.28629802235367957, "grad_norm": 2.744044542312622, "learning_rate": 1.6748192092809486e-05, "loss": 0.276, "step": 5360 }, { "epoch": 0.28683216045508564, "grad_norm": 1.9927189350128174, "learning_rate": 1.6735414544818748e-05, "loss": 0.2876, "step": 5370 }, { "epoch": 0.28736629855649176, "grad_norm": 2.281212568283081, "learning_rate": 1.6722616837311583e-05, "loss": 0.258, "step": 5380 }, { "epoch": 0.2879004366578979, "grad_norm": 2.4007697105407715, "learning_rate": 1.6709799008592337e-05, "loss": 0.2707, "step": 5390 }, { "epoch": 0.288434574759304, "grad_norm": 2.3559021949768066, "learning_rate": 1.6696961097025567e-05, "loss": 0.2705, "step": 5400 }, { "epoch": 0.28896871286071013, "grad_norm": 3.2827630043029785, "learning_rate": 1.668410314103595e-05, "loss": 0.2712, "step": 5410 }, { "epoch": 0.28950285096211625, "grad_norm": 2.5107064247131348, "learning_rate": 1.6671225179108145e-05, "loss": 0.2746, "step": 5420 }, { "epoch": 0.2900369890635224, "grad_norm": 1.862751841545105, "learning_rate": 1.6658327249786696e-05, "loss": 0.2751, "step": 5430 }, { "epoch": 0.2905711271649285, "grad_norm": 2.1066787242889404, "learning_rate": 1.664540939167592e-05, "loss": 0.2738, "step": 5440 }, { "epoch": 0.2911052652663346, "grad_norm": 2.4449520111083984, "learning_rate": 1.6632471643439764e-05, "loss": 0.2677, "step": 5450 }, { "epoch": 0.29163940336774075, "grad_norm": 2.1758930683135986, "learning_rate": 1.6619514043801718e-05, "loss": 0.2546, "step": 5460 }, { "epoch": 0.29217354146914687, "grad_norm": 2.2535574436187744, "learning_rate": 1.6606536631544692e-05, "loss": 0.2896, "step": 5470 }, { "epoch": 0.292707679570553, "grad_norm": 2.1716558933258057, "learning_rate": 1.6593539445510883e-05, "loss": 0.2843, "step": 5480 }, { "epoch": 0.29324181767195906, "grad_norm": 1.9230881929397583, "learning_rate": 1.6580522524601692e-05, "loss": 0.2722, "step": 5490 }, { "epoch": 0.2937759557733652, "grad_norm": 2.182422399520874, "learning_rate": 1.656748590777757e-05, "loss": 0.2821, "step": 5500 }, { "epoch": 0.2943100938747713, "grad_norm": 2.1532065868377686, "learning_rate": 1.6554429634057922e-05, "loss": 0.2767, "step": 5510 }, { "epoch": 0.29484423197617743, "grad_norm": 2.0772488117218018, "learning_rate": 1.6541353742520993e-05, "loss": 0.2799, "step": 5520 }, { "epoch": 0.29537837007758355, "grad_norm": 1.9692423343658447, "learning_rate": 1.6528258272303746e-05, "loss": 0.2503, "step": 5530 }, { "epoch": 0.2959125081789897, "grad_norm": 3.041440486907959, "learning_rate": 1.6515143262601743e-05, "loss": 0.2864, "step": 5540 }, { "epoch": 0.2964466462803958, "grad_norm": 2.9224579334259033, "learning_rate": 1.6502008752669022e-05, "loss": 0.2672, "step": 5550 }, { "epoch": 0.2969807843818019, "grad_norm": 2.265185832977295, "learning_rate": 1.648885478181799e-05, "loss": 0.2713, "step": 5560 }, { "epoch": 0.29751492248320804, "grad_norm": 2.625992774963379, "learning_rate": 1.647568138941931e-05, "loss": 0.2696, "step": 5570 }, { "epoch": 0.29804906058461417, "grad_norm": 2.53875470161438, "learning_rate": 1.6462488614901763e-05, "loss": 0.2805, "step": 5580 }, { "epoch": 0.2985831986860203, "grad_norm": 1.8884080648422241, "learning_rate": 1.644927649775215e-05, "loss": 0.2738, "step": 5590 }, { "epoch": 0.2991173367874264, "grad_norm": 1.8485174179077148, "learning_rate": 1.6436045077515163e-05, "loss": 0.2709, "step": 5600 }, { "epoch": 0.2996514748888325, "grad_norm": 2.0729966163635254, "learning_rate": 1.6422794393793266e-05, "loss": 0.2635, "step": 5610 }, { "epoch": 0.3001856129902386, "grad_norm": 3.0163724422454834, "learning_rate": 1.640952448624659e-05, "loss": 0.2652, "step": 5620 }, { "epoch": 0.3007197510916447, "grad_norm": 2.0768160820007324, "learning_rate": 1.639623539459279e-05, "loss": 0.2894, "step": 5630 }, { "epoch": 0.30125388919305085, "grad_norm": 3.0301129817962646, "learning_rate": 1.6382927158606946e-05, "loss": 0.2794, "step": 5640 }, { "epoch": 0.301788027294457, "grad_norm": 2.729121208190918, "learning_rate": 1.6369599818121447e-05, "loss": 0.2575, "step": 5650 }, { "epoch": 0.3023221653958631, "grad_norm": 2.1150577068328857, "learning_rate": 1.635625341302585e-05, "loss": 0.2752, "step": 5660 }, { "epoch": 0.3028563034972692, "grad_norm": 1.8983770608901978, "learning_rate": 1.6342887983266786e-05, "loss": 0.2674, "step": 5670 }, { "epoch": 0.30339044159867534, "grad_norm": 2.1588704586029053, "learning_rate": 1.6329503568847807e-05, "loss": 0.2765, "step": 5680 }, { "epoch": 0.30392457970008147, "grad_norm": 1.925016164779663, "learning_rate": 1.6316100209829314e-05, "loss": 0.2866, "step": 5690 }, { "epoch": 0.3044587178014876, "grad_norm": 2.363632917404175, "learning_rate": 1.6302677946328393e-05, "loss": 0.2801, "step": 5700 }, { "epoch": 0.3049928559028937, "grad_norm": 1.887139916419983, "learning_rate": 1.628923681851872e-05, "loss": 0.2626, "step": 5710 }, { "epoch": 0.30552699400429983, "grad_norm": 2.331603765487671, "learning_rate": 1.6275776866630425e-05, "loss": 0.2817, "step": 5720 }, { "epoch": 0.30606113210570596, "grad_norm": 1.9835960865020752, "learning_rate": 1.6262298130949992e-05, "loss": 0.2644, "step": 5730 }, { "epoch": 0.306595270207112, "grad_norm": 2.329559087753296, "learning_rate": 1.6248800651820113e-05, "loss": 0.2616, "step": 5740 }, { "epoch": 0.30712940830851815, "grad_norm": 2.0845601558685303, "learning_rate": 1.6235284469639586e-05, "loss": 0.2584, "step": 5750 }, { "epoch": 0.30766354640992427, "grad_norm": 2.3113512992858887, "learning_rate": 1.622174962486319e-05, "loss": 0.2723, "step": 5760 }, { "epoch": 0.3081976845113304, "grad_norm": 2.247990608215332, "learning_rate": 1.620819615800156e-05, "loss": 0.2612, "step": 5770 }, { "epoch": 0.3087318226127365, "grad_norm": 2.278716564178467, "learning_rate": 1.6194624109621067e-05, "loss": 0.2582, "step": 5780 }, { "epoch": 0.30926596071414264, "grad_norm": 2.3860111236572266, "learning_rate": 1.6181033520343707e-05, "loss": 0.2701, "step": 5790 }, { "epoch": 0.30980009881554876, "grad_norm": 1.8744722604751587, "learning_rate": 1.6167424430846957e-05, "loss": 0.2802, "step": 5800 }, { "epoch": 0.3103342369169549, "grad_norm": 2.765103816986084, "learning_rate": 1.6153796881863674e-05, "loss": 0.2879, "step": 5810 }, { "epoch": 0.310868375018361, "grad_norm": 2.5840275287628174, "learning_rate": 1.6140150914181957e-05, "loss": 0.2828, "step": 5820 }, { "epoch": 0.31140251311976713, "grad_norm": 2.6117844581604004, "learning_rate": 1.6126486568645046e-05, "loss": 0.2731, "step": 5830 }, { "epoch": 0.31193665122117326, "grad_norm": 2.332456111907959, "learning_rate": 1.611280388615118e-05, "loss": 0.2821, "step": 5840 }, { "epoch": 0.3124707893225794, "grad_norm": 3.1476943492889404, "learning_rate": 1.609910290765348e-05, "loss": 0.2707, "step": 5850 }, { "epoch": 0.31300492742398545, "grad_norm": 2.0097689628601074, "learning_rate": 1.6085383674159835e-05, "loss": 0.2578, "step": 5860 }, { "epoch": 0.31353906552539157, "grad_norm": 2.426935911178589, "learning_rate": 1.6071646226732762e-05, "loss": 0.268, "step": 5870 }, { "epoch": 0.3140732036267977, "grad_norm": 2.5238702297210693, "learning_rate": 1.6057890606489302e-05, "loss": 0.2722, "step": 5880 }, { "epoch": 0.3146073417282038, "grad_norm": 1.8866312503814697, "learning_rate": 1.6044116854600887e-05, "loss": 0.2623, "step": 5890 }, { "epoch": 0.31514147982960994, "grad_norm": 3.4267032146453857, "learning_rate": 1.6030325012293212e-05, "loss": 0.2668, "step": 5900 }, { "epoch": 0.31567561793101606, "grad_norm": 1.9737868309020996, "learning_rate": 1.6016515120846125e-05, "loss": 0.2605, "step": 5910 }, { "epoch": 0.3162097560324222, "grad_norm": 1.8713111877441406, "learning_rate": 1.60026872215935e-05, "loss": 0.2594, "step": 5920 }, { "epoch": 0.3167438941338283, "grad_norm": 3.3318262100219727, "learning_rate": 1.598884135592309e-05, "loss": 0.2559, "step": 5930 }, { "epoch": 0.31727803223523443, "grad_norm": 1.8477685451507568, "learning_rate": 1.5974977565276445e-05, "loss": 0.2725, "step": 5940 }, { "epoch": 0.31781217033664055, "grad_norm": 2.0850837230682373, "learning_rate": 1.5961095891148756e-05, "loss": 0.2842, "step": 5950 }, { "epoch": 0.3183463084380467, "grad_norm": 2.4099347591400146, "learning_rate": 1.5947196375088738e-05, "loss": 0.2708, "step": 5960 }, { "epoch": 0.3188804465394528, "grad_norm": 2.1157915592193604, "learning_rate": 1.593327905869851e-05, "loss": 0.2491, "step": 5970 }, { "epoch": 0.31941458464085887, "grad_norm": 1.7803987264633179, "learning_rate": 1.5919343983633476e-05, "loss": 0.264, "step": 5980 }, { "epoch": 0.319948722742265, "grad_norm": 2.3616161346435547, "learning_rate": 1.5905391191602174e-05, "loss": 0.2584, "step": 5990 }, { "epoch": 0.3204828608436711, "grad_norm": 2.318877935409546, "learning_rate": 1.5891420724366195e-05, "loss": 0.271, "step": 6000 }, { "epoch": 0.32101699894507724, "grad_norm": 2.9651107788085938, "learning_rate": 1.587743262374001e-05, "loss": 0.274, "step": 6010 }, { "epoch": 0.32155113704648336, "grad_norm": 2.3671646118164062, "learning_rate": 1.5863426931590883e-05, "loss": 0.249, "step": 6020 }, { "epoch": 0.3220852751478895, "grad_norm": 2.142096996307373, "learning_rate": 1.5849403689838726e-05, "loss": 0.2698, "step": 6030 }, { "epoch": 0.3226194132492956, "grad_norm": 1.8820281028747559, "learning_rate": 1.5835362940455975e-05, "loss": 0.2447, "step": 6040 }, { "epoch": 0.32315355135070173, "grad_norm": 2.215770721435547, "learning_rate": 1.582130472546747e-05, "loss": 0.2618, "step": 6050 }, { "epoch": 0.32368768945210785, "grad_norm": 2.5994715690612793, "learning_rate": 1.5807229086950327e-05, "loss": 0.2591, "step": 6060 }, { "epoch": 0.324221827553514, "grad_norm": 2.7101829051971436, "learning_rate": 1.579313606703381e-05, "loss": 0.2622, "step": 6070 }, { "epoch": 0.3247559656549201, "grad_norm": 2.1551461219787598, "learning_rate": 1.577902570789921e-05, "loss": 0.2691, "step": 6080 }, { "epoch": 0.3252901037563262, "grad_norm": 2.323611259460449, "learning_rate": 1.5764898051779717e-05, "loss": 0.2856, "step": 6090 }, { "epoch": 0.32582424185773234, "grad_norm": 2.165092706680298, "learning_rate": 1.575075314096028e-05, "loss": 0.2621, "step": 6100 }, { "epoch": 0.3263583799591384, "grad_norm": 2.365762710571289, "learning_rate": 1.57365910177775e-05, "loss": 0.2831, "step": 6110 }, { "epoch": 0.32689251806054453, "grad_norm": 4.697601795196533, "learning_rate": 1.5722411724619506e-05, "loss": 0.2689, "step": 6120 }, { "epoch": 0.32742665616195066, "grad_norm": 1.7116745710372925, "learning_rate": 1.5708215303925797e-05, "loss": 0.2629, "step": 6130 }, { "epoch": 0.3279607942633568, "grad_norm": 1.8926539421081543, "learning_rate": 1.569400179818716e-05, "loss": 0.2583, "step": 6140 }, { "epoch": 0.3284949323647629, "grad_norm": 1.8611681461334229, "learning_rate": 1.5679771249945493e-05, "loss": 0.2681, "step": 6150 }, { "epoch": 0.329029070466169, "grad_norm": 2.0281684398651123, "learning_rate": 1.566552370179372e-05, "loss": 0.2498, "step": 6160 }, { "epoch": 0.32956320856757515, "grad_norm": 2.241403341293335, "learning_rate": 1.5651259196375642e-05, "loss": 0.2551, "step": 6170 }, { "epoch": 0.3300973466689813, "grad_norm": 1.9789137840270996, "learning_rate": 1.563697777638582e-05, "loss": 0.2744, "step": 6180 }, { "epoch": 0.3306314847703874, "grad_norm": 1.7713334560394287, "learning_rate": 1.5622679484569427e-05, "loss": 0.2684, "step": 6190 }, { "epoch": 0.3311656228717935, "grad_norm": 1.8539912700653076, "learning_rate": 1.5608364363722154e-05, "loss": 0.2779, "step": 6200 }, { "epoch": 0.33169976097319964, "grad_norm": 2.3406591415405273, "learning_rate": 1.559403245669004e-05, "loss": 0.2622, "step": 6210 }, { "epoch": 0.33223389907460577, "grad_norm": 2.0236904621124268, "learning_rate": 1.5579683806369385e-05, "loss": 0.2801, "step": 6220 }, { "epoch": 0.33276803717601183, "grad_norm": 1.539210319519043, "learning_rate": 1.5565318455706596e-05, "loss": 0.2774, "step": 6230 }, { "epoch": 0.33330217527741796, "grad_norm": 2.49755859375, "learning_rate": 1.5550936447698064e-05, "loss": 0.2577, "step": 6240 }, { "epoch": 0.3338363133788241, "grad_norm": 2.6194007396698, "learning_rate": 1.5536537825390033e-05, "loss": 0.2741, "step": 6250 }, { "epoch": 0.3343704514802302, "grad_norm": 1.8760852813720703, "learning_rate": 1.5522122631878486e-05, "loss": 0.2536, "step": 6260 }, { "epoch": 0.3349045895816363, "grad_norm": 2.3678059577941895, "learning_rate": 1.5507690910308992e-05, "loss": 0.2676, "step": 6270 }, { "epoch": 0.33543872768304245, "grad_norm": 1.9961994886398315, "learning_rate": 1.5493242703876598e-05, "loss": 0.2614, "step": 6280 }, { "epoch": 0.33597286578444857, "grad_norm": 1.6558189392089844, "learning_rate": 1.5478778055825686e-05, "loss": 0.2683, "step": 6290 }, { "epoch": 0.3365070038858547, "grad_norm": 1.8605523109436035, "learning_rate": 1.546429700944985e-05, "loss": 0.2582, "step": 6300 }, { "epoch": 0.3370411419872608, "grad_norm": 2.3220627307891846, "learning_rate": 1.5449799608091773e-05, "loss": 0.265, "step": 6310 }, { "epoch": 0.33757528008866694, "grad_norm": 1.8735218048095703, "learning_rate": 1.543528589514308e-05, "loss": 0.2663, "step": 6320 }, { "epoch": 0.33810941819007306, "grad_norm": 2.497814893722534, "learning_rate": 1.5420755914044217e-05, "loss": 0.2668, "step": 6330 }, { "epoch": 0.3386435562914792, "grad_norm": 2.600848913192749, "learning_rate": 1.5406209708284322e-05, "loss": 0.27, "step": 6340 }, { "epoch": 0.33917769439288525, "grad_norm": 2.01554274559021, "learning_rate": 1.5391647321401104e-05, "loss": 0.2549, "step": 6350 }, { "epoch": 0.3397118324942914, "grad_norm": 2.521982431411743, "learning_rate": 1.5377068796980698e-05, "loss": 0.2631, "step": 6360 }, { "epoch": 0.3402459705956975, "grad_norm": 1.918939232826233, "learning_rate": 1.536247417865753e-05, "loss": 0.2575, "step": 6370 }, { "epoch": 0.3407801086971036, "grad_norm": 19.110347747802734, "learning_rate": 1.5347863510114213e-05, "loss": 0.2718, "step": 6380 }, { "epoch": 0.34131424679850975, "grad_norm": 2.082148313522339, "learning_rate": 1.533323683508139e-05, "loss": 0.2841, "step": 6390 }, { "epoch": 0.34184838489991587, "grad_norm": 1.9891785383224487, "learning_rate": 1.5318594197337602e-05, "loss": 0.2565, "step": 6400 }, { "epoch": 0.342382523001322, "grad_norm": 2.1734466552734375, "learning_rate": 1.5303935640709186e-05, "loss": 0.2645, "step": 6410 }, { "epoch": 0.3429166611027281, "grad_norm": 2.3558287620544434, "learning_rate": 1.528926120907012e-05, "loss": 0.264, "step": 6420 }, { "epoch": 0.34345079920413424, "grad_norm": 2.3349449634552, "learning_rate": 1.5274570946341894e-05, "loss": 0.2656, "step": 6430 }, { "epoch": 0.34398493730554036, "grad_norm": 2.077255964279175, "learning_rate": 1.5259864896493377e-05, "loss": 0.2732, "step": 6440 }, { "epoch": 0.3445190754069465, "grad_norm": 2.407076358795166, "learning_rate": 1.5245143103540696e-05, "loss": 0.2293, "step": 6450 }, { "epoch": 0.3450532135083526, "grad_norm": 2.2691733837127686, "learning_rate": 1.52304056115471e-05, "loss": 0.2672, "step": 6460 }, { "epoch": 0.34558735160975873, "grad_norm": 1.5289952754974365, "learning_rate": 1.5215652464622821e-05, "loss": 0.2462, "step": 6470 }, { "epoch": 0.3461214897111648, "grad_norm": 1.6864246129989624, "learning_rate": 1.5200883706924949e-05, "loss": 0.2705, "step": 6480 }, { "epoch": 0.3466556278125709, "grad_norm": 1.9255093336105347, "learning_rate": 1.5186099382657297e-05, "loss": 0.2643, "step": 6490 }, { "epoch": 0.34718976591397704, "grad_norm": 2.434885025024414, "learning_rate": 1.5171299536070278e-05, "loss": 0.2542, "step": 6500 }, { "epoch": 0.34772390401538317, "grad_norm": 2.051426649093628, "learning_rate": 1.5156484211460746e-05, "loss": 0.2422, "step": 6510 }, { "epoch": 0.3482580421167893, "grad_norm": 2.286144733428955, "learning_rate": 1.5141653453171901e-05, "loss": 0.2496, "step": 6520 }, { "epoch": 0.3487921802181954, "grad_norm": 2.436513662338257, "learning_rate": 1.5126807305593127e-05, "loss": 0.2477, "step": 6530 }, { "epoch": 0.34932631831960154, "grad_norm": 2.2687180042266846, "learning_rate": 1.511194581315987e-05, "loss": 0.2533, "step": 6540 }, { "epoch": 0.34986045642100766, "grad_norm": 2.084296703338623, "learning_rate": 1.5097069020353507e-05, "loss": 0.2652, "step": 6550 }, { "epoch": 0.3503945945224138, "grad_norm": 2.357975721359253, "learning_rate": 1.5082176971701205e-05, "loss": 0.2703, "step": 6560 }, { "epoch": 0.3509287326238199, "grad_norm": 2.179027795791626, "learning_rate": 1.5067269711775798e-05, "loss": 0.2486, "step": 6570 }, { "epoch": 0.35146287072522603, "grad_norm": 2.0684311389923096, "learning_rate": 1.5052347285195646e-05, "loss": 0.2763, "step": 6580 }, { "epoch": 0.35199700882663215, "grad_norm": 1.8673515319824219, "learning_rate": 1.5037409736624502e-05, "loss": 0.2638, "step": 6590 }, { "epoch": 0.3525311469280382, "grad_norm": 2.3039371967315674, "learning_rate": 1.5022457110771384e-05, "loss": 0.2553, "step": 6600 }, { "epoch": 0.35306528502944434, "grad_norm": 3.3816354274749756, "learning_rate": 1.5007489452390437e-05, "loss": 0.2623, "step": 6610 }, { "epoch": 0.35359942313085047, "grad_norm": 2.1143391132354736, "learning_rate": 1.4992506806280793e-05, "loss": 0.2636, "step": 6620 }, { "epoch": 0.3541335612322566, "grad_norm": 1.9233431816101074, "learning_rate": 1.4977509217286448e-05, "loss": 0.2545, "step": 6630 }, { "epoch": 0.3546676993336627, "grad_norm": 2.426685094833374, "learning_rate": 1.4962496730296125e-05, "loss": 0.26, "step": 6640 }, { "epoch": 0.35520183743506883, "grad_norm": 2.5181005001068115, "learning_rate": 1.4947469390243135e-05, "loss": 0.2586, "step": 6650 }, { "epoch": 0.35573597553647496, "grad_norm": 1.4983373880386353, "learning_rate": 1.4932427242105244e-05, "loss": 0.2447, "step": 6660 }, { "epoch": 0.3562701136378811, "grad_norm": 1.6722877025604248, "learning_rate": 1.491737033090454e-05, "loss": 0.2712, "step": 6670 }, { "epoch": 0.3568042517392872, "grad_norm": 1.7236862182617188, "learning_rate": 1.49022987017073e-05, "loss": 0.2503, "step": 6680 }, { "epoch": 0.3573383898406933, "grad_norm": 2.145541191101074, "learning_rate": 1.4887212399623856e-05, "loss": 0.2485, "step": 6690 }, { "epoch": 0.35787252794209945, "grad_norm": 2.542449951171875, "learning_rate": 1.4872111469808443e-05, "loss": 0.2781, "step": 6700 }, { "epoch": 0.3584066660435056, "grad_norm": 2.3643364906311035, "learning_rate": 1.4856995957459094e-05, "loss": 0.255, "step": 6710 }, { "epoch": 0.35894080414491164, "grad_norm": 3.36561918258667, "learning_rate": 1.4841865907817479e-05, "loss": 0.2748, "step": 6720 }, { "epoch": 0.35947494224631776, "grad_norm": 2.103285551071167, "learning_rate": 1.4826721366168786e-05, "loss": 0.2591, "step": 6730 }, { "epoch": 0.3600090803477239, "grad_norm": 2.4507896900177, "learning_rate": 1.4811562377841569e-05, "loss": 0.259, "step": 6740 }, { "epoch": 0.36054321844913, "grad_norm": 1.7319021224975586, "learning_rate": 1.4796388988207624e-05, "loss": 0.2704, "step": 6750 }, { "epoch": 0.36107735655053613, "grad_norm": 2.33803391456604, "learning_rate": 1.4781201242681864e-05, "loss": 0.2619, "step": 6760 }, { "epoch": 0.36161149465194226, "grad_norm": 1.7216763496398926, "learning_rate": 1.4765999186722151e-05, "loss": 0.2624, "step": 6770 }, { "epoch": 0.3621456327533484, "grad_norm": 3.762627124786377, "learning_rate": 1.4750782865829191e-05, "loss": 0.2637, "step": 6780 }, { "epoch": 0.3626797708547545, "grad_norm": 2.189223289489746, "learning_rate": 1.4735552325546385e-05, "loss": 0.268, "step": 6790 }, { "epoch": 0.3632139089561606, "grad_norm": 2.7754886150360107, "learning_rate": 1.4720307611459687e-05, "loss": 0.2661, "step": 6800 }, { "epoch": 0.36374804705756675, "grad_norm": 2.042732000350952, "learning_rate": 1.4705048769197486e-05, "loss": 0.248, "step": 6810 }, { "epoch": 0.36428218515897287, "grad_norm": 2.776440382003784, "learning_rate": 1.4689775844430438e-05, "loss": 0.2535, "step": 6820 }, { "epoch": 0.364816323260379, "grad_norm": 1.9695485830307007, "learning_rate": 1.4674488882871372e-05, "loss": 0.2734, "step": 6830 }, { "epoch": 0.3653504613617851, "grad_norm": 2.0936856269836426, "learning_rate": 1.4659187930275111e-05, "loss": 0.2622, "step": 6840 }, { "epoch": 0.3658845994631912, "grad_norm": 2.3425135612487793, "learning_rate": 1.4643873032438367e-05, "loss": 0.2464, "step": 6850 }, { "epoch": 0.3664187375645973, "grad_norm": 1.957087755203247, "learning_rate": 1.4628544235199585e-05, "loss": 0.2555, "step": 6860 }, { "epoch": 0.36695287566600343, "grad_norm": 1.330527663230896, "learning_rate": 1.461320158443881e-05, "loss": 0.2679, "step": 6870 }, { "epoch": 0.36748701376740955, "grad_norm": 2.1211888790130615, "learning_rate": 1.4597845126077557e-05, "loss": 0.2462, "step": 6880 }, { "epoch": 0.3680211518688157, "grad_norm": 1.9069117307662964, "learning_rate": 1.458247490607866e-05, "loss": 0.2452, "step": 6890 }, { "epoch": 0.3685552899702218, "grad_norm": 2.946803092956543, "learning_rate": 1.4567090970446153e-05, "loss": 0.2402, "step": 6900 }, { "epoch": 0.3690894280716279, "grad_norm": 2.6456515789031982, "learning_rate": 1.4551693365225117e-05, "loss": 0.2523, "step": 6910 }, { "epoch": 0.36962356617303405, "grad_norm": 1.467833161354065, "learning_rate": 1.4536282136501539e-05, "loss": 0.2512, "step": 6920 }, { "epoch": 0.37015770427444017, "grad_norm": 2.3340671062469482, "learning_rate": 1.4520857330402197e-05, "loss": 0.2611, "step": 6930 }, { "epoch": 0.3706918423758463, "grad_norm": 2.008423328399658, "learning_rate": 1.4505418993094491e-05, "loss": 0.2504, "step": 6940 }, { "epoch": 0.3712259804772524, "grad_norm": 2.1225826740264893, "learning_rate": 1.4489967170786337e-05, "loss": 0.25, "step": 6950 }, { "epoch": 0.37176011857865854, "grad_norm": 1.8514221906661987, "learning_rate": 1.4474501909726002e-05, "loss": 0.2625, "step": 6960 }, { "epoch": 0.3722942566800646, "grad_norm": 3.1618645191192627, "learning_rate": 1.4459023256201974e-05, "loss": 0.2513, "step": 6970 }, { "epoch": 0.37282839478147073, "grad_norm": 2.6383047103881836, "learning_rate": 1.4443531256542834e-05, "loss": 0.2398, "step": 6980 }, { "epoch": 0.37336253288287685, "grad_norm": 2.4014105796813965, "learning_rate": 1.4428025957117103e-05, "loss": 0.2563, "step": 6990 }, { "epoch": 0.373896670984283, "grad_norm": 1.9360977411270142, "learning_rate": 1.4412507404333109e-05, "loss": 0.2675, "step": 7000 }, { "epoch": 0.3744308090856891, "grad_norm": 3.968796968460083, "learning_rate": 1.4396975644638852e-05, "loss": 0.2612, "step": 7010 }, { "epoch": 0.3749649471870952, "grad_norm": 1.7205746173858643, "learning_rate": 1.4381430724521866e-05, "loss": 0.2636, "step": 7020 }, { "epoch": 0.37549908528850134, "grad_norm": 2.1856157779693604, "learning_rate": 1.4365872690509052e-05, "loss": 0.2555, "step": 7030 }, { "epoch": 0.37603322338990747, "grad_norm": 1.6627072095870972, "learning_rate": 1.4350301589166589e-05, "loss": 0.253, "step": 7040 }, { "epoch": 0.3765673614913136, "grad_norm": 1.4432976245880127, "learning_rate": 1.4334717467099753e-05, "loss": 0.2535, "step": 7050 }, { "epoch": 0.3771014995927197, "grad_norm": 2.042365550994873, "learning_rate": 1.4319120370952793e-05, "loss": 0.2646, "step": 7060 }, { "epoch": 0.37763563769412584, "grad_norm": 1.79824697971344, "learning_rate": 1.4303510347408795e-05, "loss": 0.2517, "step": 7070 }, { "epoch": 0.37816977579553196, "grad_norm": 1.7286380529403687, "learning_rate": 1.4287887443189534e-05, "loss": 0.2627, "step": 7080 }, { "epoch": 0.378703913896938, "grad_norm": 2.4688687324523926, "learning_rate": 1.4272251705055338e-05, "loss": 0.2752, "step": 7090 }, { "epoch": 0.37923805199834415, "grad_norm": 1.6724916696548462, "learning_rate": 1.4256603179804942e-05, "loss": 0.2464, "step": 7100 }, { "epoch": 0.3797721900997503, "grad_norm": 1.8701459169387817, "learning_rate": 1.4240941914275368e-05, "loss": 0.2403, "step": 7110 }, { "epoch": 0.3803063282011564, "grad_norm": 1.9864122867584229, "learning_rate": 1.4225267955341757e-05, "loss": 0.2516, "step": 7120 }, { "epoch": 0.3808404663025625, "grad_norm": 3.281242847442627, "learning_rate": 1.4209581349917247e-05, "loss": 0.2517, "step": 7130 }, { "epoch": 0.38137460440396864, "grad_norm": 2.2182374000549316, "learning_rate": 1.4193882144952835e-05, "loss": 0.2423, "step": 7140 }, { "epoch": 0.38190874250537477, "grad_norm": 2.32824969291687, "learning_rate": 1.417817038743721e-05, "loss": 0.2667, "step": 7150 }, { "epoch": 0.3824428806067809, "grad_norm": 3.180384397506714, "learning_rate": 1.4162446124396654e-05, "loss": 0.2494, "step": 7160 }, { "epoch": 0.382977018708187, "grad_norm": 1.6292178630828857, "learning_rate": 1.4146709402894861e-05, "loss": 0.2426, "step": 7170 }, { "epoch": 0.38351115680959313, "grad_norm": 1.6236523389816284, "learning_rate": 1.4130960270032828e-05, "loss": 0.2612, "step": 7180 }, { "epoch": 0.38404529491099926, "grad_norm": 2.0785157680511475, "learning_rate": 1.4115198772948686e-05, "loss": 0.2451, "step": 7190 }, { "epoch": 0.3845794330124054, "grad_norm": 2.5847954750061035, "learning_rate": 1.4099424958817588e-05, "loss": 0.2602, "step": 7200 }, { "epoch": 0.3851135711138115, "grad_norm": 3.038905620574951, "learning_rate": 1.408363887485154e-05, "loss": 0.2419, "step": 7210 }, { "epoch": 0.38564770921521757, "grad_norm": 2.501169204711914, "learning_rate": 1.4067840568299275e-05, "loss": 0.2554, "step": 7220 }, { "epoch": 0.3861818473166237, "grad_norm": 3.1191928386688232, "learning_rate": 1.405203008644612e-05, "loss": 0.2716, "step": 7230 }, { "epoch": 0.3867159854180298, "grad_norm": 2.501645565032959, "learning_rate": 1.4036207476613825e-05, "loss": 0.262, "step": 7240 }, { "epoch": 0.38725012351943594, "grad_norm": 2.1982192993164062, "learning_rate": 1.4020372786160456e-05, "loss": 0.2596, "step": 7250 }, { "epoch": 0.38778426162084206, "grad_norm": 1.75099515914917, "learning_rate": 1.400452606248023e-05, "loss": 0.2546, "step": 7260 }, { "epoch": 0.3883183997222482, "grad_norm": 2.358099937438965, "learning_rate": 1.3988667353003376e-05, "loss": 0.2574, "step": 7270 }, { "epoch": 0.3888525378236543, "grad_norm": 2.322899103164673, "learning_rate": 1.3972796705196005e-05, "loss": 0.2501, "step": 7280 }, { "epoch": 0.38938667592506043, "grad_norm": 2.745469808578491, "learning_rate": 1.395691416655996e-05, "loss": 0.2405, "step": 7290 }, { "epoch": 0.38992081402646656, "grad_norm": 1.5932797193527222, "learning_rate": 1.3941019784632665e-05, "loss": 0.2415, "step": 7300 }, { "epoch": 0.3904549521278727, "grad_norm": 2.577909231185913, "learning_rate": 1.3925113606987007e-05, "loss": 0.2519, "step": 7310 }, { "epoch": 0.3909890902292788, "grad_norm": 2.157548666000366, "learning_rate": 1.3909195681231157e-05, "loss": 0.2465, "step": 7320 }, { "epoch": 0.3915232283306849, "grad_norm": 2.062798500061035, "learning_rate": 1.3893266055008473e-05, "loss": 0.2449, "step": 7330 }, { "epoch": 0.392057366432091, "grad_norm": 1.7956074476242065, "learning_rate": 1.387732477599731e-05, "loss": 0.2637, "step": 7340 }, { "epoch": 0.3925915045334971, "grad_norm": 1.7769899368286133, "learning_rate": 1.3861371891910923e-05, "loss": 0.2522, "step": 7350 }, { "epoch": 0.39312564263490324, "grad_norm": 2.1810851097106934, "learning_rate": 1.3845407450497279e-05, "loss": 0.2586, "step": 7360 }, { "epoch": 0.39365978073630936, "grad_norm": 1.9073115587234497, "learning_rate": 1.382943149953896e-05, "loss": 0.2355, "step": 7370 }, { "epoch": 0.3941939188377155, "grad_norm": 2.5078210830688477, "learning_rate": 1.3813444086852976e-05, "loss": 0.2498, "step": 7380 }, { "epoch": 0.3947280569391216, "grad_norm": 2.251372814178467, "learning_rate": 1.3797445260290654e-05, "loss": 0.2433, "step": 7390 }, { "epoch": 0.39526219504052773, "grad_norm": 2.5520782470703125, "learning_rate": 1.3781435067737485e-05, "loss": 0.2459, "step": 7400 }, { "epoch": 0.39579633314193385, "grad_norm": 2.2263402938842773, "learning_rate": 1.3765413557112972e-05, "loss": 0.2501, "step": 7410 }, { "epoch": 0.39633047124334, "grad_norm": 2.4750075340270996, "learning_rate": 1.3749380776370497e-05, "loss": 0.2619, "step": 7420 }, { "epoch": 0.3968646093447461, "grad_norm": 2.064514398574829, "learning_rate": 1.3733336773497177e-05, "loss": 0.2406, "step": 7430 }, { "epoch": 0.3973987474461522, "grad_norm": 2.104466438293457, "learning_rate": 1.3717281596513708e-05, "loss": 0.2594, "step": 7440 }, { "epoch": 0.39793288554755835, "grad_norm": 1.781093716621399, "learning_rate": 1.3701215293474244e-05, "loss": 0.2449, "step": 7450 }, { "epoch": 0.3984670236489644, "grad_norm": 3.0183591842651367, "learning_rate": 1.368513791246623e-05, "loss": 0.2591, "step": 7460 }, { "epoch": 0.39900116175037054, "grad_norm": 2.8878283500671387, "learning_rate": 1.3669049501610277e-05, "loss": 0.2317, "step": 7470 }, { "epoch": 0.39953529985177666, "grad_norm": 1.7159744501113892, "learning_rate": 1.3652950109059994e-05, "loss": 0.241, "step": 7480 }, { "epoch": 0.4000694379531828, "grad_norm": 1.9341708421707153, "learning_rate": 1.363683978300188e-05, "loss": 0.2514, "step": 7490 }, { "epoch": 0.4006035760545889, "grad_norm": 2.1291399002075195, "learning_rate": 1.3620718571655136e-05, "loss": 0.2368, "step": 7500 }, { "epoch": 0.40113771415599503, "grad_norm": 1.9277875423431396, "learning_rate": 1.3604586523271561e-05, "loss": 0.2686, "step": 7510 }, { "epoch": 0.40167185225740115, "grad_norm": 2.042224407196045, "learning_rate": 1.3588443686135384e-05, "loss": 0.26, "step": 7520 }, { "epoch": 0.4022059903588073, "grad_norm": 2.213428020477295, "learning_rate": 1.3572290108563118e-05, "loss": 0.2458, "step": 7530 }, { "epoch": 0.4027401284602134, "grad_norm": 1.984256625175476, "learning_rate": 1.3556125838903437e-05, "loss": 0.2591, "step": 7540 }, { "epoch": 0.4032742665616195, "grad_norm": 2.5319066047668457, "learning_rate": 1.353995092553701e-05, "loss": 0.2515, "step": 7550 }, { "epoch": 0.40380840466302564, "grad_norm": 2.136357069015503, "learning_rate": 1.3523765416876357e-05, "loss": 0.2662, "step": 7560 }, { "epoch": 0.40434254276443177, "grad_norm": 1.9591666460037231, "learning_rate": 1.350756936136572e-05, "loss": 0.2146, "step": 7570 }, { "epoch": 0.40487668086583783, "grad_norm": 2.3154659271240234, "learning_rate": 1.3491362807480902e-05, "loss": 0.23, "step": 7580 }, { "epoch": 0.40541081896724396, "grad_norm": 2.2329063415527344, "learning_rate": 1.3475145803729135e-05, "loss": 0.2504, "step": 7590 }, { "epoch": 0.4059449570686501, "grad_norm": 2.357820749282837, "learning_rate": 1.3458918398648923e-05, "loss": 0.2342, "step": 7600 }, { "epoch": 0.4064790951700562, "grad_norm": 1.9621813297271729, "learning_rate": 1.3442680640809904e-05, "loss": 0.2567, "step": 7610 }, { "epoch": 0.4070132332714623, "grad_norm": 2.4515326023101807, "learning_rate": 1.3426432578812695e-05, "loss": 0.2631, "step": 7620 }, { "epoch": 0.40754737137286845, "grad_norm": 1.970691442489624, "learning_rate": 1.341017426128877e-05, "loss": 0.2321, "step": 7630 }, { "epoch": 0.4080815094742746, "grad_norm": 1.6880353689193726, "learning_rate": 1.339390573690028e-05, "loss": 0.2282, "step": 7640 }, { "epoch": 0.4086156475756807, "grad_norm": 1.8122334480285645, "learning_rate": 1.3377627054339936e-05, "loss": 0.2385, "step": 7650 }, { "epoch": 0.4091497856770868, "grad_norm": 2.516660690307617, "learning_rate": 1.3361338262330863e-05, "loss": 0.2359, "step": 7660 }, { "epoch": 0.40968392377849294, "grad_norm": 2.508638381958008, "learning_rate": 1.3345039409626418e-05, "loss": 0.2359, "step": 7670 }, { "epoch": 0.41021806187989907, "grad_norm": 2.073477029800415, "learning_rate": 1.332873054501009e-05, "loss": 0.2382, "step": 7680 }, { "epoch": 0.4107521999813052, "grad_norm": 2.500866174697876, "learning_rate": 1.3312411717295333e-05, "loss": 0.2511, "step": 7690 }, { "epoch": 0.4112863380827113, "grad_norm": 1.955191731452942, "learning_rate": 1.3296082975325415e-05, "loss": 0.2555, "step": 7700 }, { "epoch": 0.4118204761841174, "grad_norm": 1.7601314783096313, "learning_rate": 1.3279744367973279e-05, "loss": 0.2537, "step": 7710 }, { "epoch": 0.4123546142855235, "grad_norm": 2.7545766830444336, "learning_rate": 1.3263395944141404e-05, "loss": 0.2309, "step": 7720 }, { "epoch": 0.4128887523869296, "grad_norm": 2.246441125869751, "learning_rate": 1.3247037752761636e-05, "loss": 0.2425, "step": 7730 }, { "epoch": 0.41342289048833575, "grad_norm": 1.8121628761291504, "learning_rate": 1.3230669842795066e-05, "loss": 0.2533, "step": 7740 }, { "epoch": 0.41395702858974187, "grad_norm": 2.1904752254486084, "learning_rate": 1.3214292263231871e-05, "loss": 0.2352, "step": 7750 }, { "epoch": 0.414491166691148, "grad_norm": 1.8134618997573853, "learning_rate": 1.3197905063091168e-05, "loss": 0.2442, "step": 7760 }, { "epoch": 0.4150253047925541, "grad_norm": 1.7844387292861938, "learning_rate": 1.3181508291420875e-05, "loss": 0.2574, "step": 7770 }, { "epoch": 0.41555944289396024, "grad_norm": 1.9230400323867798, "learning_rate": 1.316510199729755e-05, "loss": 0.2399, "step": 7780 }, { "epoch": 0.41609358099536636, "grad_norm": 1.9353176355361938, "learning_rate": 1.3148686229826258e-05, "loss": 0.2444, "step": 7790 }, { "epoch": 0.4166277190967725, "grad_norm": 2.4505374431610107, "learning_rate": 1.3132261038140414e-05, "loss": 0.2297, "step": 7800 }, { "epoch": 0.4171618571981786, "grad_norm": 1.943070650100708, "learning_rate": 1.3115826471401645e-05, "loss": 0.2577, "step": 7810 }, { "epoch": 0.41769599529958473, "grad_norm": 1.4473865032196045, "learning_rate": 1.3099382578799635e-05, "loss": 0.2402, "step": 7820 }, { "epoch": 0.4182301334009908, "grad_norm": 1.9315630197525024, "learning_rate": 1.3082929409551981e-05, "loss": 0.2514, "step": 7830 }, { "epoch": 0.4187642715023969, "grad_norm": 4.894300937652588, "learning_rate": 1.3066467012904048e-05, "loss": 0.2465, "step": 7840 }, { "epoch": 0.41929840960380305, "grad_norm": 2.4819793701171875, "learning_rate": 1.3049995438128815e-05, "loss": 0.2547, "step": 7850 }, { "epoch": 0.41983254770520917, "grad_norm": 1.9159036874771118, "learning_rate": 1.3033514734526733e-05, "loss": 0.2448, "step": 7860 }, { "epoch": 0.4203666858066153, "grad_norm": 2.878638982772827, "learning_rate": 1.3017024951425584e-05, "loss": 0.2411, "step": 7870 }, { "epoch": 0.4209008239080214, "grad_norm": 1.5487334728240967, "learning_rate": 1.3000526138180308e-05, "loss": 0.2577, "step": 7880 }, { "epoch": 0.42143496200942754, "grad_norm": 1.8015592098236084, "learning_rate": 1.2984018344172892e-05, "loss": 0.2235, "step": 7890 }, { "epoch": 0.42196910011083366, "grad_norm": 2.967637538909912, "learning_rate": 1.2967501618812195e-05, "loss": 0.2461, "step": 7900 }, { "epoch": 0.4225032382122398, "grad_norm": 1.205308198928833, "learning_rate": 1.2950976011533802e-05, "loss": 0.2353, "step": 7910 }, { "epoch": 0.4230373763136459, "grad_norm": 1.5131487846374512, "learning_rate": 1.2934441571799893e-05, "loss": 0.2598, "step": 7920 }, { "epoch": 0.42357151441505203, "grad_norm": 2.3098154067993164, "learning_rate": 1.2917898349099074e-05, "loss": 0.2441, "step": 7930 }, { "epoch": 0.42410565251645815, "grad_norm": 2.531383752822876, "learning_rate": 1.2901346392946249e-05, "loss": 0.2544, "step": 7940 }, { "epoch": 0.4246397906178642, "grad_norm": 2.2274270057678223, "learning_rate": 1.288478575288245e-05, "loss": 0.2323, "step": 7950 }, { "epoch": 0.42517392871927034, "grad_norm": 2.1468842029571533, "learning_rate": 1.2868216478474716e-05, "loss": 0.261, "step": 7960 }, { "epoch": 0.42570806682067647, "grad_norm": 2.1782469749450684, "learning_rate": 1.2851638619315911e-05, "loss": 0.2532, "step": 7970 }, { "epoch": 0.4262422049220826, "grad_norm": 2.200592279434204, "learning_rate": 1.283505222502461e-05, "loss": 0.2386, "step": 7980 }, { "epoch": 0.4267763430234887, "grad_norm": 1.9174662828445435, "learning_rate": 1.2818457345244922e-05, "loss": 0.2434, "step": 7990 }, { "epoch": 0.42731048112489484, "grad_norm": 2.4632411003112793, "learning_rate": 1.280185402964636e-05, "loss": 0.2284, "step": 8000 }, { "epoch": 0.42784461922630096, "grad_norm": 2.361755132675171, "learning_rate": 1.278524232792369e-05, "loss": 0.2459, "step": 8010 }, { "epoch": 0.4283787573277071, "grad_norm": 1.4103230237960815, "learning_rate": 1.2768622289796763e-05, "loss": 0.2537, "step": 8020 }, { "epoch": 0.4289128954291132, "grad_norm": 1.9360517263412476, "learning_rate": 1.2751993965010398e-05, "loss": 0.2455, "step": 8030 }, { "epoch": 0.42944703353051933, "grad_norm": 1.8054049015045166, "learning_rate": 1.273535740333421e-05, "loss": 0.2463, "step": 8040 }, { "epoch": 0.42998117163192545, "grad_norm": 2.1478090286254883, "learning_rate": 1.2718712654562462e-05, "loss": 0.2464, "step": 8050 }, { "epoch": 0.4305153097333316, "grad_norm": 2.4578583240509033, "learning_rate": 1.2702059768513933e-05, "loss": 0.2314, "step": 8060 }, { "epoch": 0.4310494478347377, "grad_norm": 1.7668578624725342, "learning_rate": 1.268539879503175e-05, "loss": 0.2554, "step": 8070 }, { "epoch": 0.43158358593614377, "grad_norm": 1.727955937385559, "learning_rate": 1.266872978398324e-05, "loss": 0.2316, "step": 8080 }, { "epoch": 0.4321177240375499, "grad_norm": 1.896931767463684, "learning_rate": 1.2652052785259802e-05, "loss": 0.2394, "step": 8090 }, { "epoch": 0.432651862138956, "grad_norm": 2.0584256649017334, "learning_rate": 1.2635367848776731e-05, "loss": 0.2504, "step": 8100 }, { "epoch": 0.43318600024036213, "grad_norm": 1.7001475095748901, "learning_rate": 1.2618675024473087e-05, "loss": 0.2424, "step": 8110 }, { "epoch": 0.43372013834176826, "grad_norm": 2.050755500793457, "learning_rate": 1.2601974362311533e-05, "loss": 0.2412, "step": 8120 }, { "epoch": 0.4342542764431744, "grad_norm": 2.3158676624298096, "learning_rate": 1.25852659122782e-05, "loss": 0.2488, "step": 8130 }, { "epoch": 0.4347884145445805, "grad_norm": 1.4626377820968628, "learning_rate": 1.2568549724382513e-05, "loss": 0.2415, "step": 8140 }, { "epoch": 0.4353225526459866, "grad_norm": 2.5440309047698975, "learning_rate": 1.2551825848657073e-05, "loss": 0.2411, "step": 8150 }, { "epoch": 0.43585669074739275, "grad_norm": 2.313354015350342, "learning_rate": 1.2535094335157481e-05, "loss": 0.2283, "step": 8160 }, { "epoch": 0.4363908288487989, "grad_norm": 2.1882569789886475, "learning_rate": 1.2518355233962203e-05, "loss": 0.2282, "step": 8170 }, { "epoch": 0.436924966950205, "grad_norm": 2.0970797538757324, "learning_rate": 1.2501608595172416e-05, "loss": 0.2305, "step": 8180 }, { "epoch": 0.4374591050516111, "grad_norm": 1.6348628997802734, "learning_rate": 1.2484854468911858e-05, "loss": 0.2272, "step": 8190 }, { "epoch": 0.4379932431530172, "grad_norm": 2.3086726665496826, "learning_rate": 1.246809290532667e-05, "loss": 0.2478, "step": 8200 }, { "epoch": 0.4385273812544233, "grad_norm": 1.895745873451233, "learning_rate": 1.2451323954585259e-05, "loss": 0.2466, "step": 8210 }, { "epoch": 0.43906151935582943, "grad_norm": 2.333120107650757, "learning_rate": 1.2434547666878148e-05, "loss": 0.2407, "step": 8220 }, { "epoch": 0.43959565745723556, "grad_norm": 2.7999722957611084, "learning_rate": 1.2417764092417806e-05, "loss": 0.2422, "step": 8230 }, { "epoch": 0.4401297955586417, "grad_norm": 1.9569802284240723, "learning_rate": 1.2400973281438521e-05, "loss": 0.2476, "step": 8240 }, { "epoch": 0.4406639336600478, "grad_norm": 1.4004535675048828, "learning_rate": 1.2384175284196251e-05, "loss": 0.2425, "step": 8250 }, { "epoch": 0.4411980717614539, "grad_norm": 2.15779972076416, "learning_rate": 1.2367370150968435e-05, "loss": 0.2312, "step": 8260 }, { "epoch": 0.44173220986286005, "grad_norm": 2.0645248889923096, "learning_rate": 1.2350557932053895e-05, "loss": 0.2506, "step": 8270 }, { "epoch": 0.44226634796426617, "grad_norm": 2.28932785987854, "learning_rate": 1.2333738677772648e-05, "loss": 0.2566, "step": 8280 }, { "epoch": 0.4428004860656723, "grad_norm": 2.4740700721740723, "learning_rate": 1.2316912438465779e-05, "loss": 0.2359, "step": 8290 }, { "epoch": 0.4433346241670784, "grad_norm": 1.8925538063049316, "learning_rate": 1.2300079264495262e-05, "loss": 0.2451, "step": 8300 }, { "epoch": 0.44386876226848454, "grad_norm": 2.089332342147827, "learning_rate": 1.228323920624385e-05, "loss": 0.2263, "step": 8310 }, { "epoch": 0.4444029003698906, "grad_norm": 1.7984578609466553, "learning_rate": 1.2266392314114885e-05, "loss": 0.2378, "step": 8320 }, { "epoch": 0.44493703847129673, "grad_norm": 2.311638355255127, "learning_rate": 1.2249538638532163e-05, "loss": 0.2408, "step": 8330 }, { "epoch": 0.44547117657270285, "grad_norm": 1.8559069633483887, "learning_rate": 1.2232678229939794e-05, "loss": 0.2199, "step": 8340 }, { "epoch": 0.446005314674109, "grad_norm": 1.8695321083068848, "learning_rate": 1.221581113880203e-05, "loss": 0.2333, "step": 8350 }, { "epoch": 0.4465394527755151, "grad_norm": 1.7883480787277222, "learning_rate": 1.2198937415603132e-05, "loss": 0.2292, "step": 8360 }, { "epoch": 0.4470735908769212, "grad_norm": 2.2558138370513916, "learning_rate": 1.2182057110847203e-05, "loss": 0.2454, "step": 8370 }, { "epoch": 0.44760772897832735, "grad_norm": 2.3807525634765625, "learning_rate": 1.2165170275058049e-05, "loss": 0.2294, "step": 8380 }, { "epoch": 0.44814186707973347, "grad_norm": 1.998844861984253, "learning_rate": 1.214827695877903e-05, "loss": 0.2361, "step": 8390 }, { "epoch": 0.4486760051811396, "grad_norm": 2.4558053016662598, "learning_rate": 1.2131377212572892e-05, "loss": 0.2246, "step": 8400 }, { "epoch": 0.4492101432825457, "grad_norm": 1.6573783159255981, "learning_rate": 1.2114471087021635e-05, "loss": 0.246, "step": 8410 }, { "epoch": 0.44974428138395184, "grad_norm": 2.436753511428833, "learning_rate": 1.2097558632726345e-05, "loss": 0.2491, "step": 8420 }, { "epoch": 0.45027841948535796, "grad_norm": 2.427018165588379, "learning_rate": 1.2080639900307055e-05, "loss": 0.2258, "step": 8430 }, { "epoch": 0.4508125575867641, "grad_norm": 2.5095927715301514, "learning_rate": 1.2063714940402586e-05, "loss": 0.2699, "step": 8440 }, { "epoch": 0.45134669568817015, "grad_norm": 2.695676803588867, "learning_rate": 1.2046783803670403e-05, "loss": 0.2528, "step": 8450 }, { "epoch": 0.4518808337895763, "grad_norm": 2.136090040206909, "learning_rate": 1.2029846540786458e-05, "loss": 0.2406, "step": 8460 }, { "epoch": 0.4524149718909824, "grad_norm": 1.7964211702346802, "learning_rate": 1.201290320244503e-05, "loss": 0.234, "step": 8470 }, { "epoch": 0.4529491099923885, "grad_norm": 1.524975299835205, "learning_rate": 1.1995953839358596e-05, "loss": 0.2324, "step": 8480 }, { "epoch": 0.45348324809379464, "grad_norm": 2.3690218925476074, "learning_rate": 1.1978998502257652e-05, "loss": 0.2265, "step": 8490 }, { "epoch": 0.45401738619520077, "grad_norm": 3.2743468284606934, "learning_rate": 1.1962037241890585e-05, "loss": 0.23, "step": 8500 }, { "epoch": 0.4545515242966069, "grad_norm": 2.46572208404541, "learning_rate": 1.1945070109023506e-05, "loss": 0.2352, "step": 8510 }, { "epoch": 0.455085662398013, "grad_norm": 1.720198631286621, "learning_rate": 1.1928097154440102e-05, "loss": 0.2234, "step": 8520 }, { "epoch": 0.45561980049941914, "grad_norm": 2.5110175609588623, "learning_rate": 1.1911118428941489e-05, "loss": 0.2205, "step": 8530 }, { "epoch": 0.45615393860082526, "grad_norm": 1.9245696067810059, "learning_rate": 1.189413398334605e-05, "loss": 0.2331, "step": 8540 }, { "epoch": 0.4566880767022314, "grad_norm": 2.195054769515991, "learning_rate": 1.187714386848929e-05, "loss": 0.2299, "step": 8550 }, { "epoch": 0.4572222148036375, "grad_norm": 2.7812087535858154, "learning_rate": 1.186014813522369e-05, "loss": 0.2347, "step": 8560 }, { "epoch": 0.4577563529050436, "grad_norm": 1.7076257467269897, "learning_rate": 1.1843146834418537e-05, "loss": 0.226, "step": 8570 }, { "epoch": 0.4582904910064497, "grad_norm": 2.057361602783203, "learning_rate": 1.1826140016959786e-05, "loss": 0.2401, "step": 8580 }, { "epoch": 0.4588246291078558, "grad_norm": 2.059993028640747, "learning_rate": 1.1809127733749906e-05, "loss": 0.2235, "step": 8590 }, { "epoch": 0.45935876720926194, "grad_norm": 1.34566068649292, "learning_rate": 1.1792110035707725e-05, "loss": 0.2439, "step": 8600 }, { "epoch": 0.45989290531066807, "grad_norm": 1.7453252077102661, "learning_rate": 1.1775086973768267e-05, "loss": 0.2235, "step": 8610 }, { "epoch": 0.4604270434120742, "grad_norm": 1.9005740880966187, "learning_rate": 1.1758058598882626e-05, "loss": 0.2315, "step": 8620 }, { "epoch": 0.4609611815134803, "grad_norm": 2.3195927143096924, "learning_rate": 1.1741024962017797e-05, "loss": 0.2331, "step": 8630 }, { "epoch": 0.46149531961488643, "grad_norm": 1.458366870880127, "learning_rate": 1.1723986114156509e-05, "loss": 0.2371, "step": 8640 }, { "epoch": 0.46202945771629256, "grad_norm": 1.7080150842666626, "learning_rate": 1.1706942106297103e-05, "loss": 0.2216, "step": 8650 }, { "epoch": 0.4625635958176987, "grad_norm": 1.8504348993301392, "learning_rate": 1.1689892989453361e-05, "loss": 0.2418, "step": 8660 }, { "epoch": 0.4630977339191048, "grad_norm": 2.2403101921081543, "learning_rate": 1.1672838814654352e-05, "loss": 0.26, "step": 8670 }, { "epoch": 0.4636318720205109, "grad_norm": 2.534958600997925, "learning_rate": 1.1655779632944284e-05, "loss": 0.234, "step": 8680 }, { "epoch": 0.464166010121917, "grad_norm": 1.4856836795806885, "learning_rate": 1.1638715495382354e-05, "loss": 0.2342, "step": 8690 }, { "epoch": 0.4647001482233231, "grad_norm": 1.8507015705108643, "learning_rate": 1.1621646453042597e-05, "loss": 0.2504, "step": 8700 }, { "epoch": 0.46523428632472924, "grad_norm": 1.8989564180374146, "learning_rate": 1.1604572557013719e-05, "loss": 0.2259, "step": 8710 }, { "epoch": 0.46576842442613536, "grad_norm": 1.7216829061508179, "learning_rate": 1.1587493858398957e-05, "loss": 0.2234, "step": 8720 }, { "epoch": 0.4663025625275415, "grad_norm": 2.320866346359253, "learning_rate": 1.1570410408315921e-05, "loss": 0.227, "step": 8730 }, { "epoch": 0.4668367006289476, "grad_norm": 2.3217613697052, "learning_rate": 1.1553322257896449e-05, "loss": 0.2291, "step": 8740 }, { "epoch": 0.46737083873035373, "grad_norm": 1.6289666891098022, "learning_rate": 1.1536229458286438e-05, "loss": 0.2372, "step": 8750 }, { "epoch": 0.46790497683175986, "grad_norm": 3.058397054672241, "learning_rate": 1.1519132060645706e-05, "loss": 0.2384, "step": 8760 }, { "epoch": 0.468439114933166, "grad_norm": 2.6905791759490967, "learning_rate": 1.1502030116147836e-05, "loss": 0.2279, "step": 8770 }, { "epoch": 0.4689732530345721, "grad_norm": 1.6280781030654907, "learning_rate": 1.1484923675980009e-05, "loss": 0.2272, "step": 8780 }, { "epoch": 0.4695073911359782, "grad_norm": 2.484955072402954, "learning_rate": 1.1467812791342874e-05, "loss": 0.2204, "step": 8790 }, { "epoch": 0.47004152923738435, "grad_norm": 1.958983063697815, "learning_rate": 1.1450697513450375e-05, "loss": 0.2324, "step": 8800 }, { "epoch": 0.47057566733879047, "grad_norm": 1.7591406106948853, "learning_rate": 1.143357789352961e-05, "loss": 0.2114, "step": 8810 }, { "epoch": 0.47110980544019654, "grad_norm": 1.5119032859802246, "learning_rate": 1.1416453982820668e-05, "loss": 0.2481, "step": 8820 }, { "epoch": 0.47164394354160266, "grad_norm": 1.6546568870544434, "learning_rate": 1.1399325832576485e-05, "loss": 0.2317, "step": 8830 }, { "epoch": 0.4721780816430088, "grad_norm": 1.625227928161621, "learning_rate": 1.1382193494062685e-05, "loss": 0.226, "step": 8840 }, { "epoch": 0.4727122197444149, "grad_norm": 2.5196382999420166, "learning_rate": 1.1365057018557426e-05, "loss": 0.2408, "step": 8850 }, { "epoch": 0.47324635784582103, "grad_norm": 1.7361578941345215, "learning_rate": 1.1347916457351251e-05, "loss": 0.2296, "step": 8860 }, { "epoch": 0.47378049594722715, "grad_norm": 2.2690372467041016, "learning_rate": 1.1330771861746928e-05, "loss": 0.2282, "step": 8870 }, { "epoch": 0.4743146340486333, "grad_norm": 1.509157419204712, "learning_rate": 1.1313623283059303e-05, "loss": 0.2366, "step": 8880 }, { "epoch": 0.4748487721500394, "grad_norm": 2.4045157432556152, "learning_rate": 1.1296470772615147e-05, "loss": 0.2475, "step": 8890 }, { "epoch": 0.4753829102514455, "grad_norm": 1.9043093919754028, "learning_rate": 1.1279314381752988e-05, "loss": 0.2492, "step": 8900 }, { "epoch": 0.47591704835285165, "grad_norm": 1.5417094230651855, "learning_rate": 1.1262154161822983e-05, "loss": 0.2203, "step": 8910 }, { "epoch": 0.47645118645425777, "grad_norm": 1.8461562395095825, "learning_rate": 1.1244990164186733e-05, "loss": 0.2426, "step": 8920 }, { "epoch": 0.4769853245556639, "grad_norm": 2.366424560546875, "learning_rate": 1.1227822440217166e-05, "loss": 0.2406, "step": 8930 }, { "epoch": 0.47751946265706996, "grad_norm": 1.689052700996399, "learning_rate": 1.1210651041298343e-05, "loss": 0.2255, "step": 8940 }, { "epoch": 0.4780536007584761, "grad_norm": 1.7311588525772095, "learning_rate": 1.119347601882534e-05, "loss": 0.236, "step": 8950 }, { "epoch": 0.4785877388598822, "grad_norm": 1.7917357683181763, "learning_rate": 1.1176297424204068e-05, "loss": 0.2153, "step": 8960 }, { "epoch": 0.47912187696128833, "grad_norm": 1.8835124969482422, "learning_rate": 1.1159115308851132e-05, "loss": 0.248, "step": 8970 }, { "epoch": 0.47965601506269445, "grad_norm": 1.9323071241378784, "learning_rate": 1.1141929724193683e-05, "loss": 0.2388, "step": 8980 }, { "epoch": 0.4801901531641006, "grad_norm": 1.5817859172821045, "learning_rate": 1.112474072166924e-05, "loss": 0.228, "step": 8990 }, { "epoch": 0.4807242912655067, "grad_norm": 2.504650115966797, "learning_rate": 1.1107548352725573e-05, "loss": 0.2461, "step": 9000 }, { "epoch": 0.4812584293669128, "grad_norm": 1.8306629657745361, "learning_rate": 1.1090352668820507e-05, "loss": 0.2332, "step": 9010 }, { "epoch": 0.48179256746831894, "grad_norm": 1.7300621271133423, "learning_rate": 1.1073153721421799e-05, "loss": 0.2378, "step": 9020 }, { "epoch": 0.48232670556972507, "grad_norm": 2.2012696266174316, "learning_rate": 1.1055951562006977e-05, "loss": 0.231, "step": 9030 }, { "epoch": 0.4828608436711312, "grad_norm": 1.7074790000915527, "learning_rate": 1.1038746242063172e-05, "loss": 0.2233, "step": 9040 }, { "epoch": 0.4833949817725373, "grad_norm": 1.4047276973724365, "learning_rate": 1.102153781308699e-05, "loss": 0.233, "step": 9050 }, { "epoch": 0.4839291198739434, "grad_norm": 2.234055757522583, "learning_rate": 1.1004326326584325e-05, "loss": 0.2308, "step": 9060 }, { "epoch": 0.4844632579753495, "grad_norm": 2.3361644744873047, "learning_rate": 1.0987111834070237e-05, "loss": 0.2284, "step": 9070 }, { "epoch": 0.4849973960767556, "grad_norm": 1.8241535425186157, "learning_rate": 1.0969894387068776e-05, "loss": 0.2497, "step": 9080 }, { "epoch": 0.48553153417816175, "grad_norm": 1.9049677848815918, "learning_rate": 1.0952674037112833e-05, "loss": 0.2528, "step": 9090 }, { "epoch": 0.4860656722795679, "grad_norm": 1.836341381072998, "learning_rate": 1.0935450835743994e-05, "loss": 0.2273, "step": 9100 }, { "epoch": 0.486599810380974, "grad_norm": 1.9412810802459717, "learning_rate": 1.0918224834512376e-05, "loss": 0.2372, "step": 9110 }, { "epoch": 0.4871339484823801, "grad_norm": 1.6608407497406006, "learning_rate": 1.090099608497648e-05, "loss": 0.2303, "step": 9120 }, { "epoch": 0.48766808658378624, "grad_norm": 2.452105760574341, "learning_rate": 1.0883764638703024e-05, "loss": 0.2365, "step": 9130 }, { "epoch": 0.48820222468519237, "grad_norm": 2.0158042907714844, "learning_rate": 1.0866530547266805e-05, "loss": 0.2283, "step": 9140 }, { "epoch": 0.4887363627865985, "grad_norm": 1.9540421962738037, "learning_rate": 1.0849293862250537e-05, "loss": 0.2203, "step": 9150 }, { "epoch": 0.4892705008880046, "grad_norm": 2.843426465988159, "learning_rate": 1.0832054635244688e-05, "loss": 0.2268, "step": 9160 }, { "epoch": 0.48980463898941073, "grad_norm": 2.4183478355407715, "learning_rate": 1.0814812917847351e-05, "loss": 0.2392, "step": 9170 }, { "epoch": 0.49033877709081686, "grad_norm": 2.4311511516571045, "learning_rate": 1.0797568761664056e-05, "loss": 0.239, "step": 9180 }, { "epoch": 0.4908729151922229, "grad_norm": 1.8194931745529175, "learning_rate": 1.0780322218307644e-05, "loss": 0.2227, "step": 9190 }, { "epoch": 0.49140705329362905, "grad_norm": 3.3710336685180664, "learning_rate": 1.076307333939809e-05, "loss": 0.2326, "step": 9200 }, { "epoch": 0.49194119139503517, "grad_norm": 2.292485237121582, "learning_rate": 1.0745822176562371e-05, "loss": 0.2256, "step": 9210 }, { "epoch": 0.4924753294964413, "grad_norm": 2.2396020889282227, "learning_rate": 1.0728568781434296e-05, "loss": 0.2184, "step": 9220 }, { "epoch": 0.4930094675978474, "grad_norm": 2.181723117828369, "learning_rate": 1.0711313205654352e-05, "loss": 0.2349, "step": 9230 }, { "epoch": 0.49354360569925354, "grad_norm": 2.3042943477630615, "learning_rate": 1.0694055500869559e-05, "loss": 0.2316, "step": 9240 }, { "epoch": 0.49407774380065966, "grad_norm": 1.7766684293746948, "learning_rate": 1.0676795718733294e-05, "loss": 0.2219, "step": 9250 }, { "epoch": 0.4946118819020658, "grad_norm": 2.452115058898926, "learning_rate": 1.0659533910905178e-05, "loss": 0.233, "step": 9260 }, { "epoch": 0.4951460200034719, "grad_norm": 1.684878945350647, "learning_rate": 1.0642270129050866e-05, "loss": 0.2284, "step": 9270 }, { "epoch": 0.49568015810487803, "grad_norm": 1.9052156209945679, "learning_rate": 1.0625004424841945e-05, "loss": 0.2247, "step": 9280 }, { "epoch": 0.49621429620628416, "grad_norm": 2.0262763500213623, "learning_rate": 1.0607736849955743e-05, "loss": 0.2292, "step": 9290 }, { "epoch": 0.4967484343076903, "grad_norm": 1.7796902656555176, "learning_rate": 1.0590467456075193e-05, "loss": 0.2469, "step": 9300 }, { "epoch": 0.49728257240909635, "grad_norm": 2.110952854156494, "learning_rate": 1.0573196294888664e-05, "loss": 0.248, "step": 9310 }, { "epoch": 0.49781671051050247, "grad_norm": 1.6084312200546265, "learning_rate": 1.0555923418089821e-05, "loss": 0.2273, "step": 9320 }, { "epoch": 0.4983508486119086, "grad_norm": 3.123159170150757, "learning_rate": 1.0538648877377471e-05, "loss": 0.2409, "step": 9330 }, { "epoch": 0.4988849867133147, "grad_norm": 2.4544103145599365, "learning_rate": 1.0521372724455381e-05, "loss": 0.2328, "step": 9340 }, { "epoch": 0.49941912481472084, "grad_norm": 2.792691230773926, "learning_rate": 1.0504095011032165e-05, "loss": 0.2393, "step": 9350 }, { "epoch": 0.49995326291612696, "grad_norm": 2.273705244064331, "learning_rate": 1.0486815788821096e-05, "loss": 0.2296, "step": 9360 }, { "epoch": 0.5004874010175331, "grad_norm": 2.320528030395508, "learning_rate": 1.0469535109539965e-05, "loss": 0.2249, "step": 9370 }, { "epoch": 0.5010215391189392, "grad_norm": 1.8641767501831055, "learning_rate": 1.0452253024910924e-05, "loss": 0.2431, "step": 9380 }, { "epoch": 0.5015556772203453, "grad_norm": 1.5387927293777466, "learning_rate": 1.0434969586660331e-05, "loss": 0.2401, "step": 9390 }, { "epoch": 0.5020898153217515, "grad_norm": 1.8298964500427246, "learning_rate": 1.0417684846518598e-05, "loss": 0.2271, "step": 9400 }, { "epoch": 0.5026239534231576, "grad_norm": 1.2823607921600342, "learning_rate": 1.0400398856220032e-05, "loss": 0.2266, "step": 9410 }, { "epoch": 0.5031580915245637, "grad_norm": 2.2026023864746094, "learning_rate": 1.0383111667502678e-05, "loss": 0.2295, "step": 9420 }, { "epoch": 0.5036922296259698, "grad_norm": 1.8640315532684326, "learning_rate": 1.0365823332108177e-05, "loss": 0.2214, "step": 9430 }, { "epoch": 0.504226367727376, "grad_norm": 1.7906513214111328, "learning_rate": 1.0348533901781592e-05, "loss": 0.2359, "step": 9440 }, { "epoch": 0.5047605058287821, "grad_norm": 2.125967264175415, "learning_rate": 1.033124342827127e-05, "loss": 0.2373, "step": 9450 }, { "epoch": 0.5052946439301882, "grad_norm": 2.477053642272949, "learning_rate": 1.031395196332868e-05, "loss": 0.2336, "step": 9460 }, { "epoch": 0.5058287820315943, "grad_norm": 2.2755184173583984, "learning_rate": 1.0296659558708253e-05, "loss": 0.2325, "step": 9470 }, { "epoch": 0.5063629201330004, "grad_norm": 1.8043476343154907, "learning_rate": 1.027936626616724e-05, "loss": 0.2358, "step": 9480 }, { "epoch": 0.5068970582344066, "grad_norm": 1.846389651298523, "learning_rate": 1.0262072137465539e-05, "loss": 0.2173, "step": 9490 }, { "epoch": 0.5074311963358126, "grad_norm": 2.589195489883423, "learning_rate": 1.0244777224365566e-05, "loss": 0.2336, "step": 9500 }, { "epoch": 0.5079653344372187, "grad_norm": 2.947768449783325, "learning_rate": 1.0227481578632068e-05, "loss": 0.2381, "step": 9510 }, { "epoch": 0.5084994725386248, "grad_norm": 1.8937081098556519, "learning_rate": 1.0210185252031999e-05, "loss": 0.2116, "step": 9520 }, { "epoch": 0.5090336106400309, "grad_norm": 1.5582338571548462, "learning_rate": 1.019288829633434e-05, "loss": 0.2273, "step": 9530 }, { "epoch": 0.5095677487414371, "grad_norm": 2.1623001098632812, "learning_rate": 1.0175590763309967e-05, "loss": 0.2259, "step": 9540 }, { "epoch": 0.5101018868428432, "grad_norm": 1.6017760038375854, "learning_rate": 1.015829270473147e-05, "loss": 0.2227, "step": 9550 }, { "epoch": 0.5106360249442493, "grad_norm": 2.507993698120117, "learning_rate": 1.014099417237302e-05, "loss": 0.2356, "step": 9560 }, { "epoch": 0.5111701630456554, "grad_norm": 1.7582252025604248, "learning_rate": 1.0123695218010208e-05, "loss": 0.2284, "step": 9570 }, { "epoch": 0.5117043011470616, "grad_norm": 1.9023057222366333, "learning_rate": 1.0106395893419878e-05, "loss": 0.2303, "step": 9580 }, { "epoch": 0.5122384392484677, "grad_norm": 2.0776162147521973, "learning_rate": 1.0089096250379997e-05, "loss": 0.2203, "step": 9590 }, { "epoch": 0.5127725773498738, "grad_norm": 1.9068827629089355, "learning_rate": 1.0071796340669475e-05, "loss": 0.2175, "step": 9600 }, { "epoch": 0.5133067154512799, "grad_norm": 2.3904199600219727, "learning_rate": 1.005449621606802e-05, "loss": 0.2435, "step": 9610 }, { "epoch": 0.513840853552686, "grad_norm": 1.728954553604126, "learning_rate": 1.0037195928355985e-05, "loss": 0.2361, "step": 9620 }, { "epoch": 0.5143749916540922, "grad_norm": 2.273949146270752, "learning_rate": 1.0019895529314213e-05, "loss": 0.2301, "step": 9630 }, { "epoch": 0.5149091297554983, "grad_norm": 2.06903076171875, "learning_rate": 1.0002595070723881e-05, "loss": 0.2347, "step": 9640 }, { "epoch": 0.5154432678569044, "grad_norm": 1.4454524517059326, "learning_rate": 9.985294604366339e-06, "loss": 0.2239, "step": 9650 }, { "epoch": 0.5159774059583105, "grad_norm": 1.7149932384490967, "learning_rate": 9.967994182022966e-06, "loss": 0.2153, "step": 9660 }, { "epoch": 0.5165115440597167, "grad_norm": 2.115217924118042, "learning_rate": 9.950693855475001e-06, "loss": 0.2345, "step": 9670 }, { "epoch": 0.5170456821611228, "grad_norm": 1.7931208610534668, "learning_rate": 9.933393676503414e-06, "loss": 0.2256, "step": 9680 }, { "epoch": 0.5175798202625289, "grad_norm": 1.913888931274414, "learning_rate": 9.91609369688871e-06, "loss": 0.2481, "step": 9690 }, { "epoch": 0.518113958363935, "grad_norm": 2.5458321571350098, "learning_rate": 9.898793968410809e-06, "loss": 0.2259, "step": 9700 }, { "epoch": 0.5186480964653412, "grad_norm": 1.8786715269088745, "learning_rate": 9.881494542848885e-06, "loss": 0.2324, "step": 9710 }, { "epoch": 0.5191822345667473, "grad_norm": 1.5150341987609863, "learning_rate": 9.864195471981193e-06, "loss": 0.213, "step": 9720 }, { "epoch": 0.5197163726681534, "grad_norm": 2.2940657138824463, "learning_rate": 9.846896807584935e-06, "loss": 0.236, "step": 9730 }, { "epoch": 0.5202505107695594, "grad_norm": 2.4158716201782227, "learning_rate": 9.829598601436098e-06, "loss": 0.2334, "step": 9740 }, { "epoch": 0.5207846488709655, "grad_norm": 1.1637647151947021, "learning_rate": 9.812300905309288e-06, "loss": 0.2199, "step": 9750 }, { "epoch": 0.5213187869723717, "grad_norm": 2.0486693382263184, "learning_rate": 9.795003770977593e-06, "loss": 0.2344, "step": 9760 }, { "epoch": 0.5218529250737778, "grad_norm": 2.5651907920837402, "learning_rate": 9.777707250212413e-06, "loss": 0.2359, "step": 9770 }, { "epoch": 0.5223870631751839, "grad_norm": 2.0353634357452393, "learning_rate": 9.760411394783321e-06, "loss": 0.238, "step": 9780 }, { "epoch": 0.52292120127659, "grad_norm": 2.1958975791931152, "learning_rate": 9.743116256457887e-06, "loss": 0.2382, "step": 9790 }, { "epoch": 0.5234553393779962, "grad_norm": 1.7779262065887451, "learning_rate": 9.725821887001545e-06, "loss": 0.2385, "step": 9800 }, { "epoch": 0.5239894774794023, "grad_norm": 3.246936082839966, "learning_rate": 9.70852833817742e-06, "loss": 0.2412, "step": 9810 }, { "epoch": 0.5245236155808084, "grad_norm": 1.346787691116333, "learning_rate": 9.69123566174618e-06, "loss": 0.2284, "step": 9820 }, { "epoch": 0.5250577536822145, "grad_norm": 1.7659692764282227, "learning_rate": 9.673943909465892e-06, "loss": 0.2023, "step": 9830 }, { "epoch": 0.5255918917836206, "grad_norm": 2.27420973777771, "learning_rate": 9.656653133091848e-06, "loss": 0.1979, "step": 9840 }, { "epoch": 0.5261260298850268, "grad_norm": 1.7758007049560547, "learning_rate": 9.639363384376421e-06, "loss": 0.2179, "step": 9850 }, { "epoch": 0.5266601679864329, "grad_norm": 1.969031572341919, "learning_rate": 9.62207471506891e-06, "loss": 0.2326, "step": 9860 }, { "epoch": 0.527194306087839, "grad_norm": 1.431186556816101, "learning_rate": 9.60478717691538e-06, "loss": 0.2297, "step": 9870 }, { "epoch": 0.5277284441892451, "grad_norm": 1.848404049873352, "learning_rate": 9.587500821658516e-06, "loss": 0.2424, "step": 9880 }, { "epoch": 0.5282625822906513, "grad_norm": 2.488642930984497, "learning_rate": 9.570215701037457e-06, "loss": 0.2358, "step": 9890 }, { "epoch": 0.5287967203920574, "grad_norm": 1.7796128988265991, "learning_rate": 9.55293186678765e-06, "loss": 0.2131, "step": 9900 }, { "epoch": 0.5293308584934635, "grad_norm": 2.225304365158081, "learning_rate": 9.535649370640687e-06, "loss": 0.2275, "step": 9910 }, { "epoch": 0.5298649965948696, "grad_norm": 1.8038920164108276, "learning_rate": 9.518368264324164e-06, "loss": 0.2428, "step": 9920 }, { "epoch": 0.5303991346962758, "grad_norm": 2.61702299118042, "learning_rate": 9.50108859956151e-06, "loss": 0.2193, "step": 9930 }, { "epoch": 0.5309332727976819, "grad_norm": 1.8178497552871704, "learning_rate": 9.483810428071832e-06, "loss": 0.2213, "step": 9940 }, { "epoch": 0.531467410899088, "grad_norm": 2.0999367237091064, "learning_rate": 9.466533801569789e-06, "loss": 0.2098, "step": 9950 }, { "epoch": 0.5320015490004941, "grad_norm": 2.0983521938323975, "learning_rate": 9.449258771765394e-06, "loss": 0.2351, "step": 9960 }, { "epoch": 0.5325356871019002, "grad_norm": 2.087258815765381, "learning_rate": 9.431985390363897e-06, "loss": 0.214, "step": 9970 }, { "epoch": 0.5330698252033064, "grad_norm": 1.7866816520690918, "learning_rate": 9.414713709065602e-06, "loss": 0.2353, "step": 9980 }, { "epoch": 0.5336039633047124, "grad_norm": 1.753161907196045, "learning_rate": 9.397443779565735e-06, "loss": 0.2214, "step": 9990 }, { "epoch": 0.5341381014061185, "grad_norm": 2.8323707580566406, "learning_rate": 9.380175653554268e-06, "loss": 0.2191, "step": 10000 }, { "epoch": 0.5346722395075246, "grad_norm": 1.517661213874817, "learning_rate": 9.362909382715787e-06, "loss": 0.226, "step": 10010 }, { "epoch": 0.5352063776089308, "grad_norm": 2.0053529739379883, "learning_rate": 9.345645018729312e-06, "loss": 0.2182, "step": 10020 }, { "epoch": 0.5357405157103369, "grad_norm": 2.7930524349212646, "learning_rate": 9.328382613268172e-06, "loss": 0.229, "step": 10030 }, { "epoch": 0.536274653811743, "grad_norm": 2.1233069896698, "learning_rate": 9.311122217999816e-06, "loss": 0.216, "step": 10040 }, { "epoch": 0.5368087919131491, "grad_norm": 1.5266999006271362, "learning_rate": 9.293863884585688e-06, "loss": 0.2033, "step": 10050 }, { "epoch": 0.5373429300145552, "grad_norm": 3.0250227451324463, "learning_rate": 9.276607664681056e-06, "loss": 0.2268, "step": 10060 }, { "epoch": 0.5378770681159614, "grad_norm": 1.3228715658187866, "learning_rate": 9.25935360993487e-06, "loss": 0.2195, "step": 10070 }, { "epoch": 0.5384112062173675, "grad_norm": 1.752259373664856, "learning_rate": 9.242101771989587e-06, "loss": 0.2175, "step": 10080 }, { "epoch": 0.5389453443187736, "grad_norm": 2.080658435821533, "learning_rate": 9.224852202481041e-06, "loss": 0.2307, "step": 10090 }, { "epoch": 0.5394794824201797, "grad_norm": 1.9658247232437134, "learning_rate": 9.207604953038266e-06, "loss": 0.2205, "step": 10100 }, { "epoch": 0.5400136205215859, "grad_norm": 2.6497058868408203, "learning_rate": 9.19036007528336e-06, "loss": 0.2288, "step": 10110 }, { "epoch": 0.540547758622992, "grad_norm": 1.3912475109100342, "learning_rate": 9.173117620831317e-06, "loss": 0.2289, "step": 10120 }, { "epoch": 0.5410818967243981, "grad_norm": 2.3989152908325195, "learning_rate": 9.155877641289883e-06, "loss": 0.2329, "step": 10130 }, { "epoch": 0.5416160348258042, "grad_norm": 1.7088629007339478, "learning_rate": 9.13864018825939e-06, "loss": 0.2346, "step": 10140 }, { "epoch": 0.5421501729272103, "grad_norm": 1.3678971529006958, "learning_rate": 9.121405313332618e-06, "loss": 0.2207, "step": 10150 }, { "epoch": 0.5426843110286165, "grad_norm": 2.4927215576171875, "learning_rate": 9.104173068094618e-06, "loss": 0.2281, "step": 10160 }, { "epoch": 0.5432184491300226, "grad_norm": 2.4630587100982666, "learning_rate": 9.086943504122574e-06, "loss": 0.2082, "step": 10170 }, { "epoch": 0.5437525872314287, "grad_norm": 2.0145411491394043, "learning_rate": 9.069716672985652e-06, "loss": 0.2303, "step": 10180 }, { "epoch": 0.5442867253328348, "grad_norm": 1.9427211284637451, "learning_rate": 9.052492626244834e-06, "loss": 0.2361, "step": 10190 }, { "epoch": 0.544820863434241, "grad_norm": 1.8999853134155273, "learning_rate": 9.035271415452764e-06, "loss": 0.2149, "step": 10200 }, { "epoch": 0.5453550015356471, "grad_norm": 1.7333875894546509, "learning_rate": 9.018053092153608e-06, "loss": 0.2251, "step": 10210 }, { "epoch": 0.5458891396370532, "grad_norm": 1.908860683441162, "learning_rate": 9.000837707882874e-06, "loss": 0.2253, "step": 10220 }, { "epoch": 0.5464232777384592, "grad_norm": 1.7657915353775024, "learning_rate": 8.983625314167292e-06, "loss": 0.2096, "step": 10230 }, { "epoch": 0.5469574158398653, "grad_norm": 2.31400465965271, "learning_rate": 8.966415962524623e-06, "loss": 0.2204, "step": 10240 }, { "epoch": 0.5474915539412715, "grad_norm": 2.2185111045837402, "learning_rate": 8.949209704463538e-06, "loss": 0.2209, "step": 10250 }, { "epoch": 0.5480256920426776, "grad_norm": 2.13369083404541, "learning_rate": 8.932006591483436e-06, "loss": 0.2069, "step": 10260 }, { "epoch": 0.5485598301440837, "grad_norm": 1.7283806800842285, "learning_rate": 8.914806675074318e-06, "loss": 0.2192, "step": 10270 }, { "epoch": 0.5490939682454898, "grad_norm": 1.517518401145935, "learning_rate": 8.897610006716599e-06, "loss": 0.2308, "step": 10280 }, { "epoch": 0.549628106346896, "grad_norm": 2.128603935241699, "learning_rate": 8.880416637880981e-06, "loss": 0.22, "step": 10290 }, { "epoch": 0.5501622444483021, "grad_norm": 1.6912510395050049, "learning_rate": 8.863226620028297e-06, "loss": 0.2237, "step": 10300 }, { "epoch": 0.5506963825497082, "grad_norm": 1.7986713647842407, "learning_rate": 8.846040004609339e-06, "loss": 0.2139, "step": 10310 }, { "epoch": 0.5512305206511143, "grad_norm": 2.5891456604003906, "learning_rate": 8.828856843064726e-06, "loss": 0.2048, "step": 10320 }, { "epoch": 0.5517646587525205, "grad_norm": 2.6340396404266357, "learning_rate": 8.811677186824727e-06, "loss": 0.2228, "step": 10330 }, { "epoch": 0.5522987968539266, "grad_norm": 1.8587902784347534, "learning_rate": 8.79450108730913e-06, "loss": 0.2099, "step": 10340 }, { "epoch": 0.5528329349553327, "grad_norm": 1.910110354423523, "learning_rate": 8.777328595927075e-06, "loss": 0.2263, "step": 10350 }, { "epoch": 0.5533670730567388, "grad_norm": 1.973509430885315, "learning_rate": 8.7601597640769e-06, "loss": 0.2235, "step": 10360 }, { "epoch": 0.553901211158145, "grad_norm": 2.127013683319092, "learning_rate": 8.74299464314599e-06, "loss": 0.2311, "step": 10370 }, { "epoch": 0.5544353492595511, "grad_norm": 1.9178886413574219, "learning_rate": 8.725833284510627e-06, "loss": 0.224, "step": 10380 }, { "epoch": 0.5549694873609572, "grad_norm": 3.249615430831909, "learning_rate": 8.708675739535825e-06, "loss": 0.2211, "step": 10390 }, { "epoch": 0.5555036254623633, "grad_norm": 1.7704476118087769, "learning_rate": 8.69152205957519e-06, "loss": 0.2277, "step": 10400 }, { "epoch": 0.5560377635637694, "grad_norm": 1.5284531116485596, "learning_rate": 8.674372295970755e-06, "loss": 0.2294, "step": 10410 }, { "epoch": 0.5565719016651756, "grad_norm": 1.809374451637268, "learning_rate": 8.65722650005284e-06, "loss": 0.2198, "step": 10420 }, { "epoch": 0.5571060397665817, "grad_norm": 1.6468181610107422, "learning_rate": 8.640084723139874e-06, "loss": 0.2182, "step": 10430 }, { "epoch": 0.5576401778679878, "grad_norm": 1.7574279308319092, "learning_rate": 8.622947016538275e-06, "loss": 0.2077, "step": 10440 }, { "epoch": 0.5581743159693939, "grad_norm": 1.9037675857543945, "learning_rate": 8.605813431542262e-06, "loss": 0.2123, "step": 10450 }, { "epoch": 0.5587084540708, "grad_norm": 1.6364914178848267, "learning_rate": 8.588684019433732e-06, "loss": 0.2065, "step": 10460 }, { "epoch": 0.5592425921722062, "grad_norm": 2.4662153720855713, "learning_rate": 8.571558831482079e-06, "loss": 0.2137, "step": 10470 }, { "epoch": 0.5597767302736122, "grad_norm": 2.470003604888916, "learning_rate": 8.554437918944063e-06, "loss": 0.2141, "step": 10480 }, { "epoch": 0.5603108683750183, "grad_norm": 2.231165885925293, "learning_rate": 8.537321333063646e-06, "loss": 0.2262, "step": 10490 }, { "epoch": 0.5608450064764244, "grad_norm": 2.230396270751953, "learning_rate": 8.520209125071838e-06, "loss": 0.2334, "step": 10500 }, { "epoch": 0.5613791445778306, "grad_norm": 2.116292715072632, "learning_rate": 8.503101346186542e-06, "loss": 0.2256, "step": 10510 }, { "epoch": 0.5619132826792367, "grad_norm": 2.714449405670166, "learning_rate": 8.485998047612414e-06, "loss": 0.2133, "step": 10520 }, { "epoch": 0.5624474207806428, "grad_norm": 2.0473074913024902, "learning_rate": 8.468899280540692e-06, "loss": 0.2329, "step": 10530 }, { "epoch": 0.5629815588820489, "grad_norm": 2.815387010574341, "learning_rate": 8.451805096149056e-06, "loss": 0.2223, "step": 10540 }, { "epoch": 0.563515696983455, "grad_norm": 2.387070655822754, "learning_rate": 8.434715545601464e-06, "loss": 0.2065, "step": 10550 }, { "epoch": 0.5640498350848612, "grad_norm": 1.9377726316452026, "learning_rate": 8.417630680048014e-06, "loss": 0.2235, "step": 10560 }, { "epoch": 0.5645839731862673, "grad_norm": 1.4909231662750244, "learning_rate": 8.400550550624769e-06, "loss": 0.2029, "step": 10570 }, { "epoch": 0.5651181112876734, "grad_norm": 2.442850112915039, "learning_rate": 8.383475208453628e-06, "loss": 0.2141, "step": 10580 }, { "epoch": 0.5656522493890795, "grad_norm": 2.2836201190948486, "learning_rate": 8.366404704642154e-06, "loss": 0.2294, "step": 10590 }, { "epoch": 0.5661863874904857, "grad_norm": 1.6102449893951416, "learning_rate": 8.349339090283434e-06, "loss": 0.2122, "step": 10600 }, { "epoch": 0.5667205255918918, "grad_norm": 1.8167054653167725, "learning_rate": 8.332278416455915e-06, "loss": 0.2209, "step": 10610 }, { "epoch": 0.5672546636932979, "grad_norm": 1.6447687149047852, "learning_rate": 8.315222734223264e-06, "loss": 0.2024, "step": 10620 }, { "epoch": 0.567788801794704, "grad_norm": 2.4124929904937744, "learning_rate": 8.298172094634193e-06, "loss": 0.2147, "step": 10630 }, { "epoch": 0.5683229398961102, "grad_norm": 2.1194028854370117, "learning_rate": 8.281126548722342e-06, "loss": 0.1941, "step": 10640 }, { "epoch": 0.5688570779975163, "grad_norm": 1.8054558038711548, "learning_rate": 8.264086147506088e-06, "loss": 0.22, "step": 10650 }, { "epoch": 0.5693912160989224, "grad_norm": 1.5799001455307007, "learning_rate": 8.24705094198842e-06, "loss": 0.2224, "step": 10660 }, { "epoch": 0.5699253542003285, "grad_norm": 2.055412530899048, "learning_rate": 8.23002098315677e-06, "loss": 0.228, "step": 10670 }, { "epoch": 0.5704594923017346, "grad_norm": 2.3682734966278076, "learning_rate": 8.21299632198287e-06, "loss": 0.2318, "step": 10680 }, { "epoch": 0.5709936304031408, "grad_norm": 1.6963480710983276, "learning_rate": 8.19597700942259e-06, "loss": 0.2336, "step": 10690 }, { "epoch": 0.5715277685045469, "grad_norm": 1.7194743156433105, "learning_rate": 8.178963096415803e-06, "loss": 0.2292, "step": 10700 }, { "epoch": 0.572061906605953, "grad_norm": 2.19177508354187, "learning_rate": 8.161954633886205e-06, "loss": 0.2153, "step": 10710 }, { "epoch": 0.5725960447073591, "grad_norm": 2.1365668773651123, "learning_rate": 8.144951672741195e-06, "loss": 0.2311, "step": 10720 }, { "epoch": 0.5731301828087652, "grad_norm": 2.1766796112060547, "learning_rate": 8.127954263871693e-06, "loss": 0.2136, "step": 10730 }, { "epoch": 0.5736643209101713, "grad_norm": 2.1268510818481445, "learning_rate": 8.110962458152003e-06, "loss": 0.217, "step": 10740 }, { "epoch": 0.5741984590115774, "grad_norm": 2.5353665351867676, "learning_rate": 8.093976306439668e-06, "loss": 0.228, "step": 10750 }, { "epoch": 0.5747325971129835, "grad_norm": 1.6905584335327148, "learning_rate": 8.076995859575291e-06, "loss": 0.2254, "step": 10760 }, { "epoch": 0.5752667352143896, "grad_norm": 2.5711758136749268, "learning_rate": 8.06002116838242e-06, "loss": 0.2228, "step": 10770 }, { "epoch": 0.5758008733157958, "grad_norm": 2.1478774547576904, "learning_rate": 8.04305228366736e-06, "loss": 0.2116, "step": 10780 }, { "epoch": 0.5763350114172019, "grad_norm": 2.2196929454803467, "learning_rate": 8.026089256219043e-06, "loss": 0.2198, "step": 10790 }, { "epoch": 0.576869149518608, "grad_norm": 2.1034798622131348, "learning_rate": 8.009132136808874e-06, "loss": 0.2138, "step": 10800 }, { "epoch": 0.5774032876200141, "grad_norm": 2.351726531982422, "learning_rate": 7.992180976190565e-06, "loss": 0.2211, "step": 10810 }, { "epoch": 0.5779374257214203, "grad_norm": 1.776551604270935, "learning_rate": 7.975235825100003e-06, "loss": 0.2245, "step": 10820 }, { "epoch": 0.5784715638228264, "grad_norm": 2.2755231857299805, "learning_rate": 7.958296734255081e-06, "loss": 0.2181, "step": 10830 }, { "epoch": 0.5790057019242325, "grad_norm": 1.932604432106018, "learning_rate": 7.94136375435556e-06, "loss": 0.2283, "step": 10840 }, { "epoch": 0.5795398400256386, "grad_norm": 1.8676772117614746, "learning_rate": 7.924436936082903e-06, "loss": 0.2273, "step": 10850 }, { "epoch": 0.5800739781270448, "grad_norm": 2.1377224922180176, "learning_rate": 7.907516330100131e-06, "loss": 0.2182, "step": 10860 }, { "epoch": 0.5806081162284509, "grad_norm": 3.525808572769165, "learning_rate": 7.890601987051681e-06, "loss": 0.221, "step": 10870 }, { "epoch": 0.581142254329857, "grad_norm": 1.9583849906921387, "learning_rate": 7.873693957563232e-06, "loss": 0.2102, "step": 10880 }, { "epoch": 0.5816763924312631, "grad_norm": 1.6688528060913086, "learning_rate": 7.856792292241577e-06, "loss": 0.2062, "step": 10890 }, { "epoch": 0.5822105305326692, "grad_norm": 2.4098241329193115, "learning_rate": 7.839897041674454e-06, "loss": 0.2029, "step": 10900 }, { "epoch": 0.5827446686340754, "grad_norm": 1.5053832530975342, "learning_rate": 7.823008256430405e-06, "loss": 0.2227, "step": 10910 }, { "epoch": 0.5832788067354815, "grad_norm": 1.784575343132019, "learning_rate": 7.806125987058616e-06, "loss": 0.2145, "step": 10920 }, { "epoch": 0.5838129448368876, "grad_norm": 1.8626587390899658, "learning_rate": 7.789250284088776e-06, "loss": 0.2287, "step": 10930 }, { "epoch": 0.5843470829382937, "grad_norm": 1.95945143699646, "learning_rate": 7.77238119803092e-06, "loss": 0.2193, "step": 10940 }, { "epoch": 0.5848812210396999, "grad_norm": 1.8039461374282837, "learning_rate": 7.755518779375269e-06, "loss": 0.2121, "step": 10950 }, { "epoch": 0.585415359141106, "grad_norm": 1.7685226202011108, "learning_rate": 7.738663078592106e-06, "loss": 0.2261, "step": 10960 }, { "epoch": 0.585949497242512, "grad_norm": 2.037064552307129, "learning_rate": 7.72181414613159e-06, "loss": 0.216, "step": 10970 }, { "epoch": 0.5864836353439181, "grad_norm": 2.045543909072876, "learning_rate": 7.704972032423625e-06, "loss": 0.21, "step": 10980 }, { "epoch": 0.5870177734453242, "grad_norm": 1.7506351470947266, "learning_rate": 7.688136787877713e-06, "loss": 0.2188, "step": 10990 }, { "epoch": 0.5875519115467304, "grad_norm": 2.0683186054229736, "learning_rate": 7.671308462882793e-06, "loss": 0.2152, "step": 11000 }, { "epoch": 0.5880860496481365, "grad_norm": 1.9801336526870728, "learning_rate": 7.654487107807093e-06, "loss": 0.2101, "step": 11010 }, { "epoch": 0.5886201877495426, "grad_norm": 1.9354629516601562, "learning_rate": 7.637672772997976e-06, "loss": 0.2244, "step": 11020 }, { "epoch": 0.5891543258509487, "grad_norm": 1.5917876958847046, "learning_rate": 7.6208655087818e-06, "loss": 0.2314, "step": 11030 }, { "epoch": 0.5896884639523549, "grad_norm": 2.160581350326538, "learning_rate": 7.604065365463755e-06, "loss": 0.2157, "step": 11040 }, { "epoch": 0.590222602053761, "grad_norm": 2.1802144050598145, "learning_rate": 7.587272393327721e-06, "loss": 0.2189, "step": 11050 }, { "epoch": 0.5907567401551671, "grad_norm": 2.4242048263549805, "learning_rate": 7.570486642636109e-06, "loss": 0.2233, "step": 11060 }, { "epoch": 0.5912908782565732, "grad_norm": 2.1621463298797607, "learning_rate": 7.55370816362972e-06, "loss": 0.2131, "step": 11070 }, { "epoch": 0.5918250163579793, "grad_norm": 1.783928632736206, "learning_rate": 7.536937006527595e-06, "loss": 0.2297, "step": 11080 }, { "epoch": 0.5923591544593855, "grad_norm": 1.5802172422409058, "learning_rate": 7.520173221526852e-06, "loss": 0.2053, "step": 11090 }, { "epoch": 0.5928932925607916, "grad_norm": 1.973418116569519, "learning_rate": 7.50341685880254e-06, "loss": 0.2289, "step": 11100 }, { "epoch": 0.5934274306621977, "grad_norm": 2.4228341579437256, "learning_rate": 7.486667968507508e-06, "loss": 0.2048, "step": 11110 }, { "epoch": 0.5939615687636038, "grad_norm": 2.010727643966675, "learning_rate": 7.469926600772224e-06, "loss": 0.22, "step": 11120 }, { "epoch": 0.59449570686501, "grad_norm": 2.1708192825317383, "learning_rate": 7.453192805704654e-06, "loss": 0.2055, "step": 11130 }, { "epoch": 0.5950298449664161, "grad_norm": 1.8470309972763062, "learning_rate": 7.436466633390085e-06, "loss": 0.1948, "step": 11140 }, { "epoch": 0.5955639830678222, "grad_norm": 2.9264347553253174, "learning_rate": 7.419748133891e-06, "loss": 0.2324, "step": 11150 }, { "epoch": 0.5960981211692283, "grad_norm": 1.714800238609314, "learning_rate": 7.403037357246909e-06, "loss": 0.2122, "step": 11160 }, { "epoch": 0.5966322592706345, "grad_norm": 1.86958909034729, "learning_rate": 7.386334353474216e-06, "loss": 0.2145, "step": 11170 }, { "epoch": 0.5971663973720406, "grad_norm": 1.6828930377960205, "learning_rate": 7.3696391725660475e-06, "loss": 0.2179, "step": 11180 }, { "epoch": 0.5977005354734467, "grad_norm": 1.3941845893859863, "learning_rate": 7.352951864492128e-06, "loss": 0.2199, "step": 11190 }, { "epoch": 0.5982346735748528, "grad_norm": 2.001188039779663, "learning_rate": 7.336272479198614e-06, "loss": 0.2041, "step": 11200 }, { "epoch": 0.598768811676259, "grad_norm": 2.2270281314849854, "learning_rate": 7.31960106660794e-06, "loss": 0.2106, "step": 11210 }, { "epoch": 0.599302949777665, "grad_norm": 2.3216140270233154, "learning_rate": 7.302937676618691e-06, "loss": 0.2183, "step": 11220 }, { "epoch": 0.5998370878790711, "grad_norm": 2.280552864074707, "learning_rate": 7.28628235910543e-06, "loss": 0.2139, "step": 11230 }, { "epoch": 0.6003712259804772, "grad_norm": 2.2088541984558105, "learning_rate": 7.2696351639185645e-06, "loss": 0.2056, "step": 11240 }, { "epoch": 0.6009053640818833, "grad_norm": 1.7380043268203735, "learning_rate": 7.252996140884195e-06, "loss": 0.222, "step": 11250 }, { "epoch": 0.6014395021832895, "grad_norm": 2.2230114936828613, "learning_rate": 7.2363653398039465e-06, "loss": 0.2207, "step": 11260 }, { "epoch": 0.6019736402846956, "grad_norm": 5.5054216384887695, "learning_rate": 7.219742810454855e-06, "loss": 0.2205, "step": 11270 }, { "epoch": 0.6025077783861017, "grad_norm": 1.888136863708496, "learning_rate": 7.20312860258918e-06, "loss": 0.2157, "step": 11280 }, { "epoch": 0.6030419164875078, "grad_norm": 1.5002096891403198, "learning_rate": 7.186522765934292e-06, "loss": 0.2105, "step": 11290 }, { "epoch": 0.603576054588914, "grad_norm": 1.9556044340133667, "learning_rate": 7.169925350192491e-06, "loss": 0.2229, "step": 11300 }, { "epoch": 0.6041101926903201, "grad_norm": 1.9226624965667725, "learning_rate": 7.153336405040884e-06, "loss": 0.213, "step": 11310 }, { "epoch": 0.6046443307917262, "grad_norm": 1.8958394527435303, "learning_rate": 7.136755980131218e-06, "loss": 0.2201, "step": 11320 }, { "epoch": 0.6051784688931323, "grad_norm": 1.6764745712280273, "learning_rate": 7.120184125089737e-06, "loss": 0.2, "step": 11330 }, { "epoch": 0.6057126069945384, "grad_norm": 2.1489813327789307, "learning_rate": 7.103620889517042e-06, "loss": 0.2091, "step": 11340 }, { "epoch": 0.6062467450959446, "grad_norm": 1.993537425994873, "learning_rate": 7.087066322987929e-06, "loss": 0.2138, "step": 11350 }, { "epoch": 0.6067808831973507, "grad_norm": 1.5977764129638672, "learning_rate": 7.0705204750512524e-06, "loss": 0.2162, "step": 11360 }, { "epoch": 0.6073150212987568, "grad_norm": 2.252270221710205, "learning_rate": 7.053983395229765e-06, "loss": 0.2129, "step": 11370 }, { "epoch": 0.6078491594001629, "grad_norm": 2.5477890968322754, "learning_rate": 7.037455133019984e-06, "loss": 0.2054, "step": 11380 }, { "epoch": 0.608383297501569, "grad_norm": 1.835484504699707, "learning_rate": 7.020935737892028e-06, "loss": 0.2148, "step": 11390 }, { "epoch": 0.6089174356029752, "grad_norm": 1.9355418682098389, "learning_rate": 7.0044252592894765e-06, "loss": 0.2166, "step": 11400 }, { "epoch": 0.6094515737043813, "grad_norm": 2.474160671234131, "learning_rate": 6.987923746629229e-06, "loss": 0.2351, "step": 11410 }, { "epoch": 0.6099857118057874, "grad_norm": 1.5369596481323242, "learning_rate": 6.971431249301337e-06, "loss": 0.2231, "step": 11420 }, { "epoch": 0.6105198499071935, "grad_norm": 1.9047356843948364, "learning_rate": 6.95494781666888e-06, "loss": 0.2051, "step": 11430 }, { "epoch": 0.6110539880085997, "grad_norm": 2.0702626705169678, "learning_rate": 6.9384734980678015e-06, "loss": 0.2197, "step": 11440 }, { "epoch": 0.6115881261100058, "grad_norm": 2.3336985111236572, "learning_rate": 6.922008342806761e-06, "loss": 0.2148, "step": 11450 }, { "epoch": 0.6121222642114119, "grad_norm": 1.8266174793243408, "learning_rate": 6.9055524001670016e-06, "loss": 0.2231, "step": 11460 }, { "epoch": 0.6126564023128179, "grad_norm": 1.6749603748321533, "learning_rate": 6.889105719402183e-06, "loss": 0.1985, "step": 11470 }, { "epoch": 0.613190540414224, "grad_norm": 2.4108126163482666, "learning_rate": 6.872668349738252e-06, "loss": 0.2101, "step": 11480 }, { "epoch": 0.6137246785156302, "grad_norm": 2.194828510284424, "learning_rate": 6.856240340373279e-06, "loss": 0.2148, "step": 11490 }, { "epoch": 0.6142588166170363, "grad_norm": 2.4629597663879395, "learning_rate": 6.839821740477328e-06, "loss": 0.2159, "step": 11500 }, { "epoch": 0.6147929547184424, "grad_norm": 1.8051860332489014, "learning_rate": 6.823412599192287e-06, "loss": 0.2279, "step": 11510 }, { "epoch": 0.6153270928198485, "grad_norm": 1.770883321762085, "learning_rate": 6.807012965631743e-06, "loss": 0.1953, "step": 11520 }, { "epoch": 0.6158612309212547, "grad_norm": 1.9584269523620605, "learning_rate": 6.790622888880829e-06, "loss": 0.2142, "step": 11530 }, { "epoch": 0.6163953690226608, "grad_norm": 1.7800967693328857, "learning_rate": 6.774242417996059e-06, "loss": 0.2217, "step": 11540 }, { "epoch": 0.6169295071240669, "grad_norm": 2.1063544750213623, "learning_rate": 6.757871602005217e-06, "loss": 0.2069, "step": 11550 }, { "epoch": 0.617463645225473, "grad_norm": 1.9224251508712769, "learning_rate": 6.741510489907168e-06, "loss": 0.2179, "step": 11560 }, { "epoch": 0.6179977833268792, "grad_norm": 2.2406389713287354, "learning_rate": 6.725159130671746e-06, "loss": 0.2089, "step": 11570 }, { "epoch": 0.6185319214282853, "grad_norm": 1.7932913303375244, "learning_rate": 6.70881757323959e-06, "loss": 0.2161, "step": 11580 }, { "epoch": 0.6190660595296914, "grad_norm": 2.0157363414764404, "learning_rate": 6.692485866521999e-06, "loss": 0.2067, "step": 11590 }, { "epoch": 0.6196001976310975, "grad_norm": 2.3990602493286133, "learning_rate": 6.676164059400798e-06, "loss": 0.2064, "step": 11600 }, { "epoch": 0.6201343357325036, "grad_norm": 1.5343892574310303, "learning_rate": 6.6598522007281695e-06, "loss": 0.2175, "step": 11610 }, { "epoch": 0.6206684738339098, "grad_norm": 2.0749294757843018, "learning_rate": 6.64355033932653e-06, "loss": 0.2107, "step": 11620 }, { "epoch": 0.6212026119353159, "grad_norm": 1.849394679069519, "learning_rate": 6.627258523988364e-06, "loss": 0.2145, "step": 11630 }, { "epoch": 0.621736750036722, "grad_norm": 1.8927348852157593, "learning_rate": 6.610976803476097e-06, "loss": 0.205, "step": 11640 }, { "epoch": 0.6222708881381281, "grad_norm": 2.1021714210510254, "learning_rate": 6.594705226521931e-06, "loss": 0.2232, "step": 11650 }, { "epoch": 0.6228050262395343, "grad_norm": 1.3157902956008911, "learning_rate": 6.578443841827713e-06, "loss": 0.2083, "step": 11660 }, { "epoch": 0.6233391643409404, "grad_norm": 2.3738174438476562, "learning_rate": 6.562192698064793e-06, "loss": 0.1994, "step": 11670 }, { "epoch": 0.6238733024423465, "grad_norm": 1.9522268772125244, "learning_rate": 6.545951843873844e-06, "loss": 0.2167, "step": 11680 }, { "epoch": 0.6244074405437526, "grad_norm": 1.3353922367095947, "learning_rate": 6.529721327864762e-06, "loss": 0.2032, "step": 11690 }, { "epoch": 0.6249415786451588, "grad_norm": 1.4787133932113647, "learning_rate": 6.513501198616498e-06, "loss": 0.1946, "step": 11700 }, { "epoch": 0.6254757167465648, "grad_norm": 2.0699350833892822, "learning_rate": 6.4972915046769124e-06, "loss": 0.2236, "step": 11710 }, { "epoch": 0.6260098548479709, "grad_norm": 1.957674503326416, "learning_rate": 6.48109229456263e-06, "loss": 0.2005, "step": 11720 }, { "epoch": 0.626543992949377, "grad_norm": 2.353870391845703, "learning_rate": 6.464903616758896e-06, "loss": 0.2119, "step": 11730 }, { "epoch": 0.6270781310507831, "grad_norm": 1.5013461112976074, "learning_rate": 6.448725519719439e-06, "loss": 0.2006, "step": 11740 }, { "epoch": 0.6276122691521893, "grad_norm": 1.5558239221572876, "learning_rate": 6.432558051866311e-06, "loss": 0.2138, "step": 11750 }, { "epoch": 0.6281464072535954, "grad_norm": 2.1621413230895996, "learning_rate": 6.416401261589753e-06, "loss": 0.211, "step": 11760 }, { "epoch": 0.6286805453550015, "grad_norm": 2.213204860687256, "learning_rate": 6.400255197248049e-06, "loss": 0.2068, "step": 11770 }, { "epoch": 0.6292146834564076, "grad_norm": 1.9236663579940796, "learning_rate": 6.384119907167376e-06, "loss": 0.1921, "step": 11780 }, { "epoch": 0.6297488215578138, "grad_norm": 1.8709219694137573, "learning_rate": 6.367995439641671e-06, "loss": 0.2134, "step": 11790 }, { "epoch": 0.6302829596592199, "grad_norm": 2.056267738342285, "learning_rate": 6.351881842932462e-06, "loss": 0.2067, "step": 11800 }, { "epoch": 0.630817097760626, "grad_norm": 2.080949544906616, "learning_rate": 6.3357791652687586e-06, "loss": 0.2136, "step": 11810 }, { "epoch": 0.6313512358620321, "grad_norm": 1.9530695676803589, "learning_rate": 6.3196874548468765e-06, "loss": 0.1943, "step": 11820 }, { "epoch": 0.6318853739634382, "grad_norm": 1.4830729961395264, "learning_rate": 6.303606759830312e-06, "loss": 0.2226, "step": 11830 }, { "epoch": 0.6324195120648444, "grad_norm": 2.4522206783294678, "learning_rate": 6.28753712834959e-06, "loss": 0.2134, "step": 11840 }, { "epoch": 0.6329536501662505, "grad_norm": 1.911330223083496, "learning_rate": 6.27147860850212e-06, "loss": 0.2054, "step": 11850 }, { "epoch": 0.6334877882676566, "grad_norm": 2.157397747039795, "learning_rate": 6.255431248352057e-06, "loss": 0.2118, "step": 11860 }, { "epoch": 0.6340219263690627, "grad_norm": 2.0032198429107666, "learning_rate": 6.239395095930148e-06, "loss": 0.2171, "step": 11870 }, { "epoch": 0.6345560644704689, "grad_norm": 1.7035309076309204, "learning_rate": 6.2233701992336045e-06, "loss": 0.2133, "step": 11880 }, { "epoch": 0.635090202571875, "grad_norm": 1.8360484838485718, "learning_rate": 6.207356606225938e-06, "loss": 0.2087, "step": 11890 }, { "epoch": 0.6356243406732811, "grad_norm": 2.014369010925293, "learning_rate": 6.191354364836839e-06, "loss": 0.2073, "step": 11900 }, { "epoch": 0.6361584787746872, "grad_norm": 2.3682775497436523, "learning_rate": 6.17536352296201e-06, "loss": 0.2152, "step": 11910 }, { "epoch": 0.6366926168760934, "grad_norm": 1.6390857696533203, "learning_rate": 6.159384128463037e-06, "loss": 0.22, "step": 11920 }, { "epoch": 0.6372267549774995, "grad_norm": 1.7781578302383423, "learning_rate": 6.143416229167254e-06, "loss": 0.2174, "step": 11930 }, { "epoch": 0.6377608930789056, "grad_norm": 1.9814543724060059, "learning_rate": 6.127459872867574e-06, "loss": 0.2112, "step": 11940 }, { "epoch": 0.6382950311803117, "grad_norm": 1.8278793096542358, "learning_rate": 6.111515107322371e-06, "loss": 0.1889, "step": 11950 }, { "epoch": 0.6388291692817177, "grad_norm": 2.6934030055999756, "learning_rate": 6.095581980255323e-06, "loss": 0.2161, "step": 11960 }, { "epoch": 0.6393633073831239, "grad_norm": 1.7933999300003052, "learning_rate": 6.079660539355276e-06, "loss": 0.2074, "step": 11970 }, { "epoch": 0.63989744548453, "grad_norm": 1.7992496490478516, "learning_rate": 6.063750832276096e-06, "loss": 0.2102, "step": 11980 }, { "epoch": 0.6404315835859361, "grad_norm": 1.898014783859253, "learning_rate": 6.047852906636531e-06, "loss": 0.1964, "step": 11990 }, { "epoch": 0.6409657216873422, "grad_norm": 2.1823856830596924, "learning_rate": 6.031966810020064e-06, "loss": 0.2237, "step": 12000 }, { "epoch": 0.6414998597887484, "grad_norm": 1.1829626560211182, "learning_rate": 6.016092589974776e-06, "loss": 0.2125, "step": 12010 }, { "epoch": 0.6420339978901545, "grad_norm": 2.4170658588409424, "learning_rate": 6.000230294013201e-06, "loss": 0.204, "step": 12020 }, { "epoch": 0.6425681359915606, "grad_norm": 2.594216823577881, "learning_rate": 5.984379969612177e-06, "loss": 0.2195, "step": 12030 }, { "epoch": 0.6431022740929667, "grad_norm": 1.7258776426315308, "learning_rate": 5.968541664212715e-06, "loss": 0.1992, "step": 12040 }, { "epoch": 0.6436364121943728, "grad_norm": 2.7268295288085938, "learning_rate": 5.952715425219855e-06, "loss": 0.2265, "step": 12050 }, { "epoch": 0.644170550295779, "grad_norm": 1.9044744968414307, "learning_rate": 5.936901300002516e-06, "loss": 0.2092, "step": 12060 }, { "epoch": 0.6447046883971851, "grad_norm": 1.83115816116333, "learning_rate": 5.921099335893366e-06, "loss": 0.2141, "step": 12070 }, { "epoch": 0.6452388264985912, "grad_norm": 1.655148983001709, "learning_rate": 5.905309580188669e-06, "loss": 0.187, "step": 12080 }, { "epoch": 0.6457729645999973, "grad_norm": 2.0083398818969727, "learning_rate": 5.889532080148151e-06, "loss": 0.1943, "step": 12090 }, { "epoch": 0.6463071027014035, "grad_norm": 2.236048936843872, "learning_rate": 5.8737668829948516e-06, "loss": 0.2076, "step": 12100 }, { "epoch": 0.6468412408028096, "grad_norm": 2.1637110710144043, "learning_rate": 5.858014035914992e-06, "loss": 0.221, "step": 12110 }, { "epoch": 0.6473753789042157, "grad_norm": 2.1592493057250977, "learning_rate": 5.842273586057832e-06, "loss": 0.2, "step": 12120 }, { "epoch": 0.6479095170056218, "grad_norm": 1.7240618467330933, "learning_rate": 5.826545580535514e-06, "loss": 0.2001, "step": 12130 }, { "epoch": 0.648443655107028, "grad_norm": 2.4007160663604736, "learning_rate": 5.810830066422952e-06, "loss": 0.202, "step": 12140 }, { "epoch": 0.6489777932084341, "grad_norm": 2.3673088550567627, "learning_rate": 5.795127090757645e-06, "loss": 0.2079, "step": 12150 }, { "epoch": 0.6495119313098402, "grad_norm": 2.8149428367614746, "learning_rate": 5.779436700539594e-06, "loss": 0.2203, "step": 12160 }, { "epoch": 0.6500460694112463, "grad_norm": 1.9289332628250122, "learning_rate": 5.76375894273111e-06, "loss": 0.2045, "step": 12170 }, { "epoch": 0.6505802075126524, "grad_norm": 2.1372241973876953, "learning_rate": 5.748093864256703e-06, "loss": 0.1939, "step": 12180 }, { "epoch": 0.6511143456140586, "grad_norm": 2.091510772705078, "learning_rate": 5.732441512002928e-06, "loss": 0.1993, "step": 12190 }, { "epoch": 0.6516484837154647, "grad_norm": 1.4995598793029785, "learning_rate": 5.716801932818261e-06, "loss": 0.223, "step": 12200 }, { "epoch": 0.6521826218168707, "grad_norm": 1.81698477268219, "learning_rate": 5.701175173512934e-06, "loss": 0.2121, "step": 12210 }, { "epoch": 0.6527167599182768, "grad_norm": 1.5562748908996582, "learning_rate": 5.685561280858813e-06, "loss": 0.2036, "step": 12220 }, { "epoch": 0.653250898019683, "grad_norm": 2.8458197116851807, "learning_rate": 5.6699603015892535e-06, "loss": 0.2052, "step": 12230 }, { "epoch": 0.6537850361210891, "grad_norm": 1.7876349687576294, "learning_rate": 5.654372282398966e-06, "loss": 0.2184, "step": 12240 }, { "epoch": 0.6543191742224952, "grad_norm": 2.1311659812927246, "learning_rate": 5.638797269943861e-06, "loss": 0.1951, "step": 12250 }, { "epoch": 0.6548533123239013, "grad_norm": 1.9263982772827148, "learning_rate": 5.623235310840924e-06, "loss": 0.204, "step": 12260 }, { "epoch": 0.6553874504253074, "grad_norm": 1.6871339082717896, "learning_rate": 5.607686451668073e-06, "loss": 0.2132, "step": 12270 }, { "epoch": 0.6559215885267136, "grad_norm": 1.7853329181671143, "learning_rate": 5.592150738964013e-06, "loss": 0.2068, "step": 12280 }, { "epoch": 0.6564557266281197, "grad_norm": 2.168503761291504, "learning_rate": 5.576628219228098e-06, "loss": 0.1997, "step": 12290 }, { "epoch": 0.6569898647295258, "grad_norm": 2.462963104248047, "learning_rate": 5.561118938920207e-06, "loss": 0.2174, "step": 12300 }, { "epoch": 0.6575240028309319, "grad_norm": 1.6161651611328125, "learning_rate": 5.54562294446058e-06, "loss": 0.2215, "step": 12310 }, { "epoch": 0.658058140932338, "grad_norm": 2.1460437774658203, "learning_rate": 5.5301402822297e-06, "loss": 0.2136, "step": 12320 }, { "epoch": 0.6585922790337442, "grad_norm": 1.5240051746368408, "learning_rate": 5.514670998568135e-06, "loss": 0.2156, "step": 12330 }, { "epoch": 0.6591264171351503, "grad_norm": 1.5435818433761597, "learning_rate": 5.499215139776425e-06, "loss": 0.2139, "step": 12340 }, { "epoch": 0.6596605552365564, "grad_norm": 2.042482852935791, "learning_rate": 5.483772752114917e-06, "loss": 0.2146, "step": 12350 }, { "epoch": 0.6601946933379625, "grad_norm": 1.5750585794448853, "learning_rate": 5.468343881803644e-06, "loss": 0.1996, "step": 12360 }, { "epoch": 0.6607288314393687, "grad_norm": 1.9457638263702393, "learning_rate": 5.452928575022175e-06, "loss": 0.213, "step": 12370 }, { "epoch": 0.6612629695407748, "grad_norm": 2.141218900680542, "learning_rate": 5.437526877909489e-06, "loss": 0.2236, "step": 12380 }, { "epoch": 0.6617971076421809, "grad_norm": 2.3358991146087646, "learning_rate": 5.422138836563826e-06, "loss": 0.1959, "step": 12390 }, { "epoch": 0.662331245743587, "grad_norm": 1.7100086212158203, "learning_rate": 5.4067644970425515e-06, "loss": 0.1989, "step": 12400 }, { "epoch": 0.6628653838449932, "grad_norm": 2.0736658573150635, "learning_rate": 5.39140390536203e-06, "loss": 0.2145, "step": 12410 }, { "epoch": 0.6633995219463993, "grad_norm": 2.0246505737304688, "learning_rate": 5.37605710749747e-06, "loss": 0.2107, "step": 12420 }, { "epoch": 0.6639336600478054, "grad_norm": 1.893479347229004, "learning_rate": 5.360724149382792e-06, "loss": 0.1987, "step": 12430 }, { "epoch": 0.6644677981492115, "grad_norm": 1.6024492979049683, "learning_rate": 5.345405076910506e-06, "loss": 0.2127, "step": 12440 }, { "epoch": 0.6650019362506175, "grad_norm": 1.6761924028396606, "learning_rate": 5.330099935931545e-06, "loss": 0.2072, "step": 12450 }, { "epoch": 0.6655360743520237, "grad_norm": 1.9555671215057373, "learning_rate": 5.3148087722551555e-06, "loss": 0.2101, "step": 12460 }, { "epoch": 0.6660702124534298, "grad_norm": 1.9206242561340332, "learning_rate": 5.299531631648741e-06, "loss": 0.2141, "step": 12470 }, { "epoch": 0.6666043505548359, "grad_norm": 1.5600340366363525, "learning_rate": 5.284268559837744e-06, "loss": 0.2078, "step": 12480 }, { "epoch": 0.667138488656242, "grad_norm": 2.2750704288482666, "learning_rate": 5.269019602505494e-06, "loss": 0.1962, "step": 12490 }, { "epoch": 0.6676726267576482, "grad_norm": 1.577258825302124, "learning_rate": 5.253784805293058e-06, "loss": 0.2085, "step": 12500 }, { "epoch": 0.6682067648590543, "grad_norm": 2.2018072605133057, "learning_rate": 5.238564213799149e-06, "loss": 0.1941, "step": 12510 }, { "epoch": 0.6687409029604604, "grad_norm": 2.3488030433654785, "learning_rate": 5.223357873579944e-06, "loss": 0.2034, "step": 12520 }, { "epoch": 0.6692750410618665, "grad_norm": 1.745232343673706, "learning_rate": 5.2081658301489705e-06, "loss": 0.2108, "step": 12530 }, { "epoch": 0.6698091791632726, "grad_norm": 1.7250596284866333, "learning_rate": 5.192988128976957e-06, "loss": 0.1963, "step": 12540 }, { "epoch": 0.6703433172646788, "grad_norm": 1.650227427482605, "learning_rate": 5.1778248154917184e-06, "loss": 0.1947, "step": 12550 }, { "epoch": 0.6708774553660849, "grad_norm": 1.5390517711639404, "learning_rate": 5.162675935077998e-06, "loss": 0.1971, "step": 12560 }, { "epoch": 0.671411593467491, "grad_norm": 1.5685296058654785, "learning_rate": 5.147541533077338e-06, "loss": 0.21, "step": 12570 }, { "epoch": 0.6719457315688971, "grad_norm": 1.4924248456954956, "learning_rate": 5.132421654787951e-06, "loss": 0.1988, "step": 12580 }, { "epoch": 0.6724798696703033, "grad_norm": 2.9624857902526855, "learning_rate": 5.117316345464579e-06, "loss": 0.2168, "step": 12590 }, { "epoch": 0.6730140077717094, "grad_norm": 2.0990185737609863, "learning_rate": 5.102225650318353e-06, "loss": 0.227, "step": 12600 }, { "epoch": 0.6735481458731155, "grad_norm": 2.0988457202911377, "learning_rate": 5.087149614516668e-06, "loss": 0.1993, "step": 12610 }, { "epoch": 0.6740822839745216, "grad_norm": 1.97731614112854, "learning_rate": 5.072088283183039e-06, "loss": 0.1971, "step": 12620 }, { "epoch": 0.6746164220759278, "grad_norm": 1.6458598375320435, "learning_rate": 5.057041701396972e-06, "loss": 0.2257, "step": 12630 }, { "epoch": 0.6751505601773339, "grad_norm": 2.248922824859619, "learning_rate": 5.0420099141938235e-06, "loss": 0.2232, "step": 12640 }, { "epoch": 0.67568469827874, "grad_norm": 1.6646889448165894, "learning_rate": 5.0269929665646775e-06, "loss": 0.2074, "step": 12650 }, { "epoch": 0.6762188363801461, "grad_norm": 1.9893475770950317, "learning_rate": 5.011990903456194e-06, "loss": 0.2026, "step": 12660 }, { "epoch": 0.6767529744815522, "grad_norm": 2.3458757400512695, "learning_rate": 4.997003769770483e-06, "loss": 0.1947, "step": 12670 }, { "epoch": 0.6772871125829584, "grad_norm": 2.165609836578369, "learning_rate": 4.982031610364969e-06, "loss": 0.2052, "step": 12680 }, { "epoch": 0.6778212506843645, "grad_norm": 1.8230030536651611, "learning_rate": 4.967074470052269e-06, "loss": 0.2026, "step": 12690 }, { "epoch": 0.6783553887857705, "grad_norm": 1.760373830795288, "learning_rate": 4.952132393600031e-06, "loss": 0.187, "step": 12700 }, { "epoch": 0.6788895268871766, "grad_norm": 1.9163260459899902, "learning_rate": 4.937205425730823e-06, "loss": 0.1883, "step": 12710 }, { "epoch": 0.6794236649885828, "grad_norm": 1.524936556816101, "learning_rate": 4.922293611122001e-06, "loss": 0.1955, "step": 12720 }, { "epoch": 0.6799578030899889, "grad_norm": 1.6496570110321045, "learning_rate": 4.907396994405548e-06, "loss": 0.2039, "step": 12730 }, { "epoch": 0.680491941191395, "grad_norm": 2.1172661781311035, "learning_rate": 4.8925156201679654e-06, "loss": 0.2045, "step": 12740 }, { "epoch": 0.6810260792928011, "grad_norm": 1.7260173559188843, "learning_rate": 4.8776495329501476e-06, "loss": 0.1999, "step": 12750 }, { "epoch": 0.6815602173942072, "grad_norm": 1.7151517868041992, "learning_rate": 4.862798777247215e-06, "loss": 0.2107, "step": 12760 }, { "epoch": 0.6820943554956134, "grad_norm": 1.8780659437179565, "learning_rate": 4.847963397508409e-06, "loss": 0.2158, "step": 12770 }, { "epoch": 0.6826284935970195, "grad_norm": 2.6927716732025146, "learning_rate": 4.833143438136941e-06, "loss": 0.2036, "step": 12780 }, { "epoch": 0.6831626316984256, "grad_norm": 1.8529044389724731, "learning_rate": 4.818338943489886e-06, "loss": 0.2121, "step": 12790 }, { "epoch": 0.6836967697998317, "grad_norm": 2.4254133701324463, "learning_rate": 4.803549957878015e-06, "loss": 0.2022, "step": 12800 }, { "epoch": 0.6842309079012379, "grad_norm": 2.359133005142212, "learning_rate": 4.7887765255656885e-06, "loss": 0.2079, "step": 12810 }, { "epoch": 0.684765046002644, "grad_norm": 2.5197179317474365, "learning_rate": 4.774018690770705e-06, "loss": 0.2038, "step": 12820 }, { "epoch": 0.6852991841040501, "grad_norm": 2.0649399757385254, "learning_rate": 4.759276497664196e-06, "loss": 0.2082, "step": 12830 }, { "epoch": 0.6858333222054562, "grad_norm": 2.3130805492401123, "learning_rate": 4.744549990370464e-06, "loss": 0.213, "step": 12840 }, { "epoch": 0.6863674603068624, "grad_norm": 2.29787540435791, "learning_rate": 4.7298392129668555e-06, "loss": 0.2117, "step": 12850 }, { "epoch": 0.6869015984082685, "grad_norm": 1.6488378047943115, "learning_rate": 4.715144209483657e-06, "loss": 0.1928, "step": 12860 }, { "epoch": 0.6874357365096746, "grad_norm": 1.4000784158706665, "learning_rate": 4.700465023903927e-06, "loss": 0.2015, "step": 12870 }, { "epoch": 0.6879698746110807, "grad_norm": 2.2760636806488037, "learning_rate": 4.685801700163384e-06, "loss": 0.2099, "step": 12880 }, { "epoch": 0.6885040127124868, "grad_norm": 1.930611491203308, "learning_rate": 4.671154282150276e-06, "loss": 0.2006, "step": 12890 }, { "epoch": 0.689038150813893, "grad_norm": 1.4668408632278442, "learning_rate": 4.656522813705236e-06, "loss": 0.2096, "step": 12900 }, { "epoch": 0.6895722889152991, "grad_norm": 2.007418155670166, "learning_rate": 4.641907338621166e-06, "loss": 0.1941, "step": 12910 }, { "epoch": 0.6901064270167052, "grad_norm": 2.085430145263672, "learning_rate": 4.62730790064309e-06, "loss": 0.2118, "step": 12920 }, { "epoch": 0.6906405651181113, "grad_norm": 1.8515369892120361, "learning_rate": 4.6127245434680455e-06, "loss": 0.191, "step": 12930 }, { "epoch": 0.6911747032195175, "grad_norm": 2.514357566833496, "learning_rate": 4.598157310744929e-06, "loss": 0.2006, "step": 12940 }, { "epoch": 0.6917088413209235, "grad_norm": 1.4674506187438965, "learning_rate": 4.583606246074376e-06, "loss": 0.1986, "step": 12950 }, { "epoch": 0.6922429794223296, "grad_norm": 2.0963077545166016, "learning_rate": 4.569071393008632e-06, "loss": 0.2079, "step": 12960 }, { "epoch": 0.6927771175237357, "grad_norm": 1.6925208568572998, "learning_rate": 4.554552795051421e-06, "loss": 0.1794, "step": 12970 }, { "epoch": 0.6933112556251418, "grad_norm": 2.3441359996795654, "learning_rate": 4.540050495657813e-06, "loss": 0.1996, "step": 12980 }, { "epoch": 0.693845393726548, "grad_norm": 1.8115828037261963, "learning_rate": 4.525564538234093e-06, "loss": 0.1909, "step": 12990 }, { "epoch": 0.6943795318279541, "grad_norm": 1.978729486465454, "learning_rate": 4.5110949661376425e-06, "loss": 0.1836, "step": 13000 }, { "epoch": 0.6949136699293602, "grad_norm": 2.4729833602905273, "learning_rate": 4.496641822676792e-06, "loss": 0.2037, "step": 13010 }, { "epoch": 0.6954478080307663, "grad_norm": 2.0595285892486572, "learning_rate": 4.482205151110698e-06, "loss": 0.1992, "step": 13020 }, { "epoch": 0.6959819461321725, "grad_norm": 1.6468031406402588, "learning_rate": 4.467784994649228e-06, "loss": 0.2035, "step": 13030 }, { "epoch": 0.6965160842335786, "grad_norm": 1.9793294668197632, "learning_rate": 4.453381396452807e-06, "loss": 0.1929, "step": 13040 }, { "epoch": 0.6970502223349847, "grad_norm": 1.8945449590682983, "learning_rate": 4.438994399632305e-06, "loss": 0.2141, "step": 13050 }, { "epoch": 0.6975843604363908, "grad_norm": 1.517954707145691, "learning_rate": 4.424624047248897e-06, "loss": 0.1927, "step": 13060 }, { "epoch": 0.698118498537797, "grad_norm": 2.188405752182007, "learning_rate": 4.410270382313957e-06, "loss": 0.1997, "step": 13070 }, { "epoch": 0.6986526366392031, "grad_norm": 2.3569412231445312, "learning_rate": 4.3959334477888905e-06, "loss": 0.1954, "step": 13080 }, { "epoch": 0.6991867747406092, "grad_norm": 2.147780656814575, "learning_rate": 4.381613286585036e-06, "loss": 0.2085, "step": 13090 }, { "epoch": 0.6997209128420153, "grad_norm": 2.80957293510437, "learning_rate": 4.367309941563539e-06, "loss": 0.2181, "step": 13100 }, { "epoch": 0.7002550509434214, "grad_norm": 1.8738983869552612, "learning_rate": 4.353023455535202e-06, "loss": 0.1946, "step": 13110 }, { "epoch": 0.7007891890448276, "grad_norm": 2.308129072189331, "learning_rate": 4.338753871260367e-06, "loss": 0.202, "step": 13120 }, { "epoch": 0.7013233271462337, "grad_norm": 1.8727205991744995, "learning_rate": 4.32450123144879e-06, "loss": 0.1936, "step": 13130 }, { "epoch": 0.7018574652476398, "grad_norm": 2.123415946960449, "learning_rate": 4.310265578759518e-06, "loss": 0.2094, "step": 13140 }, { "epoch": 0.7023916033490459, "grad_norm": 2.1013638973236084, "learning_rate": 4.296046955800747e-06, "loss": 0.2148, "step": 13150 }, { "epoch": 0.7029257414504521, "grad_norm": 2.5936830043792725, "learning_rate": 4.281845405129697e-06, "loss": 0.1998, "step": 13160 }, { "epoch": 0.7034598795518582, "grad_norm": 2.412802219390869, "learning_rate": 4.267660969252505e-06, "loss": 0.1851, "step": 13170 }, { "epoch": 0.7039940176532643, "grad_norm": 2.0266475677490234, "learning_rate": 4.253493690624071e-06, "loss": 0.2067, "step": 13180 }, { "epoch": 0.7045281557546703, "grad_norm": 1.716863989830017, "learning_rate": 4.239343611647942e-06, "loss": 0.1973, "step": 13190 }, { "epoch": 0.7050622938560764, "grad_norm": 1.7210581302642822, "learning_rate": 4.225210774676192e-06, "loss": 0.1995, "step": 13200 }, { "epoch": 0.7055964319574826, "grad_norm": 2.3087141513824463, "learning_rate": 4.211095222009282e-06, "loss": 0.1982, "step": 13210 }, { "epoch": 0.7061305700588887, "grad_norm": 2.2863657474517822, "learning_rate": 4.196996995895942e-06, "loss": 0.2031, "step": 13220 }, { "epoch": 0.7066647081602948, "grad_norm": 2.3175318241119385, "learning_rate": 4.182916138533042e-06, "loss": 0.2059, "step": 13230 }, { "epoch": 0.7071988462617009, "grad_norm": 1.7953799962997437, "learning_rate": 4.168852692065474e-06, "loss": 0.2144, "step": 13240 }, { "epoch": 0.707732984363107, "grad_norm": 2.697636127471924, "learning_rate": 4.154806698586008e-06, "loss": 0.214, "step": 13250 }, { "epoch": 0.7082671224645132, "grad_norm": 3.193804979324341, "learning_rate": 4.1407782001351805e-06, "loss": 0.2164, "step": 13260 }, { "epoch": 0.7088012605659193, "grad_norm": 1.8827204704284668, "learning_rate": 4.126767238701158e-06, "loss": 0.1966, "step": 13270 }, { "epoch": 0.7093353986673254, "grad_norm": 1.7728564739227295, "learning_rate": 4.112773856219632e-06, "loss": 0.1957, "step": 13280 }, { "epoch": 0.7098695367687315, "grad_norm": 2.271893262863159, "learning_rate": 4.098798094573666e-06, "loss": 0.2096, "step": 13290 }, { "epoch": 0.7104036748701377, "grad_norm": 2.0471224784851074, "learning_rate": 4.084839995593584e-06, "loss": 0.1964, "step": 13300 }, { "epoch": 0.7109378129715438, "grad_norm": 1.9206962585449219, "learning_rate": 4.0708996010568605e-06, "loss": 0.1964, "step": 13310 }, { "epoch": 0.7114719510729499, "grad_norm": 2.3047261238098145, "learning_rate": 4.056976952687954e-06, "loss": 0.2066, "step": 13320 }, { "epoch": 0.712006089174356, "grad_norm": 1.7425634860992432, "learning_rate": 4.043072092158221e-06, "loss": 0.1944, "step": 13330 }, { "epoch": 0.7125402272757622, "grad_norm": 1.773015022277832, "learning_rate": 4.029185061085786e-06, "loss": 0.1878, "step": 13340 }, { "epoch": 0.7130743653771683, "grad_norm": 1.92392897605896, "learning_rate": 4.015315901035396e-06, "loss": 0.1989, "step": 13350 }, { "epoch": 0.7136085034785744, "grad_norm": 1.6133384704589844, "learning_rate": 4.001464653518313e-06, "loss": 0.1853, "step": 13360 }, { "epoch": 0.7141426415799805, "grad_norm": 2.264131784439087, "learning_rate": 3.9876313599921836e-06, "loss": 0.1836, "step": 13370 }, { "epoch": 0.7146767796813867, "grad_norm": 1.5634881258010864, "learning_rate": 3.973816061860925e-06, "loss": 0.2009, "step": 13380 }, { "epoch": 0.7152109177827928, "grad_norm": 2.3214104175567627, "learning_rate": 3.960018800474585e-06, "loss": 0.1891, "step": 13390 }, { "epoch": 0.7157450558841989, "grad_norm": 1.8820472955703735, "learning_rate": 3.946239617129226e-06, "loss": 0.2016, "step": 13400 }, { "epoch": 0.716279193985605, "grad_norm": 1.5946075916290283, "learning_rate": 3.932478553066804e-06, "loss": 0.2033, "step": 13410 }, { "epoch": 0.7168133320870111, "grad_norm": 1.8241169452667236, "learning_rate": 3.91873564947505e-06, "loss": 0.2023, "step": 13420 }, { "epoch": 0.7173474701884173, "grad_norm": 1.9655125141143799, "learning_rate": 3.905010947487332e-06, "loss": 0.1957, "step": 13430 }, { "epoch": 0.7178816082898233, "grad_norm": 1.6499611139297485, "learning_rate": 3.89130448818253e-06, "loss": 0.1903, "step": 13440 }, { "epoch": 0.7184157463912294, "grad_norm": 1.8614602088928223, "learning_rate": 3.877616312584943e-06, "loss": 0.1946, "step": 13450 }, { "epoch": 0.7189498844926355, "grad_norm": 1.983534812927246, "learning_rate": 3.863946461664132e-06, "loss": 0.2004, "step": 13460 }, { "epoch": 0.7194840225940417, "grad_norm": 1.9103089570999146, "learning_rate": 3.850294976334811e-06, "loss": 0.1823, "step": 13470 }, { "epoch": 0.7200181606954478, "grad_norm": 1.7057238817214966, "learning_rate": 3.836661897456736e-06, "loss": 0.1978, "step": 13480 }, { "epoch": 0.7205522987968539, "grad_norm": 1.511348843574524, "learning_rate": 3.82304726583456e-06, "loss": 0.1866, "step": 13490 }, { "epoch": 0.72108643689826, "grad_norm": 1.9422800540924072, "learning_rate": 3.8094511222177244e-06, "loss": 0.2091, "step": 13500 }, { "epoch": 0.7216205749996661, "grad_norm": 2.4030849933624268, "learning_rate": 3.795873507300334e-06, "loss": 0.2031, "step": 13510 }, { "epoch": 0.7221547131010723, "grad_norm": 1.4094882011413574, "learning_rate": 3.782314461721045e-06, "loss": 0.1977, "step": 13520 }, { "epoch": 0.7226888512024784, "grad_norm": 1.5510730743408203, "learning_rate": 3.7687740260629246e-06, "loss": 0.202, "step": 13530 }, { "epoch": 0.7232229893038845, "grad_norm": 2.0395989418029785, "learning_rate": 3.755252240853343e-06, "loss": 0.1872, "step": 13540 }, { "epoch": 0.7237571274052906, "grad_norm": 2.067761182785034, "learning_rate": 3.7417491465638456e-06, "loss": 0.1803, "step": 13550 }, { "epoch": 0.7242912655066968, "grad_norm": 2.0252134799957275, "learning_rate": 3.7282647836100393e-06, "loss": 0.1877, "step": 13560 }, { "epoch": 0.7248254036081029, "grad_norm": 2.1944291591644287, "learning_rate": 3.7147991923514636e-06, "loss": 0.1967, "step": 13570 }, { "epoch": 0.725359541709509, "grad_norm": 1.6751203536987305, "learning_rate": 3.7013524130914714e-06, "loss": 0.1986, "step": 13580 }, { "epoch": 0.7258936798109151, "grad_norm": 1.572409749031067, "learning_rate": 3.687924486077119e-06, "loss": 0.1942, "step": 13590 }, { "epoch": 0.7264278179123212, "grad_norm": 1.3165833950042725, "learning_rate": 3.6745154514990265e-06, "loss": 0.1976, "step": 13600 }, { "epoch": 0.7269619560137274, "grad_norm": 2.2623045444488525, "learning_rate": 3.6611253494912714e-06, "loss": 0.1921, "step": 13610 }, { "epoch": 0.7274960941151335, "grad_norm": 1.8448021411895752, "learning_rate": 3.647754220131269e-06, "loss": 0.202, "step": 13620 }, { "epoch": 0.7280302322165396, "grad_norm": 1.9587323665618896, "learning_rate": 3.6344021034396427e-06, "loss": 0.193, "step": 13630 }, { "epoch": 0.7285643703179457, "grad_norm": 1.5247585773468018, "learning_rate": 3.621069039380112e-06, "loss": 0.2046, "step": 13640 }, { "epoch": 0.7290985084193519, "grad_norm": 2.161699056625366, "learning_rate": 3.6077550678593663e-06, "loss": 0.1854, "step": 13650 }, { "epoch": 0.729632646520758, "grad_norm": 1.7899532318115234, "learning_rate": 3.594460228726965e-06, "loss": 0.1857, "step": 13660 }, { "epoch": 0.7301667846221641, "grad_norm": 1.8384979963302612, "learning_rate": 3.5811845617751795e-06, "loss": 0.1924, "step": 13670 }, { "epoch": 0.7307009227235702, "grad_norm": 2.0494439601898193, "learning_rate": 3.567928106738913e-06, "loss": 0.2027, "step": 13680 }, { "epoch": 0.7312350608249762, "grad_norm": 2.368408441543579, "learning_rate": 3.554690903295569e-06, "loss": 0.2016, "step": 13690 }, { "epoch": 0.7317691989263824, "grad_norm": 2.1683664321899414, "learning_rate": 3.5414729910649215e-06, "loss": 0.2057, "step": 13700 }, { "epoch": 0.7323033370277885, "grad_norm": 1.5416229963302612, "learning_rate": 3.528274409609006e-06, "loss": 0.1981, "step": 13710 }, { "epoch": 0.7328374751291946, "grad_norm": 1.3828449249267578, "learning_rate": 3.515095198431998e-06, "loss": 0.2093, "step": 13720 }, { "epoch": 0.7333716132306007, "grad_norm": 1.8000141382217407, "learning_rate": 3.5019353969801086e-06, "loss": 0.188, "step": 13730 }, { "epoch": 0.7339057513320069, "grad_norm": 1.7602670192718506, "learning_rate": 3.4887950446414388e-06, "loss": 0.1897, "step": 13740 }, { "epoch": 0.734439889433413, "grad_norm": 2.134612560272217, "learning_rate": 3.4756741807458836e-06, "loss": 0.2104, "step": 13750 }, { "epoch": 0.7349740275348191, "grad_norm": 1.9061700105667114, "learning_rate": 3.462572844565013e-06, "loss": 0.2009, "step": 13760 }, { "epoch": 0.7355081656362252, "grad_norm": 1.5581552982330322, "learning_rate": 3.4494910753119416e-06, "loss": 0.1909, "step": 13770 }, { "epoch": 0.7360423037376314, "grad_norm": 1.540329098701477, "learning_rate": 3.436428912141222e-06, "loss": 0.1922, "step": 13780 }, { "epoch": 0.7365764418390375, "grad_norm": 1.4296460151672363, "learning_rate": 3.4233863941487243e-06, "loss": 0.1962, "step": 13790 }, { "epoch": 0.7371105799404436, "grad_norm": 2.089782953262329, "learning_rate": 3.41036356037152e-06, "loss": 0.1872, "step": 13800 }, { "epoch": 0.7376447180418497, "grad_norm": 2.1125032901763916, "learning_rate": 3.3973604497877634e-06, "loss": 0.2069, "step": 13810 }, { "epoch": 0.7381788561432558, "grad_norm": 2.0742554664611816, "learning_rate": 3.3843771013165726e-06, "loss": 0.207, "step": 13820 }, { "epoch": 0.738712994244662, "grad_norm": 2.265233278274536, "learning_rate": 3.3714135538179293e-06, "loss": 0.1926, "step": 13830 }, { "epoch": 0.7392471323460681, "grad_norm": 1.83545982837677, "learning_rate": 3.3584698460925356e-06, "loss": 0.1966, "step": 13840 }, { "epoch": 0.7397812704474742, "grad_norm": 1.8797460794448853, "learning_rate": 3.3455460168817177e-06, "loss": 0.2066, "step": 13850 }, { "epoch": 0.7403154085488803, "grad_norm": 2.36694598197937, "learning_rate": 3.3326421048673007e-06, "loss": 0.2033, "step": 13860 }, { "epoch": 0.7408495466502865, "grad_norm": 2.38443660736084, "learning_rate": 3.3197581486715046e-06, "loss": 0.1969, "step": 13870 }, { "epoch": 0.7413836847516926, "grad_norm": 2.4761970043182373, "learning_rate": 3.306894186856814e-06, "loss": 0.1994, "step": 13880 }, { "epoch": 0.7419178228530987, "grad_norm": 1.8733409643173218, "learning_rate": 3.2940502579258682e-06, "loss": 0.2022, "step": 13890 }, { "epoch": 0.7424519609545048, "grad_norm": 2.109523057937622, "learning_rate": 3.281226400321349e-06, "loss": 0.192, "step": 13900 }, { "epoch": 0.742986099055911, "grad_norm": 1.6331853866577148, "learning_rate": 3.2684226524258644e-06, "loss": 0.2065, "step": 13910 }, { "epoch": 0.7435202371573171, "grad_norm": 2.084440231323242, "learning_rate": 3.2556390525618264e-06, "loss": 0.1995, "step": 13920 }, { "epoch": 0.7440543752587231, "grad_norm": 1.7966901063919067, "learning_rate": 3.242875638991356e-06, "loss": 0.1959, "step": 13930 }, { "epoch": 0.7445885133601292, "grad_norm": 1.9185980558395386, "learning_rate": 3.2301324499161433e-06, "loss": 0.183, "step": 13940 }, { "epoch": 0.7451226514615353, "grad_norm": 2.4990077018737793, "learning_rate": 3.217409523477348e-06, "loss": 0.1966, "step": 13950 }, { "epoch": 0.7456567895629415, "grad_norm": 1.4074572324752808, "learning_rate": 3.204706897755483e-06, "loss": 0.1882, "step": 13960 }, { "epoch": 0.7461909276643476, "grad_norm": 1.9081840515136719, "learning_rate": 3.1920246107703055e-06, "loss": 0.2025, "step": 13970 }, { "epoch": 0.7467250657657537, "grad_norm": 1.6681355237960815, "learning_rate": 3.179362700480688e-06, "loss": 0.1853, "step": 13980 }, { "epoch": 0.7472592038671598, "grad_norm": 1.6587789058685303, "learning_rate": 3.166721204784521e-06, "loss": 0.1976, "step": 13990 }, { "epoch": 0.747793341968566, "grad_norm": 2.537449359893799, "learning_rate": 3.1541001615185862e-06, "loss": 0.1958, "step": 14000 }, { "epoch": 0.7483274800699721, "grad_norm": 1.9216731786727905, "learning_rate": 3.141499608458465e-06, "loss": 0.1951, "step": 14010 }, { "epoch": 0.7488616181713782, "grad_norm": 1.6860138177871704, "learning_rate": 3.1289195833183893e-06, "loss": 0.1923, "step": 14020 }, { "epoch": 0.7493957562727843, "grad_norm": 2.3117315769195557, "learning_rate": 3.1163601237511596e-06, "loss": 0.207, "step": 14030 }, { "epoch": 0.7499298943741904, "grad_norm": 2.670199394226074, "learning_rate": 3.1038212673480306e-06, "loss": 0.1952, "step": 14040 }, { "epoch": 0.7504640324755966, "grad_norm": 2.5368621349334717, "learning_rate": 3.091303051638579e-06, "loss": 0.1841, "step": 14050 }, { "epoch": 0.7509981705770027, "grad_norm": 1.865332841873169, "learning_rate": 3.0788055140906024e-06, "loss": 0.2053, "step": 14060 }, { "epoch": 0.7515323086784088, "grad_norm": 2.097813606262207, "learning_rate": 3.0663286921100187e-06, "loss": 0.1836, "step": 14070 }, { "epoch": 0.7520664467798149, "grad_norm": 1.8209867477416992, "learning_rate": 3.053872623040732e-06, "loss": 0.1894, "step": 14080 }, { "epoch": 0.7526005848812211, "grad_norm": 1.63919198513031, "learning_rate": 3.041437344164535e-06, "loss": 0.1905, "step": 14090 }, { "epoch": 0.7531347229826272, "grad_norm": 1.4309654235839844, "learning_rate": 3.0290228927009902e-06, "loss": 0.1919, "step": 14100 }, { "epoch": 0.7536688610840333, "grad_norm": 1.8823355436325073, "learning_rate": 3.0166293058073325e-06, "loss": 0.1968, "step": 14110 }, { "epoch": 0.7542029991854394, "grad_norm": 1.470423936843872, "learning_rate": 3.0042566205783384e-06, "loss": 0.187, "step": 14120 }, { "epoch": 0.7547371372868455, "grad_norm": 1.8692309856414795, "learning_rate": 2.9919048740462296e-06, "loss": 0.1929, "step": 14130 }, { "epoch": 0.7552712753882517, "grad_norm": 2.151658773422241, "learning_rate": 2.9795741031805527e-06, "loss": 0.1953, "step": 14140 }, { "epoch": 0.7558054134896578, "grad_norm": 1.8074941635131836, "learning_rate": 2.9672643448880757e-06, "loss": 0.1986, "step": 14150 }, { "epoch": 0.7563395515910639, "grad_norm": 1.6774282455444336, "learning_rate": 2.954975636012675e-06, "loss": 0.1894, "step": 14160 }, { "epoch": 0.75687368969247, "grad_norm": 2.046113967895508, "learning_rate": 2.9427080133352183e-06, "loss": 0.1953, "step": 14170 }, { "epoch": 0.757407827793876, "grad_norm": 1.7747138738632202, "learning_rate": 2.930461513573475e-06, "loss": 0.1946, "step": 14180 }, { "epoch": 0.7579419658952822, "grad_norm": 1.7473129034042358, "learning_rate": 2.918236173381981e-06, "loss": 0.2002, "step": 14190 }, { "epoch": 0.7584761039966883, "grad_norm": 2.1527345180511475, "learning_rate": 2.9060320293519374e-06, "loss": 0.201, "step": 14200 }, { "epoch": 0.7590102420980944, "grad_norm": 1.4832205772399902, "learning_rate": 2.893849118011117e-06, "loss": 0.1851, "step": 14210 }, { "epoch": 0.7595443801995005, "grad_norm": 2.2976646423339844, "learning_rate": 2.8816874758237335e-06, "loss": 0.205, "step": 14220 }, { "epoch": 0.7600785183009067, "grad_norm": 1.9310762882232666, "learning_rate": 2.8695471391903383e-06, "loss": 0.1962, "step": 14230 }, { "epoch": 0.7606126564023128, "grad_norm": 1.9517450332641602, "learning_rate": 2.8574281444477183e-06, "loss": 0.1961, "step": 14240 }, { "epoch": 0.7611467945037189, "grad_norm": 1.9862440824508667, "learning_rate": 2.8453305278687826e-06, "loss": 0.1916, "step": 14250 }, { "epoch": 0.761680932605125, "grad_norm": 2.451794385910034, "learning_rate": 2.8332543256624525e-06, "loss": 0.1801, "step": 14260 }, { "epoch": 0.7622150707065312, "grad_norm": 1.695159673690796, "learning_rate": 2.821199573973552e-06, "loss": 0.1867, "step": 14270 }, { "epoch": 0.7627492088079373, "grad_norm": 1.8122671842575073, "learning_rate": 2.8091663088827124e-06, "loss": 0.1883, "step": 14280 }, { "epoch": 0.7632833469093434, "grad_norm": 1.556504726409912, "learning_rate": 2.797154566406245e-06, "loss": 0.2084, "step": 14290 }, { "epoch": 0.7638174850107495, "grad_norm": 2.0173401832580566, "learning_rate": 2.785164382496045e-06, "loss": 0.1947, "step": 14300 }, { "epoch": 0.7643516231121557, "grad_norm": 2.1507465839385986, "learning_rate": 2.7731957930394784e-06, "loss": 0.1975, "step": 14310 }, { "epoch": 0.7648857612135618, "grad_norm": 1.8325512409210205, "learning_rate": 2.761248833859288e-06, "loss": 0.1881, "step": 14320 }, { "epoch": 0.7654198993149679, "grad_norm": 2.01290225982666, "learning_rate": 2.7493235407134643e-06, "loss": 0.1983, "step": 14330 }, { "epoch": 0.765954037416374, "grad_norm": 1.6847450733184814, "learning_rate": 2.737419949295156e-06, "loss": 0.1969, "step": 14340 }, { "epoch": 0.7664881755177801, "grad_norm": 1.816990613937378, "learning_rate": 2.725538095232552e-06, "loss": 0.1912, "step": 14350 }, { "epoch": 0.7670223136191863, "grad_norm": 1.4999099969863892, "learning_rate": 2.713678014088792e-06, "loss": 0.1951, "step": 14360 }, { "epoch": 0.7675564517205924, "grad_norm": 2.0938215255737305, "learning_rate": 2.701839741361829e-06, "loss": 0.1812, "step": 14370 }, { "epoch": 0.7680905898219985, "grad_norm": 1.940747857093811, "learning_rate": 2.690023312484361e-06, "loss": 0.2002, "step": 14380 }, { "epoch": 0.7686247279234046, "grad_norm": 2.5149874687194824, "learning_rate": 2.6782287628236946e-06, "loss": 0.1955, "step": 14390 }, { "epoch": 0.7691588660248108, "grad_norm": 1.7078354358673096, "learning_rate": 2.6664561276816536e-06, "loss": 0.1903, "step": 14400 }, { "epoch": 0.7696930041262169, "grad_norm": 2.4997429847717285, "learning_rate": 2.6547054422944674e-06, "loss": 0.2036, "step": 14410 }, { "epoch": 0.770227142227623, "grad_norm": 1.1207189559936523, "learning_rate": 2.642976741832678e-06, "loss": 0.194, "step": 14420 }, { "epoch": 0.770761280329029, "grad_norm": 1.4716769456863403, "learning_rate": 2.631270061401013e-06, "loss": 0.1995, "step": 14430 }, { "epoch": 0.7712954184304351, "grad_norm": 1.651985764503479, "learning_rate": 2.6195854360383024e-06, "loss": 0.1935, "step": 14440 }, { "epoch": 0.7718295565318413, "grad_norm": 1.9286710023880005, "learning_rate": 2.6079229007173523e-06, "loss": 0.2118, "step": 14450 }, { "epoch": 0.7723636946332474, "grad_norm": 2.3074588775634766, "learning_rate": 2.596282490344868e-06, "loss": 0.1924, "step": 14460 }, { "epoch": 0.7728978327346535, "grad_norm": 1.4237712621688843, "learning_rate": 2.584664239761321e-06, "loss": 0.1769, "step": 14470 }, { "epoch": 0.7734319708360596, "grad_norm": 2.3851616382598877, "learning_rate": 2.5730681837408598e-06, "loss": 0.186, "step": 14480 }, { "epoch": 0.7739661089374658, "grad_norm": 1.808867335319519, "learning_rate": 2.5614943569912067e-06, "loss": 0.1927, "step": 14490 }, { "epoch": 0.7745002470388719, "grad_norm": 2.1080849170684814, "learning_rate": 2.549942794153547e-06, "loss": 0.1918, "step": 14500 }, { "epoch": 0.775034385140278, "grad_norm": 2.024184226989746, "learning_rate": 2.538413529802427e-06, "loss": 0.1867, "step": 14510 }, { "epoch": 0.7755685232416841, "grad_norm": 2.672346591949463, "learning_rate": 2.52690659844566e-06, "loss": 0.2084, "step": 14520 }, { "epoch": 0.7761026613430902, "grad_norm": 1.650956630706787, "learning_rate": 2.5154220345242077e-06, "loss": 0.1987, "step": 14530 }, { "epoch": 0.7766367994444964, "grad_norm": 1.470323920249939, "learning_rate": 2.5039598724120885e-06, "loss": 0.1911, "step": 14540 }, { "epoch": 0.7771709375459025, "grad_norm": 2.246478319168091, "learning_rate": 2.4925201464162653e-06, "loss": 0.1952, "step": 14550 }, { "epoch": 0.7777050756473086, "grad_norm": 1.7607316970825195, "learning_rate": 2.4811028907765576e-06, "loss": 0.2018, "step": 14560 }, { "epoch": 0.7782392137487147, "grad_norm": 1.4754760265350342, "learning_rate": 2.469708139665524e-06, "loss": 0.1931, "step": 14570 }, { "epoch": 0.7787733518501209, "grad_norm": 1.643049955368042, "learning_rate": 2.4583359271883643e-06, "loss": 0.1977, "step": 14580 }, { "epoch": 0.779307489951527, "grad_norm": 1.985589623451233, "learning_rate": 2.4469862873828186e-06, "loss": 0.1962, "step": 14590 }, { "epoch": 0.7798416280529331, "grad_norm": 2.055016040802002, "learning_rate": 2.435659254219077e-06, "loss": 0.1956, "step": 14600 }, { "epoch": 0.7803757661543392, "grad_norm": 1.357635498046875, "learning_rate": 2.424354861599648e-06, "loss": 0.204, "step": 14610 }, { "epoch": 0.7809099042557454, "grad_norm": 2.088031053543091, "learning_rate": 2.4130731433592846e-06, "loss": 0.1924, "step": 14620 }, { "epoch": 0.7814440423571515, "grad_norm": 1.705014944076538, "learning_rate": 2.4018141332648803e-06, "loss": 0.2057, "step": 14630 }, { "epoch": 0.7819781804585576, "grad_norm": 1.440266728401184, "learning_rate": 2.390577865015352e-06, "loss": 0.1906, "step": 14640 }, { "epoch": 0.7825123185599637, "grad_norm": 1.8188496828079224, "learning_rate": 2.3793643722415505e-06, "loss": 0.2059, "step": 14650 }, { "epoch": 0.7830464566613698, "grad_norm": 2.06721830368042, "learning_rate": 2.3681736885061648e-06, "loss": 0.1981, "step": 14660 }, { "epoch": 0.7835805947627759, "grad_norm": 1.7937973737716675, "learning_rate": 2.3570058473036086e-06, "loss": 0.1953, "step": 14670 }, { "epoch": 0.784114732864182, "grad_norm": 2.0717413425445557, "learning_rate": 2.345860882059926e-06, "loss": 0.1936, "step": 14680 }, { "epoch": 0.7846488709655881, "grad_norm": 1.9936023950576782, "learning_rate": 2.3347388261326944e-06, "loss": 0.2015, "step": 14690 }, { "epoch": 0.7851830090669942, "grad_norm": 2.1958553791046143, "learning_rate": 2.3236397128109245e-06, "loss": 0.2089, "step": 14700 }, { "epoch": 0.7857171471684004, "grad_norm": 2.2984092235565186, "learning_rate": 2.3125635753149567e-06, "loss": 0.2023, "step": 14710 }, { "epoch": 0.7862512852698065, "grad_norm": 2.565155506134033, "learning_rate": 2.3015104467963557e-06, "loss": 0.2012, "step": 14720 }, { "epoch": 0.7867854233712126, "grad_norm": 1.6860542297363281, "learning_rate": 2.2904803603378313e-06, "loss": 0.2017, "step": 14730 }, { "epoch": 0.7873195614726187, "grad_norm": 1.9908113479614258, "learning_rate": 2.2794733489531194e-06, "loss": 0.1809, "step": 14740 }, { "epoch": 0.7878536995740248, "grad_norm": 1.8822989463806152, "learning_rate": 2.268489445586893e-06, "loss": 0.187, "step": 14750 }, { "epoch": 0.788387837675431, "grad_norm": 2.277846097946167, "learning_rate": 2.257528683114658e-06, "loss": 0.1857, "step": 14760 }, { "epoch": 0.7889219757768371, "grad_norm": 2.029287099838257, "learning_rate": 2.246591094342667e-06, "loss": 0.1883, "step": 14770 }, { "epoch": 0.7894561138782432, "grad_norm": 1.9442925453186035, "learning_rate": 2.2356767120078027e-06, "loss": 0.1979, "step": 14780 }, { "epoch": 0.7899902519796493, "grad_norm": 2.277160167694092, "learning_rate": 2.2247855687774933e-06, "loss": 0.2003, "step": 14790 }, { "epoch": 0.7905243900810555, "grad_norm": 1.813984990119934, "learning_rate": 2.2139176972496077e-06, "loss": 0.1817, "step": 14800 }, { "epoch": 0.7910585281824616, "grad_norm": 1.5820233821868896, "learning_rate": 2.203073129952371e-06, "loss": 0.1844, "step": 14810 }, { "epoch": 0.7915926662838677, "grad_norm": 1.9238145351409912, "learning_rate": 2.1922518993442463e-06, "loss": 0.1973, "step": 14820 }, { "epoch": 0.7921268043852738, "grad_norm": 1.9056835174560547, "learning_rate": 2.1814540378138536e-06, "loss": 0.2068, "step": 14830 }, { "epoch": 0.79266094248668, "grad_norm": 2.0028278827667236, "learning_rate": 2.170679577679866e-06, "loss": 0.2063, "step": 14840 }, { "epoch": 0.7931950805880861, "grad_norm": 2.1335413455963135, "learning_rate": 2.159928551190917e-06, "loss": 0.1957, "step": 14850 }, { "epoch": 0.7937292186894922, "grad_norm": 2.7127907276153564, "learning_rate": 2.1492009905254964e-06, "loss": 0.1941, "step": 14860 }, { "epoch": 0.7942633567908983, "grad_norm": 2.582970142364502, "learning_rate": 2.13849692779187e-06, "loss": 0.1891, "step": 14870 }, { "epoch": 0.7947974948923044, "grad_norm": 1.9578105211257935, "learning_rate": 2.127816395027962e-06, "loss": 0.2063, "step": 14880 }, { "epoch": 0.7953316329937106, "grad_norm": 1.9177343845367432, "learning_rate": 2.1171594242012737e-06, "loss": 0.1864, "step": 14890 }, { "epoch": 0.7958657710951167, "grad_norm": 1.9610730409622192, "learning_rate": 2.1065260472087845e-06, "loss": 0.1947, "step": 14900 }, { "epoch": 0.7963999091965228, "grad_norm": 1.2381418943405151, "learning_rate": 2.0959162958768587e-06, "loss": 0.1995, "step": 14910 }, { "epoch": 0.7969340472979288, "grad_norm": 2.4555797576904297, "learning_rate": 2.085330201961142e-06, "loss": 0.1951, "step": 14920 }, { "epoch": 0.797468185399335, "grad_norm": 2.1510167121887207, "learning_rate": 2.0747677971464766e-06, "loss": 0.2025, "step": 14930 }, { "epoch": 0.7980023235007411, "grad_norm": 2.7530791759490967, "learning_rate": 2.064229113046796e-06, "loss": 0.1854, "step": 14940 }, { "epoch": 0.7985364616021472, "grad_norm": 2.6532673835754395, "learning_rate": 2.0537141812050476e-06, "loss": 0.1977, "step": 14950 }, { "epoch": 0.7990705997035533, "grad_norm": 1.8843119144439697, "learning_rate": 2.0432230330930692e-06, "loss": 0.1807, "step": 14960 }, { "epoch": 0.7996047378049594, "grad_norm": 1.6866428852081299, "learning_rate": 2.0327557001115293e-06, "loss": 0.197, "step": 14970 }, { "epoch": 0.8001388759063656, "grad_norm": 2.018805742263794, "learning_rate": 2.0223122135898088e-06, "loss": 0.18, "step": 14980 }, { "epoch": 0.8006730140077717, "grad_norm": 2.365093231201172, "learning_rate": 2.011892604785913e-06, "loss": 0.1777, "step": 14990 }, { "epoch": 0.8012071521091778, "grad_norm": 1.8096531629562378, "learning_rate": 2.0014969048863798e-06, "loss": 0.1971, "step": 15000 }, { "epoch": 0.8017412902105839, "grad_norm": 1.646672248840332, "learning_rate": 1.9911251450061943e-06, "loss": 0.1959, "step": 15010 }, { "epoch": 0.8022754283119901, "grad_norm": 2.43733286857605, "learning_rate": 1.980777356188678e-06, "loss": 0.1873, "step": 15020 }, { "epoch": 0.8028095664133962, "grad_norm": 2.4419779777526855, "learning_rate": 1.9704535694054095e-06, "loss": 0.1972, "step": 15030 }, { "epoch": 0.8033437045148023, "grad_norm": 1.9384167194366455, "learning_rate": 1.960153815556125e-06, "loss": 0.185, "step": 15040 }, { "epoch": 0.8038778426162084, "grad_norm": 2.3505780696868896, "learning_rate": 1.949878125468636e-06, "loss": 0.2067, "step": 15050 }, { "epoch": 0.8044119807176145, "grad_norm": 1.7631889581680298, "learning_rate": 1.939626529898725e-06, "loss": 0.1922, "step": 15060 }, { "epoch": 0.8049461188190207, "grad_norm": 1.756872534751892, "learning_rate": 1.9293990595300505e-06, "loss": 0.1974, "step": 15070 }, { "epoch": 0.8054802569204268, "grad_norm": 1.8049086332321167, "learning_rate": 1.9191957449740773e-06, "loss": 0.1981, "step": 15080 }, { "epoch": 0.8060143950218329, "grad_norm": 2.458258628845215, "learning_rate": 1.909016616769961e-06, "loss": 0.2059, "step": 15090 }, { "epoch": 0.806548533123239, "grad_norm": 1.9878307580947876, "learning_rate": 1.8988617053844638e-06, "loss": 0.1831, "step": 15100 }, { "epoch": 0.8070826712246452, "grad_norm": 1.614086627960205, "learning_rate": 1.8887310412118775e-06, "loss": 0.1928, "step": 15110 }, { "epoch": 0.8076168093260513, "grad_norm": 2.1613776683807373, "learning_rate": 1.8786246545739095e-06, "loss": 0.2093, "step": 15120 }, { "epoch": 0.8081509474274574, "grad_norm": 1.7683864831924438, "learning_rate": 1.8685425757196053e-06, "loss": 0.1886, "step": 15130 }, { "epoch": 0.8086850855288635, "grad_norm": 1.7480674982070923, "learning_rate": 1.8584848348252537e-06, "loss": 0.203, "step": 15140 }, { "epoch": 0.8092192236302697, "grad_norm": 1.6333253383636475, "learning_rate": 1.8484514619943072e-06, "loss": 0.1994, "step": 15150 }, { "epoch": 0.8097533617316757, "grad_norm": 2.0683858394622803, "learning_rate": 1.838442487257276e-06, "loss": 0.2077, "step": 15160 }, { "epoch": 0.8102874998330818, "grad_norm": 1.6661045551300049, "learning_rate": 1.8284579405716452e-06, "loss": 0.1936, "step": 15170 }, { "epoch": 0.8108216379344879, "grad_norm": 1.7199368476867676, "learning_rate": 1.8184978518217867e-06, "loss": 0.1952, "step": 15180 }, { "epoch": 0.811355776035894, "grad_norm": 1.6820697784423828, "learning_rate": 1.8085622508188683e-06, "loss": 0.1784, "step": 15190 }, { "epoch": 0.8118899141373002, "grad_norm": 1.5260549783706665, "learning_rate": 1.7986511673007655e-06, "loss": 0.1836, "step": 15200 }, { "epoch": 0.8124240522387063, "grad_norm": 2.01175856590271, "learning_rate": 1.7887646309319662e-06, "loss": 0.185, "step": 15210 }, { "epoch": 0.8129581903401124, "grad_norm": 1.4435689449310303, "learning_rate": 1.7789026713034973e-06, "loss": 0.1907, "step": 15220 }, { "epoch": 0.8134923284415185, "grad_norm": 1.7505128383636475, "learning_rate": 1.7690653179328166e-06, "loss": 0.1936, "step": 15230 }, { "epoch": 0.8140264665429247, "grad_norm": 2.0108530521392822, "learning_rate": 1.7592526002637377e-06, "loss": 0.191, "step": 15240 }, { "epoch": 0.8145606046443308, "grad_norm": 2.1345746517181396, "learning_rate": 1.7494645476663352e-06, "loss": 0.1961, "step": 15250 }, { "epoch": 0.8150947427457369, "grad_norm": 1.8194632530212402, "learning_rate": 1.7397011894368666e-06, "loss": 0.2046, "step": 15260 }, { "epoch": 0.815628880847143, "grad_norm": 2.2342379093170166, "learning_rate": 1.7299625547976707e-06, "loss": 0.2016, "step": 15270 }, { "epoch": 0.8161630189485491, "grad_norm": 2.1676619052886963, "learning_rate": 1.7202486728970858e-06, "loss": 0.1979, "step": 15280 }, { "epoch": 0.8166971570499553, "grad_norm": 1.8677483797073364, "learning_rate": 1.7105595728093737e-06, "loss": 0.185, "step": 15290 }, { "epoch": 0.8172312951513614, "grad_norm": 2.608194589614868, "learning_rate": 1.7008952835346148e-06, "loss": 0.1976, "step": 15300 }, { "epoch": 0.8177654332527675, "grad_norm": 1.8137328624725342, "learning_rate": 1.691255833998624e-06, "loss": 0.1841, "step": 15310 }, { "epoch": 0.8182995713541736, "grad_norm": 2.180060625076294, "learning_rate": 1.6816412530528815e-06, "loss": 0.1977, "step": 15320 }, { "epoch": 0.8188337094555798, "grad_norm": 2.1153762340545654, "learning_rate": 1.6720515694744289e-06, "loss": 0.1916, "step": 15330 }, { "epoch": 0.8193678475569859, "grad_norm": 1.6253478527069092, "learning_rate": 1.6624868119657856e-06, "loss": 0.1847, "step": 15340 }, { "epoch": 0.819901985658392, "grad_norm": 2.4919135570526123, "learning_rate": 1.652947009154867e-06, "loss": 0.2007, "step": 15350 }, { "epoch": 0.8204361237597981, "grad_norm": 1.9389845132827759, "learning_rate": 1.6434321895949034e-06, "loss": 0.1825, "step": 15360 }, { "epoch": 0.8209702618612043, "grad_norm": 1.809741735458374, "learning_rate": 1.6339423817643418e-06, "loss": 0.2009, "step": 15370 }, { "epoch": 0.8215043999626104, "grad_norm": 2.0170931816101074, "learning_rate": 1.6244776140667695e-06, "loss": 0.193, "step": 15380 }, { "epoch": 0.8220385380640165, "grad_norm": 1.1400176286697388, "learning_rate": 1.615037914830826e-06, "loss": 0.1849, "step": 15390 }, { "epoch": 0.8225726761654226, "grad_norm": 2.6955411434173584, "learning_rate": 1.605623312310124e-06, "loss": 0.2101, "step": 15400 }, { "epoch": 0.8231068142668286, "grad_norm": 2.7265326976776123, "learning_rate": 1.596233834683154e-06, "loss": 0.1871, "step": 15410 }, { "epoch": 0.8236409523682348, "grad_norm": 2.3386309146881104, "learning_rate": 1.5868695100532116e-06, "loss": 0.2059, "step": 15420 }, { "epoch": 0.8241750904696409, "grad_norm": 2.06644868850708, "learning_rate": 1.5775303664483032e-06, "loss": 0.1786, "step": 15430 }, { "epoch": 0.824709228571047, "grad_norm": 1.8337013721466064, "learning_rate": 1.5682164318210691e-06, "loss": 0.1901, "step": 15440 }, { "epoch": 0.8252433666724531, "grad_norm": 1.6303783655166626, "learning_rate": 1.5589277340486964e-06, "loss": 0.1841, "step": 15450 }, { "epoch": 0.8257775047738592, "grad_norm": 2.7498836517333984, "learning_rate": 1.5496643009328404e-06, "loss": 0.2003, "step": 15460 }, { "epoch": 0.8263116428752654, "grad_norm": 2.5089802742004395, "learning_rate": 1.540426160199534e-06, "loss": 0.1933, "step": 15470 }, { "epoch": 0.8268457809766715, "grad_norm": 2.792116641998291, "learning_rate": 1.5312133394991091e-06, "loss": 0.1831, "step": 15480 }, { "epoch": 0.8273799190780776, "grad_norm": 2.1863296031951904, "learning_rate": 1.5220258664061117e-06, "loss": 0.1972, "step": 15490 }, { "epoch": 0.8279140571794837, "grad_norm": 1.9472436904907227, "learning_rate": 1.512863768419226e-06, "loss": 0.189, "step": 15500 }, { "epoch": 0.8284481952808899, "grad_norm": 2.3212087154388428, "learning_rate": 1.503727072961182e-06, "loss": 0.187, "step": 15510 }, { "epoch": 0.828982333382296, "grad_norm": 1.8435794115066528, "learning_rate": 1.4946158073786797e-06, "loss": 0.1967, "step": 15520 }, { "epoch": 0.8295164714837021, "grad_norm": 2.218397855758667, "learning_rate": 1.4855299989423066e-06, "loss": 0.1823, "step": 15530 }, { "epoch": 0.8300506095851082, "grad_norm": 1.9504783153533936, "learning_rate": 1.4764696748464547e-06, "loss": 0.2079, "step": 15540 }, { "epoch": 0.8305847476865144, "grad_norm": 1.9759118556976318, "learning_rate": 1.4674348622092372e-06, "loss": 0.1838, "step": 15550 }, { "epoch": 0.8311188857879205, "grad_norm": 2.0831799507141113, "learning_rate": 1.4584255880724175e-06, "loss": 0.1822, "step": 15560 }, { "epoch": 0.8316530238893266, "grad_norm": 1.9594889879226685, "learning_rate": 1.4494418794013122e-06, "loss": 0.1888, "step": 15570 }, { "epoch": 0.8321871619907327, "grad_norm": 1.3435486555099487, "learning_rate": 1.4404837630847246e-06, "loss": 0.187, "step": 15580 }, { "epoch": 0.8327213000921388, "grad_norm": 2.3167057037353516, "learning_rate": 1.431551265934853e-06, "loss": 0.179, "step": 15590 }, { "epoch": 0.833255438193545, "grad_norm": 1.9253957271575928, "learning_rate": 1.4226444146872242e-06, "loss": 0.1802, "step": 15600 }, { "epoch": 0.8337895762949511, "grad_norm": 2.1387100219726562, "learning_rate": 1.4137632360005982e-06, "loss": 0.1888, "step": 15610 }, { "epoch": 0.8343237143963572, "grad_norm": 1.5886762142181396, "learning_rate": 1.4049077564568958e-06, "loss": 0.1726, "step": 15620 }, { "epoch": 0.8348578524977633, "grad_norm": 2.3915352821350098, "learning_rate": 1.3960780025611198e-06, "loss": 0.1834, "step": 15630 }, { "epoch": 0.8353919905991695, "grad_norm": 1.7665464878082275, "learning_rate": 1.38727400074128e-06, "loss": 0.1977, "step": 15640 }, { "epoch": 0.8359261287005756, "grad_norm": 1.5928136110305786, "learning_rate": 1.3784957773483032e-06, "loss": 0.2039, "step": 15650 }, { "epoch": 0.8364602668019816, "grad_norm": 1.930435299873352, "learning_rate": 1.3697433586559528e-06, "loss": 0.189, "step": 15660 }, { "epoch": 0.8369944049033877, "grad_norm": 1.365477442741394, "learning_rate": 1.3610167708607714e-06, "loss": 0.1785, "step": 15670 }, { "epoch": 0.8375285430047938, "grad_norm": 2.31242036819458, "learning_rate": 1.3523160400819802e-06, "loss": 0.1852, "step": 15680 }, { "epoch": 0.8380626811062, "grad_norm": 1.987921953201294, "learning_rate": 1.343641192361409e-06, "loss": 0.2016, "step": 15690 }, { "epoch": 0.8385968192076061, "grad_norm": 2.0820467472076416, "learning_rate": 1.3349922536634164e-06, "loss": 0.1883, "step": 15700 }, { "epoch": 0.8391309573090122, "grad_norm": 3.0412142276763916, "learning_rate": 1.3263692498748216e-06, "loss": 0.1975, "step": 15710 }, { "epoch": 0.8396650954104183, "grad_norm": 3.02032208442688, "learning_rate": 1.3177722068048116e-06, "loss": 0.1935, "step": 15720 }, { "epoch": 0.8401992335118245, "grad_norm": 2.207077741622925, "learning_rate": 1.309201150184869e-06, "loss": 0.1948, "step": 15730 }, { "epoch": 0.8407333716132306, "grad_norm": 2.386183738708496, "learning_rate": 1.3006561056687073e-06, "loss": 0.1998, "step": 15740 }, { "epoch": 0.8412675097146367, "grad_norm": 1.1369651556015015, "learning_rate": 1.2921370988321768e-06, "loss": 0.1812, "step": 15750 }, { "epoch": 0.8418016478160428, "grad_norm": 2.1222147941589355, "learning_rate": 1.283644155173196e-06, "loss": 0.1895, "step": 15760 }, { "epoch": 0.842335785917449, "grad_norm": 1.8976614475250244, "learning_rate": 1.2751773001116763e-06, "loss": 0.1853, "step": 15770 }, { "epoch": 0.8428699240188551, "grad_norm": 2.2440550327301025, "learning_rate": 1.266736558989443e-06, "loss": 0.1801, "step": 15780 }, { "epoch": 0.8434040621202612, "grad_norm": 2.4571006298065186, "learning_rate": 1.2583219570701634e-06, "loss": 0.1818, "step": 15790 }, { "epoch": 0.8439382002216673, "grad_norm": 2.006486654281616, "learning_rate": 1.2499335195392625e-06, "loss": 0.2, "step": 15800 }, { "epoch": 0.8444723383230734, "grad_norm": 1.647672414779663, "learning_rate": 1.241571271503862e-06, "loss": 0.196, "step": 15810 }, { "epoch": 0.8450064764244796, "grad_norm": 1.9208781719207764, "learning_rate": 1.2332352379926927e-06, "loss": 0.1993, "step": 15820 }, { "epoch": 0.8455406145258857, "grad_norm": 1.7370686531066895, "learning_rate": 1.2249254439560209e-06, "loss": 0.1834, "step": 15830 }, { "epoch": 0.8460747526272918, "grad_norm": 1.821781873703003, "learning_rate": 1.2166419142655784e-06, "loss": 0.1855, "step": 15840 }, { "epoch": 0.8466088907286979, "grad_norm": 2.148794174194336, "learning_rate": 1.20838467371449e-06, "loss": 0.1903, "step": 15850 }, { "epoch": 0.8471430288301041, "grad_norm": 1.6122615337371826, "learning_rate": 1.200153747017193e-06, "loss": 0.1834, "step": 15860 }, { "epoch": 0.8476771669315102, "grad_norm": 1.9573323726654053, "learning_rate": 1.19194915880936e-06, "loss": 0.1895, "step": 15870 }, { "epoch": 0.8482113050329163, "grad_norm": 3.3346667289733887, "learning_rate": 1.1837709336478442e-06, "loss": 0.1928, "step": 15880 }, { "epoch": 0.8487454431343224, "grad_norm": 1.9923675060272217, "learning_rate": 1.1756190960105763e-06, "loss": 0.1904, "step": 15890 }, { "epoch": 0.8492795812357284, "grad_norm": 2.316518545150757, "learning_rate": 1.1674936702965178e-06, "loss": 0.1938, "step": 15900 }, { "epoch": 0.8498137193371346, "grad_norm": 1.8290090560913086, "learning_rate": 1.1593946808255774e-06, "loss": 0.181, "step": 15910 }, { "epoch": 0.8503478574385407, "grad_norm": 1.9949698448181152, "learning_rate": 1.1513221518385354e-06, "loss": 0.1845, "step": 15920 }, { "epoch": 0.8508819955399468, "grad_norm": 1.7710309028625488, "learning_rate": 1.1432761074969756e-06, "loss": 0.1878, "step": 15930 }, { "epoch": 0.8514161336413529, "grad_norm": 2.64086651802063, "learning_rate": 1.1352565718832077e-06, "loss": 0.189, "step": 15940 }, { "epoch": 0.8519502717427591, "grad_norm": 1.7557547092437744, "learning_rate": 1.1272635690002087e-06, "loss": 0.1793, "step": 15950 }, { "epoch": 0.8524844098441652, "grad_norm": 1.8455740213394165, "learning_rate": 1.1192971227715322e-06, "loss": 0.188, "step": 15960 }, { "epoch": 0.8530185479455713, "grad_norm": 2.0141634941101074, "learning_rate": 1.11135725704125e-06, "loss": 0.1821, "step": 15970 }, { "epoch": 0.8535526860469774, "grad_norm": 1.5579683780670166, "learning_rate": 1.103443995573873e-06, "loss": 0.1874, "step": 15980 }, { "epoch": 0.8540868241483835, "grad_norm": 1.3364542722702026, "learning_rate": 1.0955573620542925e-06, "loss": 0.2028, "step": 15990 }, { "epoch": 0.8546209622497897, "grad_norm": 1.7439391613006592, "learning_rate": 1.0876973800876944e-06, "loss": 0.2089, "step": 16000 }, { "epoch": 0.8551551003511958, "grad_norm": 1.5360907316207886, "learning_rate": 1.0798640731994892e-06, "loss": 0.1856, "step": 16010 }, { "epoch": 0.8556892384526019, "grad_norm": 2.8496618270874023, "learning_rate": 1.072057464835261e-06, "loss": 0.1852, "step": 16020 }, { "epoch": 0.856223376554008, "grad_norm": 1.9631704092025757, "learning_rate": 1.0642775783606729e-06, "loss": 0.1832, "step": 16030 }, { "epoch": 0.8567575146554142, "grad_norm": 2.298893451690674, "learning_rate": 1.0565244370614103e-06, "loss": 0.1902, "step": 16040 }, { "epoch": 0.8572916527568203, "grad_norm": 1.7732148170471191, "learning_rate": 1.0487980641431128e-06, "loss": 0.1963, "step": 16050 }, { "epoch": 0.8578257908582264, "grad_norm": 2.152968406677246, "learning_rate": 1.0410984827312953e-06, "loss": 0.1926, "step": 16060 }, { "epoch": 0.8583599289596325, "grad_norm": 1.780741810798645, "learning_rate": 1.0334257158712847e-06, "loss": 0.1906, "step": 16070 }, { "epoch": 0.8588940670610387, "grad_norm": 1.8721725940704346, "learning_rate": 1.0257797865281504e-06, "loss": 0.1883, "step": 16080 }, { "epoch": 0.8594282051624448, "grad_norm": 2.350572347640991, "learning_rate": 1.0181607175866414e-06, "loss": 0.1909, "step": 16090 }, { "epoch": 0.8599623432638509, "grad_norm": 1.8676543235778809, "learning_rate": 1.0105685318511027e-06, "loss": 0.1987, "step": 16100 }, { "epoch": 0.860496481365257, "grad_norm": 2.050701141357422, "learning_rate": 1.0030032520454225e-06, "loss": 0.1991, "step": 16110 }, { "epoch": 0.8610306194666631, "grad_norm": 2.8425140380859375, "learning_rate": 9.954649008129547e-07, "loss": 0.1915, "step": 16120 }, { "epoch": 0.8615647575680693, "grad_norm": 1.7374526262283325, "learning_rate": 9.879535007164565e-07, "loss": 0.203, "step": 16130 }, { "epoch": 0.8620988956694754, "grad_norm": 1.6487489938735962, "learning_rate": 9.804690742380152e-07, "loss": 0.1797, "step": 16140 }, { "epoch": 0.8626330337708814, "grad_norm": 2.5397281646728516, "learning_rate": 9.730116437789872e-07, "loss": 0.2049, "step": 16150 }, { "epoch": 0.8631671718722875, "grad_norm": 1.8596101999282837, "learning_rate": 9.655812316599311e-07, "loss": 0.1771, "step": 16160 }, { "epoch": 0.8637013099736937, "grad_norm": 2.256027936935425, "learning_rate": 9.581778601205304e-07, "loss": 0.1852, "step": 16170 }, { "epoch": 0.8642354480750998, "grad_norm": 1.6021047830581665, "learning_rate": 9.508015513195401e-07, "loss": 0.1899, "step": 16180 }, { "epoch": 0.8647695861765059, "grad_norm": 1.441133737564087, "learning_rate": 9.434523273347141e-07, "loss": 0.183, "step": 16190 }, { "epoch": 0.865303724277912, "grad_norm": 1.3575091361999512, "learning_rate": 9.361302101627378e-07, "loss": 0.1878, "step": 16200 }, { "epoch": 0.8658378623793181, "grad_norm": 1.3963885307312012, "learning_rate": 9.288352217191654e-07, "loss": 0.1842, "step": 16210 }, { "epoch": 0.8663720004807243, "grad_norm": 2.5100231170654297, "learning_rate": 9.215673838383521e-07, "loss": 0.1996, "step": 16220 }, { "epoch": 0.8669061385821304, "grad_norm": 2.1049153804779053, "learning_rate": 9.143267182733939e-07, "loss": 0.1823, "step": 16230 }, { "epoch": 0.8674402766835365, "grad_norm": 2.4029719829559326, "learning_rate": 9.071132466960509e-07, "loss": 0.1904, "step": 16240 }, { "epoch": 0.8679744147849426, "grad_norm": 2.5730371475219727, "learning_rate": 8.999269906966946e-07, "loss": 0.1851, "step": 16250 }, { "epoch": 0.8685085528863488, "grad_norm": 1.6669591665267944, "learning_rate": 8.927679717842408e-07, "loss": 0.1926, "step": 16260 }, { "epoch": 0.8690426909877549, "grad_norm": 1.6460821628570557, "learning_rate": 8.856362113860784e-07, "loss": 0.1871, "step": 16270 }, { "epoch": 0.869576829089161, "grad_norm": 2.2781178951263428, "learning_rate": 8.785317308480124e-07, "loss": 0.1998, "step": 16280 }, { "epoch": 0.8701109671905671, "grad_norm": 1.3480076789855957, "learning_rate": 8.714545514341943e-07, "loss": 0.1876, "step": 16290 }, { "epoch": 0.8706451052919733, "grad_norm": 1.8846122026443481, "learning_rate": 8.64404694327069e-07, "loss": 0.1958, "step": 16300 }, { "epoch": 0.8711792433933794, "grad_norm": 2.454969882965088, "learning_rate": 8.573821806272953e-07, "loss": 0.2039, "step": 16310 }, { "epoch": 0.8717133814947855, "grad_norm": 2.8521759510040283, "learning_rate": 8.503870313536944e-07, "loss": 0.1973, "step": 16320 }, { "epoch": 0.8722475195961916, "grad_norm": 1.405978798866272, "learning_rate": 8.434192674431896e-07, "loss": 0.17, "step": 16330 }, { "epoch": 0.8727816576975977, "grad_norm": 1.9971487522125244, "learning_rate": 8.364789097507309e-07, "loss": 0.183, "step": 16340 }, { "epoch": 0.8733157957990039, "grad_norm": 1.598315954208374, "learning_rate": 8.295659790492416e-07, "loss": 0.1967, "step": 16350 }, { "epoch": 0.87384993390041, "grad_norm": 2.0706303119659424, "learning_rate": 8.226804960295564e-07, "loss": 0.1925, "step": 16360 }, { "epoch": 0.8743840720018161, "grad_norm": 2.8099141120910645, "learning_rate": 8.158224813003557e-07, "loss": 0.2002, "step": 16370 }, { "epoch": 0.8749182101032222, "grad_norm": 1.952162265777588, "learning_rate": 8.089919553881054e-07, "loss": 0.1721, "step": 16380 }, { "epoch": 0.8754523482046284, "grad_norm": 2.706237316131592, "learning_rate": 8.021889387369941e-07, "loss": 0.1996, "step": 16390 }, { "epoch": 0.8759864863060344, "grad_norm": 1.569981575012207, "learning_rate": 7.954134517088807e-07, "loss": 0.1954, "step": 16400 }, { "epoch": 0.8765206244074405, "grad_norm": 1.5747971534729004, "learning_rate": 7.886655145832179e-07, "loss": 0.1751, "step": 16410 }, { "epoch": 0.8770547625088466, "grad_norm": 2.2496185302734375, "learning_rate": 7.819451475570028e-07, "loss": 0.1799, "step": 16420 }, { "epoch": 0.8775889006102527, "grad_norm": 1.9302244186401367, "learning_rate": 7.752523707447135e-07, "loss": 0.1868, "step": 16430 }, { "epoch": 0.8781230387116589, "grad_norm": 1.8425142765045166, "learning_rate": 7.685872041782505e-07, "loss": 0.189, "step": 16440 }, { "epoch": 0.878657176813065, "grad_norm": 2.0477631092071533, "learning_rate": 7.619496678068739e-07, "loss": 0.1971, "step": 16450 }, { "epoch": 0.8791913149144711, "grad_norm": 2.1096863746643066, "learning_rate": 7.553397814971408e-07, "loss": 0.1912, "step": 16460 }, { "epoch": 0.8797254530158772, "grad_norm": 1.4019670486450195, "learning_rate": 7.487575650328605e-07, "loss": 0.1862, "step": 16470 }, { "epoch": 0.8802595911172834, "grad_norm": 1.501746654510498, "learning_rate": 7.422030381150124e-07, "loss": 0.1901, "step": 16480 }, { "epoch": 0.8807937292186895, "grad_norm": 1.8003556728363037, "learning_rate": 7.356762203617051e-07, "loss": 0.1911, "step": 16490 }, { "epoch": 0.8813278673200956, "grad_norm": 1.7622851133346558, "learning_rate": 7.291771313081153e-07, "loss": 0.1854, "step": 16500 }, { "epoch": 0.8818620054215017, "grad_norm": 1.4670850038528442, "learning_rate": 7.227057904064205e-07, "loss": 0.1898, "step": 16510 }, { "epoch": 0.8823961435229078, "grad_norm": 2.17773175239563, "learning_rate": 7.162622170257483e-07, "loss": 0.1948, "step": 16520 }, { "epoch": 0.882930281624314, "grad_norm": 1.8604589700698853, "learning_rate": 7.098464304521135e-07, "loss": 0.1896, "step": 16530 }, { "epoch": 0.8834644197257201, "grad_norm": 1.981903314590454, "learning_rate": 7.034584498883712e-07, "loss": 0.2095, "step": 16540 }, { "epoch": 0.8839985578271262, "grad_norm": 1.7902601957321167, "learning_rate": 6.970982944541438e-07, "loss": 0.1826, "step": 16550 }, { "epoch": 0.8845326959285323, "grad_norm": 2.006857395172119, "learning_rate": 6.907659831857738e-07, "loss": 0.1775, "step": 16560 }, { "epoch": 0.8850668340299385, "grad_norm": 2.0654683113098145, "learning_rate": 6.844615350362626e-07, "loss": 0.1863, "step": 16570 }, { "epoch": 0.8856009721313446, "grad_norm": 2.291034460067749, "learning_rate": 6.781849688752218e-07, "loss": 0.2102, "step": 16580 }, { "epoch": 0.8861351102327507, "grad_norm": 1.6059356927871704, "learning_rate": 6.719363034888016e-07, "loss": 0.1806, "step": 16590 }, { "epoch": 0.8866692483341568, "grad_norm": 1.7951716184616089, "learning_rate": 6.657155575796481e-07, "loss": 0.1829, "step": 16600 }, { "epoch": 0.887203386435563, "grad_norm": 1.8367642164230347, "learning_rate": 6.595227497668455e-07, "loss": 0.1787, "step": 16610 }, { "epoch": 0.8877375245369691, "grad_norm": 1.3339611291885376, "learning_rate": 6.533578985858513e-07, "loss": 0.1897, "step": 16620 }, { "epoch": 0.8882716626383752, "grad_norm": 1.94770348072052, "learning_rate": 6.472210224884512e-07, "loss": 0.1789, "step": 16630 }, { "epoch": 0.8888058007397812, "grad_norm": 2.028087615966797, "learning_rate": 6.41112139842699e-07, "loss": 0.1943, "step": 16640 }, { "epoch": 0.8893399388411873, "grad_norm": 1.726192593574524, "learning_rate": 6.350312689328619e-07, "loss": 0.1934, "step": 16650 }, { "epoch": 0.8898740769425935, "grad_norm": 1.7308710813522339, "learning_rate": 6.289784279593669e-07, "loss": 0.1954, "step": 16660 }, { "epoch": 0.8904082150439996, "grad_norm": 1.5285080671310425, "learning_rate": 6.229536350387422e-07, "loss": 0.1772, "step": 16670 }, { "epoch": 0.8909423531454057, "grad_norm": 2.104999542236328, "learning_rate": 6.169569082035731e-07, "loss": 0.1876, "step": 16680 }, { "epoch": 0.8914764912468118, "grad_norm": 2.6350157260894775, "learning_rate": 6.109882654024357e-07, "loss": 0.1929, "step": 16690 }, { "epoch": 0.892010629348218, "grad_norm": 2.0990569591522217, "learning_rate": 6.050477244998521e-07, "loss": 0.1855, "step": 16700 }, { "epoch": 0.8925447674496241, "grad_norm": 2.6033666133880615, "learning_rate": 5.991353032762304e-07, "loss": 0.1956, "step": 16710 }, { "epoch": 0.8930789055510302, "grad_norm": 1.5166584253311157, "learning_rate": 5.932510194278174e-07, "loss": 0.1935, "step": 16720 }, { "epoch": 0.8936130436524363, "grad_norm": 1.4861478805541992, "learning_rate": 5.873948905666416e-07, "loss": 0.189, "step": 16730 }, { "epoch": 0.8941471817538424, "grad_norm": 2.2394187450408936, "learning_rate": 5.815669342204622e-07, "loss": 0.1842, "step": 16740 }, { "epoch": 0.8946813198552486, "grad_norm": 2.523660659790039, "learning_rate": 5.757671678327182e-07, "loss": 0.1885, "step": 16750 }, { "epoch": 0.8952154579566547, "grad_norm": 2.0616846084594727, "learning_rate": 5.699956087624725e-07, "loss": 0.1939, "step": 16760 }, { "epoch": 0.8957495960580608, "grad_norm": 2.5158727169036865, "learning_rate": 5.642522742843593e-07, "loss": 0.1796, "step": 16770 }, { "epoch": 0.8962837341594669, "grad_norm": 1.7550495862960815, "learning_rate": 5.58537181588541e-07, "loss": 0.177, "step": 16780 }, { "epoch": 0.8968178722608731, "grad_norm": 2.0261199474334717, "learning_rate": 5.528503477806446e-07, "loss": 0.2028, "step": 16790 }, { "epoch": 0.8973520103622792, "grad_norm": 1.9887953996658325, "learning_rate": 5.471917898817203e-07, "loss": 0.1894, "step": 16800 }, { "epoch": 0.8978861484636853, "grad_norm": 1.8601176738739014, "learning_rate": 5.415615248281836e-07, "loss": 0.185, "step": 16810 }, { "epoch": 0.8984202865650914, "grad_norm": 1.8554942607879639, "learning_rate": 5.359595694717723e-07, "loss": 0.188, "step": 16820 }, { "epoch": 0.8989544246664976, "grad_norm": 1.9678667783737183, "learning_rate": 5.303859405794854e-07, "loss": 0.1778, "step": 16830 }, { "epoch": 0.8994885627679037, "grad_norm": 2.0422017574310303, "learning_rate": 5.248406548335427e-07, "loss": 0.1967, "step": 16840 }, { "epoch": 0.9000227008693098, "grad_norm": 2.0389740467071533, "learning_rate": 5.193237288313336e-07, "loss": 0.1999, "step": 16850 }, { "epoch": 0.9005568389707159, "grad_norm": 1.7121886014938354, "learning_rate": 5.138351790853591e-07, "loss": 0.1905, "step": 16860 }, { "epoch": 0.901090977072122, "grad_norm": 1.9639217853546143, "learning_rate": 5.083750220231942e-07, "loss": 0.2036, "step": 16870 }, { "epoch": 0.9016251151735282, "grad_norm": 1.7823457717895508, "learning_rate": 5.029432739874263e-07, "loss": 0.1805, "step": 16880 }, { "epoch": 0.9021592532749342, "grad_norm": 2.196033477783203, "learning_rate": 4.975399512356205e-07, "loss": 0.1874, "step": 16890 }, { "epoch": 0.9026933913763403, "grad_norm": 1.6100378036499023, "learning_rate": 4.921650699402581e-07, "loss": 0.1846, "step": 16900 }, { "epoch": 0.9032275294777464, "grad_norm": 1.896017074584961, "learning_rate": 4.868186461886914e-07, "loss": 0.2044, "step": 16910 }, { "epoch": 0.9037616675791526, "grad_norm": 2.1476900577545166, "learning_rate": 4.815006959831059e-07, "loss": 0.1747, "step": 16920 }, { "epoch": 0.9042958056805587, "grad_norm": 2.1895558834075928, "learning_rate": 4.7621123524045463e-07, "loss": 0.1845, "step": 16930 }, { "epoch": 0.9048299437819648, "grad_norm": 2.1143078804016113, "learning_rate": 4.709502797924259e-07, "loss": 0.1969, "step": 16940 }, { "epoch": 0.9053640818833709, "grad_norm": 2.42761492729187, "learning_rate": 4.657178453853883e-07, "loss": 0.1895, "step": 16950 }, { "epoch": 0.905898219984777, "grad_norm": 1.7737916707992554, "learning_rate": 4.605139476803433e-07, "loss": 0.1826, "step": 16960 }, { "epoch": 0.9064323580861832, "grad_norm": 2.2270278930664062, "learning_rate": 4.5533860225288276e-07, "loss": 0.1796, "step": 16970 }, { "epoch": 0.9069664961875893, "grad_norm": 1.4609014987945557, "learning_rate": 4.501918245931369e-07, "loss": 0.2011, "step": 16980 }, { "epoch": 0.9075006342889954, "grad_norm": 1.2693812847137451, "learning_rate": 4.450736301057346e-07, "loss": 0.1743, "step": 16990 }, { "epoch": 0.9080347723904015, "grad_norm": 1.9739468097686768, "learning_rate": 4.399840341097528e-07, "loss": 0.1912, "step": 17000 }, { "epoch": 0.9085689104918077, "grad_norm": 2.2540767192840576, "learning_rate": 4.349230518386682e-07, "loss": 0.1759, "step": 17010 }, { "epoch": 0.9091030485932138, "grad_norm": 2.1931941509246826, "learning_rate": 4.298906984403162e-07, "loss": 0.1894, "step": 17020 }, { "epoch": 0.9096371866946199, "grad_norm": 1.6749526262283325, "learning_rate": 4.24886988976847e-07, "loss": 0.18, "step": 17030 }, { "epoch": 0.910171324796026, "grad_norm": 1.7118310928344727, "learning_rate": 4.1991193842467526e-07, "loss": 0.2042, "step": 17040 }, { "epoch": 0.9107054628974321, "grad_norm": 1.8347235918045044, "learning_rate": 4.1496556167443633e-07, "loss": 0.1804, "step": 17050 }, { "epoch": 0.9112396009988383, "grad_norm": 1.437732458114624, "learning_rate": 4.1004787353094657e-07, "loss": 0.1931, "step": 17060 }, { "epoch": 0.9117737391002444, "grad_norm": 2.4131076335906982, "learning_rate": 4.051588887131519e-07, "loss": 0.2024, "step": 17070 }, { "epoch": 0.9123078772016505, "grad_norm": 2.3994522094726562, "learning_rate": 4.002986218540905e-07, "loss": 0.1848, "step": 17080 }, { "epoch": 0.9128420153030566, "grad_norm": 2.041776180267334, "learning_rate": 3.954670875008448e-07, "loss": 0.1938, "step": 17090 }, { "epoch": 0.9133761534044628, "grad_norm": 1.7319960594177246, "learning_rate": 3.906643001144983e-07, "loss": 0.195, "step": 17100 }, { "epoch": 0.9139102915058689, "grad_norm": 2.2646522521972656, "learning_rate": 3.8589027407009426e-07, "loss": 0.1997, "step": 17110 }, { "epoch": 0.914444429607275, "grad_norm": 1.977262020111084, "learning_rate": 3.8114502365658944e-07, "loss": 0.1886, "step": 17120 }, { "epoch": 0.9149785677086811, "grad_norm": 1.6513243913650513, "learning_rate": 3.7642856307681606e-07, "loss": 0.191, "step": 17130 }, { "epoch": 0.9155127058100871, "grad_norm": 2.022233724594116, "learning_rate": 3.7174090644743533e-07, "loss": 0.1756, "step": 17140 }, { "epoch": 0.9160468439114933, "grad_norm": 2.328139543533325, "learning_rate": 3.6708206779889623e-07, "loss": 0.1775, "step": 17150 }, { "epoch": 0.9165809820128994, "grad_norm": 2.2181718349456787, "learning_rate": 3.624520610753923e-07, "loss": 0.1975, "step": 17160 }, { "epoch": 0.9171151201143055, "grad_norm": 1.7477359771728516, "learning_rate": 3.5785090013482517e-07, "loss": 0.1919, "step": 17170 }, { "epoch": 0.9176492582157116, "grad_norm": 1.9032509326934814, "learning_rate": 3.5327859874875637e-07, "loss": 0.1944, "step": 17180 }, { "epoch": 0.9181833963171178, "grad_norm": 2.6710636615753174, "learning_rate": 3.487351706023656e-07, "loss": 0.1921, "step": 17190 }, { "epoch": 0.9187175344185239, "grad_norm": 2.6287708282470703, "learning_rate": 3.442206292944228e-07, "loss": 0.1893, "step": 17200 }, { "epoch": 0.91925167251993, "grad_norm": 1.442428708076477, "learning_rate": 3.397349883372303e-07, "loss": 0.177, "step": 17210 }, { "epoch": 0.9197858106213361, "grad_norm": 1.6243791580200195, "learning_rate": 3.3527826115658856e-07, "loss": 0.1942, "step": 17220 }, { "epoch": 0.9203199487227423, "grad_norm": 1.9001173973083496, "learning_rate": 3.308504610917651e-07, "loss": 0.1977, "step": 17230 }, { "epoch": 0.9208540868241484, "grad_norm": 1.901663064956665, "learning_rate": 3.2645160139543887e-07, "loss": 0.1792, "step": 17240 }, { "epoch": 0.9213882249255545, "grad_norm": 2.1770408153533936, "learning_rate": 3.220816952336725e-07, "loss": 0.1823, "step": 17250 }, { "epoch": 0.9219223630269606, "grad_norm": 2.0881235599517822, "learning_rate": 3.1774075568586807e-07, "loss": 0.183, "step": 17260 }, { "epoch": 0.9224565011283667, "grad_norm": 2.0422818660736084, "learning_rate": 3.1342879574472904e-07, "loss": 0.193, "step": 17270 }, { "epoch": 0.9229906392297729, "grad_norm": 1.4658859968185425, "learning_rate": 3.0914582831622185e-07, "loss": 0.1864, "step": 17280 }, { "epoch": 0.923524777331179, "grad_norm": 1.9932317733764648, "learning_rate": 3.048918662195333e-07, "loss": 0.1736, "step": 17290 }, { "epoch": 0.9240589154325851, "grad_norm": 1.7939445972442627, "learning_rate": 3.0066692218703955e-07, "loss": 0.191, "step": 17300 }, { "epoch": 0.9245930535339912, "grad_norm": 2.5735878944396973, "learning_rate": 2.964710088642608e-07, "loss": 0.1636, "step": 17310 }, { "epoch": 0.9251271916353974, "grad_norm": 2.337960958480835, "learning_rate": 2.92304138809828e-07, "loss": 0.1939, "step": 17320 }, { "epoch": 0.9256613297368035, "grad_norm": 1.7035138607025146, "learning_rate": 2.8816632449544246e-07, "loss": 0.1825, "step": 17330 }, { "epoch": 0.9261954678382096, "grad_norm": 2.102066993713379, "learning_rate": 2.84057578305843e-07, "loss": 0.1878, "step": 17340 }, { "epoch": 0.9267296059396157, "grad_norm": 2.1644673347473145, "learning_rate": 2.799779125387625e-07, "loss": 0.1993, "step": 17350 }, { "epoch": 0.9272637440410219, "grad_norm": 1.6726597547531128, "learning_rate": 2.7592733940489445e-07, "loss": 0.1853, "step": 17360 }, { "epoch": 0.927797882142428, "grad_norm": 2.182621717453003, "learning_rate": 2.7190587102785993e-07, "loss": 0.1789, "step": 17370 }, { "epoch": 0.928332020243834, "grad_norm": 1.689612627029419, "learning_rate": 2.679135194441629e-07, "loss": 0.1953, "step": 17380 }, { "epoch": 0.9288661583452401, "grad_norm": 1.6947296857833862, "learning_rate": 2.639502966031615e-07, "loss": 0.1883, "step": 17390 }, { "epoch": 0.9294002964466462, "grad_norm": 2.23433256149292, "learning_rate": 2.6001621436702816e-07, "loss": 0.1733, "step": 17400 }, { "epoch": 0.9299344345480524, "grad_norm": 2.372011423110962, "learning_rate": 2.5611128451071834e-07, "loss": 0.1907, "step": 17410 }, { "epoch": 0.9304685726494585, "grad_norm": 1.9081236124038696, "learning_rate": 2.522355187219305e-07, "loss": 0.1948, "step": 17420 }, { "epoch": 0.9310027107508646, "grad_norm": 2.2157649993896484, "learning_rate": 2.483889286010732e-07, "loss": 0.1883, "step": 17430 }, { "epoch": 0.9315368488522707, "grad_norm": 3.0452721118927, "learning_rate": 2.4457152566123135e-07, "loss": 0.1861, "step": 17440 }, { "epoch": 0.9320709869536768, "grad_norm": 1.894029140472412, "learning_rate": 2.4078332132813076e-07, "loss": 0.1816, "step": 17450 }, { "epoch": 0.932605125055083, "grad_norm": 1.9587960243225098, "learning_rate": 2.370243269401029e-07, "loss": 0.1883, "step": 17460 }, { "epoch": 0.9331392631564891, "grad_norm": 1.8781605958938599, "learning_rate": 2.3329455374805243e-07, "loss": 0.1943, "step": 17470 }, { "epoch": 0.9336734012578952, "grad_norm": 1.892554759979248, "learning_rate": 2.2959401291542504e-07, "loss": 0.1913, "step": 17480 }, { "epoch": 0.9342075393593013, "grad_norm": 1.8163005113601685, "learning_rate": 2.259227155181687e-07, "loss": 0.1716, "step": 17490 }, { "epoch": 0.9347416774607075, "grad_norm": 2.2035508155822754, "learning_rate": 2.2228067254470686e-07, "loss": 0.1923, "step": 17500 }, { "epoch": 0.9352758155621136, "grad_norm": 1.8636702299118042, "learning_rate": 2.18667894895902e-07, "loss": 0.1847, "step": 17510 }, { "epoch": 0.9358099536635197, "grad_norm": 2.3409037590026855, "learning_rate": 2.150843933850244e-07, "loss": 0.1852, "step": 17520 }, { "epoch": 0.9363440917649258, "grad_norm": 1.816667914390564, "learning_rate": 2.115301787377133e-07, "loss": 0.1894, "step": 17530 }, { "epoch": 0.936878229866332, "grad_norm": 1.9482243061065674, "learning_rate": 2.0800526159196034e-07, "loss": 0.2046, "step": 17540 }, { "epoch": 0.9374123679677381, "grad_norm": 2.2246570587158203, "learning_rate": 2.0450965249805943e-07, "loss": 0.189, "step": 17550 }, { "epoch": 0.9379465060691442, "grad_norm": 1.8416835069656372, "learning_rate": 2.010433619185881e-07, "loss": 0.2046, "step": 17560 }, { "epoch": 0.9384806441705503, "grad_norm": 2.3019700050354004, "learning_rate": 1.9760640022837064e-07, "loss": 0.1971, "step": 17570 }, { "epoch": 0.9390147822719564, "grad_norm": 2.3936755657196045, "learning_rate": 1.9419877771444716e-07, "loss": 0.1763, "step": 17580 }, { "epoch": 0.9395489203733626, "grad_norm": 1.7260682582855225, "learning_rate": 1.9082050457604695e-07, "loss": 0.1826, "step": 17590 }, { "epoch": 0.9400830584747687, "grad_norm": 2.2817585468292236, "learning_rate": 1.8747159092454948e-07, "loss": 0.1799, "step": 17600 }, { "epoch": 0.9406171965761748, "grad_norm": 2.082606315612793, "learning_rate": 1.8415204678346456e-07, "loss": 0.1888, "step": 17610 }, { "epoch": 0.9411513346775809, "grad_norm": 2.1301021575927734, "learning_rate": 1.808618820883945e-07, "loss": 0.1785, "step": 17620 }, { "epoch": 0.941685472778987, "grad_norm": 2.3053362369537354, "learning_rate": 1.7760110668700758e-07, "loss": 0.1847, "step": 17630 }, { "epoch": 0.9422196108803931, "grad_norm": 1.9152814149856567, "learning_rate": 1.7436973033900794e-07, "loss": 0.2008, "step": 17640 }, { "epoch": 0.9427537489817992, "grad_norm": 2.090841054916382, "learning_rate": 1.711677627161079e-07, "loss": 0.1721, "step": 17650 }, { "epoch": 0.9432878870832053, "grad_norm": 2.1163957118988037, "learning_rate": 1.6799521340199688e-07, "loss": 0.1851, "step": 17660 }, { "epoch": 0.9438220251846114, "grad_norm": 1.4737683534622192, "learning_rate": 1.648520918923102e-07, "loss": 0.1861, "step": 17670 }, { "epoch": 0.9443561632860176, "grad_norm": 1.39410400390625, "learning_rate": 1.6173840759461047e-07, "loss": 0.1898, "step": 17680 }, { "epoch": 0.9448903013874237, "grad_norm": 1.6344431638717651, "learning_rate": 1.5865416982834615e-07, "loss": 0.1914, "step": 17690 }, { "epoch": 0.9454244394888298, "grad_norm": 1.9974606037139893, "learning_rate": 1.555993878248363e-07, "loss": 0.1961, "step": 17700 }, { "epoch": 0.9459585775902359, "grad_norm": 2.471661329269409, "learning_rate": 1.5257407072723273e-07, "loss": 0.1861, "step": 17710 }, { "epoch": 0.9464927156916421, "grad_norm": 2.121466875076294, "learning_rate": 1.495782275904978e-07, "loss": 0.1874, "step": 17720 }, { "epoch": 0.9470268537930482, "grad_norm": 1.76398503780365, "learning_rate": 1.466118673813799e-07, "loss": 0.1767, "step": 17730 }, { "epoch": 0.9475609918944543, "grad_norm": 2.6827852725982666, "learning_rate": 1.43674998978377e-07, "loss": 0.1776, "step": 17740 }, { "epoch": 0.9480951299958604, "grad_norm": 2.599290609359741, "learning_rate": 1.407676311717221e-07, "loss": 0.1921, "step": 17750 }, { "epoch": 0.9486292680972666, "grad_norm": 1.9904946088790894, "learning_rate": 1.3788977266334768e-07, "loss": 0.175, "step": 17760 }, { "epoch": 0.9491634061986727, "grad_norm": 1.5285472869873047, "learning_rate": 1.350414320668636e-07, "loss": 0.1941, "step": 17770 }, { "epoch": 0.9496975443000788, "grad_norm": 1.675504207611084, "learning_rate": 1.3222261790753145e-07, "loss": 0.1672, "step": 17780 }, { "epoch": 0.9502316824014849, "grad_norm": 2.212117910385132, "learning_rate": 1.2943333862223906e-07, "loss": 0.1788, "step": 17790 }, { "epoch": 0.950765820502891, "grad_norm": 1.8942610025405884, "learning_rate": 1.266736025594717e-07, "loss": 0.1893, "step": 17800 }, { "epoch": 0.9512999586042972, "grad_norm": 2.145646333694458, "learning_rate": 1.239434179792931e-07, "loss": 0.1941, "step": 17810 }, { "epoch": 0.9518340967057033, "grad_norm": 1.7522659301757812, "learning_rate": 1.2124279305331666e-07, "loss": 0.1924, "step": 17820 }, { "epoch": 0.9523682348071094, "grad_norm": 2.163177013397217, "learning_rate": 1.1857173586468096e-07, "loss": 0.1857, "step": 17830 }, { "epoch": 0.9529023729085155, "grad_norm": 1.848199725151062, "learning_rate": 1.1593025440802652e-07, "loss": 0.1722, "step": 17840 }, { "epoch": 0.9534365110099217, "grad_norm": 1.4711211919784546, "learning_rate": 1.1331835658947244e-07, "loss": 0.1814, "step": 17850 }, { "epoch": 0.9539706491113278, "grad_norm": 1.6050140857696533, "learning_rate": 1.1073605022659196e-07, "loss": 0.1736, "step": 17860 }, { "epoch": 0.9545047872127339, "grad_norm": 2.0338876247406006, "learning_rate": 1.0818334304839029e-07, "loss": 0.1855, "step": 17870 }, { "epoch": 0.9550389253141399, "grad_norm": 2.5119521617889404, "learning_rate": 1.0566024269527797e-07, "loss": 0.1918, "step": 17880 }, { "epoch": 0.955573063415546, "grad_norm": 2.297657012939453, "learning_rate": 1.0316675671905307e-07, "loss": 0.1871, "step": 17890 }, { "epoch": 0.9561072015169522, "grad_norm": 2.1027920246124268, "learning_rate": 1.007028925828757e-07, "loss": 0.1857, "step": 17900 }, { "epoch": 0.9566413396183583, "grad_norm": 1.868464708328247, "learning_rate": 9.826865766124239e-08, "loss": 0.1817, "step": 17910 }, { "epoch": 0.9571754777197644, "grad_norm": 1.4649710655212402, "learning_rate": 9.586405923997177e-08, "loss": 0.1796, "step": 17920 }, { "epoch": 0.9577096158211705, "grad_norm": 2.4451744556427, "learning_rate": 9.348910451617787e-08, "loss": 0.1825, "step": 17930 }, { "epoch": 0.9582437539225767, "grad_norm": 1.2706432342529297, "learning_rate": 9.114380059824679e-08, "loss": 0.1763, "step": 17940 }, { "epoch": 0.9587778920239828, "grad_norm": 1.829909324645996, "learning_rate": 8.882815450582117e-08, "loss": 0.1778, "step": 17950 }, { "epoch": 0.9593120301253889, "grad_norm": 1.8753759860992432, "learning_rate": 8.654217316977465e-08, "loss": 0.1824, "step": 17960 }, { "epoch": 0.959846168226795, "grad_norm": 1.550188422203064, "learning_rate": 8.42858634321908e-08, "loss": 0.1711, "step": 17970 }, { "epoch": 0.9603803063282011, "grad_norm": 1.9117953777313232, "learning_rate": 8.205923204634647e-08, "loss": 0.1936, "step": 17980 }, { "epoch": 0.9609144444296073, "grad_norm": 2.9609880447387695, "learning_rate": 7.98622856766873e-08, "loss": 0.1989, "step": 17990 }, { "epoch": 0.9614485825310134, "grad_norm": 2.214887857437134, "learning_rate": 7.769503089881337e-08, "loss": 0.1934, "step": 18000 }, { "epoch": 0.9619827206324195, "grad_norm": 1.415905237197876, "learning_rate": 7.555747419945137e-08, "loss": 0.1819, "step": 18010 }, { "epoch": 0.9625168587338256, "grad_norm": 1.8760441541671753, "learning_rate": 7.344962197644134e-08, "loss": 0.1884, "step": 18020 }, { "epoch": 0.9630509968352318, "grad_norm": 2.0158016681671143, "learning_rate": 7.137148053872e-08, "loss": 0.1914, "step": 18030 }, { "epoch": 0.9635851349366379, "grad_norm": 2.0778441429138184, "learning_rate": 6.932305610629186e-08, "loss": 0.1891, "step": 18040 }, { "epoch": 0.964119273038044, "grad_norm": 2.7099874019622803, "learning_rate": 6.730435481021925e-08, "loss": 0.1773, "step": 18050 }, { "epoch": 0.9646534111394501, "grad_norm": 1.9147902727127075, "learning_rate": 6.531538269260229e-08, "loss": 0.1898, "step": 18060 }, { "epoch": 0.9651875492408563, "grad_norm": 1.674359679222107, "learning_rate": 6.33561457065579e-08, "loss": 0.1935, "step": 18070 }, { "epoch": 0.9657216873422624, "grad_norm": 1.6331712007522583, "learning_rate": 6.142664971620637e-08, "loss": 0.1834, "step": 18080 }, { "epoch": 0.9662558254436685, "grad_norm": 1.6614782810211182, "learning_rate": 5.9526900496647e-08, "loss": 0.1868, "step": 18090 }, { "epoch": 0.9667899635450746, "grad_norm": 2.1908490657806396, "learning_rate": 5.7656903733950274e-08, "loss": 0.1886, "step": 18100 }, { "epoch": 0.9673241016464807, "grad_norm": 2.695566415786743, "learning_rate": 5.5816665025132386e-08, "loss": 0.1873, "step": 18110 }, { "epoch": 0.9678582397478868, "grad_norm": 1.1855067014694214, "learning_rate": 5.400618987814188e-08, "loss": 0.1831, "step": 18120 }, { "epoch": 0.9683923778492929, "grad_norm": 2.448911428451538, "learning_rate": 5.2225483711845216e-08, "loss": 0.1749, "step": 18130 }, { "epoch": 0.968926515950699, "grad_norm": 2.373223304748535, "learning_rate": 5.0474551856006804e-08, "loss": 0.183, "step": 18140 }, { "epoch": 0.9694606540521051, "grad_norm": 1.2666983604431152, "learning_rate": 4.875339955127567e-08, "loss": 0.1739, "step": 18150 }, { "epoch": 0.9699947921535113, "grad_norm": 2.336507558822632, "learning_rate": 4.7062031949166587e-08, "loss": 0.1868, "step": 18160 }, { "epoch": 0.9705289302549174, "grad_norm": 1.3432281017303467, "learning_rate": 4.540045411205007e-08, "loss": 0.1851, "step": 18170 }, { "epoch": 0.9710630683563235, "grad_norm": 1.754212498664856, "learning_rate": 4.376867101313131e-08, "loss": 0.1961, "step": 18180 }, { "epoch": 0.9715972064577296, "grad_norm": 1.8174265623092651, "learning_rate": 4.216668753643904e-08, "loss": 0.1953, "step": 18190 }, { "epoch": 0.9721313445591357, "grad_norm": 1.4010118246078491, "learning_rate": 4.0594508476810016e-08, "loss": 0.1899, "step": 18200 }, { "epoch": 0.9726654826605419, "grad_norm": 2.141456127166748, "learning_rate": 3.905213853987455e-08, "loss": 0.1848, "step": 18210 }, { "epoch": 0.973199620761948, "grad_norm": 2.2945010662078857, "learning_rate": 3.753958234204324e-08, "loss": 0.1983, "step": 18220 }, { "epoch": 0.9737337588633541, "grad_norm": 1.7072279453277588, "learning_rate": 3.605684441048918e-08, "loss": 0.1869, "step": 18230 }, { "epoch": 0.9742678969647602, "grad_norm": 2.0583059787750244, "learning_rate": 3.460392918314126e-08, "loss": 0.1916, "step": 18240 }, { "epoch": 0.9748020350661664, "grad_norm": 1.362908124923706, "learning_rate": 3.318084100866426e-08, "loss": 0.1852, "step": 18250 }, { "epoch": 0.9753361731675725, "grad_norm": 1.696120023727417, "learning_rate": 3.1787584146451004e-08, "loss": 0.1873, "step": 18260 }, { "epoch": 0.9758703112689786, "grad_norm": 2.0516366958618164, "learning_rate": 3.042416276660576e-08, "loss": 0.1803, "step": 18270 }, { "epoch": 0.9764044493703847, "grad_norm": 1.7424355745315552, "learning_rate": 2.9090580949934223e-08, "loss": 0.2001, "step": 18280 }, { "epoch": 0.9769385874717909, "grad_norm": 2.2013845443725586, "learning_rate": 2.7786842687929083e-08, "loss": 0.1829, "step": 18290 }, { "epoch": 0.977472725573197, "grad_norm": 2.2366421222686768, "learning_rate": 2.6512951882761152e-08, "loss": 0.1797, "step": 18300 }, { "epoch": 0.9780068636746031, "grad_norm": 2.1376302242279053, "learning_rate": 2.526891234726603e-08, "loss": 0.1846, "step": 18310 }, { "epoch": 0.9785410017760092, "grad_norm": 1.9640488624572754, "learning_rate": 2.405472780493079e-08, "loss": 0.188, "step": 18320 }, { "epoch": 0.9790751398774153, "grad_norm": 1.8197063207626343, "learning_rate": 2.2870401889885096e-08, "loss": 0.1796, "step": 18330 }, { "epoch": 0.9796092779788215, "grad_norm": 1.6603585481643677, "learning_rate": 2.1715938146892322e-08, "loss": 0.1864, "step": 18340 }, { "epoch": 0.9801434160802276, "grad_norm": 2.199291229248047, "learning_rate": 2.0591340031331787e-08, "loss": 0.1854, "step": 18350 }, { "epoch": 0.9806775541816337, "grad_norm": 1.8905895948410034, "learning_rate": 1.9496610909197632e-08, "loss": 0.1894, "step": 18360 }, { "epoch": 0.9812116922830397, "grad_norm": 2.5007681846618652, "learning_rate": 1.8431754057082196e-08, "loss": 0.1916, "step": 18370 }, { "epoch": 0.9817458303844459, "grad_norm": 2.31174635887146, "learning_rate": 1.7396772662169327e-08, "loss": 0.1879, "step": 18380 }, { "epoch": 0.982279968485852, "grad_norm": 1.798823595046997, "learning_rate": 1.639166982222107e-08, "loss": 0.1693, "step": 18390 }, { "epoch": 0.9828141065872581, "grad_norm": 1.9509938955307007, "learning_rate": 1.5416448545574336e-08, "loss": 0.1747, "step": 18400 }, { "epoch": 0.9833482446886642, "grad_norm": 2.527303695678711, "learning_rate": 1.4471111751127587e-08, "loss": 0.1821, "step": 18410 }, { "epoch": 0.9838823827900703, "grad_norm": 2.673835515975952, "learning_rate": 1.3555662268331937e-08, "loss": 0.1831, "step": 18420 }, { "epoch": 0.9844165208914765, "grad_norm": 1.5855772495269775, "learning_rate": 1.2670102837185617e-08, "loss": 0.1775, "step": 18430 }, { "epoch": 0.9849506589928826, "grad_norm": 2.064337730407715, "learning_rate": 1.181443610822397e-08, "loss": 0.1841, "step": 18440 }, { "epoch": 0.9854847970942887, "grad_norm": 2.5309932231903076, "learning_rate": 1.0988664642508362e-08, "loss": 0.1795, "step": 18450 }, { "epoch": 0.9860189351956948, "grad_norm": 1.8916162252426147, "learning_rate": 1.0192790911627281e-08, "loss": 0.1883, "step": 18460 }, { "epoch": 0.986553073297101, "grad_norm": 1.7224555015563965, "learning_rate": 9.42681729767858e-09, "loss": 0.189, "step": 18470 }, { "epoch": 0.9870872113985071, "grad_norm": 1.944437026977539, "learning_rate": 8.690746093270586e-09, "loss": 0.1957, "step": 18480 }, { "epoch": 0.9876213494999132, "grad_norm": 1.5178524255752563, "learning_rate": 7.98457950150877e-09, "loss": 0.1963, "step": 18490 }, { "epoch": 0.9881554876013193, "grad_norm": 1.7362507581710815, "learning_rate": 7.3083196359957645e-09, "loss": 0.1826, "step": 18500 }, { "epoch": 0.9886896257027254, "grad_norm": 1.716273307800293, "learning_rate": 6.661968520816909e-09, "loss": 0.1926, "step": 18510 }, { "epoch": 0.9892237638041316, "grad_norm": 2.0119266510009766, "learning_rate": 6.045528090544705e-09, "loss": 0.1936, "step": 18520 }, { "epoch": 0.9897579019055377, "grad_norm": 1.300718069076538, "learning_rate": 5.459000190221053e-09, "loss": 0.1742, "step": 18530 }, { "epoch": 0.9902920400069438, "grad_norm": 1.6036467552185059, "learning_rate": 4.902386575362794e-09, "loss": 0.1905, "step": 18540 }, { "epoch": 0.9908261781083499, "grad_norm": 1.5090267658233643, "learning_rate": 4.375688911947285e-09, "loss": 0.1742, "step": 18550 }, { "epoch": 0.9913603162097561, "grad_norm": 2.0474181175231934, "learning_rate": 3.8789087764146135e-09, "loss": 0.1878, "step": 18560 }, { "epoch": 0.9918944543111622, "grad_norm": 2.2800419330596924, "learning_rate": 3.4120476556598337e-09, "loss": 0.1899, "step": 18570 }, { "epoch": 0.9924285924125683, "grad_norm": 1.3325692415237427, "learning_rate": 2.975106947025186e-09, "loss": 0.1799, "step": 18580 }, { "epoch": 0.9929627305139744, "grad_norm": 2.4395177364349365, "learning_rate": 2.5680879583023234e-09, "loss": 0.1765, "step": 18590 }, { "epoch": 0.9934968686153806, "grad_norm": 1.7495089769363403, "learning_rate": 2.190991907724538e-09, "loss": 0.1784, "step": 18600 }, { "epoch": 0.9940310067167867, "grad_norm": 1.7918496131896973, "learning_rate": 1.8438199239645404e-09, "loss": 0.1897, "step": 18610 }, { "epoch": 0.9945651448181927, "grad_norm": 1.985832929611206, "learning_rate": 1.52657304613002e-09, "loss": 0.1871, "step": 18620 }, { "epoch": 0.9950992829195988, "grad_norm": 2.1160855293273926, "learning_rate": 1.239252223759202e-09, "loss": 0.1871, "step": 18630 }, { "epoch": 0.9956334210210049, "grad_norm": 2.2004637718200684, "learning_rate": 9.818583168219598e-10, "loss": 0.198, "step": 18640 }, { "epoch": 0.9961675591224111, "grad_norm": 2.338730573654175, "learning_rate": 7.543920957142625e-10, "loss": 0.1871, "step": 18650 }, { "epoch": 0.9967016972238172, "grad_norm": 1.8197417259216309, "learning_rate": 5.568542412570654e-10, "loss": 0.1916, "step": 18660 }, { "epoch": 0.9972358353252233, "grad_norm": 2.7354938983917236, "learning_rate": 3.892453446929789e-10, "loss": 0.1843, "step": 18670 }, { "epoch": 0.9977699734266294, "grad_norm": 2.048656463623047, "learning_rate": 2.515659076862687e-10, "loss": 0.1786, "step": 18680 }, { "epoch": 0.9983041115280356, "grad_norm": 2.941896677017212, "learning_rate": 1.438163423195249e-10, "loss": 0.1945, "step": 18690 }, { "epoch": 0.9988382496294417, "grad_norm": 2.057656764984131, "learning_rate": 6.599697109477277e-11, "loss": 0.1858, "step": 18700 }, { "epoch": 0.9993723877308478, "grad_norm": 2.0866317749023438, "learning_rate": 1.8108026930141464e-11, "loss": 0.1851, "step": 18710 }, { "epoch": 0.9999065258322539, "grad_norm": 2.5661983489990234, "learning_rate": 1.4965315986437134e-13, "loss": 0.2027, "step": 18720 }, { "epoch": 0.9999599396423945, "step": 18721, "total_flos": 1.6329913003556209e+18, "train_loss": 0.26040137486703363, "train_runtime": 103455.5903, "train_samples_per_second": 17.372, "train_steps_per_second": 0.181 } ], "logging_steps": 10, "max_steps": 18721, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6329913003556209e+18, "train_batch_size": 12, "trial_name": null, "trial_params": null }