latent-sft-reasoner / trainer_state.json
shockroborty's picture
Add files using upload-large-folder tool
676384c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.49999647479359915,
"eval_steps": 500,
"global_step": 26594,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000376022016089042,
"grad_norm": 69.5,
"learning_rate": 1.9843342036553526e-08,
"loss": 2.5216,
"step": 20
},
{
"epoch": 0.000752044032178084,
"grad_norm": 75.0,
"learning_rate": 4.073107049608355e-08,
"loss": 2.4632,
"step": 40
},
{
"epoch": 0.001128066048267126,
"grad_norm": 109.0,
"learning_rate": 6.161879895561358e-08,
"loss": 2.5604,
"step": 60
},
{
"epoch": 0.001504088064356168,
"grad_norm": 48.5,
"learning_rate": 8.250652741514362e-08,
"loss": 2.4744,
"step": 80
},
{
"epoch": 0.0018801100804452101,
"grad_norm": 45.25,
"learning_rate": 1.0339425587467364e-07,
"loss": 2.5512,
"step": 100
},
{
"epoch": 0.002256132096534252,
"grad_norm": 38.25,
"learning_rate": 1.2428198433420367e-07,
"loss": 2.4959,
"step": 120
},
{
"epoch": 0.002632154112623294,
"grad_norm": 30.5,
"learning_rate": 1.451697127937337e-07,
"loss": 2.5159,
"step": 140
},
{
"epoch": 0.003008176128712336,
"grad_norm": 29.0,
"learning_rate": 1.660574412532637e-07,
"loss": 2.5399,
"step": 160
},
{
"epoch": 0.003384198144801378,
"grad_norm": 29.625,
"learning_rate": 1.8694516971279375e-07,
"loss": 2.4812,
"step": 180
},
{
"epoch": 0.0037602201608904202,
"grad_norm": 25.0,
"learning_rate": 2.0783289817232378e-07,
"loss": 2.4797,
"step": 200
},
{
"epoch": 0.004136242176979462,
"grad_norm": 24.625,
"learning_rate": 2.2872062663185383e-07,
"loss": 2.4898,
"step": 220
},
{
"epoch": 0.004512264193068504,
"grad_norm": 17.0,
"learning_rate": 2.4960835509138383e-07,
"loss": 2.4359,
"step": 240
},
{
"epoch": 0.004888286209157546,
"grad_norm": 39.75,
"learning_rate": 2.7049608355091385e-07,
"loss": 2.4451,
"step": 260
},
{
"epoch": 0.005264308225246588,
"grad_norm": 17.75,
"learning_rate": 2.913838120104439e-07,
"loss": 2.4747,
"step": 280
},
{
"epoch": 0.00564033024133563,
"grad_norm": 19.75,
"learning_rate": 3.122715404699739e-07,
"loss": 2.4672,
"step": 300
},
{
"epoch": 0.006016352257424672,
"grad_norm": 27.125,
"learning_rate": 3.3315926892950393e-07,
"loss": 2.44,
"step": 320
},
{
"epoch": 0.006392374273513714,
"grad_norm": 19.75,
"learning_rate": 3.5404699738903396e-07,
"loss": 2.494,
"step": 340
},
{
"epoch": 0.006768396289602756,
"grad_norm": 39.25,
"learning_rate": 3.7493472584856404e-07,
"loss": 2.4068,
"step": 360
},
{
"epoch": 0.007144418305691798,
"grad_norm": 27.75,
"learning_rate": 3.95822454308094e-07,
"loss": 2.3509,
"step": 380
},
{
"epoch": 0.0075204403217808405,
"grad_norm": 14.9375,
"learning_rate": 4.1671018276762403e-07,
"loss": 2.3596,
"step": 400
},
{
"epoch": 0.007896462337869883,
"grad_norm": 21.5,
"learning_rate": 4.375979112271541e-07,
"loss": 2.4322,
"step": 420
},
{
"epoch": 0.008272484353958925,
"grad_norm": 17.25,
"learning_rate": 4.584856396866841e-07,
"loss": 2.4769,
"step": 440
},
{
"epoch": 0.008648506370047966,
"grad_norm": 24.375,
"learning_rate": 4.793733681462142e-07,
"loss": 2.3957,
"step": 460
},
{
"epoch": 0.009024528386137008,
"grad_norm": 16.875,
"learning_rate": 5.002610966057442e-07,
"loss": 2.4445,
"step": 480
},
{
"epoch": 0.00940055040222605,
"grad_norm": 21.75,
"learning_rate": 5.211488250652742e-07,
"loss": 2.4009,
"step": 500
},
{
"epoch": 0.009776572418315092,
"grad_norm": 26.375,
"learning_rate": 5.420365535248042e-07,
"loss": 2.3618,
"step": 520
},
{
"epoch": 0.010152594434404135,
"grad_norm": 24.5,
"learning_rate": 5.629242819843343e-07,
"loss": 2.3718,
"step": 540
},
{
"epoch": 0.010528616450493177,
"grad_norm": 19.875,
"learning_rate": 5.838120104438643e-07,
"loss": 2.3708,
"step": 560
},
{
"epoch": 0.010904638466582219,
"grad_norm": 18.875,
"learning_rate": 6.046997389033943e-07,
"loss": 2.4253,
"step": 580
},
{
"epoch": 0.01128066048267126,
"grad_norm": 28.0,
"learning_rate": 6.255874673629243e-07,
"loss": 2.3592,
"step": 600
},
{
"epoch": 0.011656682498760302,
"grad_norm": 32.25,
"learning_rate": 6.464751958224544e-07,
"loss": 2.3199,
"step": 620
},
{
"epoch": 0.012032704514849344,
"grad_norm": 33.25,
"learning_rate": 6.673629242819844e-07,
"loss": 2.3505,
"step": 640
},
{
"epoch": 0.012408726530938387,
"grad_norm": 41.0,
"learning_rate": 6.882506527415145e-07,
"loss": 2.3872,
"step": 660
},
{
"epoch": 0.012784748547027429,
"grad_norm": 15.625,
"learning_rate": 7.091383812010443e-07,
"loss": 2.3008,
"step": 680
},
{
"epoch": 0.01316077056311647,
"grad_norm": 31.5,
"learning_rate": 7.300261096605745e-07,
"loss": 2.168,
"step": 700
},
{
"epoch": 0.013536792579205512,
"grad_norm": 71.0,
"learning_rate": 7.509138381201045e-07,
"loss": 2.2318,
"step": 720
},
{
"epoch": 0.013912814595294554,
"grad_norm": 32.25,
"learning_rate": 7.718015665796345e-07,
"loss": 2.2759,
"step": 740
},
{
"epoch": 0.014288836611383596,
"grad_norm": 71.0,
"learning_rate": 7.926892950391646e-07,
"loss": 2.2838,
"step": 760
},
{
"epoch": 0.01466485862747264,
"grad_norm": 37.0,
"learning_rate": 8.135770234986947e-07,
"loss": 2.2449,
"step": 780
},
{
"epoch": 0.015040880643561681,
"grad_norm": 100.0,
"learning_rate": 8.344647519582245e-07,
"loss": 2.2566,
"step": 800
},
{
"epoch": 0.015416902659650723,
"grad_norm": 52.5,
"learning_rate": 8.553524804177546e-07,
"loss": 2.2765,
"step": 820
},
{
"epoch": 0.015792924675739766,
"grad_norm": 30.0,
"learning_rate": 8.762402088772847e-07,
"loss": 2.2282,
"step": 840
},
{
"epoch": 0.016168946691828806,
"grad_norm": 22.75,
"learning_rate": 8.971279373368147e-07,
"loss": 2.2817,
"step": 860
},
{
"epoch": 0.01654496870791785,
"grad_norm": 70.5,
"learning_rate": 9.180156657963447e-07,
"loss": 2.209,
"step": 880
},
{
"epoch": 0.01692099072400689,
"grad_norm": 107.0,
"learning_rate": 9.389033942558748e-07,
"loss": 2.2978,
"step": 900
},
{
"epoch": 0.017297012740095933,
"grad_norm": 52.5,
"learning_rate": 9.597911227154048e-07,
"loss": 2.2589,
"step": 920
},
{
"epoch": 0.017673034756184973,
"grad_norm": 90.0,
"learning_rate": 9.806788511749348e-07,
"loss": 2.1987,
"step": 940
},
{
"epoch": 0.018049056772274016,
"grad_norm": 45.75,
"learning_rate": 1.0015665796344648e-06,
"loss": 2.1721,
"step": 960
},
{
"epoch": 0.01842507878836306,
"grad_norm": 191.0,
"learning_rate": 1.0224543080939948e-06,
"loss": 2.2062,
"step": 980
},
{
"epoch": 0.0188011008044521,
"grad_norm": 67.5,
"learning_rate": 1.0433420365535249e-06,
"loss": 2.1984,
"step": 1000
},
{
"epoch": 0.019177122820541143,
"grad_norm": 61.25,
"learning_rate": 1.0642297650130549e-06,
"loss": 2.2189,
"step": 1020
},
{
"epoch": 0.019553144836630183,
"grad_norm": 49.5,
"learning_rate": 1.085117493472585e-06,
"loss": 2.2025,
"step": 1040
},
{
"epoch": 0.019929166852719227,
"grad_norm": 80.5,
"learning_rate": 1.1060052219321151e-06,
"loss": 2.1776,
"step": 1060
},
{
"epoch": 0.02030518886880827,
"grad_norm": 139.0,
"learning_rate": 1.126892950391645e-06,
"loss": 2.1829,
"step": 1080
},
{
"epoch": 0.02068121088489731,
"grad_norm": 127.0,
"learning_rate": 1.147780678851175e-06,
"loss": 2.1865,
"step": 1100
},
{
"epoch": 0.021057232900986354,
"grad_norm": 163.0,
"learning_rate": 1.168668407310705e-06,
"loss": 2.1854,
"step": 1120
},
{
"epoch": 0.021433254917075394,
"grad_norm": 71.5,
"learning_rate": 1.189556135770235e-06,
"loss": 2.163,
"step": 1140
},
{
"epoch": 0.021809276933164437,
"grad_norm": 98.0,
"learning_rate": 1.210443864229765e-06,
"loss": 2.071,
"step": 1160
},
{
"epoch": 0.02218529894925348,
"grad_norm": 165.0,
"learning_rate": 1.2313315926892953e-06,
"loss": 2.145,
"step": 1180
},
{
"epoch": 0.02256132096534252,
"grad_norm": 35.5,
"learning_rate": 1.2522193211488251e-06,
"loss": 2.0652,
"step": 1200
},
{
"epoch": 0.022937342981431564,
"grad_norm": 115.5,
"learning_rate": 1.2731070496083554e-06,
"loss": 2.1296,
"step": 1220
},
{
"epoch": 0.023313364997520604,
"grad_norm": 90.0,
"learning_rate": 1.2939947780678852e-06,
"loss": 2.0437,
"step": 1240
},
{
"epoch": 0.023689387013609647,
"grad_norm": 45.0,
"learning_rate": 1.3148825065274152e-06,
"loss": 2.0833,
"step": 1260
},
{
"epoch": 0.024065409029698687,
"grad_norm": 32.75,
"learning_rate": 1.3357702349869452e-06,
"loss": 2.0352,
"step": 1280
},
{
"epoch": 0.02444143104578773,
"grad_norm": 153.0,
"learning_rate": 1.3566579634464752e-06,
"loss": 2.054,
"step": 1300
},
{
"epoch": 0.024817453061876774,
"grad_norm": 149.0,
"learning_rate": 1.3775456919060055e-06,
"loss": 2.0609,
"step": 1320
},
{
"epoch": 0.025193475077965814,
"grad_norm": 50.75,
"learning_rate": 1.3984334203655353e-06,
"loss": 1.97,
"step": 1340
},
{
"epoch": 0.025569497094054858,
"grad_norm": 121.5,
"learning_rate": 1.4193211488250655e-06,
"loss": 1.9576,
"step": 1360
},
{
"epoch": 0.025945519110143898,
"grad_norm": 31.0,
"learning_rate": 1.4402088772845953e-06,
"loss": 2.0156,
"step": 1380
},
{
"epoch": 0.02632154112623294,
"grad_norm": 133.0,
"learning_rate": 1.4610966057441254e-06,
"loss": 2.0576,
"step": 1400
},
{
"epoch": 0.026697563142321985,
"grad_norm": 41.75,
"learning_rate": 1.4819843342036556e-06,
"loss": 2.0427,
"step": 1420
},
{
"epoch": 0.027073585158411025,
"grad_norm": 89.5,
"learning_rate": 1.5028720626631854e-06,
"loss": 1.9948,
"step": 1440
},
{
"epoch": 0.027449607174500068,
"grad_norm": 123.5,
"learning_rate": 1.5237597911227157e-06,
"loss": 1.987,
"step": 1460
},
{
"epoch": 0.027825629190589108,
"grad_norm": 72.0,
"learning_rate": 1.5446475195822455e-06,
"loss": 2.011,
"step": 1480
},
{
"epoch": 0.02820165120667815,
"grad_norm": 58.5,
"learning_rate": 1.5655352480417757e-06,
"loss": 1.9926,
"step": 1500
},
{
"epoch": 0.02857767322276719,
"grad_norm": 50.5,
"learning_rate": 1.5864229765013055e-06,
"loss": 1.9495,
"step": 1520
},
{
"epoch": 0.028953695238856235,
"grad_norm": 58.5,
"learning_rate": 1.6073107049608356e-06,
"loss": 1.9988,
"step": 1540
},
{
"epoch": 0.02932971725494528,
"grad_norm": 76.0,
"learning_rate": 1.6281984334203658e-06,
"loss": 1.98,
"step": 1560
},
{
"epoch": 0.02970573927103432,
"grad_norm": 99.0,
"learning_rate": 1.6490861618798956e-06,
"loss": 1.9849,
"step": 1580
},
{
"epoch": 0.030081761287123362,
"grad_norm": 116.0,
"learning_rate": 1.6699738903394258e-06,
"loss": 1.9464,
"step": 1600
},
{
"epoch": 0.030457783303212402,
"grad_norm": 119.0,
"learning_rate": 1.6908616187989557e-06,
"loss": 1.9654,
"step": 1620
},
{
"epoch": 0.030833805319301445,
"grad_norm": 130.0,
"learning_rate": 1.7117493472584859e-06,
"loss": 1.9718,
"step": 1640
},
{
"epoch": 0.03120982733539049,
"grad_norm": 200.0,
"learning_rate": 1.732637075718016e-06,
"loss": 1.9127,
"step": 1660
},
{
"epoch": 0.03158584935147953,
"grad_norm": 228.0,
"learning_rate": 1.7535248041775457e-06,
"loss": 1.9649,
"step": 1680
},
{
"epoch": 0.03196187136756857,
"grad_norm": 159.0,
"learning_rate": 1.774412532637076e-06,
"loss": 1.8952,
"step": 1700
},
{
"epoch": 0.03233789338365761,
"grad_norm": 87.0,
"learning_rate": 1.7953002610966058e-06,
"loss": 1.9328,
"step": 1720
},
{
"epoch": 0.032713915399746656,
"grad_norm": 129.0,
"learning_rate": 1.816187989556136e-06,
"loss": 1.9531,
"step": 1740
},
{
"epoch": 0.0330899374158357,
"grad_norm": 187.0,
"learning_rate": 1.8370757180156658e-06,
"loss": 1.8911,
"step": 1760
},
{
"epoch": 0.03346595943192474,
"grad_norm": 140.0,
"learning_rate": 1.857963446475196e-06,
"loss": 1.9228,
"step": 1780
},
{
"epoch": 0.03384198144801378,
"grad_norm": 152.0,
"learning_rate": 1.878851174934726e-06,
"loss": 1.9264,
"step": 1800
},
{
"epoch": 0.03421800346410282,
"grad_norm": 112.0,
"learning_rate": 1.899738903394256e-06,
"loss": 1.957,
"step": 1820
},
{
"epoch": 0.034594025480191866,
"grad_norm": 81.5,
"learning_rate": 1.920626631853786e-06,
"loss": 1.9084,
"step": 1840
},
{
"epoch": 0.03497004749628091,
"grad_norm": 52.25,
"learning_rate": 1.941514360313316e-06,
"loss": 1.895,
"step": 1860
},
{
"epoch": 0.035346069512369946,
"grad_norm": 52.75,
"learning_rate": 1.9624020887728464e-06,
"loss": 1.8667,
"step": 1880
},
{
"epoch": 0.03572209152845899,
"grad_norm": 73.0,
"learning_rate": 1.9832898172323762e-06,
"loss": 1.8782,
"step": 1900
},
{
"epoch": 0.03609811354454803,
"grad_norm": 157.0,
"learning_rate": 2.004177545691906e-06,
"loss": 1.8986,
"step": 1920
},
{
"epoch": 0.036474135560637076,
"grad_norm": 124.5,
"learning_rate": 2.0250652741514363e-06,
"loss": 1.8863,
"step": 1940
},
{
"epoch": 0.03685015757672612,
"grad_norm": 211.0,
"learning_rate": 2.045953002610966e-06,
"loss": 1.8851,
"step": 1960
},
{
"epoch": 0.037226179592815156,
"grad_norm": 157.0,
"learning_rate": 2.0668407310704963e-06,
"loss": 1.8669,
"step": 1980
},
{
"epoch": 0.0376022016089042,
"grad_norm": 163.0,
"learning_rate": 2.087728459530026e-06,
"loss": 1.873,
"step": 2000
},
{
"epoch": 0.03797822362499324,
"grad_norm": 76.5,
"learning_rate": 2.1086161879895564e-06,
"loss": 1.8493,
"step": 2020
},
{
"epoch": 0.03835424564108229,
"grad_norm": 250.0,
"learning_rate": 2.129503916449086e-06,
"loss": 1.8835,
"step": 2040
},
{
"epoch": 0.03873026765717133,
"grad_norm": 86.0,
"learning_rate": 2.1503916449086164e-06,
"loss": 1.8291,
"step": 2060
},
{
"epoch": 0.03910628967326037,
"grad_norm": 81.5,
"learning_rate": 2.1712793733681462e-06,
"loss": 1.8068,
"step": 2080
},
{
"epoch": 0.03948231168934941,
"grad_norm": 94.5,
"learning_rate": 2.1921671018276765e-06,
"loss": 1.7797,
"step": 2100
},
{
"epoch": 0.03985833370543845,
"grad_norm": 153.0,
"learning_rate": 2.2130548302872067e-06,
"loss": 1.8606,
"step": 2120
},
{
"epoch": 0.0402343557215275,
"grad_norm": 160.0,
"learning_rate": 2.2339425587467365e-06,
"loss": 1.8179,
"step": 2140
},
{
"epoch": 0.04061037773761654,
"grad_norm": 252.0,
"learning_rate": 2.2548302872062668e-06,
"loss": 1.8003,
"step": 2160
},
{
"epoch": 0.04098639975370558,
"grad_norm": 272.0,
"learning_rate": 2.2757180156657966e-06,
"loss": 1.7933,
"step": 2180
},
{
"epoch": 0.04136242176979462,
"grad_norm": 66.5,
"learning_rate": 2.2966057441253264e-06,
"loss": 1.8021,
"step": 2200
},
{
"epoch": 0.041738443785883664,
"grad_norm": 132.0,
"learning_rate": 2.3174934725848566e-06,
"loss": 1.757,
"step": 2220
},
{
"epoch": 0.04211446580197271,
"grad_norm": 101.0,
"learning_rate": 2.3383812010443865e-06,
"loss": 1.7466,
"step": 2240
},
{
"epoch": 0.04249048781806175,
"grad_norm": 111.5,
"learning_rate": 2.3592689295039167e-06,
"loss": 1.7771,
"step": 2260
},
{
"epoch": 0.04286650983415079,
"grad_norm": 64.0,
"learning_rate": 2.3801566579634465e-06,
"loss": 1.754,
"step": 2280
},
{
"epoch": 0.04324253185023983,
"grad_norm": 220.0,
"learning_rate": 2.4010443864229767e-06,
"loss": 1.7484,
"step": 2300
},
{
"epoch": 0.043618553866328874,
"grad_norm": 97.5,
"learning_rate": 2.4219321148825066e-06,
"loss": 1.7204,
"step": 2320
},
{
"epoch": 0.04399457588241792,
"grad_norm": 110.5,
"learning_rate": 2.442819843342037e-06,
"loss": 1.7732,
"step": 2340
},
{
"epoch": 0.04437059789850696,
"grad_norm": 73.5,
"learning_rate": 2.463707571801567e-06,
"loss": 1.7447,
"step": 2360
},
{
"epoch": 0.044746619914596,
"grad_norm": 78.5,
"learning_rate": 2.484595300261097e-06,
"loss": 1.7127,
"step": 2380
},
{
"epoch": 0.04512264193068504,
"grad_norm": 63.25,
"learning_rate": 2.5054830287206267e-06,
"loss": 1.6951,
"step": 2400
},
{
"epoch": 0.045498663946774084,
"grad_norm": 56.25,
"learning_rate": 2.5263707571801573e-06,
"loss": 1.6848,
"step": 2420
},
{
"epoch": 0.04587468596286313,
"grad_norm": 69.0,
"learning_rate": 2.547258485639687e-06,
"loss": 1.7051,
"step": 2440
},
{
"epoch": 0.046250707978952164,
"grad_norm": 83.0,
"learning_rate": 2.568146214099217e-06,
"loss": 1.6354,
"step": 2460
},
{
"epoch": 0.04662672999504121,
"grad_norm": 90.0,
"learning_rate": 2.5890339425587468e-06,
"loss": 1.643,
"step": 2480
},
{
"epoch": 0.04700275201113025,
"grad_norm": 62.5,
"learning_rate": 2.6099216710182766e-06,
"loss": 1.6811,
"step": 2500
},
{
"epoch": 0.047378774027219295,
"grad_norm": 199.0,
"learning_rate": 2.6308093994778072e-06,
"loss": 1.6851,
"step": 2520
},
{
"epoch": 0.04775479604330834,
"grad_norm": 57.75,
"learning_rate": 2.651697127937337e-06,
"loss": 1.6055,
"step": 2540
},
{
"epoch": 0.048130818059397375,
"grad_norm": 196.0,
"learning_rate": 2.672584856396867e-06,
"loss": 1.6079,
"step": 2560
},
{
"epoch": 0.04850684007548642,
"grad_norm": 149.0,
"learning_rate": 2.693472584856397e-06,
"loss": 1.6273,
"step": 2580
},
{
"epoch": 0.04888286209157546,
"grad_norm": 95.5,
"learning_rate": 2.714360313315927e-06,
"loss": 1.6333,
"step": 2600
},
{
"epoch": 0.049258884107664505,
"grad_norm": 72.0,
"learning_rate": 2.735248041775457e-06,
"loss": 1.6026,
"step": 2620
},
{
"epoch": 0.04963490612375355,
"grad_norm": 338.0,
"learning_rate": 2.7561357702349874e-06,
"loss": 1.5909,
"step": 2640
},
{
"epoch": 0.050010928139842585,
"grad_norm": 65.5,
"learning_rate": 2.777023498694517e-06,
"loss": 1.6058,
"step": 2660
},
{
"epoch": 0.05038695015593163,
"grad_norm": 126.0,
"learning_rate": 2.797911227154047e-06,
"loss": 1.5821,
"step": 2680
},
{
"epoch": 0.05076297217202067,
"grad_norm": 142.0,
"learning_rate": 2.8187989556135777e-06,
"loss": 1.5928,
"step": 2700
},
{
"epoch": 0.051138994188109715,
"grad_norm": 59.0,
"learning_rate": 2.8396866840731075e-06,
"loss": 1.5513,
"step": 2720
},
{
"epoch": 0.05151501620419876,
"grad_norm": 173.0,
"learning_rate": 2.8605744125326373e-06,
"loss": 1.519,
"step": 2740
},
{
"epoch": 0.051891038220287795,
"grad_norm": 118.5,
"learning_rate": 2.881462140992167e-06,
"loss": 1.5389,
"step": 2760
},
{
"epoch": 0.05226706023637684,
"grad_norm": 121.5,
"learning_rate": 2.9023498694516974e-06,
"loss": 1.5027,
"step": 2780
},
{
"epoch": 0.05264308225246588,
"grad_norm": 71.5,
"learning_rate": 2.9232375979112276e-06,
"loss": 1.5588,
"step": 2800
},
{
"epoch": 0.053019104268554926,
"grad_norm": 148.0,
"learning_rate": 2.9441253263707574e-06,
"loss": 1.544,
"step": 2820
},
{
"epoch": 0.05339512628464397,
"grad_norm": 98.5,
"learning_rate": 2.9650130548302876e-06,
"loss": 1.4796,
"step": 2840
},
{
"epoch": 0.053771148300733006,
"grad_norm": 119.5,
"learning_rate": 2.9859007832898175e-06,
"loss": 1.5498,
"step": 2860
},
{
"epoch": 0.05414717031682205,
"grad_norm": 68.0,
"learning_rate": 3.0067885117493473e-06,
"loss": 1.5174,
"step": 2880
},
{
"epoch": 0.05452319233291109,
"grad_norm": 81.0,
"learning_rate": 3.027676240208878e-06,
"loss": 1.5218,
"step": 2900
},
{
"epoch": 0.054899214349000136,
"grad_norm": 89.5,
"learning_rate": 3.0485639686684078e-06,
"loss": 1.4837,
"step": 2920
},
{
"epoch": 0.05527523636508917,
"grad_norm": 175.0,
"learning_rate": 3.0694516971279376e-06,
"loss": 1.469,
"step": 2940
},
{
"epoch": 0.055651258381178216,
"grad_norm": 188.0,
"learning_rate": 3.0903394255874674e-06,
"loss": 1.4704,
"step": 2960
},
{
"epoch": 0.05602728039726726,
"grad_norm": 53.5,
"learning_rate": 3.111227154046997e-06,
"loss": 1.4528,
"step": 2980
},
{
"epoch": 0.0564033024133563,
"grad_norm": 91.0,
"learning_rate": 3.132114882506528e-06,
"loss": 1.4783,
"step": 3000
},
{
"epoch": 0.056779324429445346,
"grad_norm": 81.5,
"learning_rate": 3.1530026109660577e-06,
"loss": 1.4367,
"step": 3020
},
{
"epoch": 0.05715534644553438,
"grad_norm": 69.0,
"learning_rate": 3.1738903394255875e-06,
"loss": 1.4717,
"step": 3040
},
{
"epoch": 0.057531368461623426,
"grad_norm": 207.0,
"learning_rate": 3.1947780678851177e-06,
"loss": 1.4713,
"step": 3060
},
{
"epoch": 0.05790739047771247,
"grad_norm": 87.5,
"learning_rate": 3.215665796344648e-06,
"loss": 1.4269,
"step": 3080
},
{
"epoch": 0.05828341249380151,
"grad_norm": 87.0,
"learning_rate": 3.2365535248041778e-06,
"loss": 1.4116,
"step": 3100
},
{
"epoch": 0.05865943450989056,
"grad_norm": 133.0,
"learning_rate": 3.257441253263708e-06,
"loss": 1.423,
"step": 3120
},
{
"epoch": 0.05903545652597959,
"grad_norm": 66.0,
"learning_rate": 3.278328981723238e-06,
"loss": 1.3921,
"step": 3140
},
{
"epoch": 0.05941147854206864,
"grad_norm": 70.0,
"learning_rate": 3.2992167101827676e-06,
"loss": 1.4027,
"step": 3160
},
{
"epoch": 0.05978750055815768,
"grad_norm": 114.5,
"learning_rate": 3.3201044386422983e-06,
"loss": 1.4118,
"step": 3180
},
{
"epoch": 0.060163522574246724,
"grad_norm": 64.5,
"learning_rate": 3.340992167101828e-06,
"loss": 1.3805,
"step": 3200
},
{
"epoch": 0.06053954459033577,
"grad_norm": 76.0,
"learning_rate": 3.361879895561358e-06,
"loss": 1.3966,
"step": 3220
},
{
"epoch": 0.060915566606424804,
"grad_norm": 52.5,
"learning_rate": 3.3827676240208877e-06,
"loss": 1.3991,
"step": 3240
},
{
"epoch": 0.06129158862251385,
"grad_norm": 139.0,
"learning_rate": 3.403655352480418e-06,
"loss": 1.3851,
"step": 3260
},
{
"epoch": 0.06166761063860289,
"grad_norm": 56.25,
"learning_rate": 3.4245430809399482e-06,
"loss": 1.3506,
"step": 3280
},
{
"epoch": 0.062043632654691934,
"grad_norm": 86.0,
"learning_rate": 3.445430809399478e-06,
"loss": 1.3288,
"step": 3300
},
{
"epoch": 0.06241965467078098,
"grad_norm": 108.5,
"learning_rate": 3.4663185378590083e-06,
"loss": 1.3767,
"step": 3320
},
{
"epoch": 0.06279567668687001,
"grad_norm": 46.5,
"learning_rate": 3.487206266318538e-06,
"loss": 1.339,
"step": 3340
},
{
"epoch": 0.06317169870295906,
"grad_norm": 127.5,
"learning_rate": 3.5080939947780683e-06,
"loss": 1.3316,
"step": 3360
},
{
"epoch": 0.0635477207190481,
"grad_norm": 53.0,
"learning_rate": 3.5289817232375986e-06,
"loss": 1.362,
"step": 3380
},
{
"epoch": 0.06392374273513714,
"grad_norm": 75.5,
"learning_rate": 3.5498694516971284e-06,
"loss": 1.3073,
"step": 3400
},
{
"epoch": 0.06429976475122619,
"grad_norm": 103.5,
"learning_rate": 3.570757180156658e-06,
"loss": 1.3008,
"step": 3420
},
{
"epoch": 0.06467578676731522,
"grad_norm": 50.25,
"learning_rate": 3.591644908616188e-06,
"loss": 1.3438,
"step": 3440
},
{
"epoch": 0.06505180878340427,
"grad_norm": 108.0,
"learning_rate": 3.6125326370757187e-06,
"loss": 1.3175,
"step": 3460
},
{
"epoch": 0.06542783079949331,
"grad_norm": 97.0,
"learning_rate": 3.6334203655352485e-06,
"loss": 1.3031,
"step": 3480
},
{
"epoch": 0.06580385281558235,
"grad_norm": 124.5,
"learning_rate": 3.6543080939947783e-06,
"loss": 1.3161,
"step": 3500
},
{
"epoch": 0.0661798748316714,
"grad_norm": 78.5,
"learning_rate": 3.675195822454308e-06,
"loss": 1.2822,
"step": 3520
},
{
"epoch": 0.06655589684776043,
"grad_norm": 81.5,
"learning_rate": 3.6960835509138383e-06,
"loss": 1.3111,
"step": 3540
},
{
"epoch": 0.06693191886384948,
"grad_norm": 94.5,
"learning_rate": 3.7169712793733686e-06,
"loss": 1.2909,
"step": 3560
},
{
"epoch": 0.06730794087993852,
"grad_norm": 86.0,
"learning_rate": 3.7378590078328984e-06,
"loss": 1.2535,
"step": 3580
},
{
"epoch": 0.06768396289602756,
"grad_norm": 142.0,
"learning_rate": 3.7587467362924286e-06,
"loss": 1.2963,
"step": 3600
},
{
"epoch": 0.06805998491211661,
"grad_norm": 58.25,
"learning_rate": 3.7796344647519584e-06,
"loss": 1.2354,
"step": 3620
},
{
"epoch": 0.06843600692820564,
"grad_norm": 67.5,
"learning_rate": 3.8005221932114883e-06,
"loss": 1.2719,
"step": 3640
},
{
"epoch": 0.0688120289442947,
"grad_norm": 103.0,
"learning_rate": 3.821409921671019e-06,
"loss": 1.246,
"step": 3660
},
{
"epoch": 0.06918805096038373,
"grad_norm": 110.0,
"learning_rate": 3.842297650130548e-06,
"loss": 1.2397,
"step": 3680
},
{
"epoch": 0.06956407297647277,
"grad_norm": 45.5,
"learning_rate": 3.8631853785900785e-06,
"loss": 1.2576,
"step": 3700
},
{
"epoch": 0.06994009499256182,
"grad_norm": 62.0,
"learning_rate": 3.884073107049609e-06,
"loss": 1.2273,
"step": 3720
},
{
"epoch": 0.07031611700865086,
"grad_norm": 77.5,
"learning_rate": 3.904960835509139e-06,
"loss": 1.2366,
"step": 3740
},
{
"epoch": 0.07069213902473989,
"grad_norm": 89.5,
"learning_rate": 3.925848563968669e-06,
"loss": 1.2027,
"step": 3760
},
{
"epoch": 0.07106816104082894,
"grad_norm": 84.0,
"learning_rate": 3.946736292428199e-06,
"loss": 1.2029,
"step": 3780
},
{
"epoch": 0.07144418305691798,
"grad_norm": 51.0,
"learning_rate": 3.967624020887729e-06,
"loss": 1.2181,
"step": 3800
},
{
"epoch": 0.07182020507300703,
"grad_norm": 71.0,
"learning_rate": 3.988511749347258e-06,
"loss": 1.2405,
"step": 3820
},
{
"epoch": 0.07219622708909607,
"grad_norm": 78.0,
"learning_rate": 4.009399477806789e-06,
"loss": 1.1956,
"step": 3840
},
{
"epoch": 0.0725722491051851,
"grad_norm": 76.0,
"learning_rate": 4.030287206266319e-06,
"loss": 1.2008,
"step": 3860
},
{
"epoch": 0.07294827112127415,
"grad_norm": 71.0,
"learning_rate": 4.051174934725849e-06,
"loss": 1.209,
"step": 3880
},
{
"epoch": 0.07332429313736319,
"grad_norm": 91.0,
"learning_rate": 4.072062663185378e-06,
"loss": 1.2152,
"step": 3900
},
{
"epoch": 0.07370031515345224,
"grad_norm": 65.5,
"learning_rate": 4.092950391644909e-06,
"loss": 1.2116,
"step": 3920
},
{
"epoch": 0.07407633716954128,
"grad_norm": 84.0,
"learning_rate": 4.113838120104439e-06,
"loss": 1.1892,
"step": 3940
},
{
"epoch": 0.07445235918563031,
"grad_norm": 80.0,
"learning_rate": 4.134725848563969e-06,
"loss": 1.1591,
"step": 3960
},
{
"epoch": 0.07482838120171936,
"grad_norm": 93.0,
"learning_rate": 4.155613577023499e-06,
"loss": 1.1767,
"step": 3980
},
{
"epoch": 0.0752044032178084,
"grad_norm": 83.0,
"learning_rate": 4.176501305483029e-06,
"loss": 1.1395,
"step": 4000
},
{
"epoch": 0.07558042523389745,
"grad_norm": 72.5,
"learning_rate": 4.197389033942559e-06,
"loss": 1.1689,
"step": 4020
},
{
"epoch": 0.07595644724998649,
"grad_norm": 58.5,
"learning_rate": 4.218276762402089e-06,
"loss": 1.1351,
"step": 4040
},
{
"epoch": 0.07633246926607552,
"grad_norm": 64.5,
"learning_rate": 4.2391644908616194e-06,
"loss": 1.1314,
"step": 4060
},
{
"epoch": 0.07670849128216457,
"grad_norm": 66.5,
"learning_rate": 4.260052219321149e-06,
"loss": 1.124,
"step": 4080
},
{
"epoch": 0.07708451329825361,
"grad_norm": 42.75,
"learning_rate": 4.280939947780679e-06,
"loss": 1.0991,
"step": 4100
},
{
"epoch": 0.07746053531434266,
"grad_norm": 63.25,
"learning_rate": 4.301827676240209e-06,
"loss": 1.1353,
"step": 4120
},
{
"epoch": 0.0778365573304317,
"grad_norm": 47.5,
"learning_rate": 4.3227154046997395e-06,
"loss": 1.1189,
"step": 4140
},
{
"epoch": 0.07821257934652073,
"grad_norm": 34.0,
"learning_rate": 4.343603133159269e-06,
"loss": 1.1196,
"step": 4160
},
{
"epoch": 0.07858860136260978,
"grad_norm": 140.0,
"learning_rate": 4.364490861618799e-06,
"loss": 1.0964,
"step": 4180
},
{
"epoch": 0.07896462337869882,
"grad_norm": 60.75,
"learning_rate": 4.385378590078329e-06,
"loss": 1.1315,
"step": 4200
},
{
"epoch": 0.07934064539478787,
"grad_norm": 57.75,
"learning_rate": 4.40626631853786e-06,
"loss": 1.0911,
"step": 4220
},
{
"epoch": 0.0797166674108769,
"grad_norm": 70.5,
"learning_rate": 4.42715404699739e-06,
"loss": 1.084,
"step": 4240
},
{
"epoch": 0.08009268942696594,
"grad_norm": 29.25,
"learning_rate": 4.448041775456919e-06,
"loss": 1.0646,
"step": 4260
},
{
"epoch": 0.080468711443055,
"grad_norm": 33.0,
"learning_rate": 4.4689295039164495e-06,
"loss": 1.0657,
"step": 4280
},
{
"epoch": 0.08084473345914403,
"grad_norm": 75.5,
"learning_rate": 4.489817232375979e-06,
"loss": 1.0553,
"step": 4300
},
{
"epoch": 0.08122075547523308,
"grad_norm": 49.75,
"learning_rate": 4.51070496083551e-06,
"loss": 1.0675,
"step": 4320
},
{
"epoch": 0.08159677749132212,
"grad_norm": 40.25,
"learning_rate": 4.531592689295039e-06,
"loss": 1.0539,
"step": 4340
},
{
"epoch": 0.08197279950741115,
"grad_norm": 40.5,
"learning_rate": 4.55248041775457e-06,
"loss": 1.0381,
"step": 4360
},
{
"epoch": 0.0823488215235002,
"grad_norm": 51.75,
"learning_rate": 4.573368146214099e-06,
"loss": 1.062,
"step": 4380
},
{
"epoch": 0.08272484353958924,
"grad_norm": 120.5,
"learning_rate": 4.59425587467363e-06,
"loss": 1.0553,
"step": 4400
},
{
"epoch": 0.08310086555567829,
"grad_norm": 51.75,
"learning_rate": 4.6151436031331595e-06,
"loss": 1.0386,
"step": 4420
},
{
"epoch": 0.08347688757176733,
"grad_norm": 29.125,
"learning_rate": 4.63603133159269e-06,
"loss": 1.0334,
"step": 4440
},
{
"epoch": 0.08385290958785636,
"grad_norm": 43.25,
"learning_rate": 4.65691906005222e-06,
"loss": 1.0358,
"step": 4460
},
{
"epoch": 0.08422893160394541,
"grad_norm": 34.5,
"learning_rate": 4.677806788511749e-06,
"loss": 1.0064,
"step": 4480
},
{
"epoch": 0.08460495362003445,
"grad_norm": 40.25,
"learning_rate": 4.6986945169712796e-06,
"loss": 1.0205,
"step": 4500
},
{
"epoch": 0.0849809756361235,
"grad_norm": 37.75,
"learning_rate": 4.71958224543081e-06,
"loss": 0.9937,
"step": 4520
},
{
"epoch": 0.08535699765221254,
"grad_norm": 41.25,
"learning_rate": 4.74046997389034e-06,
"loss": 1.0081,
"step": 4540
},
{
"epoch": 0.08573301966830157,
"grad_norm": 59.25,
"learning_rate": 4.7613577023498694e-06,
"loss": 0.9894,
"step": 4560
},
{
"epoch": 0.08610904168439062,
"grad_norm": 57.5,
"learning_rate": 4.7822454308094e-06,
"loss": 0.9949,
"step": 4580
},
{
"epoch": 0.08648506370047966,
"grad_norm": 46.5,
"learning_rate": 4.80313315926893e-06,
"loss": 1.0066,
"step": 4600
},
{
"epoch": 0.08686108571656871,
"grad_norm": 46.0,
"learning_rate": 4.82402088772846e-06,
"loss": 1.001,
"step": 4620
},
{
"epoch": 0.08723710773265775,
"grad_norm": 34.5,
"learning_rate": 4.8449086161879895e-06,
"loss": 0.9981,
"step": 4640
},
{
"epoch": 0.08761312974874678,
"grad_norm": 49.75,
"learning_rate": 4.86579634464752e-06,
"loss": 0.9975,
"step": 4660
},
{
"epoch": 0.08798915176483584,
"grad_norm": 25.125,
"learning_rate": 4.88668407310705e-06,
"loss": 0.9697,
"step": 4680
},
{
"epoch": 0.08836517378092487,
"grad_norm": 68.0,
"learning_rate": 4.90757180156658e-06,
"loss": 0.9851,
"step": 4700
},
{
"epoch": 0.08874119579701392,
"grad_norm": 122.0,
"learning_rate": 4.9284595300261105e-06,
"loss": 0.9624,
"step": 4720
},
{
"epoch": 0.08911721781310296,
"grad_norm": 68.5,
"learning_rate": 4.94934725848564e-06,
"loss": 0.9812,
"step": 4740
},
{
"epoch": 0.089493239829192,
"grad_norm": 43.5,
"learning_rate": 4.97023498694517e-06,
"loss": 0.9594,
"step": 4760
},
{
"epoch": 0.08986926184528105,
"grad_norm": 26.625,
"learning_rate": 4.9911227154047e-06,
"loss": 0.9582,
"step": 4780
},
{
"epoch": 0.09024528386137008,
"grad_norm": 30.375,
"learning_rate": 5.012010443864231e-06,
"loss": 0.9374,
"step": 4800
},
{
"epoch": 0.09062130587745912,
"grad_norm": 43.75,
"learning_rate": 5.03289817232376e-06,
"loss": 0.9564,
"step": 4820
},
{
"epoch": 0.09099732789354817,
"grad_norm": 61.5,
"learning_rate": 5.05378590078329e-06,
"loss": 0.9293,
"step": 4840
},
{
"epoch": 0.0913733499096372,
"grad_norm": 34.25,
"learning_rate": 5.07467362924282e-06,
"loss": 0.9345,
"step": 4860
},
{
"epoch": 0.09174937192572626,
"grad_norm": 27.375,
"learning_rate": 5.09556135770235e-06,
"loss": 0.9374,
"step": 4880
},
{
"epoch": 0.09212539394181529,
"grad_norm": 42.0,
"learning_rate": 5.11644908616188e-06,
"loss": 0.9305,
"step": 4900
},
{
"epoch": 0.09250141595790433,
"grad_norm": 27.875,
"learning_rate": 5.137336814621411e-06,
"loss": 0.9216,
"step": 4920
},
{
"epoch": 0.09287743797399338,
"grad_norm": 54.0,
"learning_rate": 5.1582245430809406e-06,
"loss": 0.9187,
"step": 4940
},
{
"epoch": 0.09325345999008242,
"grad_norm": 33.5,
"learning_rate": 5.179112271540471e-06,
"loss": 0.9057,
"step": 4960
},
{
"epoch": 0.09362948200617147,
"grad_norm": 34.5,
"learning_rate": 5.2e-06,
"loss": 0.8961,
"step": 4980
},
{
"epoch": 0.0940055040222605,
"grad_norm": 33.75,
"learning_rate": 5.2208877284595304e-06,
"loss": 0.9232,
"step": 5000
},
{
"epoch": 0.09438152603834954,
"grad_norm": 21.375,
"learning_rate": 5.241775456919061e-06,
"loss": 0.9002,
"step": 5020
},
{
"epoch": 0.09475754805443859,
"grad_norm": 37.0,
"learning_rate": 5.26266318537859e-06,
"loss": 0.9141,
"step": 5040
},
{
"epoch": 0.09513357007052763,
"grad_norm": 31.75,
"learning_rate": 5.28355091383812e-06,
"loss": 0.9062,
"step": 5060
},
{
"epoch": 0.09550959208661668,
"grad_norm": 25.0,
"learning_rate": 5.3044386422976505e-06,
"loss": 0.8911,
"step": 5080
},
{
"epoch": 0.09588561410270571,
"grad_norm": 39.75,
"learning_rate": 5.32532637075718e-06,
"loss": 0.8961,
"step": 5100
},
{
"epoch": 0.09626163611879475,
"grad_norm": 21.375,
"learning_rate": 5.346214099216711e-06,
"loss": 0.8682,
"step": 5120
},
{
"epoch": 0.0966376581348838,
"grad_norm": 23.625,
"learning_rate": 5.367101827676241e-06,
"loss": 0.8722,
"step": 5140
},
{
"epoch": 0.09701368015097284,
"grad_norm": 32.75,
"learning_rate": 5.387989556135771e-06,
"loss": 0.8664,
"step": 5160
},
{
"epoch": 0.09738970216706189,
"grad_norm": 22.875,
"learning_rate": 5.408877284595301e-06,
"loss": 0.8557,
"step": 5180
},
{
"epoch": 0.09776572418315092,
"grad_norm": 43.5,
"learning_rate": 5.429765013054831e-06,
"loss": 0.8546,
"step": 5200
},
{
"epoch": 0.09814174619923996,
"grad_norm": 22.375,
"learning_rate": 5.4506527415143605e-06,
"loss": 0.8568,
"step": 5220
},
{
"epoch": 0.09851776821532901,
"grad_norm": 24.75,
"learning_rate": 5.471540469973891e-06,
"loss": 0.8628,
"step": 5240
},
{
"epoch": 0.09889379023141805,
"grad_norm": 23.75,
"learning_rate": 5.49242819843342e-06,
"loss": 0.8456,
"step": 5260
},
{
"epoch": 0.0992698122475071,
"grad_norm": 23.5,
"learning_rate": 5.51331592689295e-06,
"loss": 0.8357,
"step": 5280
},
{
"epoch": 0.09964583426359613,
"grad_norm": 14.9375,
"learning_rate": 5.5342036553524814e-06,
"loss": 0.8189,
"step": 5300
},
{
"epoch": 0.10002185627968517,
"grad_norm": 40.25,
"learning_rate": 5.555091383812012e-06,
"loss": 0.8384,
"step": 5320
},
{
"epoch": 0.10039787829577422,
"grad_norm": 48.0,
"learning_rate": 5.575979112271541e-06,
"loss": 0.8441,
"step": 5340
},
{
"epoch": 0.10077390031186326,
"grad_norm": 25.0,
"learning_rate": 5.596866840731071e-06,
"loss": 0.81,
"step": 5360
},
{
"epoch": 0.10114992232795231,
"grad_norm": 53.25,
"learning_rate": 5.617754569190601e-06,
"loss": 0.846,
"step": 5380
},
{
"epoch": 0.10152594434404134,
"grad_norm": 21.25,
"learning_rate": 5.638642297650131e-06,
"loss": 0.8235,
"step": 5400
},
{
"epoch": 0.10190196636013038,
"grad_norm": 22.75,
"learning_rate": 5.659530026109661e-06,
"loss": 0.8416,
"step": 5420
},
{
"epoch": 0.10227798837621943,
"grad_norm": 10.4375,
"learning_rate": 5.6804177545691906e-06,
"loss": 0.8025,
"step": 5440
},
{
"epoch": 0.10265401039230847,
"grad_norm": 11.3125,
"learning_rate": 5.701305483028721e-06,
"loss": 0.8071,
"step": 5460
},
{
"epoch": 0.10303003240839752,
"grad_norm": 13.0,
"learning_rate": 5.72219321148825e-06,
"loss": 0.819,
"step": 5480
},
{
"epoch": 0.10340605442448655,
"grad_norm": 36.25,
"learning_rate": 5.743080939947781e-06,
"loss": 0.809,
"step": 5500
},
{
"epoch": 0.10378207644057559,
"grad_norm": 22.0,
"learning_rate": 5.7639686684073115e-06,
"loss": 0.8222,
"step": 5520
},
{
"epoch": 0.10415809845666464,
"grad_norm": 20.375,
"learning_rate": 5.784856396866842e-06,
"loss": 0.7892,
"step": 5540
},
{
"epoch": 0.10453412047275368,
"grad_norm": 23.875,
"learning_rate": 5.805744125326371e-06,
"loss": 0.8109,
"step": 5560
},
{
"epoch": 0.10491014248884273,
"grad_norm": 43.5,
"learning_rate": 5.826631853785901e-06,
"loss": 0.7978,
"step": 5580
},
{
"epoch": 0.10528616450493176,
"grad_norm": 22.25,
"learning_rate": 5.847519582245431e-06,
"loss": 0.7947,
"step": 5600
},
{
"epoch": 0.1056621865210208,
"grad_norm": 9.5,
"learning_rate": 5.868407310704961e-06,
"loss": 0.8045,
"step": 5620
},
{
"epoch": 0.10603820853710985,
"grad_norm": 12.375,
"learning_rate": 5.889295039164491e-06,
"loss": 0.8083,
"step": 5640
},
{
"epoch": 0.10641423055319889,
"grad_norm": 30.125,
"learning_rate": 5.910182767624021e-06,
"loss": 0.8052,
"step": 5660
},
{
"epoch": 0.10679025256928794,
"grad_norm": 20.125,
"learning_rate": 5.931070496083552e-06,
"loss": 0.7854,
"step": 5680
},
{
"epoch": 0.10716627458537697,
"grad_norm": 15.25,
"learning_rate": 5.951958224543082e-06,
"loss": 0.7947,
"step": 5700
},
{
"epoch": 0.10754229660146601,
"grad_norm": 8.5625,
"learning_rate": 5.972845953002611e-06,
"loss": 0.7992,
"step": 5720
},
{
"epoch": 0.10791831861755506,
"grad_norm": 20.625,
"learning_rate": 5.993733681462142e-06,
"loss": 0.7855,
"step": 5740
},
{
"epoch": 0.1082943406336441,
"grad_norm": 10.125,
"learning_rate": 6.014621409921672e-06,
"loss": 0.7857,
"step": 5760
},
{
"epoch": 0.10867036264973315,
"grad_norm": 33.5,
"learning_rate": 6.035509138381201e-06,
"loss": 0.7908,
"step": 5780
},
{
"epoch": 0.10904638466582219,
"grad_norm": 12.75,
"learning_rate": 6.0563968668407315e-06,
"loss": 0.771,
"step": 5800
},
{
"epoch": 0.10942240668191122,
"grad_norm": 23.5,
"learning_rate": 6.077284595300262e-06,
"loss": 0.7734,
"step": 5820
},
{
"epoch": 0.10979842869800027,
"grad_norm": 12.125,
"learning_rate": 6.098172323759791e-06,
"loss": 0.7723,
"step": 5840
},
{
"epoch": 0.11017445071408931,
"grad_norm": 14.0,
"learning_rate": 6.119060052219322e-06,
"loss": 0.7602,
"step": 5860
},
{
"epoch": 0.11055047273017835,
"grad_norm": 32.0,
"learning_rate": 6.139947780678852e-06,
"loss": 0.7809,
"step": 5880
},
{
"epoch": 0.1109264947462674,
"grad_norm": 15.4375,
"learning_rate": 6.160835509138382e-06,
"loss": 0.7905,
"step": 5900
},
{
"epoch": 0.11130251676235643,
"grad_norm": 24.875,
"learning_rate": 6.181723237597912e-06,
"loss": 0.7764,
"step": 5920
},
{
"epoch": 0.11167853877844548,
"grad_norm": 19.625,
"learning_rate": 6.202610966057441e-06,
"loss": 0.7877,
"step": 5940
},
{
"epoch": 0.11205456079453452,
"grad_norm": 13.5625,
"learning_rate": 6.223498694516972e-06,
"loss": 0.7779,
"step": 5960
},
{
"epoch": 0.11243058281062356,
"grad_norm": 15.9375,
"learning_rate": 6.244386422976502e-06,
"loss": 0.7711,
"step": 5980
},
{
"epoch": 0.1128066048267126,
"grad_norm": 7.90625,
"learning_rate": 6.265274151436031e-06,
"loss": 0.7661,
"step": 6000
},
{
"epoch": 0.11318262684280164,
"grad_norm": 7.75,
"learning_rate": 6.2861618798955615e-06,
"loss": 0.76,
"step": 6020
},
{
"epoch": 0.11355864885889069,
"grad_norm": 9.9375,
"learning_rate": 6.307049608355092e-06,
"loss": 0.7445,
"step": 6040
},
{
"epoch": 0.11393467087497973,
"grad_norm": 12.125,
"learning_rate": 6.327937336814622e-06,
"loss": 0.7601,
"step": 6060
},
{
"epoch": 0.11431069289106877,
"grad_norm": 10.5,
"learning_rate": 6.348825065274152e-06,
"loss": 0.7641,
"step": 6080
},
{
"epoch": 0.11468671490715782,
"grad_norm": 26.0,
"learning_rate": 6.3697127937336825e-06,
"loss": 0.7501,
"step": 6100
},
{
"epoch": 0.11506273692324685,
"grad_norm": 18.625,
"learning_rate": 6.390600522193212e-06,
"loss": 0.7625,
"step": 6120
},
{
"epoch": 0.1154387589393359,
"grad_norm": 12.375,
"learning_rate": 6.411488250652742e-06,
"loss": 0.752,
"step": 6140
},
{
"epoch": 0.11581478095542494,
"grad_norm": 7.21875,
"learning_rate": 6.432375979112272e-06,
"loss": 0.7546,
"step": 6160
},
{
"epoch": 0.11619080297151398,
"grad_norm": 13.5625,
"learning_rate": 6.453263707571802e-06,
"loss": 0.7533,
"step": 6180
},
{
"epoch": 0.11656682498760303,
"grad_norm": 7.78125,
"learning_rate": 6.474151436031332e-06,
"loss": 0.7427,
"step": 6200
},
{
"epoch": 0.11694284700369206,
"grad_norm": 7.34375,
"learning_rate": 6.495039164490861e-06,
"loss": 0.7569,
"step": 6220
},
{
"epoch": 0.11731886901978111,
"grad_norm": 11.5,
"learning_rate": 6.5159268929503924e-06,
"loss": 0.7568,
"step": 6240
},
{
"epoch": 0.11769489103587015,
"grad_norm": 9.125,
"learning_rate": 6.536814621409923e-06,
"loss": 0.7583,
"step": 6260
},
{
"epoch": 0.11807091305195919,
"grad_norm": 6.0,
"learning_rate": 6.557702349869453e-06,
"loss": 0.745,
"step": 6280
},
{
"epoch": 0.11844693506804824,
"grad_norm": 8.9375,
"learning_rate": 6.578590078328982e-06,
"loss": 0.7423,
"step": 6300
},
{
"epoch": 0.11882295708413727,
"grad_norm": 17.375,
"learning_rate": 6.5994778067885125e-06,
"loss": 0.7417,
"step": 6320
},
{
"epoch": 0.11919897910022632,
"grad_norm": 6.28125,
"learning_rate": 6.620365535248042e-06,
"loss": 0.7414,
"step": 6340
},
{
"epoch": 0.11957500111631536,
"grad_norm": 18.375,
"learning_rate": 6.641253263707572e-06,
"loss": 0.7489,
"step": 6360
},
{
"epoch": 0.1199510231324044,
"grad_norm": 18.75,
"learning_rate": 6.662140992167102e-06,
"loss": 0.748,
"step": 6380
},
{
"epoch": 0.12032704514849345,
"grad_norm": 7.46875,
"learning_rate": 6.683028720626632e-06,
"loss": 0.7259,
"step": 6400
},
{
"epoch": 0.12070306716458248,
"grad_norm": 6.78125,
"learning_rate": 6.703916449086162e-06,
"loss": 0.7454,
"step": 6420
},
{
"epoch": 0.12107908918067153,
"grad_norm": 12.0,
"learning_rate": 6.724804177545693e-06,
"loss": 0.7378,
"step": 6440
},
{
"epoch": 0.12145511119676057,
"grad_norm": 10.375,
"learning_rate": 6.7456919060052225e-06,
"loss": 0.7508,
"step": 6460
},
{
"epoch": 0.12183113321284961,
"grad_norm": 8.875,
"learning_rate": 6.766579634464753e-06,
"loss": 0.7262,
"step": 6480
},
{
"epoch": 0.12220715522893866,
"grad_norm": 10.375,
"learning_rate": 6.787467362924283e-06,
"loss": 0.7445,
"step": 6500
},
{
"epoch": 0.1225831772450277,
"grad_norm": 7.46875,
"learning_rate": 6.808355091383812e-06,
"loss": 0.7337,
"step": 6520
},
{
"epoch": 0.12295919926111674,
"grad_norm": 20.375,
"learning_rate": 6.829242819843343e-06,
"loss": 0.7305,
"step": 6540
},
{
"epoch": 0.12333522127720578,
"grad_norm": 6.875,
"learning_rate": 6.850130548302872e-06,
"loss": 0.7247,
"step": 6560
},
{
"epoch": 0.12371124329329482,
"grad_norm": 9.1875,
"learning_rate": 6.871018276762402e-06,
"loss": 0.7185,
"step": 6580
},
{
"epoch": 0.12408726530938387,
"grad_norm": 8.125,
"learning_rate": 6.8919060052219325e-06,
"loss": 0.7359,
"step": 6600
},
{
"epoch": 0.1244632873254729,
"grad_norm": 11.3125,
"learning_rate": 6.9127937336814636e-06,
"loss": 0.7158,
"step": 6620
},
{
"epoch": 0.12483930934156195,
"grad_norm": 17.75,
"learning_rate": 6.933681462140993e-06,
"loss": 0.7367,
"step": 6640
},
{
"epoch": 0.12521533135765098,
"grad_norm": 9.375,
"learning_rate": 6.954569190600523e-06,
"loss": 0.718,
"step": 6660
},
{
"epoch": 0.12559135337374003,
"grad_norm": 8.375,
"learning_rate": 6.975456919060053e-06,
"loss": 0.7174,
"step": 6680
},
{
"epoch": 0.12596737538982908,
"grad_norm": 6.90625,
"learning_rate": 6.996344647519583e-06,
"loss": 0.7147,
"step": 6700
},
{
"epoch": 0.12634339740591813,
"grad_norm": 7.40625,
"learning_rate": 7.017232375979113e-06,
"loss": 0.7146,
"step": 6720
},
{
"epoch": 0.12671941942200715,
"grad_norm": 9.0,
"learning_rate": 7.0381201044386425e-06,
"loss": 0.7205,
"step": 6740
},
{
"epoch": 0.1270954414380962,
"grad_norm": 4.9375,
"learning_rate": 7.059007832898173e-06,
"loss": 0.715,
"step": 6760
},
{
"epoch": 0.12747146345418525,
"grad_norm": 7.375,
"learning_rate": 7.079895561357703e-06,
"loss": 0.7167,
"step": 6780
},
{
"epoch": 0.12784748547027427,
"grad_norm": 6.3125,
"learning_rate": 7.100783289817232e-06,
"loss": 0.7125,
"step": 6800
},
{
"epoch": 0.12822350748636333,
"grad_norm": 6.5625,
"learning_rate": 7.121671018276763e-06,
"loss": 0.7231,
"step": 6820
},
{
"epoch": 0.12859952950245238,
"grad_norm": 7.96875,
"learning_rate": 7.142558746736294e-06,
"loss": 0.7154,
"step": 6840
},
{
"epoch": 0.1289755515185414,
"grad_norm": 6.46875,
"learning_rate": 7.163446475195823e-06,
"loss": 0.7118,
"step": 6860
},
{
"epoch": 0.12935157353463045,
"grad_norm": 12.8125,
"learning_rate": 7.184334203655353e-06,
"loss": 0.6987,
"step": 6880
},
{
"epoch": 0.1297275955507195,
"grad_norm": 6.375,
"learning_rate": 7.205221932114883e-06,
"loss": 0.7044,
"step": 6900
},
{
"epoch": 0.13010361756680855,
"grad_norm": 6.6875,
"learning_rate": 7.226109660574413e-06,
"loss": 0.6975,
"step": 6920
},
{
"epoch": 0.13047963958289757,
"grad_norm": 5.875,
"learning_rate": 7.246997389033943e-06,
"loss": 0.7044,
"step": 6940
},
{
"epoch": 0.13085566159898662,
"grad_norm": 10.8125,
"learning_rate": 7.2678851174934725e-06,
"loss": 0.6952,
"step": 6960
},
{
"epoch": 0.13123168361507567,
"grad_norm": 5.25,
"learning_rate": 7.288772845953003e-06,
"loss": 0.7118,
"step": 6980
},
{
"epoch": 0.1316077056311647,
"grad_norm": 5.28125,
"learning_rate": 7.309660574412534e-06,
"loss": 0.713,
"step": 7000
},
{
"epoch": 0.13198372764725375,
"grad_norm": 8.0625,
"learning_rate": 7.330548302872063e-06,
"loss": 0.7068,
"step": 7020
},
{
"epoch": 0.1323597496633428,
"grad_norm": 6.21875,
"learning_rate": 7.3514360313315935e-06,
"loss": 0.7073,
"step": 7040
},
{
"epoch": 0.13273577167943182,
"grad_norm": 9.9375,
"learning_rate": 7.372323759791124e-06,
"loss": 0.7097,
"step": 7060
},
{
"epoch": 0.13311179369552087,
"grad_norm": 5.09375,
"learning_rate": 7.393211488250653e-06,
"loss": 0.6954,
"step": 7080
},
{
"epoch": 0.13348781571160992,
"grad_norm": 5.0,
"learning_rate": 7.414099216710183e-06,
"loss": 0.7081,
"step": 7100
},
{
"epoch": 0.13386383772769897,
"grad_norm": 4.6875,
"learning_rate": 7.4349869451697136e-06,
"loss": 0.702,
"step": 7120
},
{
"epoch": 0.134239859743788,
"grad_norm": 5.09375,
"learning_rate": 7.455874673629243e-06,
"loss": 0.7005,
"step": 7140
},
{
"epoch": 0.13461588175987704,
"grad_norm": 7.4375,
"learning_rate": 7.476762402088773e-06,
"loss": 0.6976,
"step": 7160
},
{
"epoch": 0.1349919037759661,
"grad_norm": 4.46875,
"learning_rate": 7.497650130548304e-06,
"loss": 0.6871,
"step": 7180
},
{
"epoch": 0.13536792579205512,
"grad_norm": 4.71875,
"learning_rate": 7.518537859007834e-06,
"loss": 0.6885,
"step": 7200
},
{
"epoch": 0.13574394780814417,
"grad_norm": 4.65625,
"learning_rate": 7.539425587467364e-06,
"loss": 0.6964,
"step": 7220
},
{
"epoch": 0.13611996982423322,
"grad_norm": 8.6875,
"learning_rate": 7.560313315926894e-06,
"loss": 0.7011,
"step": 7240
},
{
"epoch": 0.13649599184032224,
"grad_norm": 8.75,
"learning_rate": 7.5812010443864235e-06,
"loss": 0.687,
"step": 7260
},
{
"epoch": 0.1368720138564113,
"grad_norm": 4.03125,
"learning_rate": 7.602088772845954e-06,
"loss": 0.6915,
"step": 7280
},
{
"epoch": 0.13724803587250034,
"grad_norm": 5.84375,
"learning_rate": 7.622976501305483e-06,
"loss": 0.6777,
"step": 7300
},
{
"epoch": 0.1376240578885894,
"grad_norm": 4.78125,
"learning_rate": 7.643864229765013e-06,
"loss": 0.6925,
"step": 7320
},
{
"epoch": 0.1380000799046784,
"grad_norm": 13.3125,
"learning_rate": 7.664751958224544e-06,
"loss": 0.6797,
"step": 7340
},
{
"epoch": 0.13837610192076746,
"grad_norm": 7.5625,
"learning_rate": 7.685639686684074e-06,
"loss": 0.6898,
"step": 7360
},
{
"epoch": 0.13875212393685651,
"grad_norm": 4.34375,
"learning_rate": 7.706527415143604e-06,
"loss": 0.689,
"step": 7380
},
{
"epoch": 0.13912814595294554,
"grad_norm": 4.0,
"learning_rate": 7.727415143603134e-06,
"loss": 0.6946,
"step": 7400
},
{
"epoch": 0.1395041679690346,
"grad_norm": 5.53125,
"learning_rate": 7.748302872062665e-06,
"loss": 0.6817,
"step": 7420
},
{
"epoch": 0.13988018998512364,
"grad_norm": 6.03125,
"learning_rate": 7.769190600522193e-06,
"loss": 0.6864,
"step": 7440
},
{
"epoch": 0.14025621200121266,
"grad_norm": 4.84375,
"learning_rate": 7.790078328981723e-06,
"loss": 0.6869,
"step": 7460
},
{
"epoch": 0.1406322340173017,
"grad_norm": 4.4375,
"learning_rate": 7.810966057441254e-06,
"loss": 0.6908,
"step": 7480
},
{
"epoch": 0.14100825603339076,
"grad_norm": 8.625,
"learning_rate": 7.831853785900784e-06,
"loss": 0.6784,
"step": 7500
},
{
"epoch": 0.14138427804947978,
"grad_norm": 3.25,
"learning_rate": 7.852741514360314e-06,
"loss": 0.6762,
"step": 7520
},
{
"epoch": 0.14176030006556883,
"grad_norm": 8.4375,
"learning_rate": 7.873629242819844e-06,
"loss": 0.6726,
"step": 7540
},
{
"epoch": 0.14213632208165788,
"grad_norm": 4.1875,
"learning_rate": 7.894516971279375e-06,
"loss": 0.6635,
"step": 7560
},
{
"epoch": 0.14251234409774693,
"grad_norm": 5.40625,
"learning_rate": 7.915404699738905e-06,
"loss": 0.6875,
"step": 7580
},
{
"epoch": 0.14288836611383596,
"grad_norm": 4.5625,
"learning_rate": 7.936292428198435e-06,
"loss": 0.6747,
"step": 7600
},
{
"epoch": 0.143264388129925,
"grad_norm": 3.53125,
"learning_rate": 7.957180156657964e-06,
"loss": 0.6749,
"step": 7620
},
{
"epoch": 0.14364041014601406,
"grad_norm": 4.125,
"learning_rate": 7.978067885117494e-06,
"loss": 0.6701,
"step": 7640
},
{
"epoch": 0.14401643216210308,
"grad_norm": 5.59375,
"learning_rate": 7.998955613577024e-06,
"loss": 0.6641,
"step": 7660
},
{
"epoch": 0.14439245417819213,
"grad_norm": 10.75,
"learning_rate": 8.019843342036554e-06,
"loss": 0.661,
"step": 7680
},
{
"epoch": 0.14476847619428118,
"grad_norm": 4.84375,
"learning_rate": 8.040731070496085e-06,
"loss": 0.6687,
"step": 7700
},
{
"epoch": 0.1451444982103702,
"grad_norm": 4.875,
"learning_rate": 8.061618798955613e-06,
"loss": 0.6559,
"step": 7720
},
{
"epoch": 0.14552052022645925,
"grad_norm": 10.6875,
"learning_rate": 8.082506527415143e-06,
"loss": 0.6601,
"step": 7740
},
{
"epoch": 0.1458965422425483,
"grad_norm": 4.46875,
"learning_rate": 8.103394255874675e-06,
"loss": 0.6761,
"step": 7760
},
{
"epoch": 0.14627256425863736,
"grad_norm": 3.609375,
"learning_rate": 8.124281984334205e-06,
"loss": 0.6663,
"step": 7780
},
{
"epoch": 0.14664858627472638,
"grad_norm": 4.0625,
"learning_rate": 8.145169712793734e-06,
"loss": 0.6693,
"step": 7800
},
{
"epoch": 0.14702460829081543,
"grad_norm": 7.25,
"learning_rate": 8.166057441253264e-06,
"loss": 0.6543,
"step": 7820
},
{
"epoch": 0.14740063030690448,
"grad_norm": 4.59375,
"learning_rate": 8.186945169712795e-06,
"loss": 0.6718,
"step": 7840
},
{
"epoch": 0.1477766523229935,
"grad_norm": 4.5,
"learning_rate": 8.207832898172325e-06,
"loss": 0.6534,
"step": 7860
},
{
"epoch": 0.14815267433908255,
"grad_norm": 5.21875,
"learning_rate": 8.228720626631855e-06,
"loss": 0.6576,
"step": 7880
},
{
"epoch": 0.1485286963551716,
"grad_norm": 3.0,
"learning_rate": 8.249608355091384e-06,
"loss": 0.646,
"step": 7900
},
{
"epoch": 0.14890471837126062,
"grad_norm": 4.21875,
"learning_rate": 8.270496083550914e-06,
"loss": 0.6576,
"step": 7920
},
{
"epoch": 0.14928074038734968,
"grad_norm": 4.875,
"learning_rate": 8.291383812010446e-06,
"loss": 0.6728,
"step": 7940
},
{
"epoch": 0.14965676240343873,
"grad_norm": 6.1875,
"learning_rate": 8.312271540469974e-06,
"loss": 0.6676,
"step": 7960
},
{
"epoch": 0.15003278441952778,
"grad_norm": 11.6875,
"learning_rate": 8.333159268929504e-06,
"loss": 0.6537,
"step": 7980
},
{
"epoch": 0.1504088064356168,
"grad_norm": 5.40625,
"learning_rate": 8.354046997389035e-06,
"loss": 0.6551,
"step": 8000
},
{
"epoch": 0.15078482845170585,
"grad_norm": 3.703125,
"learning_rate": 8.374934725848565e-06,
"loss": 0.6531,
"step": 8020
},
{
"epoch": 0.1511608504677949,
"grad_norm": 4.0,
"learning_rate": 8.395822454308095e-06,
"loss": 0.654,
"step": 8040
},
{
"epoch": 0.15153687248388392,
"grad_norm": 7.3125,
"learning_rate": 8.416710182767624e-06,
"loss": 0.658,
"step": 8060
},
{
"epoch": 0.15191289449997297,
"grad_norm": 3.71875,
"learning_rate": 8.437597911227154e-06,
"loss": 0.6516,
"step": 8080
},
{
"epoch": 0.15228891651606202,
"grad_norm": 6.34375,
"learning_rate": 8.458485639686684e-06,
"loss": 0.6584,
"step": 8100
},
{
"epoch": 0.15266493853215105,
"grad_norm": 3.171875,
"learning_rate": 8.479373368146214e-06,
"loss": 0.6544,
"step": 8120
},
{
"epoch": 0.1530409605482401,
"grad_norm": 3.765625,
"learning_rate": 8.500261096605745e-06,
"loss": 0.6579,
"step": 8140
},
{
"epoch": 0.15341698256432915,
"grad_norm": 3.03125,
"learning_rate": 8.521148825065275e-06,
"loss": 0.6413,
"step": 8160
},
{
"epoch": 0.1537930045804182,
"grad_norm": 2.734375,
"learning_rate": 8.542036553524805e-06,
"loss": 0.6537,
"step": 8180
},
{
"epoch": 0.15416902659650722,
"grad_norm": 2.53125,
"learning_rate": 8.562924281984335e-06,
"loss": 0.6568,
"step": 8200
},
{
"epoch": 0.15454504861259627,
"grad_norm": 3.203125,
"learning_rate": 8.583812010443866e-06,
"loss": 0.636,
"step": 8220
},
{
"epoch": 0.15492107062868532,
"grad_norm": 2.296875,
"learning_rate": 8.604699738903394e-06,
"loss": 0.6439,
"step": 8240
},
{
"epoch": 0.15529709264477434,
"grad_norm": 3.296875,
"learning_rate": 8.625587467362924e-06,
"loss": 0.6409,
"step": 8260
},
{
"epoch": 0.1556731146608634,
"grad_norm": 3.015625,
"learning_rate": 8.646475195822455e-06,
"loss": 0.6533,
"step": 8280
},
{
"epoch": 0.15604913667695244,
"grad_norm": 3.3125,
"learning_rate": 8.667362924281985e-06,
"loss": 0.648,
"step": 8300
},
{
"epoch": 0.15642515869304147,
"grad_norm": 2.5625,
"learning_rate": 8.688250652741515e-06,
"loss": 0.6388,
"step": 8320
},
{
"epoch": 0.15680118070913052,
"grad_norm": 5.125,
"learning_rate": 8.709138381201045e-06,
"loss": 0.6439,
"step": 8340
},
{
"epoch": 0.15717720272521957,
"grad_norm": 2.359375,
"learning_rate": 8.730026109660576e-06,
"loss": 0.6397,
"step": 8360
},
{
"epoch": 0.15755322474130862,
"grad_norm": 2.5625,
"learning_rate": 8.750913838120106e-06,
"loss": 0.6342,
"step": 8380
},
{
"epoch": 0.15792924675739764,
"grad_norm": 2.46875,
"learning_rate": 8.771801566579634e-06,
"loss": 0.6388,
"step": 8400
},
{
"epoch": 0.1583052687734867,
"grad_norm": 2.953125,
"learning_rate": 8.792689295039165e-06,
"loss": 0.6361,
"step": 8420
},
{
"epoch": 0.15868129078957574,
"grad_norm": 3.96875,
"learning_rate": 8.813577023498695e-06,
"loss": 0.6334,
"step": 8440
},
{
"epoch": 0.15905731280566476,
"grad_norm": 2.546875,
"learning_rate": 8.834464751958225e-06,
"loss": 0.6556,
"step": 8460
},
{
"epoch": 0.1594333348217538,
"grad_norm": 3.015625,
"learning_rate": 8.855352480417755e-06,
"loss": 0.6357,
"step": 8480
},
{
"epoch": 0.15980935683784286,
"grad_norm": 3.0,
"learning_rate": 8.876240208877286e-06,
"loss": 0.6383,
"step": 8500
},
{
"epoch": 0.1601853788539319,
"grad_norm": 2.625,
"learning_rate": 8.897127937336816e-06,
"loss": 0.6429,
"step": 8520
},
{
"epoch": 0.16056140087002094,
"grad_norm": 3.609375,
"learning_rate": 8.918015665796346e-06,
"loss": 0.6338,
"step": 8540
},
{
"epoch": 0.16093742288611,
"grad_norm": 2.171875,
"learning_rate": 8.938903394255876e-06,
"loss": 0.649,
"step": 8560
},
{
"epoch": 0.161313444902199,
"grad_norm": 3.984375,
"learning_rate": 8.959791122715405e-06,
"loss": 0.6272,
"step": 8580
},
{
"epoch": 0.16168946691828806,
"grad_norm": 3.015625,
"learning_rate": 8.980678851174935e-06,
"loss": 0.6372,
"step": 8600
},
{
"epoch": 0.1620654889343771,
"grad_norm": 2.3125,
"learning_rate": 9.001566579634465e-06,
"loss": 0.6178,
"step": 8620
},
{
"epoch": 0.16244151095046616,
"grad_norm": 4.875,
"learning_rate": 9.022454308093996e-06,
"loss": 0.6354,
"step": 8640
},
{
"epoch": 0.16281753296655518,
"grad_norm": 2.671875,
"learning_rate": 9.043342036553526e-06,
"loss": 0.6427,
"step": 8660
},
{
"epoch": 0.16319355498264423,
"grad_norm": 2.828125,
"learning_rate": 9.064229765013054e-06,
"loss": 0.6283,
"step": 8680
},
{
"epoch": 0.16356957699873328,
"grad_norm": 2.703125,
"learning_rate": 9.085117493472586e-06,
"loss": 0.6364,
"step": 8700
},
{
"epoch": 0.1639455990148223,
"grad_norm": 2.21875,
"learning_rate": 9.106005221932116e-06,
"loss": 0.6289,
"step": 8720
},
{
"epoch": 0.16432162103091136,
"grad_norm": 2.484375,
"learning_rate": 9.126892950391647e-06,
"loss": 0.6357,
"step": 8740
},
{
"epoch": 0.1646976430470004,
"grad_norm": 4.0625,
"learning_rate": 9.147780678851175e-06,
"loss": 0.6392,
"step": 8760
},
{
"epoch": 0.16507366506308943,
"grad_norm": 1.6953125,
"learning_rate": 9.168668407310705e-06,
"loss": 0.6211,
"step": 8780
},
{
"epoch": 0.16544968707917848,
"grad_norm": 2.34375,
"learning_rate": 9.189556135770236e-06,
"loss": 0.6395,
"step": 8800
},
{
"epoch": 0.16582570909526753,
"grad_norm": 2.765625,
"learning_rate": 9.210443864229766e-06,
"loss": 0.625,
"step": 8820
},
{
"epoch": 0.16620173111135658,
"grad_norm": 2.296875,
"learning_rate": 9.231331592689296e-06,
"loss": 0.6307,
"step": 8840
},
{
"epoch": 0.1665777531274456,
"grad_norm": 2.0625,
"learning_rate": 9.252219321148825e-06,
"loss": 0.6201,
"step": 8860
},
{
"epoch": 0.16695377514353466,
"grad_norm": 1.96875,
"learning_rate": 9.273107049608357e-06,
"loss": 0.6211,
"step": 8880
},
{
"epoch": 0.1673297971596237,
"grad_norm": 1.9765625,
"learning_rate": 9.293994778067887e-06,
"loss": 0.6242,
"step": 8900
},
{
"epoch": 0.16770581917571273,
"grad_norm": 2.0625,
"learning_rate": 9.314882506527415e-06,
"loss": 0.6216,
"step": 8920
},
{
"epoch": 0.16808184119180178,
"grad_norm": 2.21875,
"learning_rate": 9.335770234986946e-06,
"loss": 0.6271,
"step": 8940
},
{
"epoch": 0.16845786320789083,
"grad_norm": 1.7265625,
"learning_rate": 9.356657963446476e-06,
"loss": 0.6366,
"step": 8960
},
{
"epoch": 0.16883388522397985,
"grad_norm": 2.234375,
"learning_rate": 9.377545691906006e-06,
"loss": 0.6155,
"step": 8980
},
{
"epoch": 0.1692099072400689,
"grad_norm": 2.421875,
"learning_rate": 9.398433420365536e-06,
"loss": 0.6162,
"step": 9000
},
{
"epoch": 0.16958592925615795,
"grad_norm": 3.578125,
"learning_rate": 9.419321148825065e-06,
"loss": 0.6269,
"step": 9020
},
{
"epoch": 0.169961951272247,
"grad_norm": 1.953125,
"learning_rate": 9.440208877284595e-06,
"loss": 0.6306,
"step": 9040
},
{
"epoch": 0.17033797328833603,
"grad_norm": 1.7265625,
"learning_rate": 9.461096605744125e-06,
"loss": 0.6247,
"step": 9060
},
{
"epoch": 0.17071399530442508,
"grad_norm": 1.9140625,
"learning_rate": 9.481984334203657e-06,
"loss": 0.613,
"step": 9080
},
{
"epoch": 0.17109001732051413,
"grad_norm": 3.953125,
"learning_rate": 9.502872062663186e-06,
"loss": 0.6187,
"step": 9100
},
{
"epoch": 0.17146603933660315,
"grad_norm": 2.875,
"learning_rate": 9.523759791122716e-06,
"loss": 0.6215,
"step": 9120
},
{
"epoch": 0.1718420613526922,
"grad_norm": 2.484375,
"learning_rate": 9.544647519582246e-06,
"loss": 0.6234,
"step": 9140
},
{
"epoch": 0.17221808336878125,
"grad_norm": 2.421875,
"learning_rate": 9.565535248041777e-06,
"loss": 0.618,
"step": 9160
},
{
"epoch": 0.17259410538487027,
"grad_norm": 1.7734375,
"learning_rate": 9.586422976501307e-06,
"loss": 0.6134,
"step": 9180
},
{
"epoch": 0.17297012740095932,
"grad_norm": 2.296875,
"learning_rate": 9.607310704960835e-06,
"loss": 0.6127,
"step": 9200
},
{
"epoch": 0.17334614941704837,
"grad_norm": 2.625,
"learning_rate": 9.628198433420366e-06,
"loss": 0.6139,
"step": 9220
},
{
"epoch": 0.17372217143313742,
"grad_norm": 2.875,
"learning_rate": 9.649086161879896e-06,
"loss": 0.6121,
"step": 9240
},
{
"epoch": 0.17409819344922645,
"grad_norm": 2.09375,
"learning_rate": 9.669973890339426e-06,
"loss": 0.6126,
"step": 9260
},
{
"epoch": 0.1744742154653155,
"grad_norm": 1.2265625,
"learning_rate": 9.690861618798956e-06,
"loss": 0.6162,
"step": 9280
},
{
"epoch": 0.17485023748140455,
"grad_norm": 2.796875,
"learning_rate": 9.711749347258487e-06,
"loss": 0.6072,
"step": 9300
},
{
"epoch": 0.17522625949749357,
"grad_norm": 1.5234375,
"learning_rate": 9.732637075718017e-06,
"loss": 0.6122,
"step": 9320
},
{
"epoch": 0.17560228151358262,
"grad_norm": 1.515625,
"learning_rate": 9.753524804177547e-06,
"loss": 0.6053,
"step": 9340
},
{
"epoch": 0.17597830352967167,
"grad_norm": 1.4765625,
"learning_rate": 9.774412532637077e-06,
"loss": 0.6149,
"step": 9360
},
{
"epoch": 0.1763543255457607,
"grad_norm": 1.5,
"learning_rate": 9.795300261096606e-06,
"loss": 0.6229,
"step": 9380
},
{
"epoch": 0.17673034756184974,
"grad_norm": 1.609375,
"learning_rate": 9.816187989556136e-06,
"loss": 0.6134,
"step": 9400
},
{
"epoch": 0.1771063695779388,
"grad_norm": 2.140625,
"learning_rate": 9.837075718015666e-06,
"loss": 0.6155,
"step": 9420
},
{
"epoch": 0.17748239159402784,
"grad_norm": 1.6015625,
"learning_rate": 9.857963446475197e-06,
"loss": 0.6042,
"step": 9440
},
{
"epoch": 0.17785841361011687,
"grad_norm": 1.78125,
"learning_rate": 9.878851174934727e-06,
"loss": 0.6182,
"step": 9460
},
{
"epoch": 0.17823443562620592,
"grad_norm": 1.3515625,
"learning_rate": 9.899738903394257e-06,
"loss": 0.6036,
"step": 9480
},
{
"epoch": 0.17861045764229497,
"grad_norm": 1.984375,
"learning_rate": 9.920626631853787e-06,
"loss": 0.6027,
"step": 9500
},
{
"epoch": 0.178986479658384,
"grad_norm": 1.5,
"learning_rate": 9.941514360313318e-06,
"loss": 0.6089,
"step": 9520
},
{
"epoch": 0.17936250167447304,
"grad_norm": 1.7734375,
"learning_rate": 9.962402088772846e-06,
"loss": 0.604,
"step": 9540
},
{
"epoch": 0.1797385236905621,
"grad_norm": 1.28125,
"learning_rate": 9.983289817232376e-06,
"loss": 0.6004,
"step": 9560
},
{
"epoch": 0.1801145457066511,
"grad_norm": 1.5234375,
"learning_rate": 9.999999995880232e-06,
"loss": 0.6019,
"step": 9580
},
{
"epoch": 0.18049056772274016,
"grad_norm": 1.421875,
"learning_rate": 9.999999851688318e-06,
"loss": 0.6145,
"step": 9600
},
{
"epoch": 0.18086658973882921,
"grad_norm": 1.78125,
"learning_rate": 9.999999501507959e-06,
"loss": 0.6105,
"step": 9620
},
{
"epoch": 0.18124261175491824,
"grad_norm": 2.109375,
"learning_rate": 9.999998945339171e-06,
"loss": 0.6139,
"step": 9640
},
{
"epoch": 0.1816186337710073,
"grad_norm": 1.8671875,
"learning_rate": 9.999998183181976e-06,
"loss": 0.6122,
"step": 9660
},
{
"epoch": 0.18199465578709634,
"grad_norm": 1.6171875,
"learning_rate": 9.999997215036408e-06,
"loss": 0.6095,
"step": 9680
},
{
"epoch": 0.1823706778031854,
"grad_norm": 1.359375,
"learning_rate": 9.999996040902503e-06,
"loss": 0.5928,
"step": 9700
},
{
"epoch": 0.1827466998192744,
"grad_norm": 1.21875,
"learning_rate": 9.999994660780312e-06,
"loss": 0.6034,
"step": 9720
},
{
"epoch": 0.18312272183536346,
"grad_norm": 1.3984375,
"learning_rate": 9.99999307466989e-06,
"loss": 0.6012,
"step": 9740
},
{
"epoch": 0.1834987438514525,
"grad_norm": 2.09375,
"learning_rate": 9.999991282571304e-06,
"loss": 0.605,
"step": 9760
},
{
"epoch": 0.18387476586754153,
"grad_norm": 1.421875,
"learning_rate": 9.999989284484629e-06,
"loss": 0.6093,
"step": 9780
},
{
"epoch": 0.18425078788363058,
"grad_norm": 1.4765625,
"learning_rate": 9.999987080409942e-06,
"loss": 0.6032,
"step": 9800
},
{
"epoch": 0.18462680989971963,
"grad_norm": 1.2421875,
"learning_rate": 9.99998467034734e-06,
"loss": 0.6019,
"step": 9820
},
{
"epoch": 0.18500283191580866,
"grad_norm": 1.84375,
"learning_rate": 9.99998205429692e-06,
"loss": 0.6006,
"step": 9840
},
{
"epoch": 0.1853788539318977,
"grad_norm": 1.328125,
"learning_rate": 9.999979232258787e-06,
"loss": 0.586,
"step": 9860
},
{
"epoch": 0.18575487594798676,
"grad_norm": 1.375,
"learning_rate": 9.999976204233062e-06,
"loss": 0.614,
"step": 9880
},
{
"epoch": 0.1861308979640758,
"grad_norm": 1.2421875,
"learning_rate": 9.999972970219865e-06,
"loss": 0.6049,
"step": 9900
},
{
"epoch": 0.18650691998016483,
"grad_norm": 1.3984375,
"learning_rate": 9.999969530219333e-06,
"loss": 0.6048,
"step": 9920
},
{
"epoch": 0.18688294199625388,
"grad_norm": 1.3828125,
"learning_rate": 9.999965884231607e-06,
"loss": 0.608,
"step": 9940
},
{
"epoch": 0.18725896401234293,
"grad_norm": 1.609375,
"learning_rate": 9.999962032256836e-06,
"loss": 0.6017,
"step": 9960
},
{
"epoch": 0.18763498602843195,
"grad_norm": 1.265625,
"learning_rate": 9.99995797429518e-06,
"loss": 0.592,
"step": 9980
},
{
"epoch": 0.188011008044521,
"grad_norm": 1.421875,
"learning_rate": 9.999953710346804e-06,
"loss": 0.602,
"step": 10000
},
{
"epoch": 0.18838703006061006,
"grad_norm": 1.3125,
"learning_rate": 9.999949240411886e-06,
"loss": 0.5894,
"step": 10020
},
{
"epoch": 0.18876305207669908,
"grad_norm": 1.375,
"learning_rate": 9.99994456449061e-06,
"loss": 0.5908,
"step": 10040
},
{
"epoch": 0.18913907409278813,
"grad_norm": 1.2890625,
"learning_rate": 9.999939682583166e-06,
"loss": 0.5914,
"step": 10060
},
{
"epoch": 0.18951509610887718,
"grad_norm": 1.6328125,
"learning_rate": 9.999934594689759e-06,
"loss": 0.5951,
"step": 10080
},
{
"epoch": 0.18989111812496623,
"grad_norm": 1.1796875,
"learning_rate": 9.999929300810595e-06,
"loss": 0.5925,
"step": 10100
},
{
"epoch": 0.19026714014105525,
"grad_norm": 1.1953125,
"learning_rate": 9.999923800945895e-06,
"loss": 0.5982,
"step": 10120
},
{
"epoch": 0.1906431621571443,
"grad_norm": 1.1640625,
"learning_rate": 9.999918095095884e-06,
"loss": 0.6023,
"step": 10140
},
{
"epoch": 0.19101918417323335,
"grad_norm": 1.171875,
"learning_rate": 9.999912183260798e-06,
"loss": 0.5926,
"step": 10160
},
{
"epoch": 0.19139520618932238,
"grad_norm": 1.28125,
"learning_rate": 9.999906065440878e-06,
"loss": 0.5869,
"step": 10180
},
{
"epoch": 0.19177122820541143,
"grad_norm": 1.296875,
"learning_rate": 9.999899741636381e-06,
"loss": 0.5965,
"step": 10200
},
{
"epoch": 0.19214725022150048,
"grad_norm": 1.0234375,
"learning_rate": 9.999893211847563e-06,
"loss": 0.601,
"step": 10220
},
{
"epoch": 0.1925232722375895,
"grad_norm": 1.0390625,
"learning_rate": 9.999886476074694e-06,
"loss": 0.5916,
"step": 10240
},
{
"epoch": 0.19289929425367855,
"grad_norm": 1.078125,
"learning_rate": 9.999879534318051e-06,
"loss": 0.5947,
"step": 10260
},
{
"epoch": 0.1932753162697676,
"grad_norm": 1.109375,
"learning_rate": 9.999872386577923e-06,
"loss": 0.5979,
"step": 10280
},
{
"epoch": 0.19365133828585665,
"grad_norm": 1.21875,
"learning_rate": 9.9998650328546e-06,
"loss": 0.5927,
"step": 10300
},
{
"epoch": 0.19402736030194567,
"grad_norm": 3.171875,
"learning_rate": 9.99985747314839e-06,
"loss": 0.5999,
"step": 10320
},
{
"epoch": 0.19440338231803472,
"grad_norm": 1.1640625,
"learning_rate": 9.999849707459601e-06,
"loss": 0.6072,
"step": 10340
},
{
"epoch": 0.19477940433412377,
"grad_norm": 1.6171875,
"learning_rate": 9.999841735788555e-06,
"loss": 0.601,
"step": 10360
},
{
"epoch": 0.1951554263502128,
"grad_norm": 1.484375,
"learning_rate": 9.999833558135578e-06,
"loss": 0.5996,
"step": 10380
},
{
"epoch": 0.19553144836630185,
"grad_norm": 1.015625,
"learning_rate": 9.999825174501009e-06,
"loss": 0.5907,
"step": 10400
},
{
"epoch": 0.1959074703823909,
"grad_norm": 1.265625,
"learning_rate": 9.999816584885192e-06,
"loss": 0.5888,
"step": 10420
},
{
"epoch": 0.19628349239847992,
"grad_norm": 1.515625,
"learning_rate": 9.99980778928848e-06,
"loss": 0.5894,
"step": 10440
},
{
"epoch": 0.19665951441456897,
"grad_norm": 2.25,
"learning_rate": 9.999798787711239e-06,
"loss": 0.5938,
"step": 10460
},
{
"epoch": 0.19703553643065802,
"grad_norm": 1.1796875,
"learning_rate": 9.999789580153835e-06,
"loss": 0.5832,
"step": 10480
},
{
"epoch": 0.19741155844674707,
"grad_norm": 1.1015625,
"learning_rate": 9.999780166616652e-06,
"loss": 0.579,
"step": 10500
},
{
"epoch": 0.1977875804628361,
"grad_norm": 1.015625,
"learning_rate": 9.999770547100073e-06,
"loss": 0.596,
"step": 10520
},
{
"epoch": 0.19816360247892514,
"grad_norm": 1.453125,
"learning_rate": 9.9997607216045e-06,
"loss": 0.5934,
"step": 10540
},
{
"epoch": 0.1985396244950142,
"grad_norm": 1.4140625,
"learning_rate": 9.999750690130335e-06,
"loss": 0.5884,
"step": 10560
},
{
"epoch": 0.19891564651110322,
"grad_norm": 1.390625,
"learning_rate": 9.99974045267799e-06,
"loss": 0.5949,
"step": 10580
},
{
"epoch": 0.19929166852719227,
"grad_norm": 1.2265625,
"learning_rate": 9.999730009247888e-06,
"loss": 0.5935,
"step": 10600
},
{
"epoch": 0.19966769054328132,
"grad_norm": 1.265625,
"learning_rate": 9.999719359840459e-06,
"loss": 0.5904,
"step": 10620
},
{
"epoch": 0.20004371255937034,
"grad_norm": 1.296875,
"learning_rate": 9.99970850445614e-06,
"loss": 0.5811,
"step": 10640
},
{
"epoch": 0.2004197345754594,
"grad_norm": 0.98828125,
"learning_rate": 9.999697443095383e-06,
"loss": 0.584,
"step": 10660
},
{
"epoch": 0.20079575659154844,
"grad_norm": 1.125,
"learning_rate": 9.999686175758639e-06,
"loss": 0.586,
"step": 10680
},
{
"epoch": 0.20117177860763746,
"grad_norm": 1.0234375,
"learning_rate": 9.999674702446375e-06,
"loss": 0.5924,
"step": 10700
},
{
"epoch": 0.20154780062372651,
"grad_norm": 1.1796875,
"learning_rate": 9.999663023159062e-06,
"loss": 0.5876,
"step": 10720
},
{
"epoch": 0.20192382263981556,
"grad_norm": 0.97265625,
"learning_rate": 9.999651137897182e-06,
"loss": 0.5857,
"step": 10740
},
{
"epoch": 0.20229984465590461,
"grad_norm": 1.0234375,
"learning_rate": 9.999639046661226e-06,
"loss": 0.5847,
"step": 10760
},
{
"epoch": 0.20267586667199364,
"grad_norm": 1.1953125,
"learning_rate": 9.999626749451688e-06,
"loss": 0.5865,
"step": 10780
},
{
"epoch": 0.2030518886880827,
"grad_norm": 1.0234375,
"learning_rate": 9.999614246269076e-06,
"loss": 0.5876,
"step": 10800
},
{
"epoch": 0.20342791070417174,
"grad_norm": 1.4296875,
"learning_rate": 9.999601537113908e-06,
"loss": 0.5751,
"step": 10820
},
{
"epoch": 0.20380393272026076,
"grad_norm": 1.125,
"learning_rate": 9.999588621986707e-06,
"loss": 0.5764,
"step": 10840
},
{
"epoch": 0.2041799547363498,
"grad_norm": 1.671875,
"learning_rate": 9.999575500888004e-06,
"loss": 0.5752,
"step": 10860
},
{
"epoch": 0.20455597675243886,
"grad_norm": 0.921875,
"learning_rate": 9.999562173818338e-06,
"loss": 0.5858,
"step": 10880
},
{
"epoch": 0.20493199876852788,
"grad_norm": 1.09375,
"learning_rate": 9.999548640778259e-06,
"loss": 0.5932,
"step": 10900
},
{
"epoch": 0.20530802078461693,
"grad_norm": 0.97265625,
"learning_rate": 9.999534901768326e-06,
"loss": 0.5797,
"step": 10920
},
{
"epoch": 0.20568404280070599,
"grad_norm": 1.0234375,
"learning_rate": 9.999520956789104e-06,
"loss": 0.5839,
"step": 10940
},
{
"epoch": 0.20606006481679504,
"grad_norm": 0.96484375,
"learning_rate": 9.999506805841169e-06,
"loss": 0.5883,
"step": 10960
},
{
"epoch": 0.20643608683288406,
"grad_norm": 1.125,
"learning_rate": 9.999492448925102e-06,
"loss": 0.5793,
"step": 10980
},
{
"epoch": 0.2068121088489731,
"grad_norm": 1.078125,
"learning_rate": 9.999477886041493e-06,
"loss": 0.5795,
"step": 11000
},
{
"epoch": 0.20718813086506216,
"grad_norm": 1.0859375,
"learning_rate": 9.999463117190945e-06,
"loss": 0.5798,
"step": 11020
},
{
"epoch": 0.20756415288115118,
"grad_norm": 1.15625,
"learning_rate": 9.999448142374066e-06,
"loss": 0.5855,
"step": 11040
},
{
"epoch": 0.20794017489724023,
"grad_norm": 0.85546875,
"learning_rate": 9.999432961591472e-06,
"loss": 0.6086,
"step": 11060
},
{
"epoch": 0.20831619691332928,
"grad_norm": 1.2265625,
"learning_rate": 9.999417574843788e-06,
"loss": 0.5777,
"step": 11080
},
{
"epoch": 0.2086922189294183,
"grad_norm": 1.0,
"learning_rate": 9.99940198213165e-06,
"loss": 0.5858,
"step": 11100
},
{
"epoch": 0.20906824094550736,
"grad_norm": 0.86328125,
"learning_rate": 9.9993861834557e-06,
"loss": 0.5761,
"step": 11120
},
{
"epoch": 0.2094442629615964,
"grad_norm": 1.4140625,
"learning_rate": 9.999370178816586e-06,
"loss": 0.5777,
"step": 11140
},
{
"epoch": 0.20982028497768546,
"grad_norm": 1.453125,
"learning_rate": 9.999353968214969e-06,
"loss": 0.5853,
"step": 11160
},
{
"epoch": 0.21019630699377448,
"grad_norm": 1.359375,
"learning_rate": 9.999337551651517e-06,
"loss": 0.5951,
"step": 11180
},
{
"epoch": 0.21057232900986353,
"grad_norm": 0.8828125,
"learning_rate": 9.999320929126909e-06,
"loss": 0.5874,
"step": 11200
},
{
"epoch": 0.21094835102595258,
"grad_norm": 0.859375,
"learning_rate": 9.999304100641824e-06,
"loss": 0.5924,
"step": 11220
},
{
"epoch": 0.2113243730420416,
"grad_norm": 1.015625,
"learning_rate": 9.99928706619696e-06,
"loss": 0.5927,
"step": 11240
},
{
"epoch": 0.21170039505813065,
"grad_norm": 0.91015625,
"learning_rate": 9.999269825793018e-06,
"loss": 0.5941,
"step": 11260
},
{
"epoch": 0.2120764170742197,
"grad_norm": 0.9296875,
"learning_rate": 9.999252379430707e-06,
"loss": 0.5873,
"step": 11280
},
{
"epoch": 0.21245243909030873,
"grad_norm": 0.8671875,
"learning_rate": 9.999234727110746e-06,
"loss": 0.586,
"step": 11300
},
{
"epoch": 0.21282846110639778,
"grad_norm": 0.8515625,
"learning_rate": 9.999216868833864e-06,
"loss": 0.5901,
"step": 11320
},
{
"epoch": 0.21320448312248683,
"grad_norm": 1.03125,
"learning_rate": 9.999198804600793e-06,
"loss": 0.5738,
"step": 11340
},
{
"epoch": 0.21358050513857588,
"grad_norm": 1.1328125,
"learning_rate": 9.999180534412281e-06,
"loss": 0.5837,
"step": 11360
},
{
"epoch": 0.2139565271546649,
"grad_norm": 0.98046875,
"learning_rate": 9.999162058269079e-06,
"loss": 0.58,
"step": 11380
},
{
"epoch": 0.21433254917075395,
"grad_norm": 0.8359375,
"learning_rate": 9.99914337617195e-06,
"loss": 0.5803,
"step": 11400
},
{
"epoch": 0.214708571186843,
"grad_norm": 0.94140625,
"learning_rate": 9.999124488121658e-06,
"loss": 0.5759,
"step": 11420
},
{
"epoch": 0.21508459320293202,
"grad_norm": 0.9921875,
"learning_rate": 9.999105394118988e-06,
"loss": 0.5867,
"step": 11440
},
{
"epoch": 0.21546061521902107,
"grad_norm": 0.8203125,
"learning_rate": 9.999086094164724e-06,
"loss": 0.5784,
"step": 11460
},
{
"epoch": 0.21583663723511012,
"grad_norm": 0.90625,
"learning_rate": 9.99906658825966e-06,
"loss": 0.5796,
"step": 11480
},
{
"epoch": 0.21621265925119915,
"grad_norm": 0.890625,
"learning_rate": 9.999046876404602e-06,
"loss": 0.5758,
"step": 11500
},
{
"epoch": 0.2165886812672882,
"grad_norm": 0.921875,
"learning_rate": 9.999026958600358e-06,
"loss": 0.5852,
"step": 11520
},
{
"epoch": 0.21696470328337725,
"grad_norm": 1.109375,
"learning_rate": 9.999006834847752e-06,
"loss": 0.576,
"step": 11540
},
{
"epoch": 0.2173407252994663,
"grad_norm": 0.83203125,
"learning_rate": 9.998986505147612e-06,
"loss": 0.5848,
"step": 11560
},
{
"epoch": 0.21771674731555532,
"grad_norm": 1.015625,
"learning_rate": 9.998965969500779e-06,
"loss": 0.5871,
"step": 11580
},
{
"epoch": 0.21809276933164437,
"grad_norm": 0.86328125,
"learning_rate": 9.99894522790809e-06,
"loss": 0.5829,
"step": 11600
},
{
"epoch": 0.21846879134773342,
"grad_norm": 0.94921875,
"learning_rate": 9.99892428037041e-06,
"loss": 0.5742,
"step": 11620
},
{
"epoch": 0.21884481336382244,
"grad_norm": 0.89453125,
"learning_rate": 9.998903126888595e-06,
"loss": 0.5841,
"step": 11640
},
{
"epoch": 0.2192208353799115,
"grad_norm": 1.0625,
"learning_rate": 9.998881767463519e-06,
"loss": 0.5819,
"step": 11660
},
{
"epoch": 0.21959685739600054,
"grad_norm": 1.0078125,
"learning_rate": 9.998860202096063e-06,
"loss": 0.5805,
"step": 11680
},
{
"epoch": 0.21997287941208957,
"grad_norm": 0.79296875,
"learning_rate": 9.998838430787112e-06,
"loss": 0.5785,
"step": 11700
},
{
"epoch": 0.22034890142817862,
"grad_norm": 1.0078125,
"learning_rate": 9.998816453537568e-06,
"loss": 0.5804,
"step": 11720
},
{
"epoch": 0.22072492344426767,
"grad_norm": 0.91796875,
"learning_rate": 9.998794270348331e-06,
"loss": 0.5854,
"step": 11740
},
{
"epoch": 0.2211009454603567,
"grad_norm": 0.890625,
"learning_rate": 9.998771881220319e-06,
"loss": 0.5857,
"step": 11760
},
{
"epoch": 0.22147696747644574,
"grad_norm": 0.8125,
"learning_rate": 9.99874928615445e-06,
"loss": 0.5855,
"step": 11780
},
{
"epoch": 0.2218529894925348,
"grad_norm": 0.9375,
"learning_rate": 9.99872648515166e-06,
"loss": 0.5736,
"step": 11800
},
{
"epoch": 0.22222901150862384,
"grad_norm": 0.86328125,
"learning_rate": 9.998703478212885e-06,
"loss": 0.5792,
"step": 11820
},
{
"epoch": 0.22260503352471286,
"grad_norm": 0.85546875,
"learning_rate": 9.998680265339076e-06,
"loss": 0.5709,
"step": 11840
},
{
"epoch": 0.22298105554080191,
"grad_norm": 0.93359375,
"learning_rate": 9.998656846531185e-06,
"loss": 0.5717,
"step": 11860
},
{
"epoch": 0.22335707755689096,
"grad_norm": 1.0,
"learning_rate": 9.99863322179018e-06,
"loss": 0.5719,
"step": 11880
},
{
"epoch": 0.22373309957298,
"grad_norm": 0.86328125,
"learning_rate": 9.99860939111703e-06,
"loss": 0.5874,
"step": 11900
},
{
"epoch": 0.22410912158906904,
"grad_norm": 0.90234375,
"learning_rate": 9.998585354512725e-06,
"loss": 0.5723,
"step": 11920
},
{
"epoch": 0.2244851436051581,
"grad_norm": 0.96875,
"learning_rate": 9.998561111978246e-06,
"loss": 0.5899,
"step": 11940
},
{
"epoch": 0.2248611656212471,
"grad_norm": 0.76953125,
"learning_rate": 9.998536663514599e-06,
"loss": 0.5824,
"step": 11960
},
{
"epoch": 0.22523718763733616,
"grad_norm": 0.98046875,
"learning_rate": 9.998512009122787e-06,
"loss": 0.5668,
"step": 11980
},
{
"epoch": 0.2256132096534252,
"grad_norm": 1.0234375,
"learning_rate": 9.998487148803826e-06,
"loss": 0.5701,
"step": 12000
},
{
"epoch": 0.22598923166951426,
"grad_norm": 1.0078125,
"learning_rate": 9.998462082558741e-06,
"loss": 0.576,
"step": 12020
},
{
"epoch": 0.22636525368560328,
"grad_norm": 0.90625,
"learning_rate": 9.998436810388566e-06,
"loss": 0.5761,
"step": 12040
},
{
"epoch": 0.22674127570169234,
"grad_norm": 1.34375,
"learning_rate": 9.998411332294341e-06,
"loss": 0.5786,
"step": 12060
},
{
"epoch": 0.22711729771778139,
"grad_norm": 0.8125,
"learning_rate": 9.998385648277116e-06,
"loss": 0.5758,
"step": 12080
},
{
"epoch": 0.2274933197338704,
"grad_norm": 0.9921875,
"learning_rate": 9.998359758337947e-06,
"loss": 0.5769,
"step": 12100
},
{
"epoch": 0.22786934174995946,
"grad_norm": 0.76953125,
"learning_rate": 9.998333662477903e-06,
"loss": 0.5666,
"step": 12120
},
{
"epoch": 0.2282453637660485,
"grad_norm": 0.79296875,
"learning_rate": 9.998307360698059e-06,
"loss": 0.5754,
"step": 12140
},
{
"epoch": 0.22862138578213753,
"grad_norm": 0.8359375,
"learning_rate": 9.998280852999496e-06,
"loss": 0.5627,
"step": 12160
},
{
"epoch": 0.22899740779822658,
"grad_norm": 0.8359375,
"learning_rate": 9.99825413938331e-06,
"loss": 0.5797,
"step": 12180
},
{
"epoch": 0.22937342981431563,
"grad_norm": 0.8515625,
"learning_rate": 9.998227219850597e-06,
"loss": 0.5875,
"step": 12200
},
{
"epoch": 0.22974945183040468,
"grad_norm": 0.8359375,
"learning_rate": 9.998200094402471e-06,
"loss": 0.5809,
"step": 12220
},
{
"epoch": 0.2301254738464937,
"grad_norm": 1.046875,
"learning_rate": 9.998172763040048e-06,
"loss": 0.5714,
"step": 12240
},
{
"epoch": 0.23050149586258276,
"grad_norm": 0.94140625,
"learning_rate": 9.99814522576445e-06,
"loss": 0.5755,
"step": 12260
},
{
"epoch": 0.2308775178786718,
"grad_norm": 0.9453125,
"learning_rate": 9.998117482576816e-06,
"loss": 0.5764,
"step": 12280
},
{
"epoch": 0.23125353989476083,
"grad_norm": 0.9375,
"learning_rate": 9.998089533478287e-06,
"loss": 0.5699,
"step": 12300
},
{
"epoch": 0.23162956191084988,
"grad_norm": 0.83203125,
"learning_rate": 9.998061378470016e-06,
"loss": 0.5814,
"step": 12320
},
{
"epoch": 0.23200558392693893,
"grad_norm": 0.78125,
"learning_rate": 9.998033017553162e-06,
"loss": 0.5776,
"step": 12340
},
{
"epoch": 0.23238160594302795,
"grad_norm": 0.88671875,
"learning_rate": 9.99800445072889e-06,
"loss": 0.5776,
"step": 12360
},
{
"epoch": 0.232757627959117,
"grad_norm": 0.87890625,
"learning_rate": 9.997975677998385e-06,
"loss": 0.574,
"step": 12380
},
{
"epoch": 0.23313364997520605,
"grad_norm": 1.328125,
"learning_rate": 9.997946699362825e-06,
"loss": 0.5668,
"step": 12400
},
{
"epoch": 0.2335096719912951,
"grad_norm": 0.80859375,
"learning_rate": 9.997917514823406e-06,
"loss": 0.5711,
"step": 12420
},
{
"epoch": 0.23388569400738413,
"grad_norm": 0.76171875,
"learning_rate": 9.99788812438133e-06,
"loss": 0.5556,
"step": 12440
},
{
"epoch": 0.23426171602347318,
"grad_norm": 0.96484375,
"learning_rate": 9.99785852803781e-06,
"loss": 0.5841,
"step": 12460
},
{
"epoch": 0.23463773803956223,
"grad_norm": 0.91015625,
"learning_rate": 9.997828725794061e-06,
"loss": 0.5763,
"step": 12480
},
{
"epoch": 0.23501376005565125,
"grad_norm": 0.79296875,
"learning_rate": 9.997798717651316e-06,
"loss": 0.5698,
"step": 12500
},
{
"epoch": 0.2353897820717403,
"grad_norm": 0.9140625,
"learning_rate": 9.99776850361081e-06,
"loss": 0.5708,
"step": 12520
},
{
"epoch": 0.23576580408782935,
"grad_norm": 0.75,
"learning_rate": 9.997738083673785e-06,
"loss": 0.5727,
"step": 12540
},
{
"epoch": 0.23614182610391837,
"grad_norm": 0.95703125,
"learning_rate": 9.997707457841496e-06,
"loss": 0.5596,
"step": 12560
},
{
"epoch": 0.23651784812000742,
"grad_norm": 1.1015625,
"learning_rate": 9.997676626115205e-06,
"loss": 0.5688,
"step": 12580
},
{
"epoch": 0.23689387013609647,
"grad_norm": 0.94140625,
"learning_rate": 9.997645588496181e-06,
"loss": 0.5598,
"step": 12600
},
{
"epoch": 0.23726989215218552,
"grad_norm": 0.84375,
"learning_rate": 9.997614344985705e-06,
"loss": 0.5573,
"step": 12620
},
{
"epoch": 0.23764591416827455,
"grad_norm": 0.8203125,
"learning_rate": 9.99758289558506e-06,
"loss": 0.5708,
"step": 12640
},
{
"epoch": 0.2380219361843636,
"grad_norm": 0.8984375,
"learning_rate": 9.997551240295546e-06,
"loss": 0.5752,
"step": 12660
},
{
"epoch": 0.23839795820045265,
"grad_norm": 0.73046875,
"learning_rate": 9.997519379118465e-06,
"loss": 0.5741,
"step": 12680
},
{
"epoch": 0.23877398021654167,
"grad_norm": 0.75390625,
"learning_rate": 9.99748731205513e-06,
"loss": 0.5625,
"step": 12700
},
{
"epoch": 0.23915000223263072,
"grad_norm": 1.09375,
"learning_rate": 9.997455039106861e-06,
"loss": 0.5751,
"step": 12720
},
{
"epoch": 0.23952602424871977,
"grad_norm": 0.84765625,
"learning_rate": 9.99742256027499e-06,
"loss": 0.5627,
"step": 12740
},
{
"epoch": 0.2399020462648088,
"grad_norm": 1.15625,
"learning_rate": 9.997389875560853e-06,
"loss": 0.5675,
"step": 12760
},
{
"epoch": 0.24027806828089784,
"grad_norm": 0.90234375,
"learning_rate": 9.997356984965798e-06,
"loss": 0.5751,
"step": 12780
},
{
"epoch": 0.2406540902969869,
"grad_norm": 0.83203125,
"learning_rate": 9.997323888491178e-06,
"loss": 0.5762,
"step": 12800
},
{
"epoch": 0.24103011231307592,
"grad_norm": 0.80859375,
"learning_rate": 9.997290586138357e-06,
"loss": 0.5744,
"step": 12820
},
{
"epoch": 0.24140613432916497,
"grad_norm": 0.703125,
"learning_rate": 9.99725707790871e-06,
"loss": 0.5676,
"step": 12840
},
{
"epoch": 0.24178215634525402,
"grad_norm": 0.76953125,
"learning_rate": 9.997223363803615e-06,
"loss": 0.5817,
"step": 12860
},
{
"epoch": 0.24215817836134307,
"grad_norm": 0.80859375,
"learning_rate": 9.99718944382446e-06,
"loss": 0.5763,
"step": 12880
},
{
"epoch": 0.2425342003774321,
"grad_norm": 0.8125,
"learning_rate": 9.997155317972643e-06,
"loss": 0.5745,
"step": 12900
},
{
"epoch": 0.24291022239352114,
"grad_norm": 0.7578125,
"learning_rate": 9.99712098624957e-06,
"loss": 0.5663,
"step": 12920
},
{
"epoch": 0.2432862444096102,
"grad_norm": 0.86328125,
"learning_rate": 9.997086448656658e-06,
"loss": 0.5695,
"step": 12940
},
{
"epoch": 0.24366226642569921,
"grad_norm": 0.71875,
"learning_rate": 9.997051705195326e-06,
"loss": 0.573,
"step": 12960
},
{
"epoch": 0.24403828844178826,
"grad_norm": 0.88671875,
"learning_rate": 9.997016755867008e-06,
"loss": 0.5698,
"step": 12980
},
{
"epoch": 0.24441431045787732,
"grad_norm": 0.921875,
"learning_rate": 9.996981600673144e-06,
"loss": 0.5666,
"step": 13000
},
{
"epoch": 0.24479033247396634,
"grad_norm": 0.75390625,
"learning_rate": 9.99694623961518e-06,
"loss": 0.5694,
"step": 13020
},
{
"epoch": 0.2451663544900554,
"grad_norm": 0.9140625,
"learning_rate": 9.996910672694573e-06,
"loss": 0.5574,
"step": 13040
},
{
"epoch": 0.24554237650614444,
"grad_norm": 0.94140625,
"learning_rate": 9.99687489991279e-06,
"loss": 0.5564,
"step": 13060
},
{
"epoch": 0.2459183985222335,
"grad_norm": 0.9140625,
"learning_rate": 9.996838921271304e-06,
"loss": 0.5666,
"step": 13080
},
{
"epoch": 0.2462944205383225,
"grad_norm": 0.890625,
"learning_rate": 9.996802736771597e-06,
"loss": 0.5758,
"step": 13100
},
{
"epoch": 0.24667044255441156,
"grad_norm": 0.69140625,
"learning_rate": 9.99676634641516e-06,
"loss": 0.5619,
"step": 13120
},
{
"epoch": 0.2470464645705006,
"grad_norm": 0.8203125,
"learning_rate": 9.996729750203493e-06,
"loss": 0.5817,
"step": 13140
},
{
"epoch": 0.24742248658658964,
"grad_norm": 0.84375,
"learning_rate": 9.996692948138102e-06,
"loss": 0.5705,
"step": 13160
},
{
"epoch": 0.24779850860267869,
"grad_norm": 0.8203125,
"learning_rate": 9.996655940220504e-06,
"loss": 0.5713,
"step": 13180
},
{
"epoch": 0.24817453061876774,
"grad_norm": 0.89453125,
"learning_rate": 9.996618726452223e-06,
"loss": 0.5715,
"step": 13200
},
{
"epoch": 0.24855055263485676,
"grad_norm": 0.8046875,
"learning_rate": 9.996581306834793e-06,
"loss": 0.5622,
"step": 13220
},
{
"epoch": 0.2489265746509458,
"grad_norm": 0.71875,
"learning_rate": 9.996543681369756e-06,
"loss": 0.5636,
"step": 13240
},
{
"epoch": 0.24930259666703486,
"grad_norm": 1.0703125,
"learning_rate": 9.996505850058663e-06,
"loss": 0.5753,
"step": 13260
},
{
"epoch": 0.2496786186831239,
"grad_norm": 0.796875,
"learning_rate": 9.996467812903067e-06,
"loss": 0.5774,
"step": 13280
},
{
"epoch": 0.25005464069921296,
"grad_norm": 0.83984375,
"learning_rate": 9.996429569904542e-06,
"loss": 0.5687,
"step": 13300
},
{
"epoch": 0.25043066271530195,
"grad_norm": 1.09375,
"learning_rate": 9.99639112106466e-06,
"loss": 0.5652,
"step": 13320
},
{
"epoch": 0.250806684731391,
"grad_norm": 0.8671875,
"learning_rate": 9.996352466385006e-06,
"loss": 0.5579,
"step": 13340
},
{
"epoch": 0.25118270674748006,
"grad_norm": 0.79296875,
"learning_rate": 9.996313605867172e-06,
"loss": 0.5663,
"step": 13360
},
{
"epoch": 0.2515587287635691,
"grad_norm": 0.69140625,
"learning_rate": 9.996274539512759e-06,
"loss": 0.5653,
"step": 13380
},
{
"epoch": 0.25193475077965816,
"grad_norm": 0.8203125,
"learning_rate": 9.996235267323375e-06,
"loss": 0.5658,
"step": 13400
},
{
"epoch": 0.2523107727957472,
"grad_norm": 0.8828125,
"learning_rate": 9.99619578930064e-06,
"loss": 0.5738,
"step": 13420
},
{
"epoch": 0.25268679481183626,
"grad_norm": 0.72265625,
"learning_rate": 9.996156105446182e-06,
"loss": 0.572,
"step": 13440
},
{
"epoch": 0.25306281682792525,
"grad_norm": 0.73046875,
"learning_rate": 9.99611621576163e-06,
"loss": 0.5811,
"step": 13460
},
{
"epoch": 0.2534388388440143,
"grad_norm": 0.7421875,
"learning_rate": 9.996076120248634e-06,
"loss": 0.5669,
"step": 13480
},
{
"epoch": 0.25381486086010335,
"grad_norm": 0.71875,
"learning_rate": 9.996035818908842e-06,
"loss": 0.5716,
"step": 13500
},
{
"epoch": 0.2541908828761924,
"grad_norm": 0.76953125,
"learning_rate": 9.995995311743915e-06,
"loss": 0.5712,
"step": 13520
},
{
"epoch": 0.25456690489228145,
"grad_norm": 0.7734375,
"learning_rate": 9.995954598755522e-06,
"loss": 0.5702,
"step": 13540
},
{
"epoch": 0.2549429269083705,
"grad_norm": 0.81640625,
"learning_rate": 9.99591367994534e-06,
"loss": 0.5655,
"step": 13560
},
{
"epoch": 0.25531894892445955,
"grad_norm": 0.75,
"learning_rate": 9.995872555315056e-06,
"loss": 0.5704,
"step": 13580
},
{
"epoch": 0.25569497094054855,
"grad_norm": 0.76171875,
"learning_rate": 9.995831224866363e-06,
"loss": 0.5744,
"step": 13600
},
{
"epoch": 0.2560709929566376,
"grad_norm": 0.94140625,
"learning_rate": 9.995789688600964e-06,
"loss": 0.5598,
"step": 13620
},
{
"epoch": 0.25644701497272665,
"grad_norm": 0.78125,
"learning_rate": 9.995747946520569e-06,
"loss": 0.5758,
"step": 13640
},
{
"epoch": 0.2568230369888157,
"grad_norm": 0.75390625,
"learning_rate": 9.995705998626898e-06,
"loss": 0.5645,
"step": 13660
},
{
"epoch": 0.25719905900490475,
"grad_norm": 0.86328125,
"learning_rate": 9.995663844921684e-06,
"loss": 0.5619,
"step": 13680
},
{
"epoch": 0.2575750810209938,
"grad_norm": 0.76171875,
"learning_rate": 9.995621485406658e-06,
"loss": 0.5648,
"step": 13700
},
{
"epoch": 0.2579511030370828,
"grad_norm": 0.8203125,
"learning_rate": 9.995578920083565e-06,
"loss": 0.5713,
"step": 13720
},
{
"epoch": 0.25832712505317185,
"grad_norm": 0.67578125,
"learning_rate": 9.995536148954162e-06,
"loss": 0.5617,
"step": 13740
},
{
"epoch": 0.2587031470692609,
"grad_norm": 0.8984375,
"learning_rate": 9.995493172020208e-06,
"loss": 0.5586,
"step": 13760
},
{
"epoch": 0.25907916908534995,
"grad_norm": 0.921875,
"learning_rate": 9.995449989283477e-06,
"loss": 0.5664,
"step": 13780
},
{
"epoch": 0.259455191101439,
"grad_norm": 0.73828125,
"learning_rate": 9.995406600745745e-06,
"loss": 0.5642,
"step": 13800
},
{
"epoch": 0.25983121311752805,
"grad_norm": 0.6796875,
"learning_rate": 9.9953630064088e-06,
"loss": 0.5631,
"step": 13820
},
{
"epoch": 0.2602072351336171,
"grad_norm": 0.71875,
"learning_rate": 9.99531920627444e-06,
"loss": 0.5721,
"step": 13840
},
{
"epoch": 0.2605832571497061,
"grad_norm": 0.765625,
"learning_rate": 9.995275200344467e-06,
"loss": 0.572,
"step": 13860
},
{
"epoch": 0.26095927916579514,
"grad_norm": 0.82421875,
"learning_rate": 9.995230988620694e-06,
"loss": 0.5649,
"step": 13880
},
{
"epoch": 0.2613353011818842,
"grad_norm": 0.828125,
"learning_rate": 9.995186571104945e-06,
"loss": 0.571,
"step": 13900
},
{
"epoch": 0.26171132319797324,
"grad_norm": 0.89453125,
"learning_rate": 9.995141947799047e-06,
"loss": 0.5718,
"step": 13920
},
{
"epoch": 0.2620873452140623,
"grad_norm": 0.71875,
"learning_rate": 9.995097118704843e-06,
"loss": 0.5686,
"step": 13940
},
{
"epoch": 0.26246336723015135,
"grad_norm": 0.6484375,
"learning_rate": 9.995052083824173e-06,
"loss": 0.5607,
"step": 13960
},
{
"epoch": 0.26283938924624034,
"grad_norm": 0.70703125,
"learning_rate": 9.995006843158896e-06,
"loss": 0.5693,
"step": 13980
},
{
"epoch": 0.2632154112623294,
"grad_norm": 0.79296875,
"learning_rate": 9.994961396710876e-06,
"loss": 0.5632,
"step": 14000
},
{
"epoch": 0.26359143327841844,
"grad_norm": 0.80078125,
"learning_rate": 9.994915744481985e-06,
"loss": 0.5622,
"step": 14020
},
{
"epoch": 0.2639674552945075,
"grad_norm": 0.7890625,
"learning_rate": 9.994869886474103e-06,
"loss": 0.5606,
"step": 14040
},
{
"epoch": 0.26434347731059654,
"grad_norm": 0.7265625,
"learning_rate": 9.994823822689121e-06,
"loss": 0.5725,
"step": 14060
},
{
"epoch": 0.2647194993266856,
"grad_norm": 0.7109375,
"learning_rate": 9.994777553128935e-06,
"loss": 0.568,
"step": 14080
},
{
"epoch": 0.26509552134277464,
"grad_norm": 0.8984375,
"learning_rate": 9.994731077795454e-06,
"loss": 0.5614,
"step": 14100
},
{
"epoch": 0.26547154335886364,
"grad_norm": 0.90625,
"learning_rate": 9.994684396690588e-06,
"loss": 0.5747,
"step": 14120
},
{
"epoch": 0.2658475653749527,
"grad_norm": 0.74609375,
"learning_rate": 9.994637509816263e-06,
"loss": 0.5703,
"step": 14140
},
{
"epoch": 0.26622358739104174,
"grad_norm": 0.7578125,
"learning_rate": 9.994590417174411e-06,
"loss": 0.5579,
"step": 14160
},
{
"epoch": 0.2665996094071308,
"grad_norm": 0.7734375,
"learning_rate": 9.994543118766972e-06,
"loss": 0.575,
"step": 14180
},
{
"epoch": 0.26697563142321984,
"grad_norm": 0.7578125,
"learning_rate": 9.994495614595892e-06,
"loss": 0.5576,
"step": 14200
},
{
"epoch": 0.2673516534393089,
"grad_norm": 0.7421875,
"learning_rate": 9.994447904663132e-06,
"loss": 0.5698,
"step": 14220
},
{
"epoch": 0.26772767545539794,
"grad_norm": 0.7265625,
"learning_rate": 9.994399988970654e-06,
"loss": 0.5643,
"step": 14240
},
{
"epoch": 0.26810369747148693,
"grad_norm": 0.734375,
"learning_rate": 9.994351867520436e-06,
"loss": 0.5491,
"step": 14260
},
{
"epoch": 0.268479719487576,
"grad_norm": 0.74609375,
"learning_rate": 9.994303540314457e-06,
"loss": 0.5591,
"step": 14280
},
{
"epoch": 0.26885574150366504,
"grad_norm": 0.69140625,
"learning_rate": 9.994255007354708e-06,
"loss": 0.558,
"step": 14300
},
{
"epoch": 0.2692317635197541,
"grad_norm": 0.76953125,
"learning_rate": 9.994206268643189e-06,
"loss": 0.5541,
"step": 14320
},
{
"epoch": 0.26960778553584314,
"grad_norm": 0.76171875,
"learning_rate": 9.99415732418191e-06,
"loss": 0.561,
"step": 14340
},
{
"epoch": 0.2699838075519322,
"grad_norm": 0.72265625,
"learning_rate": 9.994108173972885e-06,
"loss": 0.5578,
"step": 14360
},
{
"epoch": 0.2703598295680212,
"grad_norm": 0.7109375,
"learning_rate": 9.99405881801814e-06,
"loss": 0.5665,
"step": 14380
},
{
"epoch": 0.27073585158411023,
"grad_norm": 0.9375,
"learning_rate": 9.99400925631971e-06,
"loss": 0.5671,
"step": 14400
},
{
"epoch": 0.2711118736001993,
"grad_norm": 0.75390625,
"learning_rate": 9.993959488879632e-06,
"loss": 0.5585,
"step": 14420
},
{
"epoch": 0.27148789561628833,
"grad_norm": 0.88671875,
"learning_rate": 9.99390951569996e-06,
"loss": 0.569,
"step": 14440
},
{
"epoch": 0.2718639176323774,
"grad_norm": 0.765625,
"learning_rate": 9.993859336782752e-06,
"loss": 0.5634,
"step": 14460
},
{
"epoch": 0.27223993964846643,
"grad_norm": 0.6953125,
"learning_rate": 9.993808952130076e-06,
"loss": 0.5823,
"step": 14480
},
{
"epoch": 0.2726159616645555,
"grad_norm": 0.72265625,
"learning_rate": 9.993758361744007e-06,
"loss": 0.5762,
"step": 14500
},
{
"epoch": 0.2729919836806445,
"grad_norm": 0.90234375,
"learning_rate": 9.99370756562663e-06,
"loss": 0.5647,
"step": 14520
},
{
"epoch": 0.27336800569673353,
"grad_norm": 0.59375,
"learning_rate": 9.993656563780034e-06,
"loss": 0.5638,
"step": 14540
},
{
"epoch": 0.2737440277128226,
"grad_norm": 0.7421875,
"learning_rate": 9.993605356206324e-06,
"loss": 0.5812,
"step": 14560
},
{
"epoch": 0.27412004972891163,
"grad_norm": 0.6875,
"learning_rate": 9.99355394290761e-06,
"loss": 0.5694,
"step": 14580
},
{
"epoch": 0.2744960717450007,
"grad_norm": 0.6875,
"learning_rate": 9.993502323886008e-06,
"loss": 0.5601,
"step": 14600
},
{
"epoch": 0.27487209376108973,
"grad_norm": 0.69921875,
"learning_rate": 9.993450499143646e-06,
"loss": 0.5712,
"step": 14620
},
{
"epoch": 0.2752481157771788,
"grad_norm": 0.77734375,
"learning_rate": 9.993398468682657e-06,
"loss": 0.5676,
"step": 14640
},
{
"epoch": 0.2756241377932678,
"grad_norm": 0.7421875,
"learning_rate": 9.993346232505186e-06,
"loss": 0.5554,
"step": 14660
},
{
"epoch": 0.2760001598093568,
"grad_norm": 0.69140625,
"learning_rate": 9.993293790613386e-06,
"loss": 0.576,
"step": 14680
},
{
"epoch": 0.2763761818254459,
"grad_norm": 0.75,
"learning_rate": 9.993241143009416e-06,
"loss": 0.5732,
"step": 14700
},
{
"epoch": 0.2767522038415349,
"grad_norm": 0.765625,
"learning_rate": 9.993188289695446e-06,
"loss": 0.5655,
"step": 14720
},
{
"epoch": 0.277128225857624,
"grad_norm": 0.66796875,
"learning_rate": 9.993135230673651e-06,
"loss": 0.5572,
"step": 14740
},
{
"epoch": 0.27750424787371303,
"grad_norm": 0.60546875,
"learning_rate": 9.993081965946221e-06,
"loss": 0.5657,
"step": 14760
},
{
"epoch": 0.277880269889802,
"grad_norm": 0.80859375,
"learning_rate": 9.993028495515347e-06,
"loss": 0.5521,
"step": 14780
},
{
"epoch": 0.2782562919058911,
"grad_norm": 0.7890625,
"learning_rate": 9.992974819383233e-06,
"loss": 0.5614,
"step": 14800
},
{
"epoch": 0.2786323139219801,
"grad_norm": 0.7265625,
"learning_rate": 9.99292093755209e-06,
"loss": 0.5646,
"step": 14820
},
{
"epoch": 0.2790083359380692,
"grad_norm": 0.7890625,
"learning_rate": 9.992866850024138e-06,
"loss": 0.5673,
"step": 14840
},
{
"epoch": 0.2793843579541582,
"grad_norm": 0.6875,
"learning_rate": 9.992812556801607e-06,
"loss": 0.5631,
"step": 14860
},
{
"epoch": 0.2797603799702473,
"grad_norm": 0.80859375,
"learning_rate": 9.992758057886732e-06,
"loss": 0.5607,
"step": 14880
},
{
"epoch": 0.2801364019863363,
"grad_norm": 0.921875,
"learning_rate": 9.992703353281757e-06,
"loss": 0.5581,
"step": 14900
},
{
"epoch": 0.2805124240024253,
"grad_norm": 0.86328125,
"learning_rate": 9.992648442988937e-06,
"loss": 0.5556,
"step": 14920
},
{
"epoch": 0.28088844601851437,
"grad_norm": 0.6796875,
"learning_rate": 9.992593327010536e-06,
"loss": 0.5565,
"step": 14940
},
{
"epoch": 0.2812644680346034,
"grad_norm": 0.671875,
"learning_rate": 9.99253800534882e-06,
"loss": 0.5663,
"step": 14960
},
{
"epoch": 0.28164049005069247,
"grad_norm": 0.68359375,
"learning_rate": 9.992482478006073e-06,
"loss": 0.5658,
"step": 14980
},
{
"epoch": 0.2820165120667815,
"grad_norm": 0.6953125,
"learning_rate": 9.992426744984582e-06,
"loss": 0.561,
"step": 15000
},
{
"epoch": 0.28239253408287057,
"grad_norm": 0.65625,
"learning_rate": 9.99237080628664e-06,
"loss": 0.558,
"step": 15020
},
{
"epoch": 0.28276855609895957,
"grad_norm": 0.7109375,
"learning_rate": 9.992314661914553e-06,
"loss": 0.569,
"step": 15040
},
{
"epoch": 0.2831445781150486,
"grad_norm": 0.79296875,
"learning_rate": 9.992258311870636e-06,
"loss": 0.558,
"step": 15060
},
{
"epoch": 0.28352060013113767,
"grad_norm": 0.6484375,
"learning_rate": 9.992201756157207e-06,
"loss": 0.5582,
"step": 15080
},
{
"epoch": 0.2838966221472267,
"grad_norm": 0.7421875,
"learning_rate": 9.992144994776597e-06,
"loss": 0.5632,
"step": 15100
},
{
"epoch": 0.28427264416331577,
"grad_norm": 0.6640625,
"learning_rate": 9.992088027731146e-06,
"loss": 0.5654,
"step": 15120
},
{
"epoch": 0.2846486661794048,
"grad_norm": 0.75,
"learning_rate": 9.992030855023201e-06,
"loss": 0.5666,
"step": 15140
},
{
"epoch": 0.28502468819549387,
"grad_norm": 0.6484375,
"learning_rate": 9.991973476655116e-06,
"loss": 0.5561,
"step": 15160
},
{
"epoch": 0.28540071021158286,
"grad_norm": 0.68359375,
"learning_rate": 9.991915892629255e-06,
"loss": 0.5668,
"step": 15180
},
{
"epoch": 0.2857767322276719,
"grad_norm": 0.59375,
"learning_rate": 9.991858102947991e-06,
"loss": 0.5642,
"step": 15200
},
{
"epoch": 0.28615275424376097,
"grad_norm": 0.64453125,
"learning_rate": 9.991800107613704e-06,
"loss": 0.5549,
"step": 15220
},
{
"epoch": 0.28652877625985,
"grad_norm": 0.6484375,
"learning_rate": 9.991741906628784e-06,
"loss": 0.5641,
"step": 15240
},
{
"epoch": 0.28690479827593907,
"grad_norm": 0.69921875,
"learning_rate": 9.991683499995629e-06,
"loss": 0.5585,
"step": 15260
},
{
"epoch": 0.2872808202920281,
"grad_norm": 0.64453125,
"learning_rate": 9.991624887716644e-06,
"loss": 0.5639,
"step": 15280
},
{
"epoch": 0.28765684230811717,
"grad_norm": 0.74609375,
"learning_rate": 9.991566069794244e-06,
"loss": 0.5636,
"step": 15300
},
{
"epoch": 0.28803286432420616,
"grad_norm": 0.84765625,
"learning_rate": 9.991507046230853e-06,
"loss": 0.5625,
"step": 15320
},
{
"epoch": 0.2884088863402952,
"grad_norm": 0.67578125,
"learning_rate": 9.991447817028903e-06,
"loss": 0.5631,
"step": 15340
},
{
"epoch": 0.28878490835638426,
"grad_norm": 0.7265625,
"learning_rate": 9.991388382190832e-06,
"loss": 0.5494,
"step": 15360
},
{
"epoch": 0.2891609303724733,
"grad_norm": 0.6953125,
"learning_rate": 9.991328741719092e-06,
"loss": 0.5587,
"step": 15380
},
{
"epoch": 0.28953695238856236,
"grad_norm": 0.703125,
"learning_rate": 9.991268895616136e-06,
"loss": 0.5561,
"step": 15400
},
{
"epoch": 0.2899129744046514,
"grad_norm": 0.70703125,
"learning_rate": 9.99120884388443e-06,
"loss": 0.5552,
"step": 15420
},
{
"epoch": 0.2902889964207404,
"grad_norm": 0.70703125,
"learning_rate": 9.991148586526451e-06,
"loss": 0.5637,
"step": 15440
},
{
"epoch": 0.29066501843682946,
"grad_norm": 0.82421875,
"learning_rate": 9.99108812354468e-06,
"loss": 0.5659,
"step": 15460
},
{
"epoch": 0.2910410404529185,
"grad_norm": 0.765625,
"learning_rate": 9.991027454941608e-06,
"loss": 0.5664,
"step": 15480
},
{
"epoch": 0.29141706246900756,
"grad_norm": 0.640625,
"learning_rate": 9.990966580719734e-06,
"loss": 0.5653,
"step": 15500
},
{
"epoch": 0.2917930844850966,
"grad_norm": 0.82421875,
"learning_rate": 9.990905500881568e-06,
"loss": 0.5576,
"step": 15520
},
{
"epoch": 0.29216910650118566,
"grad_norm": 0.9140625,
"learning_rate": 9.990844215429621e-06,
"loss": 0.5644,
"step": 15540
},
{
"epoch": 0.2925451285172747,
"grad_norm": 0.7734375,
"learning_rate": 9.990782724366424e-06,
"loss": 0.5569,
"step": 15560
},
{
"epoch": 0.2929211505333637,
"grad_norm": 0.7109375,
"learning_rate": 9.990721027694506e-06,
"loss": 0.5541,
"step": 15580
},
{
"epoch": 0.29329717254945276,
"grad_norm": 0.7578125,
"learning_rate": 9.990659125416411e-06,
"loss": 0.5702,
"step": 15600
},
{
"epoch": 0.2936731945655418,
"grad_norm": 0.6875,
"learning_rate": 9.990597017534689e-06,
"loss": 0.5716,
"step": 15620
},
{
"epoch": 0.29404921658163086,
"grad_norm": 0.59375,
"learning_rate": 9.990534704051897e-06,
"loss": 0.5562,
"step": 15640
},
{
"epoch": 0.2944252385977199,
"grad_norm": 0.7265625,
"learning_rate": 9.990472184970603e-06,
"loss": 0.5657,
"step": 15660
},
{
"epoch": 0.29480126061380896,
"grad_norm": 0.66015625,
"learning_rate": 9.990409460293385e-06,
"loss": 0.5693,
"step": 15680
},
{
"epoch": 0.295177282629898,
"grad_norm": 0.6171875,
"learning_rate": 9.990346530022826e-06,
"loss": 0.5766,
"step": 15700
},
{
"epoch": 0.295553304645987,
"grad_norm": 0.66015625,
"learning_rate": 9.990283394161515e-06,
"loss": 0.5619,
"step": 15720
},
{
"epoch": 0.29592932666207605,
"grad_norm": 0.69140625,
"learning_rate": 9.990220052712056e-06,
"loss": 0.5574,
"step": 15740
},
{
"epoch": 0.2963053486781651,
"grad_norm": 0.74609375,
"learning_rate": 9.99015650567706e-06,
"loss": 0.558,
"step": 15760
},
{
"epoch": 0.29668137069425415,
"grad_norm": 0.6640625,
"learning_rate": 9.990092753059142e-06,
"loss": 0.5712,
"step": 15780
},
{
"epoch": 0.2970573927103432,
"grad_norm": 0.55859375,
"learning_rate": 9.990028794860931e-06,
"loss": 0.565,
"step": 15800
},
{
"epoch": 0.29743341472643225,
"grad_norm": 0.7109375,
"learning_rate": 9.98996463108506e-06,
"loss": 0.5626,
"step": 15820
},
{
"epoch": 0.29780943674252125,
"grad_norm": 0.8046875,
"learning_rate": 9.989900261734174e-06,
"loss": 0.5693,
"step": 15840
},
{
"epoch": 0.2981854587586103,
"grad_norm": 0.6640625,
"learning_rate": 9.989835686810922e-06,
"loss": 0.553,
"step": 15860
},
{
"epoch": 0.29856148077469935,
"grad_norm": 0.69140625,
"learning_rate": 9.989770906317967e-06,
"loss": 0.5728,
"step": 15880
},
{
"epoch": 0.2989375027907884,
"grad_norm": 0.7109375,
"learning_rate": 9.989705920257977e-06,
"loss": 0.5645,
"step": 15900
},
{
"epoch": 0.29931352480687745,
"grad_norm": 0.6015625,
"learning_rate": 9.989640728633631e-06,
"loss": 0.5573,
"step": 15920
},
{
"epoch": 0.2996895468229665,
"grad_norm": 0.640625,
"learning_rate": 9.989575331447612e-06,
"loss": 0.5529,
"step": 15940
},
{
"epoch": 0.30006556883905555,
"grad_norm": 0.6171875,
"learning_rate": 9.989509728702615e-06,
"loss": 0.5619,
"step": 15960
},
{
"epoch": 0.30044159085514455,
"grad_norm": 0.8359375,
"learning_rate": 9.989443920401344e-06,
"loss": 0.5711,
"step": 15980
},
{
"epoch": 0.3008176128712336,
"grad_norm": 0.6875,
"learning_rate": 9.989377906546509e-06,
"loss": 0.5515,
"step": 16000
},
{
"epoch": 0.30119363488732265,
"grad_norm": 0.72265625,
"learning_rate": 9.989311687140831e-06,
"loss": 0.5689,
"step": 16020
},
{
"epoch": 0.3015696569034117,
"grad_norm": 0.69140625,
"learning_rate": 9.989245262187033e-06,
"loss": 0.5511,
"step": 16040
},
{
"epoch": 0.30194567891950075,
"grad_norm": 0.70703125,
"learning_rate": 9.989178631687859e-06,
"loss": 0.574,
"step": 16060
},
{
"epoch": 0.3023217009355898,
"grad_norm": 0.6796875,
"learning_rate": 9.98911179564605e-06,
"loss": 0.5555,
"step": 16080
},
{
"epoch": 0.3026977229516788,
"grad_norm": 0.63671875,
"learning_rate": 9.989044754064358e-06,
"loss": 0.5591,
"step": 16100
},
{
"epoch": 0.30307374496776784,
"grad_norm": 0.7890625,
"learning_rate": 9.988977506945549e-06,
"loss": 0.5591,
"step": 16120
},
{
"epoch": 0.3034497669838569,
"grad_norm": 0.6796875,
"learning_rate": 9.98891005429239e-06,
"loss": 0.5722,
"step": 16140
},
{
"epoch": 0.30382578899994594,
"grad_norm": 0.68359375,
"learning_rate": 9.988842396107663e-06,
"loss": 0.5603,
"step": 16160
},
{
"epoch": 0.304201811016035,
"grad_norm": 0.6484375,
"learning_rate": 9.988774532394152e-06,
"loss": 0.5677,
"step": 16180
},
{
"epoch": 0.30457783303212405,
"grad_norm": 0.7734375,
"learning_rate": 9.988706463154656e-06,
"loss": 0.5559,
"step": 16200
},
{
"epoch": 0.3049538550482131,
"grad_norm": 0.62109375,
"learning_rate": 9.988638188391974e-06,
"loss": 0.557,
"step": 16220
},
{
"epoch": 0.3053298770643021,
"grad_norm": 0.7734375,
"learning_rate": 9.988569708108927e-06,
"loss": 0.5588,
"step": 16240
},
{
"epoch": 0.30570589908039114,
"grad_norm": 0.6640625,
"learning_rate": 9.988501022308331e-06,
"loss": 0.5643,
"step": 16260
},
{
"epoch": 0.3060819210964802,
"grad_norm": 0.609375,
"learning_rate": 9.988432130993013e-06,
"loss": 0.5608,
"step": 16280
},
{
"epoch": 0.30645794311256924,
"grad_norm": 0.69140625,
"learning_rate": 9.988363034165817e-06,
"loss": 0.5647,
"step": 16300
},
{
"epoch": 0.3068339651286583,
"grad_norm": 0.76953125,
"learning_rate": 9.988293731829588e-06,
"loss": 0.5569,
"step": 16320
},
{
"epoch": 0.30720998714474734,
"grad_norm": 0.66796875,
"learning_rate": 9.98822422398718e-06,
"loss": 0.5603,
"step": 16340
},
{
"epoch": 0.3075860091608364,
"grad_norm": 0.69921875,
"learning_rate": 9.988154510641455e-06,
"loss": 0.5614,
"step": 16360
},
{
"epoch": 0.3079620311769254,
"grad_norm": 0.69921875,
"learning_rate": 9.98808459179529e-06,
"loss": 0.543,
"step": 16380
},
{
"epoch": 0.30833805319301444,
"grad_norm": 0.6796875,
"learning_rate": 9.98801446745156e-06,
"loss": 0.5666,
"step": 16400
},
{
"epoch": 0.3087140752091035,
"grad_norm": 0.68359375,
"learning_rate": 9.987944137613155e-06,
"loss": 0.5627,
"step": 16420
},
{
"epoch": 0.30909009722519254,
"grad_norm": 0.7421875,
"learning_rate": 9.987873602282976e-06,
"loss": 0.5492,
"step": 16440
},
{
"epoch": 0.3094661192412816,
"grad_norm": 0.59375,
"learning_rate": 9.987802861463927e-06,
"loss": 0.5667,
"step": 16460
},
{
"epoch": 0.30984214125737064,
"grad_norm": 0.62109375,
"learning_rate": 9.98773191515892e-06,
"loss": 0.5597,
"step": 16480
},
{
"epoch": 0.31021816327345964,
"grad_norm": 0.64453125,
"learning_rate": 9.987660763370883e-06,
"loss": 0.5579,
"step": 16500
},
{
"epoch": 0.3105941852895487,
"grad_norm": 0.734375,
"learning_rate": 9.98758940610274e-06,
"loss": 0.5672,
"step": 16520
},
{
"epoch": 0.31097020730563774,
"grad_norm": 0.7421875,
"learning_rate": 9.98751784335744e-06,
"loss": 0.5598,
"step": 16540
},
{
"epoch": 0.3113462293217268,
"grad_norm": 0.69921875,
"learning_rate": 9.987446075137922e-06,
"loss": 0.5531,
"step": 16560
},
{
"epoch": 0.31172225133781584,
"grad_norm": 0.765625,
"learning_rate": 9.987374101447148e-06,
"loss": 0.5565,
"step": 16580
},
{
"epoch": 0.3120982733539049,
"grad_norm": 0.83984375,
"learning_rate": 9.987301922288082e-06,
"loss": 0.5607,
"step": 16600
},
{
"epoch": 0.31247429536999394,
"grad_norm": 0.60546875,
"learning_rate": 9.987229537663698e-06,
"loss": 0.5497,
"step": 16620
},
{
"epoch": 0.31285031738608293,
"grad_norm": 0.63671875,
"learning_rate": 9.987156947576977e-06,
"loss": 0.5613,
"step": 16640
},
{
"epoch": 0.313226339402172,
"grad_norm": 0.64453125,
"learning_rate": 9.98708415203091e-06,
"loss": 0.5598,
"step": 16660
},
{
"epoch": 0.31360236141826103,
"grad_norm": 0.83984375,
"learning_rate": 9.987011151028496e-06,
"loss": 0.5646,
"step": 16680
},
{
"epoch": 0.3139783834343501,
"grad_norm": 0.6171875,
"learning_rate": 9.986937944572746e-06,
"loss": 0.5668,
"step": 16700
},
{
"epoch": 0.31435440545043913,
"grad_norm": 0.61328125,
"learning_rate": 9.98686453266667e-06,
"loss": 0.5572,
"step": 16720
},
{
"epoch": 0.3147304274665282,
"grad_norm": 0.71484375,
"learning_rate": 9.986790915313293e-06,
"loss": 0.5523,
"step": 16740
},
{
"epoch": 0.31510644948261723,
"grad_norm": 0.703125,
"learning_rate": 9.986717092515653e-06,
"loss": 0.5625,
"step": 16760
},
{
"epoch": 0.31548247149870623,
"grad_norm": 0.625,
"learning_rate": 9.986643064276786e-06,
"loss": 0.5636,
"step": 16780
},
{
"epoch": 0.3158584935147953,
"grad_norm": 0.71875,
"learning_rate": 9.986568830599745e-06,
"loss": 0.5582,
"step": 16800
},
{
"epoch": 0.31623451553088433,
"grad_norm": 0.6171875,
"learning_rate": 9.986494391487588e-06,
"loss": 0.5688,
"step": 16820
},
{
"epoch": 0.3166105375469734,
"grad_norm": 0.58984375,
"learning_rate": 9.98641974694338e-06,
"loss": 0.5604,
"step": 16840
},
{
"epoch": 0.31698655956306243,
"grad_norm": 0.76953125,
"learning_rate": 9.986344896970198e-06,
"loss": 0.5631,
"step": 16860
},
{
"epoch": 0.3173625815791515,
"grad_norm": 0.71875,
"learning_rate": 9.986269841571124e-06,
"loss": 0.5611,
"step": 16880
},
{
"epoch": 0.3177386035952405,
"grad_norm": 0.703125,
"learning_rate": 9.986194580749251e-06,
"loss": 0.5505,
"step": 16900
},
{
"epoch": 0.3181146256113295,
"grad_norm": 0.69921875,
"learning_rate": 9.98611911450768e-06,
"loss": 0.5597,
"step": 16920
},
{
"epoch": 0.3184906476274186,
"grad_norm": 0.6640625,
"learning_rate": 9.98604344284952e-06,
"loss": 0.5474,
"step": 16940
},
{
"epoch": 0.3188666696435076,
"grad_norm": 0.65234375,
"learning_rate": 9.985967565777887e-06,
"loss": 0.5493,
"step": 16960
},
{
"epoch": 0.3192426916595967,
"grad_norm": 0.60546875,
"learning_rate": 9.985891483295908e-06,
"loss": 0.5638,
"step": 16980
},
{
"epoch": 0.31961871367568573,
"grad_norm": 0.60546875,
"learning_rate": 9.985815195406718e-06,
"loss": 0.552,
"step": 17000
},
{
"epoch": 0.3199947356917748,
"grad_norm": 0.76171875,
"learning_rate": 9.985738702113457e-06,
"loss": 0.5554,
"step": 17020
},
{
"epoch": 0.3203707577078638,
"grad_norm": 0.91796875,
"learning_rate": 9.985662003419282e-06,
"loss": 0.5725,
"step": 17040
},
{
"epoch": 0.3207467797239528,
"grad_norm": 0.60546875,
"learning_rate": 9.985585099327348e-06,
"loss": 0.5567,
"step": 17060
},
{
"epoch": 0.3211228017400419,
"grad_norm": 0.6640625,
"learning_rate": 9.985507989840824e-06,
"loss": 0.5699,
"step": 17080
},
{
"epoch": 0.3214988237561309,
"grad_norm": 0.640625,
"learning_rate": 9.985430674962888e-06,
"loss": 0.5513,
"step": 17100
},
{
"epoch": 0.32187484577222,
"grad_norm": 0.76953125,
"learning_rate": 9.985353154696725e-06,
"loss": 0.5634,
"step": 17120
},
{
"epoch": 0.322250867788309,
"grad_norm": 0.6171875,
"learning_rate": 9.985275429045526e-06,
"loss": 0.5576,
"step": 17140
},
{
"epoch": 0.322626889804398,
"grad_norm": 0.6171875,
"learning_rate": 9.985197498012499e-06,
"loss": 0.5595,
"step": 17160
},
{
"epoch": 0.32300291182048707,
"grad_norm": 0.609375,
"learning_rate": 9.98511936160085e-06,
"loss": 0.5551,
"step": 17180
},
{
"epoch": 0.3233789338365761,
"grad_norm": 0.609375,
"learning_rate": 9.985041019813797e-06,
"loss": 0.5535,
"step": 17200
},
{
"epoch": 0.32375495585266517,
"grad_norm": 0.6875,
"learning_rate": 9.98496247265457e-06,
"loss": 0.5611,
"step": 17220
},
{
"epoch": 0.3241309778687542,
"grad_norm": 0.671875,
"learning_rate": 9.984883720126407e-06,
"loss": 0.5602,
"step": 17240
},
{
"epoch": 0.3245069998848433,
"grad_norm": 0.63671875,
"learning_rate": 9.984804762232548e-06,
"loss": 0.5559,
"step": 17260
},
{
"epoch": 0.3248830219009323,
"grad_norm": 0.80078125,
"learning_rate": 9.984725598976248e-06,
"loss": 0.5542,
"step": 17280
},
{
"epoch": 0.3252590439170213,
"grad_norm": 0.625,
"learning_rate": 9.984646230360768e-06,
"loss": 0.5537,
"step": 17300
},
{
"epoch": 0.32563506593311037,
"grad_norm": 0.76171875,
"learning_rate": 9.984566656389378e-06,
"loss": 0.559,
"step": 17320
},
{
"epoch": 0.3260110879491994,
"grad_norm": 0.69140625,
"learning_rate": 9.984486877065357e-06,
"loss": 0.5664,
"step": 17340
},
{
"epoch": 0.32638710996528847,
"grad_norm": 0.671875,
"learning_rate": 9.98440689239199e-06,
"loss": 0.5545,
"step": 17360
},
{
"epoch": 0.3267631319813775,
"grad_norm": 0.6015625,
"learning_rate": 9.984326702372572e-06,
"loss": 0.5492,
"step": 17380
},
{
"epoch": 0.32713915399746657,
"grad_norm": 0.62890625,
"learning_rate": 9.984246307010411e-06,
"loss": 0.5514,
"step": 17400
},
{
"epoch": 0.3275151760135556,
"grad_norm": 0.67578125,
"learning_rate": 9.984165706308815e-06,
"loss": 0.5548,
"step": 17420
},
{
"epoch": 0.3278911980296446,
"grad_norm": 0.78125,
"learning_rate": 9.984084900271104e-06,
"loss": 0.57,
"step": 17440
},
{
"epoch": 0.32826722004573367,
"grad_norm": 0.91015625,
"learning_rate": 9.984003888900608e-06,
"loss": 0.5594,
"step": 17460
},
{
"epoch": 0.3286432420618227,
"grad_norm": 0.734375,
"learning_rate": 9.983922672200666e-06,
"loss": 0.553,
"step": 17480
},
{
"epoch": 0.32901926407791177,
"grad_norm": 0.76953125,
"learning_rate": 9.983841250174623e-06,
"loss": 0.5664,
"step": 17500
},
{
"epoch": 0.3293952860940008,
"grad_norm": 0.65234375,
"learning_rate": 9.983759622825832e-06,
"loss": 0.5566,
"step": 17520
},
{
"epoch": 0.32977130811008987,
"grad_norm": 0.68359375,
"learning_rate": 9.983677790157659e-06,
"loss": 0.5628,
"step": 17540
},
{
"epoch": 0.33014733012617886,
"grad_norm": 0.640625,
"learning_rate": 9.983595752173472e-06,
"loss": 0.5613,
"step": 17560
},
{
"epoch": 0.3305233521422679,
"grad_norm": 0.69921875,
"learning_rate": 9.983513508876653e-06,
"loss": 0.5599,
"step": 17580
},
{
"epoch": 0.33089937415835696,
"grad_norm": 0.67578125,
"learning_rate": 9.98343106027059e-06,
"loss": 0.5596,
"step": 17600
},
{
"epoch": 0.331275396174446,
"grad_norm": 0.60546875,
"learning_rate": 9.983348406358677e-06,
"loss": 0.5596,
"step": 17620
},
{
"epoch": 0.33165141819053506,
"grad_norm": 0.64453125,
"learning_rate": 9.983265547144323e-06,
"loss": 0.561,
"step": 17640
},
{
"epoch": 0.3320274402066241,
"grad_norm": 0.61328125,
"learning_rate": 9.98318248263094e-06,
"loss": 0.5546,
"step": 17660
},
{
"epoch": 0.33240346222271316,
"grad_norm": 0.859375,
"learning_rate": 9.98309921282195e-06,
"loss": 0.5602,
"step": 17680
},
{
"epoch": 0.33277948423880216,
"grad_norm": 0.72265625,
"learning_rate": 9.983015737720782e-06,
"loss": 0.5521,
"step": 17700
},
{
"epoch": 0.3331555062548912,
"grad_norm": 0.62890625,
"learning_rate": 9.982932057330878e-06,
"loss": 0.5603,
"step": 17720
},
{
"epoch": 0.33353152827098026,
"grad_norm": 0.58203125,
"learning_rate": 9.982848171655684e-06,
"loss": 0.5622,
"step": 17740
},
{
"epoch": 0.3339075502870693,
"grad_norm": 0.65625,
"learning_rate": 9.982764080698655e-06,
"loss": 0.5577,
"step": 17760
},
{
"epoch": 0.33428357230315836,
"grad_norm": 0.71484375,
"learning_rate": 9.982679784463256e-06,
"loss": 0.5608,
"step": 17780
},
{
"epoch": 0.3346595943192474,
"grad_norm": 0.578125,
"learning_rate": 9.98259528295296e-06,
"loss": 0.5618,
"step": 17800
},
{
"epoch": 0.33503561633533646,
"grad_norm": 0.671875,
"learning_rate": 9.982510576171249e-06,
"loss": 0.562,
"step": 17820
},
{
"epoch": 0.33541163835142546,
"grad_norm": 0.6015625,
"learning_rate": 9.982425664121611e-06,
"loss": 0.5591,
"step": 17840
},
{
"epoch": 0.3357876603675145,
"grad_norm": 0.671875,
"learning_rate": 9.982340546807546e-06,
"loss": 0.5604,
"step": 17860
},
{
"epoch": 0.33616368238360356,
"grad_norm": 0.63671875,
"learning_rate": 9.982255224232558e-06,
"loss": 0.5541,
"step": 17880
},
{
"epoch": 0.3365397043996926,
"grad_norm": 0.640625,
"learning_rate": 9.982169696400166e-06,
"loss": 0.5544,
"step": 17900
},
{
"epoch": 0.33691572641578166,
"grad_norm": 0.60546875,
"learning_rate": 9.982083963313892e-06,
"loss": 0.5567,
"step": 17920
},
{
"epoch": 0.3372917484318707,
"grad_norm": 0.72265625,
"learning_rate": 9.981998024977263e-06,
"loss": 0.5536,
"step": 17940
},
{
"epoch": 0.3376677704479597,
"grad_norm": 0.6953125,
"learning_rate": 9.981911881393828e-06,
"loss": 0.5518,
"step": 17960
},
{
"epoch": 0.33804379246404875,
"grad_norm": 0.61328125,
"learning_rate": 9.98182553256713e-06,
"loss": 0.5594,
"step": 17980
},
{
"epoch": 0.3384198144801378,
"grad_norm": 0.67578125,
"learning_rate": 9.98173897850073e-06,
"loss": 0.561,
"step": 18000
},
{
"epoch": 0.33879583649622685,
"grad_norm": 0.72265625,
"learning_rate": 9.981652219198191e-06,
"loss": 0.5478,
"step": 18020
},
{
"epoch": 0.3391718585123159,
"grad_norm": 0.6640625,
"learning_rate": 9.981565254663089e-06,
"loss": 0.5547,
"step": 18040
},
{
"epoch": 0.33954788052840496,
"grad_norm": 0.62109375,
"learning_rate": 9.981478084899004e-06,
"loss": 0.5496,
"step": 18060
},
{
"epoch": 0.339923902544494,
"grad_norm": 0.59765625,
"learning_rate": 9.981390709909531e-06,
"loss": 0.5502,
"step": 18080
},
{
"epoch": 0.340299924560583,
"grad_norm": 1.84375,
"learning_rate": 9.981303129698268e-06,
"loss": 0.5611,
"step": 18100
},
{
"epoch": 0.34067594657667205,
"grad_norm": 0.7578125,
"learning_rate": 9.981215344268821e-06,
"loss": 0.5581,
"step": 18120
},
{
"epoch": 0.3410519685927611,
"grad_norm": 0.765625,
"learning_rate": 9.981127353624811e-06,
"loss": 0.5598,
"step": 18140
},
{
"epoch": 0.34142799060885015,
"grad_norm": 0.69921875,
"learning_rate": 9.98103915776986e-06,
"loss": 0.5529,
"step": 18160
},
{
"epoch": 0.3418040126249392,
"grad_norm": 0.625,
"learning_rate": 9.9809507567076e-06,
"loss": 0.5582,
"step": 18180
},
{
"epoch": 0.34218003464102825,
"grad_norm": 0.890625,
"learning_rate": 9.980862150441677e-06,
"loss": 0.5514,
"step": 18200
},
{
"epoch": 0.34255605665711725,
"grad_norm": 0.671875,
"learning_rate": 9.980773338975737e-06,
"loss": 0.5448,
"step": 18220
},
{
"epoch": 0.3429320786732063,
"grad_norm": 0.6875,
"learning_rate": 9.980684322313443e-06,
"loss": 0.55,
"step": 18240
},
{
"epoch": 0.34330810068929535,
"grad_norm": 0.859375,
"learning_rate": 9.980595100458462e-06,
"loss": 0.563,
"step": 18260
},
{
"epoch": 0.3436841227053844,
"grad_norm": 0.8125,
"learning_rate": 9.980505673414465e-06,
"loss": 0.5649,
"step": 18280
},
{
"epoch": 0.34406014472147345,
"grad_norm": 0.6953125,
"learning_rate": 9.980416041185141e-06,
"loss": 0.556,
"step": 18300
},
{
"epoch": 0.3444361667375625,
"grad_norm": 0.640625,
"learning_rate": 9.98032620377418e-06,
"loss": 0.5583,
"step": 18320
},
{
"epoch": 0.34481218875365155,
"grad_norm": 0.609375,
"learning_rate": 9.980236161185284e-06,
"loss": 0.5583,
"step": 18340
},
{
"epoch": 0.34518821076974054,
"grad_norm": 0.6484375,
"learning_rate": 9.980145913422164e-06,
"loss": 0.5648,
"step": 18360
},
{
"epoch": 0.3455642327858296,
"grad_norm": 0.62109375,
"learning_rate": 9.980055460488537e-06,
"loss": 0.5585,
"step": 18380
},
{
"epoch": 0.34594025480191865,
"grad_norm": 0.71484375,
"learning_rate": 9.979964802388127e-06,
"loss": 0.5584,
"step": 18400
},
{
"epoch": 0.3463162768180077,
"grad_norm": 0.69921875,
"learning_rate": 9.979873939124672e-06,
"loss": 0.5623,
"step": 18420
},
{
"epoch": 0.34669229883409675,
"grad_norm": 0.59375,
"learning_rate": 9.979782870701912e-06,
"loss": 0.56,
"step": 18440
},
{
"epoch": 0.3470683208501858,
"grad_norm": 0.671875,
"learning_rate": 9.979691597123604e-06,
"loss": 0.5547,
"step": 18460
},
{
"epoch": 0.34744434286627485,
"grad_norm": 0.7421875,
"learning_rate": 9.979600118393503e-06,
"loss": 0.5532,
"step": 18480
},
{
"epoch": 0.34782036488236384,
"grad_norm": 0.5859375,
"learning_rate": 9.979508434515383e-06,
"loss": 0.5563,
"step": 18500
},
{
"epoch": 0.3481963868984529,
"grad_norm": 0.703125,
"learning_rate": 9.979416545493016e-06,
"loss": 0.5605,
"step": 18520
},
{
"epoch": 0.34857240891454194,
"grad_norm": 0.81640625,
"learning_rate": 9.979324451330193e-06,
"loss": 0.5574,
"step": 18540
},
{
"epoch": 0.348948430930631,
"grad_norm": 0.66796875,
"learning_rate": 9.979232152030703e-06,
"loss": 0.5638,
"step": 18560
},
{
"epoch": 0.34932445294672004,
"grad_norm": 0.61328125,
"learning_rate": 9.97913964759835e-06,
"loss": 0.57,
"step": 18580
},
{
"epoch": 0.3497004749628091,
"grad_norm": 0.6796875,
"learning_rate": 9.979046938036945e-06,
"loss": 0.5544,
"step": 18600
},
{
"epoch": 0.3500764969788981,
"grad_norm": 0.6171875,
"learning_rate": 9.97895402335031e-06,
"loss": 0.5502,
"step": 18620
},
{
"epoch": 0.35045251899498714,
"grad_norm": 0.64453125,
"learning_rate": 9.978860903542268e-06,
"loss": 0.5529,
"step": 18640
},
{
"epoch": 0.3508285410110762,
"grad_norm": 0.66796875,
"learning_rate": 9.97876757861666e-06,
"loss": 0.5576,
"step": 18660
},
{
"epoch": 0.35120456302716524,
"grad_norm": 0.78515625,
"learning_rate": 9.978674048577326e-06,
"loss": 0.5509,
"step": 18680
},
{
"epoch": 0.3515805850432543,
"grad_norm": 0.7265625,
"learning_rate": 9.978580313428125e-06,
"loss": 0.5566,
"step": 18700
},
{
"epoch": 0.35195660705934334,
"grad_norm": 0.69140625,
"learning_rate": 9.978486373172916e-06,
"loss": 0.5527,
"step": 18720
},
{
"epoch": 0.3523326290754324,
"grad_norm": 0.75390625,
"learning_rate": 9.978392227815568e-06,
"loss": 0.5494,
"step": 18740
},
{
"epoch": 0.3527086510915214,
"grad_norm": 0.66015625,
"learning_rate": 9.97829787735996e-06,
"loss": 0.5529,
"step": 18760
},
{
"epoch": 0.35308467310761044,
"grad_norm": 0.6640625,
"learning_rate": 9.978203321809979e-06,
"loss": 0.5507,
"step": 18780
},
{
"epoch": 0.3534606951236995,
"grad_norm": 0.70703125,
"learning_rate": 9.978108561169521e-06,
"loss": 0.5541,
"step": 18800
},
{
"epoch": 0.35383671713978854,
"grad_norm": 0.66015625,
"learning_rate": 9.97801359544249e-06,
"loss": 0.5559,
"step": 18820
},
{
"epoch": 0.3542127391558776,
"grad_norm": 0.63671875,
"learning_rate": 9.977918424632798e-06,
"loss": 0.5471,
"step": 18840
},
{
"epoch": 0.35458876117196664,
"grad_norm": 0.6328125,
"learning_rate": 9.977823048744367e-06,
"loss": 0.5535,
"step": 18860
},
{
"epoch": 0.3549647831880557,
"grad_norm": 0.83203125,
"learning_rate": 9.977727467781124e-06,
"loss": 0.5645,
"step": 18880
},
{
"epoch": 0.3553408052041447,
"grad_norm": 0.63671875,
"learning_rate": 9.97763168174701e-06,
"loss": 0.55,
"step": 18900
},
{
"epoch": 0.35571682722023373,
"grad_norm": 0.69140625,
"learning_rate": 9.977535690645967e-06,
"loss": 0.5513,
"step": 18920
},
{
"epoch": 0.3560928492363228,
"grad_norm": 0.73828125,
"learning_rate": 9.977439494481951e-06,
"loss": 0.5507,
"step": 18940
},
{
"epoch": 0.35646887125241183,
"grad_norm": 0.6875,
"learning_rate": 9.977343093258928e-06,
"loss": 0.5519,
"step": 18960
},
{
"epoch": 0.3568448932685009,
"grad_norm": 0.73828125,
"learning_rate": 9.977246486980867e-06,
"loss": 0.5514,
"step": 18980
},
{
"epoch": 0.35722091528458993,
"grad_norm": 0.65625,
"learning_rate": 9.977149675651747e-06,
"loss": 0.5593,
"step": 19000
},
{
"epoch": 0.35759693730067893,
"grad_norm": 0.9921875,
"learning_rate": 9.977052659275559e-06,
"loss": 0.5538,
"step": 19020
},
{
"epoch": 0.357972959316768,
"grad_norm": 0.8046875,
"learning_rate": 9.976955437856299e-06,
"loss": 0.5653,
"step": 19040
},
{
"epoch": 0.35834898133285703,
"grad_norm": 0.6484375,
"learning_rate": 9.97685801139797e-06,
"loss": 0.5466,
"step": 19060
},
{
"epoch": 0.3587250033489461,
"grad_norm": 0.65234375,
"learning_rate": 9.976760379904588e-06,
"loss": 0.5499,
"step": 19080
},
{
"epoch": 0.35910102536503513,
"grad_norm": 0.58984375,
"learning_rate": 9.976662543380175e-06,
"loss": 0.5516,
"step": 19100
},
{
"epoch": 0.3594770473811242,
"grad_norm": 0.68359375,
"learning_rate": 9.976564501828763e-06,
"loss": 0.5512,
"step": 19120
},
{
"epoch": 0.35985306939721323,
"grad_norm": 0.5625,
"learning_rate": 9.976466255254387e-06,
"loss": 0.5686,
"step": 19140
},
{
"epoch": 0.3602290914133022,
"grad_norm": 0.68359375,
"learning_rate": 9.976367803661097e-06,
"loss": 0.5557,
"step": 19160
},
{
"epoch": 0.3606051134293913,
"grad_norm": 0.5859375,
"learning_rate": 9.976269147052951e-06,
"loss": 0.5677,
"step": 19180
},
{
"epoch": 0.36098113544548033,
"grad_norm": 0.609375,
"learning_rate": 9.976170285434012e-06,
"loss": 0.5557,
"step": 19200
},
{
"epoch": 0.3613571574615694,
"grad_norm": 0.609375,
"learning_rate": 9.97607121880835e-06,
"loss": 0.547,
"step": 19220
},
{
"epoch": 0.36173317947765843,
"grad_norm": 0.63671875,
"learning_rate": 9.97597194718005e-06,
"loss": 0.5564,
"step": 19240
},
{
"epoch": 0.3621092014937475,
"grad_norm": 0.640625,
"learning_rate": 9.9758724705532e-06,
"loss": 0.546,
"step": 19260
},
{
"epoch": 0.3624852235098365,
"grad_norm": 0.64453125,
"learning_rate": 9.975772788931898e-06,
"loss": 0.5529,
"step": 19280
},
{
"epoch": 0.3628612455259255,
"grad_norm": 0.703125,
"learning_rate": 9.975672902320252e-06,
"loss": 0.5538,
"step": 19300
},
{
"epoch": 0.3632372675420146,
"grad_norm": 0.60546875,
"learning_rate": 9.975572810722376e-06,
"loss": 0.5516,
"step": 19320
},
{
"epoch": 0.3636132895581036,
"grad_norm": 0.609375,
"learning_rate": 9.975472514142392e-06,
"loss": 0.5601,
"step": 19340
},
{
"epoch": 0.3639893115741927,
"grad_norm": 0.71875,
"learning_rate": 9.975372012584437e-06,
"loss": 0.5593,
"step": 19360
},
{
"epoch": 0.3643653335902817,
"grad_norm": 0.5703125,
"learning_rate": 9.975271306052648e-06,
"loss": 0.551,
"step": 19380
},
{
"epoch": 0.3647413556063708,
"grad_norm": 0.58203125,
"learning_rate": 9.975170394551173e-06,
"loss": 0.5599,
"step": 19400
},
{
"epoch": 0.36511737762245977,
"grad_norm": 0.77734375,
"learning_rate": 9.97506927808417e-06,
"loss": 0.5618,
"step": 19420
},
{
"epoch": 0.3654933996385488,
"grad_norm": 0.703125,
"learning_rate": 9.974967956655806e-06,
"loss": 0.549,
"step": 19440
},
{
"epoch": 0.36586942165463787,
"grad_norm": 0.58984375,
"learning_rate": 9.974866430270254e-06,
"loss": 0.5692,
"step": 19460
},
{
"epoch": 0.3662454436707269,
"grad_norm": 0.64453125,
"learning_rate": 9.974764698931698e-06,
"loss": 0.5503,
"step": 19480
},
{
"epoch": 0.366621465686816,
"grad_norm": 0.625,
"learning_rate": 9.974662762644328e-06,
"loss": 0.5578,
"step": 19500
},
{
"epoch": 0.366997487702905,
"grad_norm": 0.62890625,
"learning_rate": 9.974560621412342e-06,
"loss": 0.5566,
"step": 19520
},
{
"epoch": 0.3673735097189941,
"grad_norm": 0.6015625,
"learning_rate": 9.97445827523995e-06,
"loss": 0.5625,
"step": 19540
},
{
"epoch": 0.36774953173508307,
"grad_norm": 0.8359375,
"learning_rate": 9.974355724131371e-06,
"loss": 0.5508,
"step": 19560
},
{
"epoch": 0.3681255537511721,
"grad_norm": 0.61328125,
"learning_rate": 9.974252968090826e-06,
"loss": 0.5551,
"step": 19580
},
{
"epoch": 0.36850157576726117,
"grad_norm": 0.75390625,
"learning_rate": 9.974150007122548e-06,
"loss": 0.5508,
"step": 19600
},
{
"epoch": 0.3688775977833502,
"grad_norm": 0.625,
"learning_rate": 9.97404684123078e-06,
"loss": 0.5576,
"step": 19620
},
{
"epoch": 0.36925361979943927,
"grad_norm": 0.71875,
"learning_rate": 9.973943470419773e-06,
"loss": 0.5678,
"step": 19640
},
{
"epoch": 0.3696296418155283,
"grad_norm": 0.6484375,
"learning_rate": 9.973839894693785e-06,
"loss": 0.5518,
"step": 19660
},
{
"epoch": 0.3700056638316173,
"grad_norm": 0.61328125,
"learning_rate": 9.973736114057083e-06,
"loss": 0.5599,
"step": 19680
},
{
"epoch": 0.37038168584770637,
"grad_norm": 0.625,
"learning_rate": 9.973632128513943e-06,
"loss": 0.5481,
"step": 19700
},
{
"epoch": 0.3707577078637954,
"grad_norm": 0.625,
"learning_rate": 9.973527938068648e-06,
"loss": 0.5442,
"step": 19720
},
{
"epoch": 0.37113372987988447,
"grad_norm": 0.62109375,
"learning_rate": 9.973423542725491e-06,
"loss": 0.556,
"step": 19740
},
{
"epoch": 0.3715097518959735,
"grad_norm": 0.62890625,
"learning_rate": 9.973318942488772e-06,
"loss": 0.5486,
"step": 19760
},
{
"epoch": 0.37188577391206257,
"grad_norm": 0.734375,
"learning_rate": 9.9732141373628e-06,
"loss": 0.555,
"step": 19780
},
{
"epoch": 0.3722617959281516,
"grad_norm": 0.6015625,
"learning_rate": 9.973109127351897e-06,
"loss": 0.5592,
"step": 19800
},
{
"epoch": 0.3726378179442406,
"grad_norm": 0.63671875,
"learning_rate": 9.973003912460383e-06,
"loss": 0.5519,
"step": 19820
},
{
"epoch": 0.37301383996032966,
"grad_norm": 0.62109375,
"learning_rate": 9.972898492692598e-06,
"loss": 0.5588,
"step": 19840
},
{
"epoch": 0.3733898619764187,
"grad_norm": 0.86328125,
"learning_rate": 9.972792868052882e-06,
"loss": 0.5564,
"step": 19860
},
{
"epoch": 0.37376588399250776,
"grad_norm": 0.61328125,
"learning_rate": 9.972687038545586e-06,
"loss": 0.5545,
"step": 19880
},
{
"epoch": 0.3741419060085968,
"grad_norm": 0.58984375,
"learning_rate": 9.972581004175073e-06,
"loss": 0.556,
"step": 19900
},
{
"epoch": 0.37451792802468586,
"grad_norm": 0.70703125,
"learning_rate": 9.972474764945707e-06,
"loss": 0.5468,
"step": 19920
},
{
"epoch": 0.3748939500407749,
"grad_norm": 0.66796875,
"learning_rate": 9.972368320861868e-06,
"loss": 0.5608,
"step": 19940
},
{
"epoch": 0.3752699720568639,
"grad_norm": 0.640625,
"learning_rate": 9.972261671927941e-06,
"loss": 0.5489,
"step": 19960
},
{
"epoch": 0.37564599407295296,
"grad_norm": 0.625,
"learning_rate": 9.972154818148319e-06,
"loss": 0.5548,
"step": 19980
},
{
"epoch": 0.376022016089042,
"grad_norm": 0.62890625,
"learning_rate": 9.972047759527404e-06,
"loss": 0.5457,
"step": 20000
},
{
"epoch": 0.37639803810513106,
"grad_norm": 0.70703125,
"learning_rate": 9.971940496069607e-06,
"loss": 0.5574,
"step": 20020
},
{
"epoch": 0.3767740601212201,
"grad_norm": 0.609375,
"learning_rate": 9.971833027779347e-06,
"loss": 0.5542,
"step": 20040
},
{
"epoch": 0.37715008213730916,
"grad_norm": 0.609375,
"learning_rate": 9.97172535466105e-06,
"loss": 0.5445,
"step": 20060
},
{
"epoch": 0.37752610415339816,
"grad_norm": 0.61328125,
"learning_rate": 9.971617476719155e-06,
"loss": 0.5484,
"step": 20080
},
{
"epoch": 0.3779021261694872,
"grad_norm": 0.62109375,
"learning_rate": 9.971509393958103e-06,
"loss": 0.544,
"step": 20100
},
{
"epoch": 0.37827814818557626,
"grad_norm": 0.6171875,
"learning_rate": 9.97140110638235e-06,
"loss": 0.5425,
"step": 20120
},
{
"epoch": 0.3786541702016653,
"grad_norm": 0.66796875,
"learning_rate": 9.971292613996352e-06,
"loss": 0.5417,
"step": 20140
},
{
"epoch": 0.37903019221775436,
"grad_norm": 0.65625,
"learning_rate": 9.971183916804585e-06,
"loss": 0.554,
"step": 20160
},
{
"epoch": 0.3794062142338434,
"grad_norm": 0.62890625,
"learning_rate": 9.971075014811525e-06,
"loss": 0.561,
"step": 20180
},
{
"epoch": 0.37978223624993246,
"grad_norm": 0.64453125,
"learning_rate": 9.970965908021656e-06,
"loss": 0.5485,
"step": 20200
},
{
"epoch": 0.38015825826602145,
"grad_norm": 0.58984375,
"learning_rate": 9.970856596439474e-06,
"loss": 0.5519,
"step": 20220
},
{
"epoch": 0.3805342802821105,
"grad_norm": 0.625,
"learning_rate": 9.970747080069485e-06,
"loss": 0.547,
"step": 20240
},
{
"epoch": 0.38091030229819955,
"grad_norm": 0.64453125,
"learning_rate": 9.970637358916198e-06,
"loss": 0.5524,
"step": 20260
},
{
"epoch": 0.3812863243142886,
"grad_norm": 0.58203125,
"learning_rate": 9.970527432984133e-06,
"loss": 0.5468,
"step": 20280
},
{
"epoch": 0.38166234633037766,
"grad_norm": 0.70703125,
"learning_rate": 9.97041730227782e-06,
"loss": 0.5562,
"step": 20300
},
{
"epoch": 0.3820383683464667,
"grad_norm": 0.6171875,
"learning_rate": 9.970306966801796e-06,
"loss": 0.5454,
"step": 20320
},
{
"epoch": 0.3824143903625557,
"grad_norm": 0.546875,
"learning_rate": 9.970196426560607e-06,
"loss": 0.5521,
"step": 20340
},
{
"epoch": 0.38279041237864475,
"grad_norm": 0.734375,
"learning_rate": 9.970085681558807e-06,
"loss": 0.5545,
"step": 20360
},
{
"epoch": 0.3831664343947338,
"grad_norm": 0.5703125,
"learning_rate": 9.969974731800957e-06,
"loss": 0.5489,
"step": 20380
},
{
"epoch": 0.38354245641082285,
"grad_norm": 0.8671875,
"learning_rate": 9.96986357729163e-06,
"loss": 0.5501,
"step": 20400
},
{
"epoch": 0.3839184784269119,
"grad_norm": 0.59375,
"learning_rate": 9.969752218035404e-06,
"loss": 0.5461,
"step": 20420
},
{
"epoch": 0.38429450044300095,
"grad_norm": 0.76953125,
"learning_rate": 9.969640654036864e-06,
"loss": 0.5637,
"step": 20440
},
{
"epoch": 0.38467052245909,
"grad_norm": 0.64453125,
"learning_rate": 9.969528885300612e-06,
"loss": 0.5475,
"step": 20460
},
{
"epoch": 0.385046544475179,
"grad_norm": 0.60546875,
"learning_rate": 9.96941691183125e-06,
"loss": 0.5571,
"step": 20480
},
{
"epoch": 0.38542256649126805,
"grad_norm": 0.65234375,
"learning_rate": 9.96930473363339e-06,
"loss": 0.5496,
"step": 20500
},
{
"epoch": 0.3857985885073571,
"grad_norm": 0.6484375,
"learning_rate": 9.969192350711651e-06,
"loss": 0.5493,
"step": 20520
},
{
"epoch": 0.38617461052344615,
"grad_norm": 0.6015625,
"learning_rate": 9.969079763070671e-06,
"loss": 0.5576,
"step": 20540
},
{
"epoch": 0.3865506325395352,
"grad_norm": 0.640625,
"learning_rate": 9.96896697071508e-06,
"loss": 0.5492,
"step": 20560
},
{
"epoch": 0.38692665455562425,
"grad_norm": 0.61328125,
"learning_rate": 9.96885397364953e-06,
"loss": 0.5553,
"step": 20580
},
{
"epoch": 0.3873026765717133,
"grad_norm": 0.84765625,
"learning_rate": 9.968740771878673e-06,
"loss": 0.5351,
"step": 20600
},
{
"epoch": 0.3876786985878023,
"grad_norm": 0.61328125,
"learning_rate": 9.968627365407174e-06,
"loss": 0.5607,
"step": 20620
},
{
"epoch": 0.38805472060389135,
"grad_norm": 0.65625,
"learning_rate": 9.968513754239707e-06,
"loss": 0.5564,
"step": 20640
},
{
"epoch": 0.3884307426199804,
"grad_norm": 0.6171875,
"learning_rate": 9.968399938380951e-06,
"loss": 0.5522,
"step": 20660
},
{
"epoch": 0.38880676463606945,
"grad_norm": 0.63671875,
"learning_rate": 9.968285917835592e-06,
"loss": 0.5573,
"step": 20680
},
{
"epoch": 0.3891827866521585,
"grad_norm": 0.64453125,
"learning_rate": 9.968171692608332e-06,
"loss": 0.55,
"step": 20700
},
{
"epoch": 0.38955880866824755,
"grad_norm": 0.60546875,
"learning_rate": 9.968057262703875e-06,
"loss": 0.5542,
"step": 20720
},
{
"epoch": 0.38993483068433654,
"grad_norm": 0.6953125,
"learning_rate": 9.967942628126933e-06,
"loss": 0.5527,
"step": 20740
},
{
"epoch": 0.3903108527004256,
"grad_norm": 0.56640625,
"learning_rate": 9.967827788882231e-06,
"loss": 0.5462,
"step": 20760
},
{
"epoch": 0.39068687471651464,
"grad_norm": 0.7421875,
"learning_rate": 9.967712744974502e-06,
"loss": 0.5582,
"step": 20780
},
{
"epoch": 0.3910628967326037,
"grad_norm": 0.6328125,
"learning_rate": 9.967597496408483e-06,
"loss": 0.5548,
"step": 20800
},
{
"epoch": 0.39143891874869274,
"grad_norm": 0.6640625,
"learning_rate": 9.96748204318892e-06,
"loss": 0.5484,
"step": 20820
},
{
"epoch": 0.3918149407647818,
"grad_norm": 0.63671875,
"learning_rate": 9.967366385320576e-06,
"loss": 0.5537,
"step": 20840
},
{
"epoch": 0.39219096278087084,
"grad_norm": 0.71875,
"learning_rate": 9.967250522808208e-06,
"loss": 0.5436,
"step": 20860
},
{
"epoch": 0.39256698479695984,
"grad_norm": 0.72265625,
"learning_rate": 9.967134455656595e-06,
"loss": 0.561,
"step": 20880
},
{
"epoch": 0.3929430068130489,
"grad_norm": 0.65234375,
"learning_rate": 9.967018183870514e-06,
"loss": 0.5531,
"step": 20900
},
{
"epoch": 0.39331902882913794,
"grad_norm": 0.59765625,
"learning_rate": 9.96690170745476e-06,
"loss": 0.5543,
"step": 20920
},
{
"epoch": 0.393695050845227,
"grad_norm": 0.5703125,
"learning_rate": 9.96678502641413e-06,
"loss": 0.5509,
"step": 20940
},
{
"epoch": 0.39407107286131604,
"grad_norm": 0.73828125,
"learning_rate": 9.966668140753428e-06,
"loss": 0.5571,
"step": 20960
},
{
"epoch": 0.3944470948774051,
"grad_norm": 0.66015625,
"learning_rate": 9.966551050477473e-06,
"loss": 0.558,
"step": 20980
},
{
"epoch": 0.39482311689349414,
"grad_norm": 0.6796875,
"learning_rate": 9.966433755591087e-06,
"loss": 0.5545,
"step": 21000
},
{
"epoch": 0.39519913890958314,
"grad_norm": 0.59375,
"learning_rate": 9.966316256099104e-06,
"loss": 0.5453,
"step": 21020
},
{
"epoch": 0.3955751609256722,
"grad_norm": 0.5859375,
"learning_rate": 9.966198552006361e-06,
"loss": 0.5585,
"step": 21040
},
{
"epoch": 0.39595118294176124,
"grad_norm": 0.54296875,
"learning_rate": 9.966080643317713e-06,
"loss": 0.536,
"step": 21060
},
{
"epoch": 0.3963272049578503,
"grad_norm": 0.671875,
"learning_rate": 9.96596253003801e-06,
"loss": 0.5536,
"step": 21080
},
{
"epoch": 0.39670322697393934,
"grad_norm": 0.69921875,
"learning_rate": 9.965844212172127e-06,
"loss": 0.5517,
"step": 21100
},
{
"epoch": 0.3970792489900284,
"grad_norm": 0.6328125,
"learning_rate": 9.965725689724931e-06,
"loss": 0.5481,
"step": 21120
},
{
"epoch": 0.3974552710061174,
"grad_norm": 0.6328125,
"learning_rate": 9.965606962701308e-06,
"loss": 0.557,
"step": 21140
},
{
"epoch": 0.39783129302220643,
"grad_norm": 0.62890625,
"learning_rate": 9.96548803110615e-06,
"loss": 0.5557,
"step": 21160
},
{
"epoch": 0.3982073150382955,
"grad_norm": 0.625,
"learning_rate": 9.965368894944353e-06,
"loss": 0.5471,
"step": 21180
},
{
"epoch": 0.39858333705438453,
"grad_norm": 0.61328125,
"learning_rate": 9.965249554220828e-06,
"loss": 0.5658,
"step": 21200
},
{
"epoch": 0.3989593590704736,
"grad_norm": 0.6484375,
"learning_rate": 9.965130008940493e-06,
"loss": 0.5486,
"step": 21220
},
{
"epoch": 0.39933538108656264,
"grad_norm": 0.55078125,
"learning_rate": 9.965010259108269e-06,
"loss": 0.5458,
"step": 21240
},
{
"epoch": 0.3997114031026517,
"grad_norm": 0.60546875,
"learning_rate": 9.964890304729094e-06,
"loss": 0.5525,
"step": 21260
},
{
"epoch": 0.4000874251187407,
"grad_norm": 0.59765625,
"learning_rate": 9.964770145807907e-06,
"loss": 0.5471,
"step": 21280
},
{
"epoch": 0.40046344713482973,
"grad_norm": 0.6640625,
"learning_rate": 9.964649782349658e-06,
"loss": 0.5457,
"step": 21300
},
{
"epoch": 0.4008394691509188,
"grad_norm": 0.58203125,
"learning_rate": 9.964529214359306e-06,
"loss": 0.5438,
"step": 21320
},
{
"epoch": 0.40121549116700783,
"grad_norm": 0.640625,
"learning_rate": 9.964408441841819e-06,
"loss": 0.5474,
"step": 21340
},
{
"epoch": 0.4015915131830969,
"grad_norm": 0.55859375,
"learning_rate": 9.964287464802172e-06,
"loss": 0.5474,
"step": 21360
},
{
"epoch": 0.40196753519918593,
"grad_norm": 0.65234375,
"learning_rate": 9.964166283245348e-06,
"loss": 0.5563,
"step": 21380
},
{
"epoch": 0.4023435572152749,
"grad_norm": 0.6328125,
"learning_rate": 9.964044897176342e-06,
"loss": 0.5552,
"step": 21400
},
{
"epoch": 0.402719579231364,
"grad_norm": 0.64453125,
"learning_rate": 9.963923306600154e-06,
"loss": 0.5473,
"step": 21420
},
{
"epoch": 0.40309560124745303,
"grad_norm": 0.59375,
"learning_rate": 9.963801511521791e-06,
"loss": 0.5507,
"step": 21440
},
{
"epoch": 0.4034716232635421,
"grad_norm": 0.60546875,
"learning_rate": 9.963679511946271e-06,
"loss": 0.5531,
"step": 21460
},
{
"epoch": 0.40384764527963113,
"grad_norm": 0.63671875,
"learning_rate": 9.963557307878624e-06,
"loss": 0.5497,
"step": 21480
},
{
"epoch": 0.4042236672957202,
"grad_norm": 0.62109375,
"learning_rate": 9.96343489932388e-06,
"loss": 0.5551,
"step": 21500
},
{
"epoch": 0.40459968931180923,
"grad_norm": 0.6171875,
"learning_rate": 9.963312286287086e-06,
"loss": 0.5465,
"step": 21520
},
{
"epoch": 0.4049757113278982,
"grad_norm": 0.60546875,
"learning_rate": 9.96318946877329e-06,
"loss": 0.5419,
"step": 21540
},
{
"epoch": 0.4053517333439873,
"grad_norm": 0.66796875,
"learning_rate": 9.96306644678755e-06,
"loss": 0.5615,
"step": 21560
},
{
"epoch": 0.4057277553600763,
"grad_norm": 0.6328125,
"learning_rate": 9.96294322033494e-06,
"loss": 0.5619,
"step": 21580
},
{
"epoch": 0.4061037773761654,
"grad_norm": 0.62109375,
"learning_rate": 9.962819789420535e-06,
"loss": 0.5473,
"step": 21600
},
{
"epoch": 0.4064797993922544,
"grad_norm": 0.60546875,
"learning_rate": 9.962696154049416e-06,
"loss": 0.5478,
"step": 21620
},
{
"epoch": 0.4068558214083435,
"grad_norm": 0.58984375,
"learning_rate": 9.962572314226682e-06,
"loss": 0.5455,
"step": 21640
},
{
"epoch": 0.4072318434244325,
"grad_norm": 0.71484375,
"learning_rate": 9.96244826995743e-06,
"loss": 0.5557,
"step": 21660
},
{
"epoch": 0.4076078654405215,
"grad_norm": 0.62109375,
"learning_rate": 9.962324021246775e-06,
"loss": 0.5541,
"step": 21680
},
{
"epoch": 0.40798388745661057,
"grad_norm": 0.625,
"learning_rate": 9.96219956809983e-06,
"loss": 0.5512,
"step": 21700
},
{
"epoch": 0.4083599094726996,
"grad_norm": 0.82421875,
"learning_rate": 9.962074910521729e-06,
"loss": 0.5616,
"step": 21720
},
{
"epoch": 0.4087359314887887,
"grad_norm": 0.67578125,
"learning_rate": 9.961950048517604e-06,
"loss": 0.5375,
"step": 21740
},
{
"epoch": 0.4091119535048777,
"grad_norm": 0.671875,
"learning_rate": 9.961824982092597e-06,
"loss": 0.5522,
"step": 21760
},
{
"epoch": 0.4094879755209668,
"grad_norm": 0.66796875,
"learning_rate": 9.961699711251864e-06,
"loss": 0.5537,
"step": 21780
},
{
"epoch": 0.40986399753705577,
"grad_norm": 0.62109375,
"learning_rate": 9.961574236000564e-06,
"loss": 0.5558,
"step": 21800
},
{
"epoch": 0.4102400195531448,
"grad_norm": 0.56640625,
"learning_rate": 9.961448556343866e-06,
"loss": 0.5611,
"step": 21820
},
{
"epoch": 0.41061604156923387,
"grad_norm": 0.578125,
"learning_rate": 9.961322672286951e-06,
"loss": 0.5521,
"step": 21840
},
{
"epoch": 0.4109920635853229,
"grad_norm": 0.75,
"learning_rate": 9.961196583835e-06,
"loss": 0.554,
"step": 21860
},
{
"epoch": 0.41136808560141197,
"grad_norm": 0.6015625,
"learning_rate": 9.961070290993212e-06,
"loss": 0.5543,
"step": 21880
},
{
"epoch": 0.411744107617501,
"grad_norm": 0.58984375,
"learning_rate": 9.960943793766788e-06,
"loss": 0.5555,
"step": 21900
},
{
"epoch": 0.41212012963359007,
"grad_norm": 0.65625,
"learning_rate": 9.96081709216094e-06,
"loss": 0.5535,
"step": 21920
},
{
"epoch": 0.41249615164967907,
"grad_norm": 0.7421875,
"learning_rate": 9.960690186180886e-06,
"loss": 0.5591,
"step": 21940
},
{
"epoch": 0.4128721736657681,
"grad_norm": 0.6796875,
"learning_rate": 9.960563075831856e-06,
"loss": 0.5559,
"step": 21960
},
{
"epoch": 0.41324819568185717,
"grad_norm": 0.65625,
"learning_rate": 9.960435761119088e-06,
"loss": 0.5616,
"step": 21980
},
{
"epoch": 0.4136242176979462,
"grad_norm": 0.6875,
"learning_rate": 9.960308242047822e-06,
"loss": 0.558,
"step": 22000
},
{
"epoch": 0.41400023971403527,
"grad_norm": 0.6875,
"learning_rate": 9.960180518623317e-06,
"loss": 0.5523,
"step": 22020
},
{
"epoch": 0.4143762617301243,
"grad_norm": 0.6171875,
"learning_rate": 9.960052590850833e-06,
"loss": 0.5468,
"step": 22040
},
{
"epoch": 0.41475228374621337,
"grad_norm": 0.5625,
"learning_rate": 9.95992445873564e-06,
"loss": 0.5365,
"step": 22060
},
{
"epoch": 0.41512830576230236,
"grad_norm": 0.62109375,
"learning_rate": 9.959796122283016e-06,
"loss": 0.5475,
"step": 22080
},
{
"epoch": 0.4155043277783914,
"grad_norm": 0.74609375,
"learning_rate": 9.959667581498249e-06,
"loss": 0.5538,
"step": 22100
},
{
"epoch": 0.41588034979448046,
"grad_norm": 0.490234375,
"learning_rate": 9.959538836386635e-06,
"loss": 0.5504,
"step": 22120
},
{
"epoch": 0.4162563718105695,
"grad_norm": 0.54296875,
"learning_rate": 9.959409886953477e-06,
"loss": 0.5493,
"step": 22140
},
{
"epoch": 0.41663239382665856,
"grad_norm": 0.59375,
"learning_rate": 9.95928073320409e-06,
"loss": 0.554,
"step": 22160
},
{
"epoch": 0.4170084158427476,
"grad_norm": 0.5859375,
"learning_rate": 9.95915137514379e-06,
"loss": 0.5457,
"step": 22180
},
{
"epoch": 0.4173844378588366,
"grad_norm": 0.640625,
"learning_rate": 9.95902181277791e-06,
"loss": 0.5512,
"step": 22200
},
{
"epoch": 0.41776045987492566,
"grad_norm": 0.61328125,
"learning_rate": 9.958892046111786e-06,
"loss": 0.5477,
"step": 22220
},
{
"epoch": 0.4181364818910147,
"grad_norm": 0.63671875,
"learning_rate": 9.958762075150767e-06,
"loss": 0.5559,
"step": 22240
},
{
"epoch": 0.41851250390710376,
"grad_norm": 0.6484375,
"learning_rate": 9.958631899900203e-06,
"loss": 0.5452,
"step": 22260
},
{
"epoch": 0.4188885259231928,
"grad_norm": 0.61328125,
"learning_rate": 9.958501520365463e-06,
"loss": 0.5511,
"step": 22280
},
{
"epoch": 0.41926454793928186,
"grad_norm": 0.578125,
"learning_rate": 9.958370936551911e-06,
"loss": 0.5511,
"step": 22300
},
{
"epoch": 0.4196405699553709,
"grad_norm": 0.6875,
"learning_rate": 9.95824014846493e-06,
"loss": 0.5567,
"step": 22320
},
{
"epoch": 0.4200165919714599,
"grad_norm": 0.66796875,
"learning_rate": 9.958109156109912e-06,
"loss": 0.5426,
"step": 22340
},
{
"epoch": 0.42039261398754896,
"grad_norm": 0.58984375,
"learning_rate": 9.957977959492247e-06,
"loss": 0.5453,
"step": 22360
},
{
"epoch": 0.420768636003638,
"grad_norm": 0.66796875,
"learning_rate": 9.957846558617345e-06,
"loss": 0.5563,
"step": 22380
},
{
"epoch": 0.42114465801972706,
"grad_norm": 0.60546875,
"learning_rate": 9.957714953490616e-06,
"loss": 0.5543,
"step": 22400
},
{
"epoch": 0.4215206800358161,
"grad_norm": 0.59375,
"learning_rate": 9.957583144117483e-06,
"loss": 0.5434,
"step": 22420
},
{
"epoch": 0.42189670205190516,
"grad_norm": 0.625,
"learning_rate": 9.95745113050338e-06,
"loss": 0.5512,
"step": 22440
},
{
"epoch": 0.42227272406799415,
"grad_norm": 0.5859375,
"learning_rate": 9.957318912653738e-06,
"loss": 0.5338,
"step": 22460
},
{
"epoch": 0.4226487460840832,
"grad_norm": 0.62109375,
"learning_rate": 9.95718649057401e-06,
"loss": 0.5483,
"step": 22480
},
{
"epoch": 0.42302476810017225,
"grad_norm": 0.578125,
"learning_rate": 9.95705386426965e-06,
"loss": 0.5552,
"step": 22500
},
{
"epoch": 0.4234007901162613,
"grad_norm": 0.609375,
"learning_rate": 9.956921033746123e-06,
"loss": 0.5355,
"step": 22520
},
{
"epoch": 0.42377681213235036,
"grad_norm": 0.5703125,
"learning_rate": 9.956787999008898e-06,
"loss": 0.5556,
"step": 22540
},
{
"epoch": 0.4241528341484394,
"grad_norm": 0.765625,
"learning_rate": 9.956654760063458e-06,
"loss": 0.543,
"step": 22560
},
{
"epoch": 0.42452885616452846,
"grad_norm": 0.73828125,
"learning_rate": 9.956521316915293e-06,
"loss": 0.5503,
"step": 22580
},
{
"epoch": 0.42490487818061745,
"grad_norm": 0.6796875,
"learning_rate": 9.956387669569898e-06,
"loss": 0.5502,
"step": 22600
},
{
"epoch": 0.4252809001967065,
"grad_norm": 0.62890625,
"learning_rate": 9.956253818032782e-06,
"loss": 0.5624,
"step": 22620
},
{
"epoch": 0.42565692221279555,
"grad_norm": 0.58203125,
"learning_rate": 9.956119762309456e-06,
"loss": 0.5492,
"step": 22640
},
{
"epoch": 0.4260329442288846,
"grad_norm": 0.75,
"learning_rate": 9.955985502405446e-06,
"loss": 0.5575,
"step": 22660
},
{
"epoch": 0.42640896624497365,
"grad_norm": 0.59375,
"learning_rate": 9.955851038326279e-06,
"loss": 0.5539,
"step": 22680
},
{
"epoch": 0.4267849882610627,
"grad_norm": 0.7109375,
"learning_rate": 9.9557163700775e-06,
"loss": 0.5512,
"step": 22700
},
{
"epoch": 0.42716101027715175,
"grad_norm": 0.75,
"learning_rate": 9.955581497664653e-06,
"loss": 0.5484,
"step": 22720
},
{
"epoch": 0.42753703229324075,
"grad_norm": 0.625,
"learning_rate": 9.955446421093297e-06,
"loss": 0.5482,
"step": 22740
},
{
"epoch": 0.4279130543093298,
"grad_norm": 0.609375,
"learning_rate": 9.955311140368995e-06,
"loss": 0.5572,
"step": 22760
},
{
"epoch": 0.42828907632541885,
"grad_norm": 0.5859375,
"learning_rate": 9.955175655497321e-06,
"loss": 0.5442,
"step": 22780
},
{
"epoch": 0.4286650983415079,
"grad_norm": 0.6171875,
"learning_rate": 9.955039966483856e-06,
"loss": 0.5471,
"step": 22800
},
{
"epoch": 0.42904112035759695,
"grad_norm": 0.59375,
"learning_rate": 9.954904073334191e-06,
"loss": 0.5565,
"step": 22820
},
{
"epoch": 0.429417142373686,
"grad_norm": 0.59765625,
"learning_rate": 9.954767976053925e-06,
"loss": 0.5479,
"step": 22840
},
{
"epoch": 0.429793164389775,
"grad_norm": 0.53125,
"learning_rate": 9.954631674648662e-06,
"loss": 0.5467,
"step": 22860
},
{
"epoch": 0.43016918640586405,
"grad_norm": 0.59375,
"learning_rate": 9.954495169124022e-06,
"loss": 0.5525,
"step": 22880
},
{
"epoch": 0.4305452084219531,
"grad_norm": 0.62890625,
"learning_rate": 9.954358459485625e-06,
"loss": 0.5517,
"step": 22900
},
{
"epoch": 0.43092123043804215,
"grad_norm": 0.63671875,
"learning_rate": 9.954221545739102e-06,
"loss": 0.5473,
"step": 22920
},
{
"epoch": 0.4312972524541312,
"grad_norm": 0.65234375,
"learning_rate": 9.954084427890099e-06,
"loss": 0.558,
"step": 22940
},
{
"epoch": 0.43167327447022025,
"grad_norm": 0.57421875,
"learning_rate": 9.953947105944259e-06,
"loss": 0.5496,
"step": 22960
},
{
"epoch": 0.4320492964863093,
"grad_norm": 0.5625,
"learning_rate": 9.953809579907244e-06,
"loss": 0.5563,
"step": 22980
},
{
"epoch": 0.4324253185023983,
"grad_norm": 0.5625,
"learning_rate": 9.953671849784717e-06,
"loss": 0.5504,
"step": 23000
},
{
"epoch": 0.43280134051848734,
"grad_norm": 0.58203125,
"learning_rate": 9.95353391558235e-06,
"loss": 0.5524,
"step": 23020
},
{
"epoch": 0.4331773625345764,
"grad_norm": 0.64453125,
"learning_rate": 9.953395777305832e-06,
"loss": 0.5537,
"step": 23040
},
{
"epoch": 0.43355338455066544,
"grad_norm": 0.6953125,
"learning_rate": 9.953257434960848e-06,
"loss": 0.5453,
"step": 23060
},
{
"epoch": 0.4339294065667545,
"grad_norm": 0.609375,
"learning_rate": 9.953118888553102e-06,
"loss": 0.5496,
"step": 23080
},
{
"epoch": 0.43430542858284354,
"grad_norm": 0.55859375,
"learning_rate": 9.9529801380883e-06,
"loss": 0.5505,
"step": 23100
},
{
"epoch": 0.4346814505989326,
"grad_norm": 0.55078125,
"learning_rate": 9.952841183572154e-06,
"loss": 0.5462,
"step": 23120
},
{
"epoch": 0.4350574726150216,
"grad_norm": 0.69140625,
"learning_rate": 9.952702025010397e-06,
"loss": 0.5504,
"step": 23140
},
{
"epoch": 0.43543349463111064,
"grad_norm": 0.71875,
"learning_rate": 9.952562662408755e-06,
"loss": 0.5546,
"step": 23160
},
{
"epoch": 0.4358095166471997,
"grad_norm": 0.6953125,
"learning_rate": 9.952423095772971e-06,
"loss": 0.5504,
"step": 23180
},
{
"epoch": 0.43618553866328874,
"grad_norm": 0.6484375,
"learning_rate": 9.952283325108799e-06,
"loss": 0.5514,
"step": 23200
},
{
"epoch": 0.4365615606793778,
"grad_norm": 0.62109375,
"learning_rate": 9.95214335042199e-06,
"loss": 0.5484,
"step": 23220
},
{
"epoch": 0.43693758269546684,
"grad_norm": 0.6484375,
"learning_rate": 9.952003171718316e-06,
"loss": 0.5519,
"step": 23240
},
{
"epoch": 0.43731360471155584,
"grad_norm": 0.62109375,
"learning_rate": 9.951862789003552e-06,
"loss": 0.5436,
"step": 23260
},
{
"epoch": 0.4376896267276449,
"grad_norm": 0.578125,
"learning_rate": 9.951722202283479e-06,
"loss": 0.5415,
"step": 23280
},
{
"epoch": 0.43806564874373394,
"grad_norm": 0.58984375,
"learning_rate": 9.95158141156389e-06,
"loss": 0.5525,
"step": 23300
},
{
"epoch": 0.438441670759823,
"grad_norm": 0.66796875,
"learning_rate": 9.951440416850582e-06,
"loss": 0.5553,
"step": 23320
},
{
"epoch": 0.43881769277591204,
"grad_norm": 0.609375,
"learning_rate": 9.951299218149371e-06,
"loss": 0.5476,
"step": 23340
},
{
"epoch": 0.4391937147920011,
"grad_norm": 0.5625,
"learning_rate": 9.951157815466069e-06,
"loss": 0.5412,
"step": 23360
},
{
"epoch": 0.43956973680809014,
"grad_norm": 0.55078125,
"learning_rate": 9.9510162088065e-06,
"loss": 0.5443,
"step": 23380
},
{
"epoch": 0.43994575882417913,
"grad_norm": 0.58984375,
"learning_rate": 9.950874398176503e-06,
"loss": 0.5535,
"step": 23400
},
{
"epoch": 0.4403217808402682,
"grad_norm": 0.6171875,
"learning_rate": 9.950732383581915e-06,
"loss": 0.5426,
"step": 23420
},
{
"epoch": 0.44069780285635723,
"grad_norm": 0.5625,
"learning_rate": 9.95059016502859e-06,
"loss": 0.5538,
"step": 23440
},
{
"epoch": 0.4410738248724463,
"grad_norm": 0.59765625,
"learning_rate": 9.950447742522387e-06,
"loss": 0.5513,
"step": 23460
},
{
"epoch": 0.44144984688853534,
"grad_norm": 0.6328125,
"learning_rate": 9.950305116069171e-06,
"loss": 0.5618,
"step": 23480
},
{
"epoch": 0.4418258689046244,
"grad_norm": 0.61328125,
"learning_rate": 9.95016228567482e-06,
"loss": 0.5504,
"step": 23500
},
{
"epoch": 0.4422018909207134,
"grad_norm": 0.5859375,
"learning_rate": 9.95001925134522e-06,
"loss": 0.5455,
"step": 23520
},
{
"epoch": 0.44257791293680243,
"grad_norm": 0.56640625,
"learning_rate": 9.949876013086258e-06,
"loss": 0.5425,
"step": 23540
},
{
"epoch": 0.4429539349528915,
"grad_norm": 0.69140625,
"learning_rate": 9.94973257090384e-06,
"loss": 0.5531,
"step": 23560
},
{
"epoch": 0.44332995696898053,
"grad_norm": 0.625,
"learning_rate": 9.949588924803875e-06,
"loss": 0.5569,
"step": 23580
},
{
"epoch": 0.4437059789850696,
"grad_norm": 0.62109375,
"learning_rate": 9.949445074792279e-06,
"loss": 0.5535,
"step": 23600
},
{
"epoch": 0.44408200100115863,
"grad_norm": 0.578125,
"learning_rate": 9.94930102087498e-06,
"loss": 0.5458,
"step": 23620
},
{
"epoch": 0.4444580230172477,
"grad_norm": 0.56640625,
"learning_rate": 9.949156763057912e-06,
"loss": 0.5536,
"step": 23640
},
{
"epoch": 0.4448340450333367,
"grad_norm": 0.6171875,
"learning_rate": 9.949012301347016e-06,
"loss": 0.554,
"step": 23660
},
{
"epoch": 0.44521006704942573,
"grad_norm": 0.67578125,
"learning_rate": 9.948867635748248e-06,
"loss": 0.5641,
"step": 23680
},
{
"epoch": 0.4455860890655148,
"grad_norm": 0.59765625,
"learning_rate": 9.948722766267565e-06,
"loss": 0.5517,
"step": 23700
},
{
"epoch": 0.44596211108160383,
"grad_norm": 0.58203125,
"learning_rate": 9.948577692910934e-06,
"loss": 0.5528,
"step": 23720
},
{
"epoch": 0.4463381330976929,
"grad_norm": 0.78125,
"learning_rate": 9.948432415684335e-06,
"loss": 0.5397,
"step": 23740
},
{
"epoch": 0.44671415511378193,
"grad_norm": 0.6328125,
"learning_rate": 9.948286934593751e-06,
"loss": 0.5533,
"step": 23760
},
{
"epoch": 0.447090177129871,
"grad_norm": 0.61328125,
"learning_rate": 9.948141249645176e-06,
"loss": 0.5564,
"step": 23780
},
{
"epoch": 0.44746619914596,
"grad_norm": 0.57421875,
"learning_rate": 9.94799536084461e-06,
"loss": 0.5378,
"step": 23800
},
{
"epoch": 0.447842221162049,
"grad_norm": 0.578125,
"learning_rate": 9.947849268198067e-06,
"loss": 0.5495,
"step": 23820
},
{
"epoch": 0.4482182431781381,
"grad_norm": 0.58984375,
"learning_rate": 9.947702971711564e-06,
"loss": 0.5491,
"step": 23840
},
{
"epoch": 0.4485942651942271,
"grad_norm": 0.62890625,
"learning_rate": 9.947556471391127e-06,
"loss": 0.5517,
"step": 23860
},
{
"epoch": 0.4489702872103162,
"grad_norm": 0.5703125,
"learning_rate": 9.947409767242793e-06,
"loss": 0.5504,
"step": 23880
},
{
"epoch": 0.4493463092264052,
"grad_norm": 0.61328125,
"learning_rate": 9.947262859272605e-06,
"loss": 0.5478,
"step": 23900
},
{
"epoch": 0.4497223312424942,
"grad_norm": 0.5234375,
"learning_rate": 9.947115747486616e-06,
"loss": 0.5491,
"step": 23920
},
{
"epoch": 0.4500983532585833,
"grad_norm": 0.6328125,
"learning_rate": 9.946968431890884e-06,
"loss": 0.5444,
"step": 23940
},
{
"epoch": 0.4504743752746723,
"grad_norm": 0.65625,
"learning_rate": 9.946820912491483e-06,
"loss": 0.5486,
"step": 23960
},
{
"epoch": 0.4508503972907614,
"grad_norm": 0.703125,
"learning_rate": 9.946673189294486e-06,
"loss": 0.5474,
"step": 23980
},
{
"epoch": 0.4512264193068504,
"grad_norm": 0.58984375,
"learning_rate": 9.94652526230598e-06,
"loss": 0.5411,
"step": 24000
},
{
"epoch": 0.4516024413229395,
"grad_norm": 0.66015625,
"learning_rate": 9.94637713153206e-06,
"loss": 0.5557,
"step": 24020
},
{
"epoch": 0.4519784633390285,
"grad_norm": 0.609375,
"learning_rate": 9.94622879697883e-06,
"loss": 0.5471,
"step": 24040
},
{
"epoch": 0.4523544853551175,
"grad_norm": 0.6640625,
"learning_rate": 9.9460802586524e-06,
"loss": 0.5487,
"step": 24060
},
{
"epoch": 0.45273050737120657,
"grad_norm": 0.66015625,
"learning_rate": 9.945931516558886e-06,
"loss": 0.5558,
"step": 24080
},
{
"epoch": 0.4531065293872956,
"grad_norm": 0.625,
"learning_rate": 9.945782570704421e-06,
"loss": 0.5548,
"step": 24100
},
{
"epoch": 0.45348255140338467,
"grad_norm": 0.640625,
"learning_rate": 9.945633421095137e-06,
"loss": 0.5387,
"step": 24120
},
{
"epoch": 0.4538585734194737,
"grad_norm": 0.58984375,
"learning_rate": 9.945484067737182e-06,
"loss": 0.5526,
"step": 24140
},
{
"epoch": 0.45423459543556277,
"grad_norm": 0.64453125,
"learning_rate": 9.945334510636707e-06,
"loss": 0.5455,
"step": 24160
},
{
"epoch": 0.4546106174516518,
"grad_norm": 0.6640625,
"learning_rate": 9.945184749799874e-06,
"loss": 0.5458,
"step": 24180
},
{
"epoch": 0.4549866394677408,
"grad_norm": 0.671875,
"learning_rate": 9.945034785232853e-06,
"loss": 0.5505,
"step": 24200
},
{
"epoch": 0.45536266148382987,
"grad_norm": 0.6640625,
"learning_rate": 9.944884616941822e-06,
"loss": 0.5433,
"step": 24220
},
{
"epoch": 0.4557386834999189,
"grad_norm": 0.54296875,
"learning_rate": 9.944734244932968e-06,
"loss": 0.5481,
"step": 24240
},
{
"epoch": 0.45611470551600797,
"grad_norm": 0.65625,
"learning_rate": 9.944583669212485e-06,
"loss": 0.5555,
"step": 24260
},
{
"epoch": 0.456490727532097,
"grad_norm": 0.6484375,
"learning_rate": 9.944432889786578e-06,
"loss": 0.5525,
"step": 24280
},
{
"epoch": 0.45686674954818607,
"grad_norm": 0.5546875,
"learning_rate": 9.944281906661455e-06,
"loss": 0.5402,
"step": 24300
},
{
"epoch": 0.45724277156427506,
"grad_norm": 0.7265625,
"learning_rate": 9.944130719843341e-06,
"loss": 0.5559,
"step": 24320
},
{
"epoch": 0.4576187935803641,
"grad_norm": 0.67578125,
"learning_rate": 9.94397932933846e-06,
"loss": 0.5475,
"step": 24340
},
{
"epoch": 0.45799481559645316,
"grad_norm": 0.5625,
"learning_rate": 9.943827735153055e-06,
"loss": 0.5473,
"step": 24360
},
{
"epoch": 0.4583708376125422,
"grad_norm": 0.640625,
"learning_rate": 9.943675937293365e-06,
"loss": 0.5471,
"step": 24380
},
{
"epoch": 0.45874685962863126,
"grad_norm": 0.609375,
"learning_rate": 9.943523935765647e-06,
"loss": 0.5487,
"step": 24400
},
{
"epoch": 0.4591228816447203,
"grad_norm": 0.6015625,
"learning_rate": 9.943371730576164e-06,
"loss": 0.5439,
"step": 24420
},
{
"epoch": 0.45949890366080937,
"grad_norm": 0.6171875,
"learning_rate": 9.943219321731183e-06,
"loss": 0.5513,
"step": 24440
},
{
"epoch": 0.45987492567689836,
"grad_norm": 0.62890625,
"learning_rate": 9.943066709236985e-06,
"loss": 0.5473,
"step": 24460
},
{
"epoch": 0.4602509476929874,
"grad_norm": 0.6640625,
"learning_rate": 9.942913893099859e-06,
"loss": 0.553,
"step": 24480
},
{
"epoch": 0.46062696970907646,
"grad_norm": 0.6015625,
"learning_rate": 9.942760873326096e-06,
"loss": 0.5543,
"step": 24500
},
{
"epoch": 0.4610029917251655,
"grad_norm": 0.6328125,
"learning_rate": 9.942607649922005e-06,
"loss": 0.5534,
"step": 24520
},
{
"epoch": 0.46137901374125456,
"grad_norm": 0.67578125,
"learning_rate": 9.942454222893895e-06,
"loss": 0.5498,
"step": 24540
},
{
"epoch": 0.4617550357573436,
"grad_norm": 0.67578125,
"learning_rate": 9.94230059224809e-06,
"loss": 0.5488,
"step": 24560
},
{
"epoch": 0.4621310577734326,
"grad_norm": 0.6328125,
"learning_rate": 9.942146757990916e-06,
"loss": 0.5436,
"step": 24580
},
{
"epoch": 0.46250707978952166,
"grad_norm": 0.6640625,
"learning_rate": 9.941992720128713e-06,
"loss": 0.5396,
"step": 24600
},
{
"epoch": 0.4628831018056107,
"grad_norm": 0.578125,
"learning_rate": 9.941838478667825e-06,
"loss": 0.5523,
"step": 24620
},
{
"epoch": 0.46325912382169976,
"grad_norm": 0.62890625,
"learning_rate": 9.941684033614607e-06,
"loss": 0.5429,
"step": 24640
},
{
"epoch": 0.4636351458377888,
"grad_norm": 0.51953125,
"learning_rate": 9.941529384975423e-06,
"loss": 0.549,
"step": 24660
},
{
"epoch": 0.46401116785387786,
"grad_norm": 0.5859375,
"learning_rate": 9.941374532756644e-06,
"loss": 0.5485,
"step": 24680
},
{
"epoch": 0.4643871898699669,
"grad_norm": 0.64453125,
"learning_rate": 9.941219476964648e-06,
"loss": 0.5436,
"step": 24700
},
{
"epoch": 0.4647632118860559,
"grad_norm": 0.55859375,
"learning_rate": 9.941064217605824e-06,
"loss": 0.5441,
"step": 24720
},
{
"epoch": 0.46513923390214496,
"grad_norm": 0.6015625,
"learning_rate": 9.94090875468657e-06,
"loss": 0.5515,
"step": 24740
},
{
"epoch": 0.465515255918234,
"grad_norm": 0.6484375,
"learning_rate": 9.940753088213287e-06,
"loss": 0.5519,
"step": 24760
},
{
"epoch": 0.46589127793432306,
"grad_norm": 0.62109375,
"learning_rate": 9.94059721819239e-06,
"loss": 0.5534,
"step": 24780
},
{
"epoch": 0.4662672999504121,
"grad_norm": 0.6796875,
"learning_rate": 9.940441144630299e-06,
"loss": 0.5367,
"step": 24800
},
{
"epoch": 0.46664332196650116,
"grad_norm": 0.609375,
"learning_rate": 9.940284867533447e-06,
"loss": 0.5532,
"step": 24820
},
{
"epoch": 0.4670193439825902,
"grad_norm": 0.59765625,
"learning_rate": 9.940128386908272e-06,
"loss": 0.5458,
"step": 24840
},
{
"epoch": 0.4673953659986792,
"grad_norm": 0.60546875,
"learning_rate": 9.939971702761217e-06,
"loss": 0.5565,
"step": 24860
},
{
"epoch": 0.46777138801476825,
"grad_norm": 0.68359375,
"learning_rate": 9.93981481509874e-06,
"loss": 0.5472,
"step": 24880
},
{
"epoch": 0.4681474100308573,
"grad_norm": 0.625,
"learning_rate": 9.939657723927305e-06,
"loss": 0.5492,
"step": 24900
},
{
"epoch": 0.46852343204694635,
"grad_norm": 0.65234375,
"learning_rate": 9.93950042925338e-06,
"loss": 0.5467,
"step": 24920
},
{
"epoch": 0.4688994540630354,
"grad_norm": 0.6328125,
"learning_rate": 9.93934293108345e-06,
"loss": 0.5484,
"step": 24940
},
{
"epoch": 0.46927547607912445,
"grad_norm": 0.703125,
"learning_rate": 9.939185229424e-06,
"loss": 0.5557,
"step": 24960
},
{
"epoch": 0.46965149809521345,
"grad_norm": 0.73046875,
"learning_rate": 9.939027324281529e-06,
"loss": 0.5579,
"step": 24980
},
{
"epoch": 0.4700275201113025,
"grad_norm": 0.6015625,
"learning_rate": 9.938869215662541e-06,
"loss": 0.5543,
"step": 25000
},
{
"epoch": 0.47040354212739155,
"grad_norm": 0.55859375,
"learning_rate": 9.93871090357355e-06,
"loss": 0.5489,
"step": 25020
},
{
"epoch": 0.4707795641434806,
"grad_norm": 0.61328125,
"learning_rate": 9.938552388021079e-06,
"loss": 0.5477,
"step": 25040
},
{
"epoch": 0.47115558615956965,
"grad_norm": 0.59765625,
"learning_rate": 9.938393669011657e-06,
"loss": 0.5491,
"step": 25060
},
{
"epoch": 0.4715316081756587,
"grad_norm": 0.65234375,
"learning_rate": 9.938234746551825e-06,
"loss": 0.5503,
"step": 25080
},
{
"epoch": 0.47190763019174775,
"grad_norm": 0.58984375,
"learning_rate": 9.938075620648127e-06,
"loss": 0.5569,
"step": 25100
},
{
"epoch": 0.47228365220783675,
"grad_norm": 0.59375,
"learning_rate": 9.937916291307122e-06,
"loss": 0.5441,
"step": 25120
},
{
"epoch": 0.4726596742239258,
"grad_norm": 0.82421875,
"learning_rate": 9.937756758535371e-06,
"loss": 0.555,
"step": 25140
},
{
"epoch": 0.47303569624001485,
"grad_norm": 0.52734375,
"learning_rate": 9.937597022339448e-06,
"loss": 0.5498,
"step": 25160
},
{
"epoch": 0.4734117182561039,
"grad_norm": 0.62109375,
"learning_rate": 9.937437082725934e-06,
"loss": 0.5497,
"step": 25180
},
{
"epoch": 0.47378774027219295,
"grad_norm": 0.640625,
"learning_rate": 9.937276939701418e-06,
"loss": 0.5402,
"step": 25200
},
{
"epoch": 0.474163762288282,
"grad_norm": 0.625,
"learning_rate": 9.937116593272499e-06,
"loss": 0.5427,
"step": 25220
},
{
"epoch": 0.47453978430437105,
"grad_norm": 0.609375,
"learning_rate": 9.936956043445778e-06,
"loss": 0.5426,
"step": 25240
},
{
"epoch": 0.47491580632046004,
"grad_norm": 0.66015625,
"learning_rate": 9.936795290227875e-06,
"loss": 0.5425,
"step": 25260
},
{
"epoch": 0.4752918283365491,
"grad_norm": 0.7109375,
"learning_rate": 9.936634333625407e-06,
"loss": 0.5603,
"step": 25280
},
{
"epoch": 0.47566785035263814,
"grad_norm": 0.58203125,
"learning_rate": 9.936473173645012e-06,
"loss": 0.5467,
"step": 25300
},
{
"epoch": 0.4760438723687272,
"grad_norm": 0.578125,
"learning_rate": 9.936311810293322e-06,
"loss": 0.5531,
"step": 25320
},
{
"epoch": 0.47641989438481624,
"grad_norm": 0.55859375,
"learning_rate": 9.93615024357699e-06,
"loss": 0.5522,
"step": 25340
},
{
"epoch": 0.4767959164009053,
"grad_norm": 0.5546875,
"learning_rate": 9.935988473502671e-06,
"loss": 0.5442,
"step": 25360
},
{
"epoch": 0.4771719384169943,
"grad_norm": 0.5859375,
"learning_rate": 9.935826500077029e-06,
"loss": 0.5484,
"step": 25380
},
{
"epoch": 0.47754796043308334,
"grad_norm": 0.71484375,
"learning_rate": 9.935664323306737e-06,
"loss": 0.5355,
"step": 25400
},
{
"epoch": 0.4779239824491724,
"grad_norm": 0.6484375,
"learning_rate": 9.935501943198478e-06,
"loss": 0.5461,
"step": 25420
},
{
"epoch": 0.47830000446526144,
"grad_norm": 0.66796875,
"learning_rate": 9.935339359758938e-06,
"loss": 0.5574,
"step": 25440
},
{
"epoch": 0.4786760264813505,
"grad_norm": 0.66796875,
"learning_rate": 9.935176572994816e-06,
"loss": 0.5446,
"step": 25460
},
{
"epoch": 0.47905204849743954,
"grad_norm": 0.578125,
"learning_rate": 9.935013582912822e-06,
"loss": 0.5464,
"step": 25480
},
{
"epoch": 0.4794280705135286,
"grad_norm": 0.58984375,
"learning_rate": 9.934850389519666e-06,
"loss": 0.556,
"step": 25500
},
{
"epoch": 0.4798040925296176,
"grad_norm": 0.66796875,
"learning_rate": 9.934686992822076e-06,
"loss": 0.5436,
"step": 25520
},
{
"epoch": 0.48018011454570664,
"grad_norm": 0.6171875,
"learning_rate": 9.93452339282678e-06,
"loss": 0.5428,
"step": 25540
},
{
"epoch": 0.4805561365617957,
"grad_norm": 0.5859375,
"learning_rate": 9.934359589540519e-06,
"loss": 0.5511,
"step": 25560
},
{
"epoch": 0.48093215857788474,
"grad_norm": 0.6171875,
"learning_rate": 9.934195582970042e-06,
"loss": 0.5386,
"step": 25580
},
{
"epoch": 0.4813081805939738,
"grad_norm": 0.67578125,
"learning_rate": 9.934031373122104e-06,
"loss": 0.5477,
"step": 25600
},
{
"epoch": 0.48168420261006284,
"grad_norm": 0.62109375,
"learning_rate": 9.933866960003471e-06,
"loss": 0.5394,
"step": 25620
},
{
"epoch": 0.48206022462615183,
"grad_norm": 0.515625,
"learning_rate": 9.933702343620917e-06,
"loss": 0.5408,
"step": 25640
},
{
"epoch": 0.4824362466422409,
"grad_norm": 0.58203125,
"learning_rate": 9.933537523981226e-06,
"loss": 0.5506,
"step": 25660
},
{
"epoch": 0.48281226865832994,
"grad_norm": 0.5546875,
"learning_rate": 9.933372501091182e-06,
"loss": 0.5436,
"step": 25680
},
{
"epoch": 0.483188290674419,
"grad_norm": 0.5390625,
"learning_rate": 9.933207274957588e-06,
"loss": 0.5479,
"step": 25700
},
{
"epoch": 0.48356431269050804,
"grad_norm": 0.81640625,
"learning_rate": 9.93304184558725e-06,
"loss": 0.5432,
"step": 25720
},
{
"epoch": 0.4839403347065971,
"grad_norm": 0.640625,
"learning_rate": 9.932876212986984e-06,
"loss": 0.5365,
"step": 25740
},
{
"epoch": 0.48431635672268614,
"grad_norm": 0.546875,
"learning_rate": 9.932710377163612e-06,
"loss": 0.558,
"step": 25760
},
{
"epoch": 0.48469237873877513,
"grad_norm": 0.76953125,
"learning_rate": 9.932544338123969e-06,
"loss": 0.5381,
"step": 25780
},
{
"epoch": 0.4850684007548642,
"grad_norm": 0.6015625,
"learning_rate": 9.932378095874893e-06,
"loss": 0.5481,
"step": 25800
},
{
"epoch": 0.48544442277095323,
"grad_norm": 0.61328125,
"learning_rate": 9.932211650423234e-06,
"loss": 0.5428,
"step": 25820
},
{
"epoch": 0.4858204447870423,
"grad_norm": 0.59765625,
"learning_rate": 9.932045001775846e-06,
"loss": 0.5462,
"step": 25840
},
{
"epoch": 0.48619646680313133,
"grad_norm": 0.5703125,
"learning_rate": 9.9318781499396e-06,
"loss": 0.5425,
"step": 25860
},
{
"epoch": 0.4865724888192204,
"grad_norm": 0.65625,
"learning_rate": 9.931711094921363e-06,
"loss": 0.5506,
"step": 25880
},
{
"epoch": 0.48694851083530943,
"grad_norm": 0.73046875,
"learning_rate": 9.931543836728025e-06,
"loss": 0.5545,
"step": 25900
},
{
"epoch": 0.48732453285139843,
"grad_norm": 0.66796875,
"learning_rate": 9.931376375366471e-06,
"loss": 0.5557,
"step": 25920
},
{
"epoch": 0.4877005548674875,
"grad_norm": 0.5234375,
"learning_rate": 9.931208710843603e-06,
"loss": 0.5373,
"step": 25940
},
{
"epoch": 0.48807657688357653,
"grad_norm": 0.59375,
"learning_rate": 9.931040843166326e-06,
"loss": 0.5354,
"step": 25960
},
{
"epoch": 0.4884525988996656,
"grad_norm": 0.62890625,
"learning_rate": 9.930872772341558e-06,
"loss": 0.553,
"step": 25980
},
{
"epoch": 0.48882862091575463,
"grad_norm": 0.609375,
"learning_rate": 9.930704498376223e-06,
"loss": 0.5476,
"step": 26000
},
{
"epoch": 0.4892046429318437,
"grad_norm": 0.6328125,
"learning_rate": 9.93053602127725e-06,
"loss": 0.5514,
"step": 26020
},
{
"epoch": 0.4895806649479327,
"grad_norm": 0.59765625,
"learning_rate": 9.930367341051586e-06,
"loss": 0.5403,
"step": 26040
},
{
"epoch": 0.4899566869640217,
"grad_norm": 0.6328125,
"learning_rate": 9.930198457706176e-06,
"loss": 0.5484,
"step": 26060
},
{
"epoch": 0.4903327089801108,
"grad_norm": 0.64453125,
"learning_rate": 9.930029371247975e-06,
"loss": 0.5646,
"step": 26080
},
{
"epoch": 0.4907087309961998,
"grad_norm": 0.67578125,
"learning_rate": 9.929860081683954e-06,
"loss": 0.5528,
"step": 26100
},
{
"epoch": 0.4910847530122889,
"grad_norm": 0.55078125,
"learning_rate": 9.929690589021087e-06,
"loss": 0.5439,
"step": 26120
},
{
"epoch": 0.4914607750283779,
"grad_norm": 0.64453125,
"learning_rate": 9.929520893266355e-06,
"loss": 0.5472,
"step": 26140
},
{
"epoch": 0.491836797044467,
"grad_norm": 0.70703125,
"learning_rate": 9.929350994426751e-06,
"loss": 0.5466,
"step": 26160
},
{
"epoch": 0.492212819060556,
"grad_norm": 0.66796875,
"learning_rate": 9.929180892509272e-06,
"loss": 0.541,
"step": 26180
},
{
"epoch": 0.492588841076645,
"grad_norm": 0.62109375,
"learning_rate": 9.929010587520926e-06,
"loss": 0.5494,
"step": 26200
},
{
"epoch": 0.4929648630927341,
"grad_norm": 0.60546875,
"learning_rate": 9.92884007946873e-06,
"loss": 0.5538,
"step": 26220
},
{
"epoch": 0.4933408851088231,
"grad_norm": 0.76953125,
"learning_rate": 9.928669368359706e-06,
"loss": 0.5601,
"step": 26240
},
{
"epoch": 0.4937169071249122,
"grad_norm": 0.5859375,
"learning_rate": 9.928498454200894e-06,
"loss": 0.5486,
"step": 26260
},
{
"epoch": 0.4940929291410012,
"grad_norm": 0.5859375,
"learning_rate": 9.928327336999329e-06,
"loss": 0.5432,
"step": 26280
},
{
"epoch": 0.4944689511570903,
"grad_norm": 0.6171875,
"learning_rate": 9.928156016762061e-06,
"loss": 0.5413,
"step": 26300
},
{
"epoch": 0.49484497317317927,
"grad_norm": 0.609375,
"learning_rate": 9.92798449349615e-06,
"loss": 0.551,
"step": 26320
},
{
"epoch": 0.4952209951892683,
"grad_norm": 0.68359375,
"learning_rate": 9.927812767208662e-06,
"loss": 0.5532,
"step": 26340
},
{
"epoch": 0.49559701720535737,
"grad_norm": 0.5859375,
"learning_rate": 9.92764083790667e-06,
"loss": 0.5397,
"step": 26360
},
{
"epoch": 0.4959730392214464,
"grad_norm": 0.6328125,
"learning_rate": 9.927468705597258e-06,
"loss": 0.548,
"step": 26380
},
{
"epoch": 0.49634906123753547,
"grad_norm": 0.60546875,
"learning_rate": 9.92729637028752e-06,
"loss": 0.5562,
"step": 26400
},
{
"epoch": 0.4967250832536245,
"grad_norm": 0.60546875,
"learning_rate": 9.927123831984553e-06,
"loss": 0.538,
"step": 26420
},
{
"epoch": 0.4971011052697135,
"grad_norm": 0.66796875,
"learning_rate": 9.926951090695466e-06,
"loss": 0.553,
"step": 26440
},
{
"epoch": 0.49747712728580257,
"grad_norm": 0.578125,
"learning_rate": 9.926778146427374e-06,
"loss": 0.552,
"step": 26460
},
{
"epoch": 0.4978531493018916,
"grad_norm": 0.59765625,
"learning_rate": 9.926604999187405e-06,
"loss": 0.5408,
"step": 26480
},
{
"epoch": 0.49822917131798067,
"grad_norm": 0.671875,
"learning_rate": 9.92643164898269e-06,
"loss": 0.549,
"step": 26500
},
{
"epoch": 0.4986051933340697,
"grad_norm": 0.58984375,
"learning_rate": 9.926258095820372e-06,
"loss": 0.54,
"step": 26520
},
{
"epoch": 0.49898121535015877,
"grad_norm": 0.60546875,
"learning_rate": 9.9260843397076e-06,
"loss": 0.545,
"step": 26540
},
{
"epoch": 0.4993572373662478,
"grad_norm": 0.60546875,
"learning_rate": 9.925910380651531e-06,
"loss": 0.5392,
"step": 26560
},
{
"epoch": 0.4997332593823368,
"grad_norm": 0.625,
"learning_rate": 9.925736218659333e-06,
"loss": 0.5557,
"step": 26580
}
],
"logging_steps": 20,
"max_steps": 319134,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 13297,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.9989602005747545e+20,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}