ErrorAI's picture
Training in progress, step 1475, checkpoint
8310557 verified
Invalid JSON: Unexpected token 'N', ..."al_loss": NaN, "... is not valid JSON
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.07228886138917137,
"eval_steps": 369,
"global_step": 1475,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 4.900939755198059e-05,
"grad_norm": 0.5761741399765015,
"learning_rate": 2e-05,
"loss": 1.0427,
"step": 1
},
{
"epoch": 4.900939755198059e-05,
"eval_loss": NaN,
"eval_runtime": 184.6812,
"eval_samples_per_second": 46.523,
"eval_steps_per_second": 23.262,
"step": 1
},
{
"epoch": 9.801879510396118e-05,
"grad_norm": 0.9498972296714783,
"learning_rate": 4e-05,
"loss": 1.2518,
"step": 2
},
{
"epoch": 0.00014702819265594177,
"grad_norm": 0.7824539542198181,
"learning_rate": 6e-05,
"loss": 1.1888,
"step": 3
},
{
"epoch": 0.00019603759020792236,
"grad_norm": 0.6423453688621521,
"learning_rate": 8e-05,
"loss": 1.0259,
"step": 4
},
{
"epoch": 0.00024504698775990296,
"grad_norm": 0.637169361114502,
"learning_rate": 0.0001,
"loss": 1.0737,
"step": 5
},
{
"epoch": 0.00029405638531188353,
"grad_norm": 0.8547672033309937,
"learning_rate": 0.00012,
"loss": 1.2056,
"step": 6
},
{
"epoch": 0.00034306578286386416,
"grad_norm": 0.8592035174369812,
"learning_rate": 0.00014,
"loss": 1.2962,
"step": 7
},
{
"epoch": 0.00039207518041584473,
"grad_norm": 0.8024699091911316,
"learning_rate": 0.00016,
"loss": 1.2049,
"step": 8
},
{
"epoch": 0.00044108457796782535,
"grad_norm": 0.7319013476371765,
"learning_rate": 0.00018,
"loss": 1.0241,
"step": 9
},
{
"epoch": 0.0004900939755198059,
"grad_norm": 0.7202540040016174,
"learning_rate": 0.0002,
"loss": 1.2143,
"step": 10
},
{
"epoch": 0.0005391033730717865,
"grad_norm": 0.7399929165840149,
"learning_rate": 0.00019999977007069113,
"loss": 1.0112,
"step": 11
},
{
"epoch": 0.0005881127706237671,
"grad_norm": 0.6700228452682495,
"learning_rate": 0.00019999908028382185,
"loss": 1.1158,
"step": 12
},
{
"epoch": 0.0006371221681757477,
"grad_norm": 0.7994217872619629,
"learning_rate": 0.0001999979306425642,
"loss": 1.1398,
"step": 13
},
{
"epoch": 0.0006861315657277283,
"grad_norm": 0.6176120042800903,
"learning_rate": 0.00019999632115220493,
"loss": 1.0548,
"step": 14
},
{
"epoch": 0.0007351409632797088,
"grad_norm": 0.7442915439605713,
"learning_rate": 0.0001999942518201454,
"loss": 1.0992,
"step": 15
},
{
"epoch": 0.0007841503608316895,
"grad_norm": 0.7436463832855225,
"learning_rate": 0.0001999917226559016,
"loss": 1.2715,
"step": 16
},
{
"epoch": 0.0008331597583836701,
"grad_norm": 0.654746949672699,
"learning_rate": 0.00019998873367110416,
"loss": 1.1128,
"step": 17
},
{
"epoch": 0.0008821691559356507,
"grad_norm": 0.6541568040847778,
"learning_rate": 0.00019998528487949813,
"loss": 1.0187,
"step": 18
},
{
"epoch": 0.0009311785534876312,
"grad_norm": 0.5537770986557007,
"learning_rate": 0.00019998137629694308,
"loss": 1.0205,
"step": 19
},
{
"epoch": 0.0009801879510396118,
"grad_norm": 0.7401204109191895,
"learning_rate": 0.000199977007941413,
"loss": 1.1563,
"step": 20
},
{
"epoch": 0.0010291973485915924,
"grad_norm": 0.6118784546852112,
"learning_rate": 0.0001999721798329961,
"loss": 1.0828,
"step": 21
},
{
"epoch": 0.001078206746143573,
"grad_norm": 0.8245735764503479,
"learning_rate": 0.0001999668919938949,
"loss": 1.2984,
"step": 22
},
{
"epoch": 0.0011272161436955536,
"grad_norm": 0.5586501955986023,
"learning_rate": 0.00019996114444842595,
"loss": 0.9791,
"step": 23
},
{
"epoch": 0.0011762255412475341,
"grad_norm": 0.6310902833938599,
"learning_rate": 0.00019995493722301989,
"loss": 1.0142,
"step": 24
},
{
"epoch": 0.0012252349387995149,
"grad_norm": 0.5360428094863892,
"learning_rate": 0.0001999482703462211,
"loss": 1.1159,
"step": 25
},
{
"epoch": 0.0012742443363514954,
"grad_norm": 0.616188108921051,
"learning_rate": 0.00019994114384868782,
"loss": 0.9876,
"step": 26
},
{
"epoch": 0.001323253733903476,
"grad_norm": 0.7839058041572571,
"learning_rate": 0.00019993355776319193,
"loss": 1.0909,
"step": 27
},
{
"epoch": 0.0013722631314554566,
"grad_norm": 0.5597085356712341,
"learning_rate": 0.00019992551212461856,
"loss": 1.0243,
"step": 28
},
{
"epoch": 0.0014212725290074371,
"grad_norm": 0.5495719313621521,
"learning_rate": 0.00019991700696996638,
"loss": 1.1049,
"step": 29
},
{
"epoch": 0.0014702819265594177,
"grad_norm": 0.5567327737808228,
"learning_rate": 0.00019990804233834705,
"loss": 1.0723,
"step": 30
},
{
"epoch": 0.0015192913241113984,
"grad_norm": 0.7199801206588745,
"learning_rate": 0.00019989861827098522,
"loss": 1.1713,
"step": 31
},
{
"epoch": 0.001568300721663379,
"grad_norm": 0.5128418207168579,
"learning_rate": 0.0001998887348112182,
"loss": 1.1,
"step": 32
},
{
"epoch": 0.0016173101192153596,
"grad_norm": 0.4268779456615448,
"learning_rate": 0.00019987839200449602,
"loss": 0.954,
"step": 33
},
{
"epoch": 0.0016663195167673402,
"grad_norm": 0.4875427186489105,
"learning_rate": 0.00019986758989838093,
"loss": 0.8048,
"step": 34
},
{
"epoch": 0.0017153289143193207,
"grad_norm": 0.5128729343414307,
"learning_rate": 0.00019985632854254735,
"loss": 1.0232,
"step": 35
},
{
"epoch": 0.0017643383118713014,
"grad_norm": 0.5712106823921204,
"learning_rate": 0.0001998446079887816,
"loss": 1.0229,
"step": 36
},
{
"epoch": 0.001813347709423282,
"grad_norm": 0.6023369431495667,
"learning_rate": 0.00019983242829098164,
"loss": 1.143,
"step": 37
},
{
"epoch": 0.0018623571069752624,
"grad_norm": 0.5086866617202759,
"learning_rate": 0.00019981978950515687,
"loss": 0.9654,
"step": 38
},
{
"epoch": 0.0019113665045272432,
"grad_norm": 0.49370044469833374,
"learning_rate": 0.00019980669168942784,
"loss": 1.1224,
"step": 39
},
{
"epoch": 0.0019603759020792237,
"grad_norm": 0.5668991804122925,
"learning_rate": 0.00019979313490402597,
"loss": 0.935,
"step": 40
},
{
"epoch": 0.002009385299631204,
"grad_norm": 0.44917380809783936,
"learning_rate": 0.00019977911921129332,
"loss": 1.0326,
"step": 41
},
{
"epoch": 0.0020583946971831847,
"grad_norm": 0.4713689386844635,
"learning_rate": 0.00019976464467568226,
"loss": 0.8148,
"step": 42
},
{
"epoch": 0.0021074040947351657,
"grad_norm": 0.8936280012130737,
"learning_rate": 0.0001997497113637552,
"loss": 1.0798,
"step": 43
},
{
"epoch": 0.002156413492287146,
"grad_norm": 0.5882841348648071,
"learning_rate": 0.0001997343193441842,
"loss": 1.0313,
"step": 44
},
{
"epoch": 0.0022054228898391267,
"grad_norm": 0.5324700474739075,
"learning_rate": 0.0001997184686877509,
"loss": 1.0233,
"step": 45
},
{
"epoch": 0.0022544322873911072,
"grad_norm": 0.43486493825912476,
"learning_rate": 0.00019970215946734583,
"loss": 1.0024,
"step": 46
},
{
"epoch": 0.0023034416849430877,
"grad_norm": 0.6185562014579773,
"learning_rate": 0.00019968539175796833,
"loss": 1.0328,
"step": 47
},
{
"epoch": 0.0023524510824950683,
"grad_norm": 0.5140308141708374,
"learning_rate": 0.00019966816563672622,
"loss": 1.0239,
"step": 48
},
{
"epoch": 0.002401460480047049,
"grad_norm": 0.5439414978027344,
"learning_rate": 0.00019965048118283525,
"loss": 0.9122,
"step": 49
},
{
"epoch": 0.0024504698775990297,
"grad_norm": 0.6618689298629761,
"learning_rate": 0.00019963233847761894,
"loss": 1.0345,
"step": 50
},
{
"epoch": 0.0024994792751510102,
"grad_norm": 0.5437299609184265,
"learning_rate": 0.00019961373760450806,
"loss": 1.0961,
"step": 51
},
{
"epoch": 0.0025484886727029908,
"grad_norm": 0.49915456771850586,
"learning_rate": 0.00019959467864904035,
"loss": 1.1166,
"step": 52
},
{
"epoch": 0.0025974980702549713,
"grad_norm": 0.6561310887336731,
"learning_rate": 0.00019957516169886007,
"loss": 1.0409,
"step": 53
},
{
"epoch": 0.002646507467806952,
"grad_norm": 0.4919995665550232,
"learning_rate": 0.00019955518684371752,
"loss": 1.0823,
"step": 54
},
{
"epoch": 0.0026955168653589327,
"grad_norm": 0.5455455780029297,
"learning_rate": 0.0001995347541754689,
"loss": 0.9638,
"step": 55
},
{
"epoch": 0.0027445262629109133,
"grad_norm": 0.5481426119804382,
"learning_rate": 0.0001995138637880755,
"loss": 0.9245,
"step": 56
},
{
"epoch": 0.0027935356604628938,
"grad_norm": 0.8154264092445374,
"learning_rate": 0.0001994925157776036,
"loss": 1.0783,
"step": 57
},
{
"epoch": 0.0028425450580148743,
"grad_norm": 0.8507540225982666,
"learning_rate": 0.00019947071024222393,
"loss": 0.9816,
"step": 58
},
{
"epoch": 0.002891554455566855,
"grad_norm": 0.5288252830505371,
"learning_rate": 0.00019944844728221104,
"loss": 0.9762,
"step": 59
},
{
"epoch": 0.0029405638531188353,
"grad_norm": 0.5518356561660767,
"learning_rate": 0.0001994257269999431,
"loss": 1.0246,
"step": 60
},
{
"epoch": 0.0029895732506708163,
"grad_norm": 0.562601625919342,
"learning_rate": 0.0001994025494999013,
"loss": 1.0615,
"step": 61
},
{
"epoch": 0.003038582648222797,
"grad_norm": 0.6437848806381226,
"learning_rate": 0.00019937891488866935,
"loss": 1.2619,
"step": 62
},
{
"epoch": 0.0030875920457747773,
"grad_norm": 0.4969969391822815,
"learning_rate": 0.00019935482327493306,
"loss": 1.0224,
"step": 63
},
{
"epoch": 0.003136601443326758,
"grad_norm": 0.4657133221626282,
"learning_rate": 0.00019933027476947977,
"loss": 0.9961,
"step": 64
},
{
"epoch": 0.0031856108408787383,
"grad_norm": 0.4991621673107147,
"learning_rate": 0.00019930526948519793,
"loss": 1.0202,
"step": 65
},
{
"epoch": 0.0032346202384307193,
"grad_norm": 0.5264037251472473,
"learning_rate": 0.00019927980753707645,
"loss": 0.9748,
"step": 66
},
{
"epoch": 0.0032836296359827,
"grad_norm": 0.6157549619674683,
"learning_rate": 0.00019925388904220434,
"loss": 0.861,
"step": 67
},
{
"epoch": 0.0033326390335346803,
"grad_norm": 0.5480204224586487,
"learning_rate": 0.00019922751411977,
"loss": 0.9667,
"step": 68
},
{
"epoch": 0.003381648431086661,
"grad_norm": 0.4597250521183014,
"learning_rate": 0.00019920068289106083,
"loss": 1.0202,
"step": 69
},
{
"epoch": 0.0034306578286386414,
"grad_norm": 0.6237716674804688,
"learning_rate": 0.00019917339547946246,
"loss": 1.1458,
"step": 70
},
{
"epoch": 0.003479667226190622,
"grad_norm": 0.4779433608055115,
"learning_rate": 0.00019914565201045853,
"loss": 0.967,
"step": 71
},
{
"epoch": 0.003528676623742603,
"grad_norm": 0.5889516472816467,
"learning_rate": 0.00019911745261162964,
"loss": 1.0123,
"step": 72
},
{
"epoch": 0.0035776860212945833,
"grad_norm": 0.427267849445343,
"learning_rate": 0.00019908879741265323,
"loss": 0.9696,
"step": 73
},
{
"epoch": 0.003626695418846564,
"grad_norm": 0.5856832265853882,
"learning_rate": 0.00019905968654530272,
"loss": 1.1239,
"step": 74
},
{
"epoch": 0.0036757048163985444,
"grad_norm": 0.5578678846359253,
"learning_rate": 0.00019903012014344686,
"loss": 0.9191,
"step": 75
},
{
"epoch": 0.003724714213950525,
"grad_norm": 0.4885029196739197,
"learning_rate": 0.00019900009834304937,
"loss": 0.9503,
"step": 76
},
{
"epoch": 0.0037737236115025054,
"grad_norm": 0.7779492735862732,
"learning_rate": 0.00019896962128216809,
"loss": 1.2377,
"step": 77
},
{
"epoch": 0.0038227330090544864,
"grad_norm": 0.4959696829319,
"learning_rate": 0.00019893868910095437,
"loss": 0.9475,
"step": 78
},
{
"epoch": 0.003871742406606467,
"grad_norm": 0.5338016152381897,
"learning_rate": 0.0001989073019416525,
"loss": 1.0817,
"step": 79
},
{
"epoch": 0.003920751804158447,
"grad_norm": 0.6674442887306213,
"learning_rate": 0.0001988754599485991,
"loss": 0.9813,
"step": 80
},
{
"epoch": 0.003969761201710428,
"grad_norm": 0.6323183178901672,
"learning_rate": 0.00019884316326822225,
"loss": 1.1776,
"step": 81
},
{
"epoch": 0.004018770599262408,
"grad_norm": 0.542637050151825,
"learning_rate": 0.00019881041204904105,
"loss": 1.0185,
"step": 82
},
{
"epoch": 0.004067779996814389,
"grad_norm": 0.5368381142616272,
"learning_rate": 0.0001987772064416648,
"loss": 1.0672,
"step": 83
},
{
"epoch": 0.0041167893943663695,
"grad_norm": 0.5859797596931458,
"learning_rate": 0.00019874354659879238,
"loss": 0.9016,
"step": 84
},
{
"epoch": 0.00416579879191835,
"grad_norm": 0.7060731053352356,
"learning_rate": 0.00019870943267521145,
"loss": 1.0842,
"step": 85
},
{
"epoch": 0.004214808189470331,
"grad_norm": 0.5956705212593079,
"learning_rate": 0.0001986748648277978,
"loss": 1.1822,
"step": 86
},
{
"epoch": 0.0042638175870223114,
"grad_norm": 0.7368344068527222,
"learning_rate": 0.0001986398432155147,
"loss": 1.0949,
"step": 87
},
{
"epoch": 0.004312826984574292,
"grad_norm": 0.6101769804954529,
"learning_rate": 0.00019860436799941201,
"loss": 0.9986,
"step": 88
},
{
"epoch": 0.0043618363821262725,
"grad_norm": 0.6834807991981506,
"learning_rate": 0.00019856843934262563,
"loss": 1.0497,
"step": 89
},
{
"epoch": 0.004410845779678253,
"grad_norm": 0.6897183060646057,
"learning_rate": 0.00019853205741037652,
"loss": 0.7885,
"step": 90
},
{
"epoch": 0.0044598551772302335,
"grad_norm": 0.5369214415550232,
"learning_rate": 0.00019849522236997018,
"loss": 1.0458,
"step": 91
},
{
"epoch": 0.0045088645747822145,
"grad_norm": 0.6021403670310974,
"learning_rate": 0.00019845793439079567,
"loss": 0.9317,
"step": 92
},
{
"epoch": 0.004557873972334195,
"grad_norm": 0.7959415316581726,
"learning_rate": 0.00019842019364432504,
"loss": 1.0009,
"step": 93
},
{
"epoch": 0.0046068833698861755,
"grad_norm": 0.4632684290409088,
"learning_rate": 0.00019838200030411227,
"loss": 1.0488,
"step": 94
},
{
"epoch": 0.0046558927674381564,
"grad_norm": 0.6845225095748901,
"learning_rate": 0.0001983433545457928,
"loss": 1.0982,
"step": 95
},
{
"epoch": 0.0047049021649901365,
"grad_norm": 0.5301809906959534,
"learning_rate": 0.00019830425654708246,
"loss": 1.0651,
"step": 96
},
{
"epoch": 0.0047539115625421175,
"grad_norm": 0.6478580832481384,
"learning_rate": 0.0001982647064877768,
"loss": 1.0767,
"step": 97
},
{
"epoch": 0.004802920960094098,
"grad_norm": 0.6522680521011353,
"learning_rate": 0.00019822470454975013,
"loss": 0.9553,
"step": 98
},
{
"epoch": 0.0048519303576460785,
"grad_norm": 0.7917822599411011,
"learning_rate": 0.00019818425091695481,
"loss": 1.0848,
"step": 99
},
{
"epoch": 0.0049009397551980595,
"grad_norm": 0.6298443078994751,
"learning_rate": 0.00019814334577542038,
"loss": 1.1411,
"step": 100
},
{
"epoch": 0.0049499491527500395,
"grad_norm": 0.5504825115203857,
"learning_rate": 0.00019810198931325266,
"loss": 1.1046,
"step": 101
},
{
"epoch": 0.0049989585503020205,
"grad_norm": 0.69349604845047,
"learning_rate": 0.00019806018172063288,
"loss": 1.0621,
"step": 102
},
{
"epoch": 0.005047967947854001,
"grad_norm": 0.6714924573898315,
"learning_rate": 0.00019801792318981687,
"loss": 1.1085,
"step": 103
},
{
"epoch": 0.0050969773454059815,
"grad_norm": 0.5743169784545898,
"learning_rate": 0.00019797521391513412,
"loss": 0.9405,
"step": 104
},
{
"epoch": 0.0051459867429579625,
"grad_norm": 0.5487734079360962,
"learning_rate": 0.00019793205409298693,
"loss": 1.0558,
"step": 105
},
{
"epoch": 0.0051949961405099426,
"grad_norm": 0.9174990057945251,
"learning_rate": 0.00019788844392184943,
"loss": 1.2962,
"step": 106
},
{
"epoch": 0.0052440055380619235,
"grad_norm": 0.6920627355575562,
"learning_rate": 0.0001978443836022668,
"loss": 1.0936,
"step": 107
},
{
"epoch": 0.005293014935613904,
"grad_norm": 0.5456178784370422,
"learning_rate": 0.0001977998733368542,
"loss": 1.0819,
"step": 108
},
{
"epoch": 0.0053420243331658845,
"grad_norm": 0.4566207528114319,
"learning_rate": 0.00019775491333029588,
"loss": 0.9396,
"step": 109
},
{
"epoch": 0.0053910337307178655,
"grad_norm": 0.48873913288116455,
"learning_rate": 0.00019770950378934435,
"loss": 1.0805,
"step": 110
},
{
"epoch": 0.005440043128269846,
"grad_norm": 0.5160651803016663,
"learning_rate": 0.00019766364492281924,
"loss": 1.078,
"step": 111
},
{
"epoch": 0.0054890525258218265,
"grad_norm": 0.4412589967250824,
"learning_rate": 0.00019761733694160656,
"loss": 1.0189,
"step": 112
},
{
"epoch": 0.005538061923373807,
"grad_norm": 0.5093545913696289,
"learning_rate": 0.00019757058005865754,
"loss": 1.0374,
"step": 113
},
{
"epoch": 0.0055870713209257876,
"grad_norm": 0.546847403049469,
"learning_rate": 0.0001975233744889877,
"loss": 1.0663,
"step": 114
},
{
"epoch": 0.0056360807184777685,
"grad_norm": 0.5874338746070862,
"learning_rate": 0.000197475720449676,
"loss": 1.0858,
"step": 115
},
{
"epoch": 0.005685090116029749,
"grad_norm": 0.478515625,
"learning_rate": 0.00019742761815986354,
"loss": 0.895,
"step": 116
},
{
"epoch": 0.0057340995135817295,
"grad_norm": 0.55860435962677,
"learning_rate": 0.00019737906784075292,
"loss": 0.986,
"step": 117
},
{
"epoch": 0.00578310891113371,
"grad_norm": 0.7575963735580444,
"learning_rate": 0.00019733006971560694,
"loss": 1.1348,
"step": 118
},
{
"epoch": 0.005832118308685691,
"grad_norm": 0.5981109738349915,
"learning_rate": 0.00019728062400974775,
"loss": 1.1434,
"step": 119
},
{
"epoch": 0.005881127706237671,
"grad_norm": 0.5200974941253662,
"learning_rate": 0.00019723073095055557,
"loss": 1.0091,
"step": 120
},
{
"epoch": 0.005930137103789652,
"grad_norm": 0.4704555571079254,
"learning_rate": 0.00019718039076746809,
"loss": 0.9529,
"step": 121
},
{
"epoch": 0.0059791465013416326,
"grad_norm": 0.5887392163276672,
"learning_rate": 0.00019712960369197883,
"loss": 1.0724,
"step": 122
},
{
"epoch": 0.006028155898893613,
"grad_norm": 0.6547783613204956,
"learning_rate": 0.00019707836995763663,
"loss": 1.0877,
"step": 123
},
{
"epoch": 0.006077165296445594,
"grad_norm": 0.5115367770195007,
"learning_rate": 0.00019702668980004423,
"loss": 0.8939,
"step": 124
},
{
"epoch": 0.006126174693997574,
"grad_norm": 0.652423620223999,
"learning_rate": 0.0001969745634568572,
"loss": 1.109,
"step": 125
},
{
"epoch": 0.006175184091549555,
"grad_norm": 0.48406100273132324,
"learning_rate": 0.00019692199116778315,
"loss": 1.0766,
"step": 126
},
{
"epoch": 0.006224193489101536,
"grad_norm": 0.6201152801513672,
"learning_rate": 0.00019686897317458015,
"loss": 1.0048,
"step": 127
},
{
"epoch": 0.006273202886653516,
"grad_norm": 0.5421327948570251,
"learning_rate": 0.0001968155097210561,
"loss": 1.1253,
"step": 128
},
{
"epoch": 0.006322212284205497,
"grad_norm": 0.4736417233943939,
"learning_rate": 0.00019676160105306728,
"loss": 0.9182,
"step": 129
},
{
"epoch": 0.006371221681757477,
"grad_norm": 0.5314813852310181,
"learning_rate": 0.00019670724741851735,
"loss": 1.0786,
"step": 130
},
{
"epoch": 0.006420231079309458,
"grad_norm": 0.7178956866264343,
"learning_rate": 0.00019665244906735618,
"loss": 1.1229,
"step": 131
},
{
"epoch": 0.006469240476861439,
"grad_norm": 0.7294998168945312,
"learning_rate": 0.00019659720625157868,
"loss": 0.9181,
"step": 132
},
{
"epoch": 0.006518249874413419,
"grad_norm": 0.49370306730270386,
"learning_rate": 0.00019654151922522374,
"loss": 0.9575,
"step": 133
},
{
"epoch": 0.0065672592719654,
"grad_norm": 0.6981483697891235,
"learning_rate": 0.00019648538824437292,
"loss": 1.039,
"step": 134
},
{
"epoch": 0.00661626866951738,
"grad_norm": 0.5041336417198181,
"learning_rate": 0.0001964288135671494,
"loss": 1.0359,
"step": 135
},
{
"epoch": 0.006665278067069361,
"grad_norm": 0.6436519026756287,
"learning_rate": 0.00019637179545371666,
"loss": 1.3036,
"step": 136
},
{
"epoch": 0.006714287464621341,
"grad_norm": 0.7052600979804993,
"learning_rate": 0.00019631433416627747,
"loss": 1.1122,
"step": 137
},
{
"epoch": 0.006763296862173322,
"grad_norm": 0.7348969578742981,
"learning_rate": 0.00019625642996907248,
"loss": 0.9007,
"step": 138
},
{
"epoch": 0.006812306259725303,
"grad_norm": 0.46670839190483093,
"learning_rate": 0.00019619808312837912,
"loss": 0.9005,
"step": 139
},
{
"epoch": 0.006861315657277283,
"grad_norm": 0.5681964755058289,
"learning_rate": 0.0001961392939125104,
"loss": 0.9202,
"step": 140
},
{
"epoch": 0.006910325054829264,
"grad_norm": 0.5394182801246643,
"learning_rate": 0.00019608006259181355,
"loss": 1.0096,
"step": 141
},
{
"epoch": 0.006959334452381244,
"grad_norm": 0.48045870661735535,
"learning_rate": 0.00019602038943866896,
"loss": 0.9415,
"step": 142
},
{
"epoch": 0.007008343849933225,
"grad_norm": 0.5574181079864502,
"learning_rate": 0.0001959602747274887,
"loss": 0.9763,
"step": 143
},
{
"epoch": 0.007057353247485206,
"grad_norm": 0.6325384378433228,
"learning_rate": 0.00019589971873471552,
"loss": 0.8447,
"step": 144
},
{
"epoch": 0.007106362645037186,
"grad_norm": 0.5570932626724243,
"learning_rate": 0.00019583872173882129,
"loss": 1.0864,
"step": 145
},
{
"epoch": 0.007155372042589167,
"grad_norm": 0.5965400338172913,
"learning_rate": 0.00019577728402030603,
"loss": 1.0298,
"step": 146
},
{
"epoch": 0.007204381440141147,
"grad_norm": 0.6382545828819275,
"learning_rate": 0.00019571540586169633,
"loss": 1.1735,
"step": 147
},
{
"epoch": 0.007253390837693128,
"grad_norm": 0.5810325145721436,
"learning_rate": 0.0001956530875475443,
"loss": 1.0175,
"step": 148
},
{
"epoch": 0.007302400235245109,
"grad_norm": 0.5877084732055664,
"learning_rate": 0.000195590329364426,
"loss": 1.018,
"step": 149
},
{
"epoch": 0.007351409632797089,
"grad_norm": 0.6393083930015564,
"learning_rate": 0.00019552713160094038,
"loss": 1.0437,
"step": 150
},
{
"epoch": 0.00740041903034907,
"grad_norm": 0.551087498664856,
"learning_rate": 0.0001954634945477078,
"loss": 1.191,
"step": 151
},
{
"epoch": 0.00744942842790105,
"grad_norm": 0.5265794396400452,
"learning_rate": 0.00019539941849736875,
"loss": 1.0204,
"step": 152
},
{
"epoch": 0.007498437825453031,
"grad_norm": 0.537392795085907,
"learning_rate": 0.00019533490374458245,
"loss": 1.1164,
"step": 153
},
{
"epoch": 0.007547447223005011,
"grad_norm": 0.5640383958816528,
"learning_rate": 0.00019526995058602553,
"loss": 1.1857,
"step": 154
},
{
"epoch": 0.007596456620556992,
"grad_norm": 0.5242000818252563,
"learning_rate": 0.0001952045593203907,
"loss": 1.1145,
"step": 155
},
{
"epoch": 0.007645466018108973,
"grad_norm": 0.5312201976776123,
"learning_rate": 0.0001951387302483854,
"loss": 0.8864,
"step": 156
},
{
"epoch": 0.007694475415660953,
"grad_norm": 0.5541028380393982,
"learning_rate": 0.00019507246367273017,
"loss": 0.9166,
"step": 157
},
{
"epoch": 0.007743484813212934,
"grad_norm": 0.7059816122055054,
"learning_rate": 0.00019500575989815766,
"loss": 1.1794,
"step": 158
},
{
"epoch": 0.007792494210764914,
"grad_norm": 0.5818477272987366,
"learning_rate": 0.00019493861923141088,
"loss": 1.0268,
"step": 159
},
{
"epoch": 0.007841503608316895,
"grad_norm": 0.7610416412353516,
"learning_rate": 0.000194871041981242,
"loss": 1.1005,
"step": 160
},
{
"epoch": 0.007890513005868875,
"grad_norm": 0.5330606698989868,
"learning_rate": 0.0001948030284584108,
"loss": 1.017,
"step": 161
},
{
"epoch": 0.007939522403420857,
"grad_norm": 0.5154563188552856,
"learning_rate": 0.00019473457897568338,
"loss": 0.8678,
"step": 162
},
{
"epoch": 0.007988531800972837,
"grad_norm": 0.5452204346656799,
"learning_rate": 0.0001946656938478305,
"loss": 0.9008,
"step": 163
},
{
"epoch": 0.008037541198524817,
"grad_norm": 0.6915609836578369,
"learning_rate": 0.0001945963733916264,
"loss": 0.7523,
"step": 164
},
{
"epoch": 0.008086550596076797,
"grad_norm": 0.5153624415397644,
"learning_rate": 0.0001945266179258472,
"loss": 1.0295,
"step": 165
},
{
"epoch": 0.008135559993628779,
"grad_norm": 0.5068039298057556,
"learning_rate": 0.0001944564277712694,
"loss": 0.9964,
"step": 166
},
{
"epoch": 0.008184569391180759,
"grad_norm": 0.5240331888198853,
"learning_rate": 0.00019438580325066847,
"loss": 1.0435,
"step": 167
},
{
"epoch": 0.008233578788732739,
"grad_norm": 0.663153350353241,
"learning_rate": 0.00019431474468881735,
"loss": 1.0541,
"step": 168
},
{
"epoch": 0.00828258818628472,
"grad_norm": 0.5227854251861572,
"learning_rate": 0.00019424325241248496,
"loss": 1.095,
"step": 169
},
{
"epoch": 0.0083315975838367,
"grad_norm": 0.5788130164146423,
"learning_rate": 0.00019417132675043471,
"loss": 0.9356,
"step": 170
},
{
"epoch": 0.008380606981388681,
"grad_norm": 0.584823727607727,
"learning_rate": 0.00019409896803342292,
"loss": 0.949,
"step": 171
},
{
"epoch": 0.008429616378940663,
"grad_norm": 0.6930997371673584,
"learning_rate": 0.00019402617659419744,
"loss": 1.114,
"step": 172
},
{
"epoch": 0.008478625776492643,
"grad_norm": 0.6039778590202332,
"learning_rate": 0.00019395295276749592,
"loss": 0.9691,
"step": 173
},
{
"epoch": 0.008527635174044623,
"grad_norm": 0.786405086517334,
"learning_rate": 0.00019387929689004447,
"loss": 1.0854,
"step": 174
},
{
"epoch": 0.008576644571596603,
"grad_norm": 0.7498408555984497,
"learning_rate": 0.00019380520930055602,
"loss": 1.0526,
"step": 175
},
{
"epoch": 0.008625653969148585,
"grad_norm": 0.4368021488189697,
"learning_rate": 0.00019373069033972864,
"loss": 0.8813,
"step": 176
},
{
"epoch": 0.008674663366700565,
"grad_norm": 0.6928194165229797,
"learning_rate": 0.0001936557403502443,
"loss": 0.972,
"step": 177
},
{
"epoch": 0.008723672764252545,
"grad_norm": 0.6533752679824829,
"learning_rate": 0.0001935803596767669,
"loss": 0.9243,
"step": 178
},
{
"epoch": 0.008772682161804527,
"grad_norm": 0.6834339499473572,
"learning_rate": 0.00019350454866594105,
"loss": 0.9531,
"step": 179
},
{
"epoch": 0.008821691559356507,
"grad_norm": 0.665930986404419,
"learning_rate": 0.00019342830766639013,
"loss": 1.1487,
"step": 180
},
{
"epoch": 0.008870700956908487,
"grad_norm": 1.0465463399887085,
"learning_rate": 0.000193351637028715,
"loss": 1.4273,
"step": 181
},
{
"epoch": 0.008919710354460467,
"grad_norm": 0.4840773642063141,
"learning_rate": 0.00019327453710549214,
"loss": 0.9474,
"step": 182
},
{
"epoch": 0.008968719752012449,
"grad_norm": 0.674241304397583,
"learning_rate": 0.00019319700825127227,
"loss": 1.2863,
"step": 183
},
{
"epoch": 0.009017729149564429,
"grad_norm": 0.45641985535621643,
"learning_rate": 0.00019311905082257846,
"loss": 1.0501,
"step": 184
},
{
"epoch": 0.009066738547116409,
"grad_norm": 0.4727928042411804,
"learning_rate": 0.00019304066517790465,
"loss": 0.9775,
"step": 185
},
{
"epoch": 0.00911574794466839,
"grad_norm": 0.5751796960830688,
"learning_rate": 0.00019296185167771404,
"loss": 0.9788,
"step": 186
},
{
"epoch": 0.009164757342220371,
"grad_norm": 0.6331678628921509,
"learning_rate": 0.00019288261068443725,
"loss": 1.1389,
"step": 187
},
{
"epoch": 0.009213766739772351,
"grad_norm": 0.6849628686904907,
"learning_rate": 0.0001928029425624708,
"loss": 1.1108,
"step": 188
},
{
"epoch": 0.009262776137324333,
"grad_norm": 0.515921413898468,
"learning_rate": 0.0001927228476781755,
"loss": 1.1741,
"step": 189
},
{
"epoch": 0.009311785534876313,
"grad_norm": 0.7446777820587158,
"learning_rate": 0.0001926423263998745,
"loss": 1.2786,
"step": 190
},
{
"epoch": 0.009360794932428293,
"grad_norm": 0.5110065340995789,
"learning_rate": 0.00019256137909785185,
"loss": 1.142,
"step": 191
},
{
"epoch": 0.009409804329980273,
"grad_norm": 0.5118314623832703,
"learning_rate": 0.0001924800061443507,
"loss": 1.3642,
"step": 192
},
{
"epoch": 0.009458813727532255,
"grad_norm": 0.5356354117393494,
"learning_rate": 0.00019239820791357165,
"loss": 1.0135,
"step": 193
},
{
"epoch": 0.009507823125084235,
"grad_norm": 0.6616246700286865,
"learning_rate": 0.00019231598478167082,
"loss": 1.1266,
"step": 194
},
{
"epoch": 0.009556832522636215,
"grad_norm": 0.5914151072502136,
"learning_rate": 0.00019223333712675838,
"loss": 0.915,
"step": 195
},
{
"epoch": 0.009605841920188197,
"grad_norm": 0.745429277420044,
"learning_rate": 0.00019215026532889675,
"loss": 1.0138,
"step": 196
},
{
"epoch": 0.009654851317740177,
"grad_norm": 0.6249227523803711,
"learning_rate": 0.00019206676977009872,
"loss": 1.0224,
"step": 197
},
{
"epoch": 0.009703860715292157,
"grad_norm": 0.7981612086296082,
"learning_rate": 0.0001919828508343258,
"loss": 1.045,
"step": 198
},
{
"epoch": 0.009752870112844137,
"grad_norm": 0.5909057855606079,
"learning_rate": 0.00019189850890748652,
"loss": 1.0247,
"step": 199
},
{
"epoch": 0.009801879510396119,
"grad_norm": 0.7416546940803528,
"learning_rate": 0.00019181374437743438,
"loss": 1.0874,
"step": 200
},
{
"epoch": 0.009850888907948099,
"grad_norm": 0.5643423795700073,
"learning_rate": 0.00019172855763396643,
"loss": 1.1505,
"step": 201
},
{
"epoch": 0.009899898305500079,
"grad_norm": 0.5001385807991028,
"learning_rate": 0.0001916429490688213,
"loss": 0.8847,
"step": 202
},
{
"epoch": 0.009948907703052061,
"grad_norm": 0.7633288502693176,
"learning_rate": 0.00019155691907567728,
"loss": 0.9812,
"step": 203
},
{
"epoch": 0.009997917100604041,
"grad_norm": 0.5486236214637756,
"learning_rate": 0.00019147046805015076,
"loss": 0.8544,
"step": 204
},
{
"epoch": 0.010046926498156021,
"grad_norm": 0.6690341234207153,
"learning_rate": 0.0001913835963897942,
"loss": 1.0948,
"step": 205
},
{
"epoch": 0.010095935895708001,
"grad_norm": 0.7186251282691956,
"learning_rate": 0.00019129630449409444,
"loss": 0.9121,
"step": 206
},
{
"epoch": 0.010144945293259983,
"grad_norm": 0.5190759897232056,
"learning_rate": 0.00019120859276447076,
"loss": 1.0424,
"step": 207
},
{
"epoch": 0.010193954690811963,
"grad_norm": 0.5398345589637756,
"learning_rate": 0.00019112046160427312,
"loss": 1.1069,
"step": 208
},
{
"epoch": 0.010242964088363943,
"grad_norm": 0.6046240329742432,
"learning_rate": 0.00019103191141878027,
"loss": 1.1071,
"step": 209
},
{
"epoch": 0.010291973485915925,
"grad_norm": 0.6967079043388367,
"learning_rate": 0.00019094294261519785,
"loss": 1.055,
"step": 210
},
{
"epoch": 0.010340982883467905,
"grad_norm": 0.6013466715812683,
"learning_rate": 0.00019085355560265657,
"loss": 1.0539,
"step": 211
},
{
"epoch": 0.010389992281019885,
"grad_norm": 0.6052589416503906,
"learning_rate": 0.0001907637507922103,
"loss": 0.9343,
"step": 212
},
{
"epoch": 0.010439001678571867,
"grad_norm": 0.46449118852615356,
"learning_rate": 0.00019067352859683423,
"loss": 0.9858,
"step": 213
},
{
"epoch": 0.010488011076123847,
"grad_norm": 0.45442724227905273,
"learning_rate": 0.0001905828894314229,
"loss": 0.8839,
"step": 214
},
{
"epoch": 0.010537020473675827,
"grad_norm": 0.5393562316894531,
"learning_rate": 0.00019049183371278828,
"loss": 0.9945,
"step": 215
},
{
"epoch": 0.010586029871227807,
"grad_norm": 0.5419538021087646,
"learning_rate": 0.00019040036185965798,
"loss": 0.8671,
"step": 216
},
{
"epoch": 0.010635039268779789,
"grad_norm": 0.5931328535079956,
"learning_rate": 0.00019030847429267318,
"loss": 0.936,
"step": 217
},
{
"epoch": 0.010684048666331769,
"grad_norm": 0.6994450092315674,
"learning_rate": 0.00019021617143438678,
"loss": 1.1344,
"step": 218
},
{
"epoch": 0.01073305806388375,
"grad_norm": 0.5890368819236755,
"learning_rate": 0.00019012345370926145,
"loss": 1.0253,
"step": 219
},
{
"epoch": 0.010782067461435731,
"grad_norm": 0.71031254529953,
"learning_rate": 0.0001900303215436676,
"loss": 0.8868,
"step": 220
},
{
"epoch": 0.010831076858987711,
"grad_norm": 0.5714280009269714,
"learning_rate": 0.00018993677536588156,
"loss": 1.1322,
"step": 221
},
{
"epoch": 0.010880086256539691,
"grad_norm": 0.5989257097244263,
"learning_rate": 0.00018984281560608345,
"loss": 1.0701,
"step": 222
},
{
"epoch": 0.010929095654091671,
"grad_norm": 0.653944730758667,
"learning_rate": 0.00018974844269635535,
"loss": 0.908,
"step": 223
},
{
"epoch": 0.010978105051643653,
"grad_norm": 0.5854188203811646,
"learning_rate": 0.00018965365707067922,
"loss": 0.9684,
"step": 224
},
{
"epoch": 0.011027114449195633,
"grad_norm": 0.4724690616130829,
"learning_rate": 0.0001895584591649349,
"loss": 0.974,
"step": 225
},
{
"epoch": 0.011076123846747613,
"grad_norm": 0.5080939531326294,
"learning_rate": 0.00018946284941689817,
"loss": 1.0113,
"step": 226
},
{
"epoch": 0.011125133244299595,
"grad_norm": 0.5035115480422974,
"learning_rate": 0.00018936682826623875,
"loss": 1.033,
"step": 227
},
{
"epoch": 0.011174142641851575,
"grad_norm": 0.544927179813385,
"learning_rate": 0.0001892703961545181,
"loss": 1.0694,
"step": 228
},
{
"epoch": 0.011223152039403555,
"grad_norm": 0.5631483793258667,
"learning_rate": 0.00018917355352518765,
"loss": 1.1408,
"step": 229
},
{
"epoch": 0.011272161436955537,
"grad_norm": 0.5837711095809937,
"learning_rate": 0.00018907630082358657,
"loss": 0.9456,
"step": 230
},
{
"epoch": 0.011321170834507517,
"grad_norm": 0.5248781442642212,
"learning_rate": 0.00018897863849693972,
"loss": 0.7417,
"step": 231
},
{
"epoch": 0.011370180232059497,
"grad_norm": 0.7330105304718018,
"learning_rate": 0.00018888056699435584,
"loss": 1.1141,
"step": 232
},
{
"epoch": 0.011419189629611477,
"grad_norm": 0.7595493197441101,
"learning_rate": 0.00018878208676682508,
"loss": 1.1187,
"step": 233
},
{
"epoch": 0.011468199027163459,
"grad_norm": 0.47531285881996155,
"learning_rate": 0.00018868319826721735,
"loss": 0.9251,
"step": 234
},
{
"epoch": 0.01151720842471544,
"grad_norm": 0.4790467917919159,
"learning_rate": 0.00018858390195027985,
"loss": 1.0659,
"step": 235
},
{
"epoch": 0.01156621782226742,
"grad_norm": 0.49423718452453613,
"learning_rate": 0.0001884841982726353,
"loss": 0.9563,
"step": 236
},
{
"epoch": 0.011615227219819401,
"grad_norm": 0.5443271398544312,
"learning_rate": 0.00018838408769277965,
"loss": 1.0261,
"step": 237
},
{
"epoch": 0.011664236617371381,
"grad_norm": 0.44612443447113037,
"learning_rate": 0.00018828357067108,
"loss": 0.9187,
"step": 238
},
{
"epoch": 0.011713246014923361,
"grad_norm": 0.4964349865913391,
"learning_rate": 0.00018818264766977256,
"loss": 1.1576,
"step": 239
},
{
"epoch": 0.011762255412475341,
"grad_norm": 0.5051829218864441,
"learning_rate": 0.00018808131915296045,
"loss": 1.0866,
"step": 240
},
{
"epoch": 0.011811264810027323,
"grad_norm": 0.5532406568527222,
"learning_rate": 0.0001879795855866116,
"loss": 1.0958,
"step": 241
},
{
"epoch": 0.011860274207579303,
"grad_norm": 0.6946402788162231,
"learning_rate": 0.00018787744743855656,
"loss": 1.0638,
"step": 242
},
{
"epoch": 0.011909283605131283,
"grad_norm": 0.5688795447349548,
"learning_rate": 0.00018777490517848643,
"loss": 1.1,
"step": 243
},
{
"epoch": 0.011958293002683265,
"grad_norm": 0.7369052767753601,
"learning_rate": 0.00018767195927795057,
"loss": 0.9118,
"step": 244
},
{
"epoch": 0.012007302400235245,
"grad_norm": 0.5264215469360352,
"learning_rate": 0.00018756861021035462,
"loss": 0.9238,
"step": 245
},
{
"epoch": 0.012056311797787225,
"grad_norm": 0.6346014738082886,
"learning_rate": 0.0001874648584509582,
"loss": 0.9497,
"step": 246
},
{
"epoch": 0.012105321195339207,
"grad_norm": 0.48699167370796204,
"learning_rate": 0.00018736070447687267,
"loss": 1.0339,
"step": 247
},
{
"epoch": 0.012154330592891187,
"grad_norm": 0.5329305529594421,
"learning_rate": 0.00018725614876705907,
"loss": 1.0089,
"step": 248
},
{
"epoch": 0.012203339990443167,
"grad_norm": 0.5199721455574036,
"learning_rate": 0.00018715119180232582,
"loss": 0.9166,
"step": 249
},
{
"epoch": 0.012252349387995147,
"grad_norm": 0.6322909593582153,
"learning_rate": 0.00018704583406532662,
"loss": 1.0835,
"step": 250
},
{
"epoch": 0.01230135878554713,
"grad_norm": 0.5659762620925903,
"learning_rate": 0.00018694007604055807,
"loss": 1.0007,
"step": 251
},
{
"epoch": 0.01235036818309911,
"grad_norm": 0.6697028279304504,
"learning_rate": 0.00018683391821435757,
"loss": 1.0302,
"step": 252
},
{
"epoch": 0.01239937758065109,
"grad_norm": 0.5258873105049133,
"learning_rate": 0.00018672736107490102,
"loss": 0.9095,
"step": 253
},
{
"epoch": 0.012448386978203071,
"grad_norm": 0.718046247959137,
"learning_rate": 0.00018662040511220062,
"loss": 0.774,
"step": 254
},
{
"epoch": 0.012497396375755051,
"grad_norm": 0.5691109895706177,
"learning_rate": 0.0001865130508181026,
"loss": 0.8814,
"step": 255
},
{
"epoch": 0.012546405773307031,
"grad_norm": 0.5212081074714661,
"learning_rate": 0.00018640529868628488,
"loss": 0.9577,
"step": 256
},
{
"epoch": 0.012595415170859011,
"grad_norm": 0.4526687264442444,
"learning_rate": 0.00018629714921225495,
"loss": 1.1245,
"step": 257
},
{
"epoch": 0.012644424568410993,
"grad_norm": 0.5150011777877808,
"learning_rate": 0.0001861886028933475,
"loss": 0.9222,
"step": 258
},
{
"epoch": 0.012693433965962973,
"grad_norm": 0.5537204742431641,
"learning_rate": 0.0001860796602287221,
"loss": 1.0903,
"step": 259
},
{
"epoch": 0.012742443363514953,
"grad_norm": 0.5243268013000488,
"learning_rate": 0.00018597032171936104,
"loss": 1.0161,
"step": 260
},
{
"epoch": 0.012791452761066935,
"grad_norm": 0.6439827084541321,
"learning_rate": 0.00018586058786806685,
"loss": 1.0715,
"step": 261
},
{
"epoch": 0.012840462158618915,
"grad_norm": 0.9355524778366089,
"learning_rate": 0.00018575045917946007,
"loss": 1.129,
"step": 262
},
{
"epoch": 0.012889471556170895,
"grad_norm": 0.5358828902244568,
"learning_rate": 0.000185639936159977,
"loss": 0.997,
"step": 263
},
{
"epoch": 0.012938480953722877,
"grad_norm": 1.1923553943634033,
"learning_rate": 0.00018552901931786727,
"loss": 1.0837,
"step": 264
},
{
"epoch": 0.012987490351274857,
"grad_norm": 0.7703134417533875,
"learning_rate": 0.0001854177091631915,
"loss": 1.059,
"step": 265
},
{
"epoch": 0.013036499748826837,
"grad_norm": 0.46094661951065063,
"learning_rate": 0.00018530600620781903,
"loss": 0.9812,
"step": 266
},
{
"epoch": 0.013085509146378817,
"grad_norm": 0.5393503308296204,
"learning_rate": 0.00018519391096542563,
"loss": 0.9942,
"step": 267
},
{
"epoch": 0.0131345185439308,
"grad_norm": 0.5819559693336487,
"learning_rate": 0.00018508142395149077,
"loss": 0.7817,
"step": 268
},
{
"epoch": 0.01318352794148278,
"grad_norm": 0.5847815275192261,
"learning_rate": 0.0001849685456832958,
"loss": 0.9157,
"step": 269
},
{
"epoch": 0.01323253733903476,
"grad_norm": 0.5651043653488159,
"learning_rate": 0.00018485527667992115,
"loss": 0.9457,
"step": 270
},
{
"epoch": 0.013281546736586741,
"grad_norm": 0.6798825860023499,
"learning_rate": 0.00018474161746224403,
"loss": 1.0945,
"step": 271
},
{
"epoch": 0.013330556134138721,
"grad_norm": 0.48076069355010986,
"learning_rate": 0.0001846275685529362,
"loss": 0.9762,
"step": 272
},
{
"epoch": 0.013379565531690701,
"grad_norm": 0.48303014039993286,
"learning_rate": 0.0001845131304764614,
"loss": 0.7044,
"step": 273
},
{
"epoch": 0.013428574929242681,
"grad_norm": 0.5329292416572571,
"learning_rate": 0.00018439830375907294,
"loss": 0.9718,
"step": 274
},
{
"epoch": 0.013477584326794663,
"grad_norm": 0.7514523267745972,
"learning_rate": 0.0001842830889288114,
"loss": 1.0568,
"step": 275
},
{
"epoch": 0.013526593724346643,
"grad_norm": 0.7108585238456726,
"learning_rate": 0.00018416748651550212,
"loss": 1.0998,
"step": 276
},
{
"epoch": 0.013575603121898623,
"grad_norm": 0.49022603034973145,
"learning_rate": 0.00018405149705075276,
"loss": 0.8044,
"step": 277
},
{
"epoch": 0.013624612519450605,
"grad_norm": 0.5161099433898926,
"learning_rate": 0.00018393512106795086,
"loss": 1.011,
"step": 278
},
{
"epoch": 0.013673621917002585,
"grad_norm": 0.6260969042778015,
"learning_rate": 0.0001838183591022614,
"loss": 1.0089,
"step": 279
},
{
"epoch": 0.013722631314554565,
"grad_norm": 0.5831906795501709,
"learning_rate": 0.00018370121169062435,
"loss": 0.942,
"step": 280
},
{
"epoch": 0.013771640712106547,
"grad_norm": 0.4630817174911499,
"learning_rate": 0.0001835836793717522,
"loss": 1.0795,
"step": 281
},
{
"epoch": 0.013820650109658527,
"grad_norm": 0.6668508052825928,
"learning_rate": 0.0001834657626861274,
"loss": 1.0784,
"step": 282
},
{
"epoch": 0.013869659507210507,
"grad_norm": 0.523901104927063,
"learning_rate": 0.000183347462176,
"loss": 1.0338,
"step": 283
},
{
"epoch": 0.013918668904762488,
"grad_norm": 0.6257583498954773,
"learning_rate": 0.00018322877838538513,
"loss": 0.9845,
"step": 284
},
{
"epoch": 0.01396767830231447,
"grad_norm": 0.6482861638069153,
"learning_rate": 0.00018310971186006038,
"loss": 0.9081,
"step": 285
},
{
"epoch": 0.01401668769986645,
"grad_norm": 0.600287139415741,
"learning_rate": 0.00018299026314756348,
"loss": 1.0216,
"step": 286
},
{
"epoch": 0.01406569709741843,
"grad_norm": 0.732085108757019,
"learning_rate": 0.00018287043279718957,
"loss": 1.1922,
"step": 287
},
{
"epoch": 0.014114706494970411,
"grad_norm": 0.7779308557510376,
"learning_rate": 0.00018275022135998888,
"loss": 1.3221,
"step": 288
},
{
"epoch": 0.014163715892522391,
"grad_norm": 0.606619656085968,
"learning_rate": 0.00018262962938876408,
"loss": 0.9412,
"step": 289
},
{
"epoch": 0.014212725290074371,
"grad_norm": 0.7746778130531311,
"learning_rate": 0.00018250865743806767,
"loss": 1.1909,
"step": 290
},
{
"epoch": 0.014261734687626352,
"grad_norm": 0.7022519111633301,
"learning_rate": 0.00018238730606419965,
"loss": 0.8972,
"step": 291
},
{
"epoch": 0.014310744085178333,
"grad_norm": 0.5326123237609863,
"learning_rate": 0.00018226557582520477,
"loss": 0.8689,
"step": 292
},
{
"epoch": 0.014359753482730313,
"grad_norm": 0.5942233800888062,
"learning_rate": 0.00018214346728087,
"loss": 0.9346,
"step": 293
},
{
"epoch": 0.014408762880282294,
"grad_norm": 0.5361180901527405,
"learning_rate": 0.00018202098099272205,
"loss": 0.8712,
"step": 294
},
{
"epoch": 0.014457772277834275,
"grad_norm": 0.5793167948722839,
"learning_rate": 0.00018189811752402458,
"loss": 1.0495,
"step": 295
},
{
"epoch": 0.014506781675386255,
"grad_norm": 0.5728379487991333,
"learning_rate": 0.00018177487743977597,
"loss": 1.0623,
"step": 296
},
{
"epoch": 0.014555791072938236,
"grad_norm": 0.6366235613822937,
"learning_rate": 0.00018165126130670624,
"loss": 0.9413,
"step": 297
},
{
"epoch": 0.014604800470490217,
"grad_norm": 0.5405012965202332,
"learning_rate": 0.0001815272696932749,
"loss": 1.1222,
"step": 298
},
{
"epoch": 0.014653809868042197,
"grad_norm": 0.701434850692749,
"learning_rate": 0.00018140290316966806,
"loss": 1.185,
"step": 299
},
{
"epoch": 0.014702819265594178,
"grad_norm": 0.5572264790534973,
"learning_rate": 0.0001812781623077959,
"loss": 1.0778,
"step": 300
},
{
"epoch": 0.014751828663146158,
"grad_norm": 1.0823994874954224,
"learning_rate": 0.00018115304768129,
"loss": 1.2721,
"step": 301
},
{
"epoch": 0.01480083806069814,
"grad_norm": 0.7624396681785583,
"learning_rate": 0.00018102755986550078,
"loss": 1.103,
"step": 302
},
{
"epoch": 0.01484984745825012,
"grad_norm": 0.6541030406951904,
"learning_rate": 0.00018090169943749476,
"loss": 1.1571,
"step": 303
},
{
"epoch": 0.0148988568558021,
"grad_norm": 0.4585016071796417,
"learning_rate": 0.00018077546697605197,
"loss": 0.8446,
"step": 304
},
{
"epoch": 0.014947866253354081,
"grad_norm": 0.5118653774261475,
"learning_rate": 0.00018064886306166323,
"loss": 0.8785,
"step": 305
},
{
"epoch": 0.014996875650906061,
"grad_norm": 0.6636449098587036,
"learning_rate": 0.0001805218882765276,
"loss": 1.1065,
"step": 306
},
{
"epoch": 0.015045885048458042,
"grad_norm": 0.5869944095611572,
"learning_rate": 0.00018039454320454957,
"loss": 1.2293,
"step": 307
},
{
"epoch": 0.015094894446010022,
"grad_norm": 0.5056412816047668,
"learning_rate": 0.0001802668284313364,
"loss": 1.0039,
"step": 308
},
{
"epoch": 0.015143903843562003,
"grad_norm": 0.4390663504600525,
"learning_rate": 0.00018013874454419546,
"loss": 0.9446,
"step": 309
},
{
"epoch": 0.015192913241113984,
"grad_norm": 0.640178918838501,
"learning_rate": 0.00018001029213213162,
"loss": 1.0914,
"step": 310
},
{
"epoch": 0.015241922638665964,
"grad_norm": 0.6631582975387573,
"learning_rate": 0.00017988147178584426,
"loss": 1.0147,
"step": 311
},
{
"epoch": 0.015290932036217945,
"grad_norm": 0.4660460352897644,
"learning_rate": 0.00017975228409772496,
"loss": 0.94,
"step": 312
},
{
"epoch": 0.015339941433769926,
"grad_norm": 0.6155552864074707,
"learning_rate": 0.00017962272966185437,
"loss": 1.0236,
"step": 313
},
{
"epoch": 0.015388950831321906,
"grad_norm": 0.4859367609024048,
"learning_rate": 0.0001794928090739997,
"loss": 1.0036,
"step": 314
},
{
"epoch": 0.015437960228873887,
"grad_norm": 0.46601447463035583,
"learning_rate": 0.00017936252293161204,
"loss": 1.0945,
"step": 315
},
{
"epoch": 0.015486969626425868,
"grad_norm": 0.5885031819343567,
"learning_rate": 0.00017923187183382342,
"loss": 1.0307,
"step": 316
},
{
"epoch": 0.015535979023977848,
"grad_norm": 0.552923321723938,
"learning_rate": 0.00017910085638144416,
"loss": 0.9364,
"step": 317
},
{
"epoch": 0.015584988421529828,
"grad_norm": 0.7075880169868469,
"learning_rate": 0.00017896947717696012,
"loss": 1.0722,
"step": 318
},
{
"epoch": 0.01563399781908181,
"grad_norm": 0.47218647599220276,
"learning_rate": 0.00017883773482452986,
"loss": 0.7295,
"step": 319
},
{
"epoch": 0.01568300721663379,
"grad_norm": 0.6173276901245117,
"learning_rate": 0.00017870562992998193,
"loss": 1.019,
"step": 320
},
{
"epoch": 0.01573201661418577,
"grad_norm": 0.6104951500892639,
"learning_rate": 0.0001785731631008122,
"loss": 0.9296,
"step": 321
},
{
"epoch": 0.01578102601173775,
"grad_norm": 0.6612362861633301,
"learning_rate": 0.00017844033494618063,
"loss": 1.0921,
"step": 322
},
{
"epoch": 0.01583003540928973,
"grad_norm": 0.7498394250869751,
"learning_rate": 0.00017830714607690906,
"loss": 1.0273,
"step": 323
},
{
"epoch": 0.015879044806841713,
"grad_norm": 0.6212750673294067,
"learning_rate": 0.0001781735971054779,
"loss": 0.9132,
"step": 324
},
{
"epoch": 0.015928054204393693,
"grad_norm": 0.7612932920455933,
"learning_rate": 0.0001780396886460237,
"loss": 1.0407,
"step": 325
},
{
"epoch": 0.015977063601945674,
"grad_norm": 0.5707682371139526,
"learning_rate": 0.00017790542131433597,
"loss": 1.0211,
"step": 326
},
{
"epoch": 0.016026072999497654,
"grad_norm": 0.5141270756721497,
"learning_rate": 0.00017777079572785462,
"loss": 1.0815,
"step": 327
},
{
"epoch": 0.016075082397049634,
"grad_norm": 0.552099347114563,
"learning_rate": 0.000177635812505667,
"loss": 0.9927,
"step": 328
},
{
"epoch": 0.016124091794601614,
"grad_norm": 0.8106556534767151,
"learning_rate": 0.00017750047226850512,
"loss": 1.0921,
"step": 329
},
{
"epoch": 0.016173101192153594,
"grad_norm": 0.7649658918380737,
"learning_rate": 0.00017736477563874275,
"loss": 1.0336,
"step": 330
},
{
"epoch": 0.016222110589705577,
"grad_norm": 0.5398699045181274,
"learning_rate": 0.00017722872324039247,
"loss": 0.8543,
"step": 331
},
{
"epoch": 0.016271119987257558,
"grad_norm": 0.7642013430595398,
"learning_rate": 0.00017709231569910296,
"loss": 1.0422,
"step": 332
},
{
"epoch": 0.016320129384809538,
"grad_norm": 0.7440565824508667,
"learning_rate": 0.0001769555536421561,
"loss": 0.9295,
"step": 333
},
{
"epoch": 0.016369138782361518,
"grad_norm": 0.8193507194519043,
"learning_rate": 0.000176818437698464,
"loss": 1.2756,
"step": 334
},
{
"epoch": 0.016418148179913498,
"grad_norm": 0.8170818090438843,
"learning_rate": 0.0001766809684985661,
"loss": 1.1115,
"step": 335
},
{
"epoch": 0.016467157577465478,
"grad_norm": 0.6213825941085815,
"learning_rate": 0.0001765431466746264,
"loss": 1.0258,
"step": 336
},
{
"epoch": 0.016516166975017458,
"grad_norm": 0.5284627079963684,
"learning_rate": 0.00017640497286043036,
"loss": 0.8486,
"step": 337
},
{
"epoch": 0.01656517637256944,
"grad_norm": 0.6979788541793823,
"learning_rate": 0.00017626644769138227,
"loss": 1.0995,
"step": 338
},
{
"epoch": 0.01661418577012142,
"grad_norm": 0.43623411655426025,
"learning_rate": 0.00017612757180450204,
"loss": 0.8647,
"step": 339
},
{
"epoch": 0.0166631951676734,
"grad_norm": 0.6120411157608032,
"learning_rate": 0.00017598834583842235,
"loss": 0.7997,
"step": 340
},
{
"epoch": 0.01671220456522538,
"grad_norm": 0.5070543885231018,
"learning_rate": 0.00017584877043338583,
"loss": 0.8916,
"step": 341
},
{
"epoch": 0.016761213962777362,
"grad_norm": 0.7476792931556702,
"learning_rate": 0.00017570884623124201,
"loss": 1.0732,
"step": 342
},
{
"epoch": 0.016810223360329342,
"grad_norm": 0.5438335537910461,
"learning_rate": 0.00017556857387544442,
"loss": 0.9926,
"step": 343
},
{
"epoch": 0.016859232757881325,
"grad_norm": 0.5938275456428528,
"learning_rate": 0.00017542795401104751,
"loss": 1.2716,
"step": 344
},
{
"epoch": 0.016908242155433306,
"grad_norm": 0.6337954998016357,
"learning_rate": 0.00017528698728470392,
"loss": 0.851,
"step": 345
},
{
"epoch": 0.016957251552985286,
"grad_norm": 0.5400267243385315,
"learning_rate": 0.00017514567434466122,
"loss": 1.0499,
"step": 346
},
{
"epoch": 0.017006260950537266,
"grad_norm": 0.5979104042053223,
"learning_rate": 0.0001750040158407592,
"loss": 0.957,
"step": 347
},
{
"epoch": 0.017055270348089246,
"grad_norm": 0.649106502532959,
"learning_rate": 0.0001748620124244267,
"loss": 0.9333,
"step": 348
},
{
"epoch": 0.017104279745641226,
"grad_norm": 0.6345680356025696,
"learning_rate": 0.00017471966474867857,
"loss": 1.0754,
"step": 349
},
{
"epoch": 0.017153289143193206,
"grad_norm": 0.6540642380714417,
"learning_rate": 0.000174576973468113,
"loss": 1.0878,
"step": 350
},
{
"epoch": 0.01720229854074519,
"grad_norm": 0.633299708366394,
"learning_rate": 0.00017443393923890805,
"loss": 0.9907,
"step": 351
},
{
"epoch": 0.01725130793829717,
"grad_norm": 0.6478168368339539,
"learning_rate": 0.00017429056271881898,
"loss": 0.9081,
"step": 352
},
{
"epoch": 0.01730031733584915,
"grad_norm": 0.6225663423538208,
"learning_rate": 0.00017414684456717506,
"loss": 1.1538,
"step": 353
},
{
"epoch": 0.01734932673340113,
"grad_norm": 0.6177115440368652,
"learning_rate": 0.0001740027854448766,
"loss": 1.0673,
"step": 354
},
{
"epoch": 0.01739833613095311,
"grad_norm": 0.5825146436691284,
"learning_rate": 0.0001738583860143919,
"loss": 0.9452,
"step": 355
},
{
"epoch": 0.01744734552850509,
"grad_norm": 0.6394603848457336,
"learning_rate": 0.0001737136469397542,
"loss": 0.8666,
"step": 356
},
{
"epoch": 0.01749635492605707,
"grad_norm": 0.8003541827201843,
"learning_rate": 0.00017356856888655858,
"loss": 0.9749,
"step": 357
},
{
"epoch": 0.017545364323609054,
"grad_norm": 0.6348778009414673,
"learning_rate": 0.00017342315252195895,
"loss": 1.0315,
"step": 358
},
{
"epoch": 0.017594373721161034,
"grad_norm": 0.5486456751823425,
"learning_rate": 0.00017327739851466504,
"loss": 0.9171,
"step": 359
},
{
"epoch": 0.017643383118713014,
"grad_norm": 0.4903838038444519,
"learning_rate": 0.00017313130753493917,
"loss": 0.8439,
"step": 360
},
{
"epoch": 0.017692392516264994,
"grad_norm": 0.5837622284889221,
"learning_rate": 0.00017298488025459336,
"loss": 1.1164,
"step": 361
},
{
"epoch": 0.017741401913816974,
"grad_norm": 0.5179843306541443,
"learning_rate": 0.00017283811734698602,
"loss": 0.8872,
"step": 362
},
{
"epoch": 0.017790411311368954,
"grad_norm": 0.5837355852127075,
"learning_rate": 0.00017269101948701906,
"loss": 0.9488,
"step": 363
},
{
"epoch": 0.017839420708920934,
"grad_norm": 0.5812946557998657,
"learning_rate": 0.00017254358735113465,
"loss": 0.9592,
"step": 364
},
{
"epoch": 0.017888430106472918,
"grad_norm": 0.6350980401039124,
"learning_rate": 0.00017239582161731218,
"loss": 0.836,
"step": 365
},
{
"epoch": 0.017937439504024898,
"grad_norm": 0.5819278359413147,
"learning_rate": 0.0001722477229650651,
"loss": 1.1291,
"step": 366
},
{
"epoch": 0.017986448901576878,
"grad_norm": 0.5616326332092285,
"learning_rate": 0.00017209929207543786,
"loss": 1.1593,
"step": 367
},
{
"epoch": 0.018035458299128858,
"grad_norm": 0.6132553815841675,
"learning_rate": 0.00017195052963100266,
"loss": 0.978,
"step": 368
},
{
"epoch": 0.018084467696680838,
"grad_norm": 0.7652292251586914,
"learning_rate": 0.0001718014363158564,
"loss": 0.9381,
"step": 369
},
{
"epoch": 0.018084467696680838,
"eval_loss": NaN,
"eval_runtime": 183.4447,
"eval_samples_per_second": 46.837,
"eval_steps_per_second": 23.419,
"step": 369
},
{
"epoch": 0.018133477094232818,
"grad_norm": 0.5262779593467712,
"learning_rate": 0.0001716520128156176,
"loss": 1.0514,
"step": 370
},
{
"epoch": 0.018182486491784798,
"grad_norm": 0.5101955533027649,
"learning_rate": 0.00017150225981742308,
"loss": 0.9021,
"step": 371
},
{
"epoch": 0.01823149588933678,
"grad_norm": 0.6334530711174011,
"learning_rate": 0.00017135217800992488,
"loss": 0.9931,
"step": 372
},
{
"epoch": 0.01828050528688876,
"grad_norm": 0.7216410040855408,
"learning_rate": 0.00017120176808328713,
"loss": 0.8422,
"step": 373
},
{
"epoch": 0.018329514684440742,
"grad_norm": 0.7431232333183289,
"learning_rate": 0.00017105103072918292,
"loss": 0.9414,
"step": 374
},
{
"epoch": 0.018378524081992722,
"grad_norm": 0.7508997321128845,
"learning_rate": 0.00017089996664079084,
"loss": 1.1678,
"step": 375
},
{
"epoch": 0.018427533479544702,
"grad_norm": 0.6515668034553528,
"learning_rate": 0.0001707485765127922,
"loss": 0.9482,
"step": 376
},
{
"epoch": 0.018476542877096682,
"grad_norm": 0.5263504981994629,
"learning_rate": 0.0001705968610413675,
"loss": 1.055,
"step": 377
},
{
"epoch": 0.018525552274648666,
"grad_norm": 0.57945716381073,
"learning_rate": 0.00017044482092419346,
"loss": 0.9905,
"step": 378
},
{
"epoch": 0.018574561672200646,
"grad_norm": 0.5509532690048218,
"learning_rate": 0.00017029245686043965,
"loss": 0.8783,
"step": 379
},
{
"epoch": 0.018623571069752626,
"grad_norm": 0.7690268754959106,
"learning_rate": 0.00017013976955076535,
"loss": 1.0791,
"step": 380
},
{
"epoch": 0.018672580467304606,
"grad_norm": 0.7092053890228271,
"learning_rate": 0.00016998675969731624,
"loss": 1.0632,
"step": 381
},
{
"epoch": 0.018721589864856586,
"grad_norm": 0.6620251536369324,
"learning_rate": 0.0001698334280037214,
"loss": 0.9258,
"step": 382
},
{
"epoch": 0.018770599262408566,
"grad_norm": 0.7011927366256714,
"learning_rate": 0.0001696797751750898,
"loss": 0.9686,
"step": 383
},
{
"epoch": 0.018819608659960546,
"grad_norm": 0.5856828093528748,
"learning_rate": 0.00016952580191800727,
"loss": 1.0147,
"step": 384
},
{
"epoch": 0.01886861805751253,
"grad_norm": 0.5925838947296143,
"learning_rate": 0.00016937150894053303,
"loss": 1.017,
"step": 385
},
{
"epoch": 0.01891762745506451,
"grad_norm": 0.5735793709754944,
"learning_rate": 0.00016921689695219665,
"loss": 0.849,
"step": 386
},
{
"epoch": 0.01896663685261649,
"grad_norm": 0.6090975403785706,
"learning_rate": 0.0001690619666639947,
"loss": 0.8939,
"step": 387
},
{
"epoch": 0.01901564625016847,
"grad_norm": 0.589528501033783,
"learning_rate": 0.00016890671878838745,
"loss": 1.0517,
"step": 388
},
{
"epoch": 0.01906465564772045,
"grad_norm": 0.517760694026947,
"learning_rate": 0.00016875115403929564,
"loss": 1.0258,
"step": 389
},
{
"epoch": 0.01911366504527243,
"grad_norm": 0.5802696943283081,
"learning_rate": 0.00016859527313209722,
"loss": 1.0879,
"step": 390
},
{
"epoch": 0.01916267444282441,
"grad_norm": 0.5980463027954102,
"learning_rate": 0.00016843907678362388,
"loss": 0.9885,
"step": 391
},
{
"epoch": 0.019211683840376394,
"grad_norm": 0.5813844203948975,
"learning_rate": 0.00016828256571215804,
"loss": 0.9511,
"step": 392
},
{
"epoch": 0.019260693237928374,
"grad_norm": 0.588787853717804,
"learning_rate": 0.00016812574063742935,
"loss": 0.9681,
"step": 393
},
{
"epoch": 0.019309702635480354,
"grad_norm": 0.5007123351097107,
"learning_rate": 0.00016796860228061145,
"loss": 1.0178,
"step": 394
},
{
"epoch": 0.019358712033032334,
"grad_norm": 0.6212466359138489,
"learning_rate": 0.00016781115136431856,
"loss": 0.9542,
"step": 395
},
{
"epoch": 0.019407721430584314,
"grad_norm": 0.63362056016922,
"learning_rate": 0.0001676533886126024,
"loss": 0.9947,
"step": 396
},
{
"epoch": 0.019456730828136294,
"grad_norm": 0.5138733386993408,
"learning_rate": 0.00016749531475094843,
"loss": 0.9759,
"step": 397
},
{
"epoch": 0.019505740225688274,
"grad_norm": 0.5656266808509827,
"learning_rate": 0.000167336930506273,
"loss": 0.9864,
"step": 398
},
{
"epoch": 0.019554749623240258,
"grad_norm": 0.5729262828826904,
"learning_rate": 0.00016717823660691972,
"loss": 0.9961,
"step": 399
},
{
"epoch": 0.019603759020792238,
"grad_norm": 0.5822618007659912,
"learning_rate": 0.00016701923378265615,
"loss": 1.1899,
"step": 400
},
{
"epoch": 0.019652768418344218,
"grad_norm": 0.6505359411239624,
"learning_rate": 0.0001668599227646705,
"loss": 1.2617,
"step": 401
},
{
"epoch": 0.019701777815896198,
"grad_norm": 0.5997886061668396,
"learning_rate": 0.00016670030428556816,
"loss": 0.9047,
"step": 402
},
{
"epoch": 0.019750787213448178,
"grad_norm": 0.5251815915107727,
"learning_rate": 0.00016654037907936847,
"loss": 0.8927,
"step": 403
},
{
"epoch": 0.019799796611000158,
"grad_norm": 0.6396327614784241,
"learning_rate": 0.00016638014788150133,
"loss": 1.1262,
"step": 404
},
{
"epoch": 0.019848806008552138,
"grad_norm": 0.5167068839073181,
"learning_rate": 0.0001662196114288037,
"loss": 0.9205,
"step": 405
},
{
"epoch": 0.019897815406104122,
"grad_norm": 0.5277456045150757,
"learning_rate": 0.00016605877045951624,
"loss": 0.975,
"step": 406
},
{
"epoch": 0.019946824803656102,
"grad_norm": 0.5696888566017151,
"learning_rate": 0.00016589762571328005,
"loss": 0.9641,
"step": 407
},
{
"epoch": 0.019995834201208082,
"grad_norm": 0.6764147877693176,
"learning_rate": 0.00016573617793113308,
"loss": 0.9214,
"step": 408
},
{
"epoch": 0.020044843598760062,
"grad_norm": 0.5221783518791199,
"learning_rate": 0.000165574427855507,
"loss": 1.0529,
"step": 409
},
{
"epoch": 0.020093852996312042,
"grad_norm": 0.598206639289856,
"learning_rate": 0.0001654123762302233,
"loss": 1.1109,
"step": 410
},
{
"epoch": 0.020142862393864022,
"grad_norm": 0.7939401268959045,
"learning_rate": 0.0001652500238004905,
"loss": 1.0441,
"step": 411
},
{
"epoch": 0.020191871791416002,
"grad_norm": 0.7071179747581482,
"learning_rate": 0.00016508737131290012,
"loss": 0.9036,
"step": 412
},
{
"epoch": 0.020240881188967986,
"grad_norm": 0.502358615398407,
"learning_rate": 0.00016492441951542367,
"loss": 0.8945,
"step": 413
},
{
"epoch": 0.020289890586519966,
"grad_norm": 0.7627480626106262,
"learning_rate": 0.0001647611691574091,
"loss": 0.8688,
"step": 414
},
{
"epoch": 0.020338899984071946,
"grad_norm": 0.6446793675422668,
"learning_rate": 0.0001645976209895772,
"loss": 1.083,
"step": 415
},
{
"epoch": 0.020387909381623926,
"grad_norm": 0.5880749821662903,
"learning_rate": 0.00016443377576401828,
"loss": 1.0038,
"step": 416
},
{
"epoch": 0.020436918779175906,
"grad_norm": 0.7221845388412476,
"learning_rate": 0.00016426963423418878,
"loss": 1.1188,
"step": 417
},
{
"epoch": 0.020485928176727886,
"grad_norm": 0.6247958540916443,
"learning_rate": 0.00016410519715490764,
"loss": 0.9271,
"step": 418
},
{
"epoch": 0.02053493757427987,
"grad_norm": 0.47129571437835693,
"learning_rate": 0.000163940465282353,
"loss": 0.9078,
"step": 419
},
{
"epoch": 0.02058394697183185,
"grad_norm": 0.5665132999420166,
"learning_rate": 0.00016377543937405847,
"loss": 0.8963,
"step": 420
},
{
"epoch": 0.02063295636938383,
"grad_norm": 0.5928874611854553,
"learning_rate": 0.00016361012018890997,
"loss": 1.1101,
"step": 421
},
{
"epoch": 0.02068196576693581,
"grad_norm": 0.493681937456131,
"learning_rate": 0.00016344450848714204,
"loss": 0.9305,
"step": 422
},
{
"epoch": 0.02073097516448779,
"grad_norm": 0.5588467121124268,
"learning_rate": 0.00016327860503033436,
"loss": 0.7529,
"step": 423
},
{
"epoch": 0.02077998456203977,
"grad_norm": 0.8472508192062378,
"learning_rate": 0.00016311241058140823,
"loss": 1.0739,
"step": 424
},
{
"epoch": 0.02082899395959175,
"grad_norm": 0.5035362243652344,
"learning_rate": 0.00016294592590462316,
"loss": 0.9373,
"step": 425
},
{
"epoch": 0.020878003357143734,
"grad_norm": 0.494436651468277,
"learning_rate": 0.00016277915176557333,
"loss": 0.8846,
"step": 426
},
{
"epoch": 0.020927012754695714,
"grad_norm": 0.5663606524467468,
"learning_rate": 0.00016261208893118392,
"loss": 0.9322,
"step": 427
},
{
"epoch": 0.020976022152247694,
"grad_norm": 0.5859829783439636,
"learning_rate": 0.0001624447381697078,
"loss": 1.1366,
"step": 428
},
{
"epoch": 0.021025031549799674,
"grad_norm": 0.6672139763832092,
"learning_rate": 0.00016227710025072187,
"loss": 0.9509,
"step": 429
},
{
"epoch": 0.021074040947351654,
"grad_norm": 0.5644333958625793,
"learning_rate": 0.00016210917594512356,
"loss": 0.9532,
"step": 430
},
{
"epoch": 0.021123050344903634,
"grad_norm": 0.828715443611145,
"learning_rate": 0.00016194096602512725,
"loss": 1.1273,
"step": 431
},
{
"epoch": 0.021172059742455614,
"grad_norm": 0.6238652467727661,
"learning_rate": 0.00016177247126426076,
"loss": 1.0033,
"step": 432
},
{
"epoch": 0.021221069140007598,
"grad_norm": 0.5639209151268005,
"learning_rate": 0.00016160369243736175,
"loss": 0.9039,
"step": 433
},
{
"epoch": 0.021270078537559578,
"grad_norm": 0.6098002195358276,
"learning_rate": 0.00016143463032057423,
"loss": 1.0728,
"step": 434
},
{
"epoch": 0.021319087935111558,
"grad_norm": 0.5710477232933044,
"learning_rate": 0.00016126528569134488,
"loss": 1.0031,
"step": 435
},
{
"epoch": 0.021368097332663538,
"grad_norm": 0.6529645919799805,
"learning_rate": 0.0001610956593284196,
"loss": 0.9533,
"step": 436
},
{
"epoch": 0.021417106730215518,
"grad_norm": 0.6168728470802307,
"learning_rate": 0.0001609257520118398,
"loss": 0.9776,
"step": 437
},
{
"epoch": 0.0214661161277675,
"grad_norm": 0.6322219371795654,
"learning_rate": 0.00016075556452293895,
"loss": 1.1566,
"step": 438
},
{
"epoch": 0.02151512552531948,
"grad_norm": 0.7928242087364197,
"learning_rate": 0.00016058509764433886,
"loss": 1.0876,
"step": 439
},
{
"epoch": 0.021564134922871462,
"grad_norm": 0.5785205364227295,
"learning_rate": 0.00016041435215994622,
"loss": 0.9904,
"step": 440
},
{
"epoch": 0.021613144320423442,
"grad_norm": 0.6156221628189087,
"learning_rate": 0.0001602433288549488,
"loss": 0.9935,
"step": 441
},
{
"epoch": 0.021662153717975422,
"grad_norm": 0.6420785784721375,
"learning_rate": 0.000160072028515812,
"loss": 1.0979,
"step": 442
},
{
"epoch": 0.021711163115527402,
"grad_norm": 0.512675404548645,
"learning_rate": 0.00015990045193027522,
"loss": 0.8404,
"step": 443
},
{
"epoch": 0.021760172513079382,
"grad_norm": 0.4817085564136505,
"learning_rate": 0.00015972859988734817,
"loss": 0.86,
"step": 444
},
{
"epoch": 0.021809181910631362,
"grad_norm": 0.6226520538330078,
"learning_rate": 0.00015955647317730727,
"loss": 1.0432,
"step": 445
},
{
"epoch": 0.021858191308183342,
"grad_norm": 0.5906574130058289,
"learning_rate": 0.000159384072591692,
"loss": 0.9998,
"step": 446
},
{
"epoch": 0.021907200705735326,
"grad_norm": 0.5114994049072266,
"learning_rate": 0.00015921139892330138,
"loss": 0.971,
"step": 447
},
{
"epoch": 0.021956210103287306,
"grad_norm": 0.6080278158187866,
"learning_rate": 0.00015903845296619013,
"loss": 0.829,
"step": 448
},
{
"epoch": 0.022005219500839286,
"grad_norm": 0.8234065175056458,
"learning_rate": 0.0001588652355156651,
"loss": 0.9837,
"step": 449
},
{
"epoch": 0.022054228898391266,
"grad_norm": 0.6270102262496948,
"learning_rate": 0.00015869174736828168,
"loss": 0.9846,
"step": 450
},
{
"epoch": 0.022103238295943246,
"grad_norm": 0.9178394675254822,
"learning_rate": 0.0001585179893218401,
"loss": 0.8804,
"step": 451
},
{
"epoch": 0.022152247693495226,
"grad_norm": 0.7372391223907471,
"learning_rate": 0.0001583439621753817,
"loss": 1.0138,
"step": 452
},
{
"epoch": 0.02220125709104721,
"grad_norm": 0.620411217212677,
"learning_rate": 0.00015816966672918529,
"loss": 1.0152,
"step": 453
},
{
"epoch": 0.02225026648859919,
"grad_norm": 0.5102325677871704,
"learning_rate": 0.00015799510378476347,
"loss": 0.8829,
"step": 454
},
{
"epoch": 0.02229927588615117,
"grad_norm": 0.5099406242370605,
"learning_rate": 0.00015782027414485905,
"loss": 0.9327,
"step": 455
},
{
"epoch": 0.02234828528370315,
"grad_norm": 0.5905872583389282,
"learning_rate": 0.00015764517861344116,
"loss": 0.9433,
"step": 456
},
{
"epoch": 0.02239729468125513,
"grad_norm": 0.6617975831031799,
"learning_rate": 0.0001574698179957017,
"loss": 1.0295,
"step": 457
},
{
"epoch": 0.02244630407880711,
"grad_norm": 0.5137200355529785,
"learning_rate": 0.0001572941930980516,
"loss": 0.9562,
"step": 458
},
{
"epoch": 0.02249531347635909,
"grad_norm": 0.5464679598808289,
"learning_rate": 0.00015711830472811702,
"loss": 0.9995,
"step": 459
},
{
"epoch": 0.022544322873911074,
"grad_norm": 0.574379026889801,
"learning_rate": 0.00015694215369473584,
"loss": 1.0157,
"step": 460
},
{
"epoch": 0.022593332271463054,
"grad_norm": 0.5301153063774109,
"learning_rate": 0.00015676574080795378,
"loss": 0.9596,
"step": 461
},
{
"epoch": 0.022642341669015034,
"grad_norm": 0.6540731191635132,
"learning_rate": 0.0001565890668790207,
"loss": 0.9636,
"step": 462
},
{
"epoch": 0.022691351066567014,
"grad_norm": 0.9363406300544739,
"learning_rate": 0.00015641213272038682,
"loss": 1.1141,
"step": 463
},
{
"epoch": 0.022740360464118994,
"grad_norm": 0.6103693842887878,
"learning_rate": 0.0001562349391456992,
"loss": 0.8886,
"step": 464
},
{
"epoch": 0.022789369861670974,
"grad_norm": 0.6556861996650696,
"learning_rate": 0.0001560574869697977,
"loss": 1.0414,
"step": 465
},
{
"epoch": 0.022838379259222955,
"grad_norm": 0.5614734292030334,
"learning_rate": 0.0001558797770087115,
"loss": 0.8673,
"step": 466
},
{
"epoch": 0.022887388656774938,
"grad_norm": 0.6785566210746765,
"learning_rate": 0.00015570181007965514,
"loss": 0.9473,
"step": 467
},
{
"epoch": 0.022936398054326918,
"grad_norm": 0.5315887928009033,
"learning_rate": 0.0001555235870010249,
"loss": 0.9594,
"step": 468
},
{
"epoch": 0.022985407451878898,
"grad_norm": 0.567318320274353,
"learning_rate": 0.00015534510859239493,
"loss": 1.157,
"step": 469
},
{
"epoch": 0.02303441684943088,
"grad_norm": 0.5194117426872253,
"learning_rate": 0.00015516637567451356,
"loss": 0.9412,
"step": 470
},
{
"epoch": 0.02308342624698286,
"grad_norm": 0.6578914523124695,
"learning_rate": 0.0001549873890692996,
"loss": 1.1002,
"step": 471
},
{
"epoch": 0.02313243564453484,
"grad_norm": 0.7504667639732361,
"learning_rate": 0.0001548081495998383,
"loss": 0.9016,
"step": 472
},
{
"epoch": 0.02318144504208682,
"grad_norm": 0.6988145112991333,
"learning_rate": 0.00015462865809037784,
"loss": 1.0586,
"step": 473
},
{
"epoch": 0.023230454439638802,
"grad_norm": 0.6438858509063721,
"learning_rate": 0.0001544489153663254,
"loss": 0.8252,
"step": 474
},
{
"epoch": 0.023279463837190782,
"grad_norm": 0.5463119149208069,
"learning_rate": 0.00015426892225424337,
"loss": 0.9599,
"step": 475
},
{
"epoch": 0.023328473234742762,
"grad_norm": 0.6298267245292664,
"learning_rate": 0.00015408867958184556,
"loss": 0.9993,
"step": 476
},
{
"epoch": 0.023377482632294742,
"grad_norm": 0.5038778185844421,
"learning_rate": 0.0001539081881779935,
"loss": 0.9883,
"step": 477
},
{
"epoch": 0.023426492029846722,
"grad_norm": 0.7516053318977356,
"learning_rate": 0.00015372744887269242,
"loss": 1.0561,
"step": 478
},
{
"epoch": 0.023475501427398703,
"grad_norm": 0.5092015862464905,
"learning_rate": 0.00015354646249708757,
"loss": 0.9311,
"step": 479
},
{
"epoch": 0.023524510824950683,
"grad_norm": 0.6461193561553955,
"learning_rate": 0.00015336522988346047,
"loss": 0.9045,
"step": 480
},
{
"epoch": 0.023573520222502666,
"grad_norm": 0.8320329785346985,
"learning_rate": 0.00015318375186522485,
"loss": 1.0344,
"step": 481
},
{
"epoch": 0.023622529620054646,
"grad_norm": 0.5258587002754211,
"learning_rate": 0.00015300202927692302,
"loss": 0.9523,
"step": 482
},
{
"epoch": 0.023671539017606626,
"grad_norm": 0.7670847773551941,
"learning_rate": 0.00015282006295422199,
"loss": 1.005,
"step": 483
},
{
"epoch": 0.023720548415158606,
"grad_norm": 0.5239679217338562,
"learning_rate": 0.00015263785373390956,
"loss": 0.9929,
"step": 484
},
{
"epoch": 0.023769557812710587,
"grad_norm": 0.5952014923095703,
"learning_rate": 0.00015245540245389052,
"loss": 1.1896,
"step": 485
},
{
"epoch": 0.023818567210262567,
"grad_norm": 0.7766487002372742,
"learning_rate": 0.0001522727099531828,
"loss": 0.8564,
"step": 486
},
{
"epoch": 0.02386757660781455,
"grad_norm": 0.7068402171134949,
"learning_rate": 0.00015208977707191368,
"loss": 1.0832,
"step": 487
},
{
"epoch": 0.02391658600536653,
"grad_norm": 0.5393612384796143,
"learning_rate": 0.0001519066046513157,
"loss": 0.8455,
"step": 488
},
{
"epoch": 0.02396559540291851,
"grad_norm": 0.6815643906593323,
"learning_rate": 0.00015172319353372302,
"loss": 0.8441,
"step": 489
},
{
"epoch": 0.02401460480047049,
"grad_norm": 0.7284293174743652,
"learning_rate": 0.00015153954456256753,
"loss": 0.9822,
"step": 490
},
{
"epoch": 0.02406361419802247,
"grad_norm": 0.547886312007904,
"learning_rate": 0.00015135565858237482,
"loss": 0.8793,
"step": 491
},
{
"epoch": 0.02411262359557445,
"grad_norm": 0.5711491703987122,
"learning_rate": 0.00015117153643876038,
"loss": 0.7461,
"step": 492
},
{
"epoch": 0.02416163299312643,
"grad_norm": 0.5130283236503601,
"learning_rate": 0.00015098717897842585,
"loss": 0.8651,
"step": 493
},
{
"epoch": 0.024210642390678414,
"grad_norm": 0.7330240607261658,
"learning_rate": 0.00015080258704915477,
"loss": 1.1199,
"step": 494
},
{
"epoch": 0.024259651788230394,
"grad_norm": 0.6397481560707092,
"learning_rate": 0.00015061776149980914,
"loss": 0.8711,
"step": 495
},
{
"epoch": 0.024308661185782374,
"grad_norm": 0.5871744751930237,
"learning_rate": 0.00015043270318032512,
"loss": 1.0653,
"step": 496
},
{
"epoch": 0.024357670583334354,
"grad_norm": 0.560553789138794,
"learning_rate": 0.0001502474129417094,
"loss": 0.8683,
"step": 497
},
{
"epoch": 0.024406679980886335,
"grad_norm": 0.6320847272872925,
"learning_rate": 0.00015006189163603502,
"loss": 0.8956,
"step": 498
},
{
"epoch": 0.024455689378438315,
"grad_norm": 0.7588667869567871,
"learning_rate": 0.00014987614011643775,
"loss": 0.9022,
"step": 499
},
{
"epoch": 0.024504698775990295,
"grad_norm": 0.6102482676506042,
"learning_rate": 0.00014969015923711195,
"loss": 0.9273,
"step": 500
},
{
"epoch": 0.024553708173542278,
"grad_norm": 0.8161607384681702,
"learning_rate": 0.00014950394985330676,
"loss": 0.892,
"step": 501
},
{
"epoch": 0.02460271757109426,
"grad_norm": 0.6331537961959839,
"learning_rate": 0.000149317512821322,
"loss": 1.0033,
"step": 502
},
{
"epoch": 0.02465172696864624,
"grad_norm": 0.5715288519859314,
"learning_rate": 0.00014913084899850448,
"loss": 0.8952,
"step": 503
},
{
"epoch": 0.02470073636619822,
"grad_norm": 0.6018710136413574,
"learning_rate": 0.00014894395924324388,
"loss": 0.8296,
"step": 504
},
{
"epoch": 0.0247497457637502,
"grad_norm": 0.4526127576828003,
"learning_rate": 0.00014875684441496883,
"loss": 0.9113,
"step": 505
},
{
"epoch": 0.02479875516130218,
"grad_norm": 0.7399595975875854,
"learning_rate": 0.00014856950537414299,
"loss": 1.0297,
"step": 506
},
{
"epoch": 0.02484776455885416,
"grad_norm": 0.6361096501350403,
"learning_rate": 0.00014838194298226104,
"loss": 1.0017,
"step": 507
},
{
"epoch": 0.024896773956406142,
"grad_norm": 0.5069610476493835,
"learning_rate": 0.00014819415810184491,
"loss": 0.8191,
"step": 508
},
{
"epoch": 0.024945783353958122,
"grad_norm": 0.6111281514167786,
"learning_rate": 0.00014800615159643945,
"loss": 1.1032,
"step": 509
},
{
"epoch": 0.024994792751510102,
"grad_norm": 0.6328058242797852,
"learning_rate": 0.00014781792433060884,
"loss": 0.9169,
"step": 510
},
{
"epoch": 0.025043802149062083,
"grad_norm": 0.5302674174308777,
"learning_rate": 0.00014762947716993237,
"loss": 0.9313,
"step": 511
},
{
"epoch": 0.025092811546614063,
"grad_norm": 0.8632691502571106,
"learning_rate": 0.00014744081098100052,
"loss": 0.8959,
"step": 512
},
{
"epoch": 0.025141820944166043,
"grad_norm": 0.5266912579536438,
"learning_rate": 0.00014725192663141108,
"loss": 0.971,
"step": 513
},
{
"epoch": 0.025190830341718023,
"grad_norm": 0.6190224885940552,
"learning_rate": 0.00014706282498976495,
"loss": 1.0125,
"step": 514
},
{
"epoch": 0.025239839739270006,
"grad_norm": 0.56204754114151,
"learning_rate": 0.00014687350692566236,
"loss": 0.9021,
"step": 515
},
{
"epoch": 0.025288849136821986,
"grad_norm": 0.5350925326347351,
"learning_rate": 0.0001466839733096987,
"loss": 0.9583,
"step": 516
},
{
"epoch": 0.025337858534373967,
"grad_norm": 0.5631693005561829,
"learning_rate": 0.0001464942250134607,
"loss": 0.8102,
"step": 517
},
{
"epoch": 0.025386867931925947,
"grad_norm": 0.5800350308418274,
"learning_rate": 0.00014630426290952218,
"loss": 1.0154,
"step": 518
},
{
"epoch": 0.025435877329477927,
"grad_norm": 0.9159958362579346,
"learning_rate": 0.0001461140878714403,
"loss": 0.8732,
"step": 519
},
{
"epoch": 0.025484886727029907,
"grad_norm": 0.5498703122138977,
"learning_rate": 0.00014592370077375132,
"loss": 0.8598,
"step": 520
},
{
"epoch": 0.025533896124581887,
"grad_norm": 0.6629413962364197,
"learning_rate": 0.00014573310249196679,
"loss": 0.9835,
"step": 521
},
{
"epoch": 0.02558290552213387,
"grad_norm": 0.795328676700592,
"learning_rate": 0.00014554229390256924,
"loss": 0.9006,
"step": 522
},
{
"epoch": 0.02563191491968585,
"grad_norm": 0.5357116460800171,
"learning_rate": 0.00014535127588300847,
"loss": 1.0219,
"step": 523
},
{
"epoch": 0.02568092431723783,
"grad_norm": 0.6141706705093384,
"learning_rate": 0.00014516004931169728,
"loss": 0.9475,
"step": 524
},
{
"epoch": 0.02572993371478981,
"grad_norm": 0.6289846897125244,
"learning_rate": 0.00014496861506800758,
"loss": 1.2036,
"step": 525
},
{
"epoch": 0.02577894311234179,
"grad_norm": 0.639453113079071,
"learning_rate": 0.0001447769740322662,
"loss": 0.994,
"step": 526
},
{
"epoch": 0.02582795250989377,
"grad_norm": 0.6463608145713806,
"learning_rate": 0.00014458512708575094,
"loss": 1.0525,
"step": 527
},
{
"epoch": 0.025876961907445754,
"grad_norm": 1.537865400314331,
"learning_rate": 0.00014439307511068656,
"loss": 1.5053,
"step": 528
},
{
"epoch": 0.025925971304997734,
"grad_norm": 0.6124917268753052,
"learning_rate": 0.00014420081899024057,
"loss": 0.8965,
"step": 529
},
{
"epoch": 0.025974980702549715,
"grad_norm": 0.5339726209640503,
"learning_rate": 0.00014400835960851936,
"loss": 0.9666,
"step": 530
},
{
"epoch": 0.026023990100101695,
"grad_norm": 0.5528753399848938,
"learning_rate": 0.00014381569785056395,
"loss": 0.8453,
"step": 531
},
{
"epoch": 0.026072999497653675,
"grad_norm": 0.506305456161499,
"learning_rate": 0.00014362283460234604,
"loss": 0.9733,
"step": 532
},
{
"epoch": 0.026122008895205655,
"grad_norm": 0.6728483438491821,
"learning_rate": 0.0001434297707507639,
"loss": 1.0848,
"step": 533
},
{
"epoch": 0.026171018292757635,
"grad_norm": 0.7605665326118469,
"learning_rate": 0.0001432365071836383,
"loss": 0.8888,
"step": 534
},
{
"epoch": 0.02622002769030962,
"grad_norm": 0.5308957099914551,
"learning_rate": 0.00014304304478970838,
"loss": 0.9177,
"step": 535
},
{
"epoch": 0.0262690370878616,
"grad_norm": 0.6177141666412354,
"learning_rate": 0.00014284938445862768,
"loss": 0.7726,
"step": 536
},
{
"epoch": 0.02631804648541358,
"grad_norm": 1.5204870700836182,
"learning_rate": 0.00014265552708095987,
"loss": 0.9485,
"step": 537
},
{
"epoch": 0.02636705588296556,
"grad_norm": 0.7078920602798462,
"learning_rate": 0.00014246147354817485,
"loss": 1.0088,
"step": 538
},
{
"epoch": 0.02641606528051754,
"grad_norm": 0.6084794402122498,
"learning_rate": 0.00014226722475264449,
"loss": 0.9549,
"step": 539
},
{
"epoch": 0.02646507467806952,
"grad_norm": 0.6052922010421753,
"learning_rate": 0.0001420727815876386,
"loss": 1.0489,
"step": 540
},
{
"epoch": 0.0265140840756215,
"grad_norm": 0.6130591034889221,
"learning_rate": 0.00014187814494732087,
"loss": 1.0125,
"step": 541
},
{
"epoch": 0.026563093473173482,
"grad_norm": 0.5894243121147156,
"learning_rate": 0.00014168331572674464,
"loss": 0.9286,
"step": 542
},
{
"epoch": 0.026612102870725463,
"grad_norm": 0.5302923917770386,
"learning_rate": 0.00014148829482184887,
"loss": 1.0314,
"step": 543
},
{
"epoch": 0.026661112268277443,
"grad_norm": 0.7226726412773132,
"learning_rate": 0.000141293083129454,
"loss": 1.0828,
"step": 544
},
{
"epoch": 0.026710121665829423,
"grad_norm": 0.6821955442428589,
"learning_rate": 0.00014109768154725783,
"loss": 0.7577,
"step": 545
},
{
"epoch": 0.026759131063381403,
"grad_norm": 0.5746994018554688,
"learning_rate": 0.00014090209097383135,
"loss": 0.9804,
"step": 546
},
{
"epoch": 0.026808140460933383,
"grad_norm": 0.7999975085258484,
"learning_rate": 0.0001407063123086147,
"loss": 1.2291,
"step": 547
},
{
"epoch": 0.026857149858485363,
"grad_norm": 0.5952467918395996,
"learning_rate": 0.00014051034645191294,
"loss": 0.9799,
"step": 548
},
{
"epoch": 0.026906159256037347,
"grad_norm": 0.5163621306419373,
"learning_rate": 0.00014031419430489192,
"loss": 0.964,
"step": 549
},
{
"epoch": 0.026955168653589327,
"grad_norm": 0.5633589029312134,
"learning_rate": 0.00014011785676957422,
"loss": 1.1302,
"step": 550
},
{
"epoch": 0.027004178051141307,
"grad_norm": 0.9280727505683899,
"learning_rate": 0.0001399213347488349,
"loss": 1.1204,
"step": 551
},
{
"epoch": 0.027053187448693287,
"grad_norm": 0.47326645255088806,
"learning_rate": 0.00013972462914639745,
"loss": 0.9462,
"step": 552
},
{
"epoch": 0.027102196846245267,
"grad_norm": 0.646728515625,
"learning_rate": 0.00013952774086682944,
"loss": 1.098,
"step": 553
},
{
"epoch": 0.027151206243797247,
"grad_norm": 0.660001277923584,
"learning_rate": 0.00013933067081553868,
"loss": 0.905,
"step": 554
},
{
"epoch": 0.027200215641349227,
"grad_norm": 0.565215528011322,
"learning_rate": 0.00013913341989876875,
"loss": 0.8937,
"step": 555
},
{
"epoch": 0.02724922503890121,
"grad_norm": 0.5420377254486084,
"learning_rate": 0.00013893598902359498,
"loss": 0.8095,
"step": 556
},
{
"epoch": 0.02729823443645319,
"grad_norm": 0.5699804425239563,
"learning_rate": 0.0001387383790979203,
"loss": 0.8931,
"step": 557
},
{
"epoch": 0.02734724383400517,
"grad_norm": 0.6203681230545044,
"learning_rate": 0.00013854059103047094,
"loss": 0.9657,
"step": 558
},
{
"epoch": 0.02739625323155715,
"grad_norm": 0.5285091996192932,
"learning_rate": 0.00013834262573079241,
"loss": 0.9815,
"step": 559
},
{
"epoch": 0.02744526262910913,
"grad_norm": 0.5105606913566589,
"learning_rate": 0.0001381444841092452,
"loss": 0.7874,
"step": 560
},
{
"epoch": 0.02749427202666111,
"grad_norm": 0.5473595261573792,
"learning_rate": 0.00013794616707700057,
"loss": 0.8856,
"step": 561
},
{
"epoch": 0.027543281424213095,
"grad_norm": 0.5748807191848755,
"learning_rate": 0.00013774767554603659,
"loss": 0.9379,
"step": 562
},
{
"epoch": 0.027592290821765075,
"grad_norm": 0.6852862238883972,
"learning_rate": 0.00013754901042913357,
"loss": 0.9867,
"step": 563
},
{
"epoch": 0.027641300219317055,
"grad_norm": 0.5716254711151123,
"learning_rate": 0.0001373501726398702,
"loss": 1.0988,
"step": 564
},
{
"epoch": 0.027690309616869035,
"grad_norm": 0.5285595059394836,
"learning_rate": 0.0001371511630926192,
"loss": 0.8952,
"step": 565
},
{
"epoch": 0.027739319014421015,
"grad_norm": 0.6386389136314392,
"learning_rate": 0.00013695198270254316,
"loss": 0.9052,
"step": 566
},
{
"epoch": 0.027788328411972995,
"grad_norm": 0.6316874623298645,
"learning_rate": 0.0001367526323855902,
"loss": 0.8897,
"step": 567
},
{
"epoch": 0.027837337809524975,
"grad_norm": 0.5656607151031494,
"learning_rate": 0.00013655311305848996,
"loss": 1.0086,
"step": 568
},
{
"epoch": 0.02788634720707696,
"grad_norm": 0.5613694190979004,
"learning_rate": 0.00013635342563874926,
"loss": 1.0209,
"step": 569
},
{
"epoch": 0.02793535660462894,
"grad_norm": 0.53194659948349,
"learning_rate": 0.00013615357104464795,
"loss": 1.1905,
"step": 570
},
{
"epoch": 0.02798436600218092,
"grad_norm": 0.7656781077384949,
"learning_rate": 0.00013595355019523452,
"loss": 0.8432,
"step": 571
},
{
"epoch": 0.0280333753997329,
"grad_norm": 0.8043819665908813,
"learning_rate": 0.00013575336401032214,
"loss": 1.0537,
"step": 572
},
{
"epoch": 0.02808238479728488,
"grad_norm": 0.543787956237793,
"learning_rate": 0.00013555301341048424,
"loss": 1.0334,
"step": 573
},
{
"epoch": 0.02813139419483686,
"grad_norm": 0.5572656989097595,
"learning_rate": 0.00013535249931705029,
"loss": 1.0404,
"step": 574
},
{
"epoch": 0.02818040359238884,
"grad_norm": 0.6637741923332214,
"learning_rate": 0.00013515182265210165,
"loss": 0.9432,
"step": 575
},
{
"epoch": 0.028229412989940823,
"grad_norm": 0.5407254695892334,
"learning_rate": 0.00013495098433846724,
"loss": 0.8214,
"step": 576
},
{
"epoch": 0.028278422387492803,
"grad_norm": 0.7442420125007629,
"learning_rate": 0.00013474998529971934,
"loss": 0.8009,
"step": 577
},
{
"epoch": 0.028327431785044783,
"grad_norm": 0.8374704122543335,
"learning_rate": 0.00013454882646016938,
"loss": 0.8955,
"step": 578
},
{
"epoch": 0.028376441182596763,
"grad_norm": 0.54608553647995,
"learning_rate": 0.0001343475087448636,
"loss": 0.8587,
"step": 579
},
{
"epoch": 0.028425450580148743,
"grad_norm": 0.6886517405509949,
"learning_rate": 0.0001341460330795789,
"loss": 1.0729,
"step": 580
},
{
"epoch": 0.028474459977700723,
"grad_norm": 0.9504325985908508,
"learning_rate": 0.00013394440039081847,
"loss": 0.9301,
"step": 581
},
{
"epoch": 0.028523469375252703,
"grad_norm": 0.5252017974853516,
"learning_rate": 0.00013374261160580754,
"loss": 1.0221,
"step": 582
},
{
"epoch": 0.028572478772804687,
"grad_norm": 0.7967292070388794,
"learning_rate": 0.00013354066765248934,
"loss": 0.9371,
"step": 583
},
{
"epoch": 0.028621488170356667,
"grad_norm": 0.6831346750259399,
"learning_rate": 0.0001333385694595205,
"loss": 1.0537,
"step": 584
},
{
"epoch": 0.028670497567908647,
"grad_norm": 0.5146148204803467,
"learning_rate": 0.0001331363179562669,
"loss": 0.9417,
"step": 585
},
{
"epoch": 0.028719506965460627,
"grad_norm": 0.49967578053474426,
"learning_rate": 0.0001329339140727996,
"loss": 0.9316,
"step": 586
},
{
"epoch": 0.028768516363012607,
"grad_norm": 0.6498879790306091,
"learning_rate": 0.00013273135873989028,
"loss": 0.8361,
"step": 587
},
{
"epoch": 0.028817525760564587,
"grad_norm": 0.49114206433296204,
"learning_rate": 0.00013252865288900707,
"loss": 0.8651,
"step": 588
},
{
"epoch": 0.028866535158116567,
"grad_norm": 0.5896993279457092,
"learning_rate": 0.00013232579745231035,
"loss": 0.8044,
"step": 589
},
{
"epoch": 0.02891554455566855,
"grad_norm": 0.5868169069290161,
"learning_rate": 0.00013212279336264823,
"loss": 0.85,
"step": 590
},
{
"epoch": 0.02896455395322053,
"grad_norm": 0.5865799188613892,
"learning_rate": 0.00013191964155355264,
"loss": 1.1317,
"step": 591
},
{
"epoch": 0.02901356335077251,
"grad_norm": 0.5267806053161621,
"learning_rate": 0.00013171634295923456,
"loss": 1.0084,
"step": 592
},
{
"epoch": 0.02906257274832449,
"grad_norm": 0.5632867813110352,
"learning_rate": 0.00013151289851458015,
"loss": 0.9592,
"step": 593
},
{
"epoch": 0.02911158214587647,
"grad_norm": 0.5206688642501831,
"learning_rate": 0.00013130930915514623,
"loss": 0.9212,
"step": 594
},
{
"epoch": 0.02916059154342845,
"grad_norm": 0.6168157458305359,
"learning_rate": 0.000131105575817156,
"loss": 1.0057,
"step": 595
},
{
"epoch": 0.029209600940980435,
"grad_norm": 0.5929550528526306,
"learning_rate": 0.00013090169943749476,
"loss": 0.9311,
"step": 596
},
{
"epoch": 0.029258610338532415,
"grad_norm": 0.6218149065971375,
"learning_rate": 0.00013069768095370563,
"loss": 0.8691,
"step": 597
},
{
"epoch": 0.029307619736084395,
"grad_norm": 0.7326745390892029,
"learning_rate": 0.00013049352130398517,
"loss": 0.9513,
"step": 598
},
{
"epoch": 0.029356629133636375,
"grad_norm": 0.5618883371353149,
"learning_rate": 0.00013028922142717918,
"loss": 1.0182,
"step": 599
},
{
"epoch": 0.029405638531188355,
"grad_norm": 1.0137871503829956,
"learning_rate": 0.00013008478226277816,
"loss": 1.0928,
"step": 600
},
{
"epoch": 0.029454647928740335,
"grad_norm": 0.5762143135070801,
"learning_rate": 0.00012988020475091327,
"loss": 0.7268,
"step": 601
},
{
"epoch": 0.029503657326292315,
"grad_norm": 0.6828392148017883,
"learning_rate": 0.00012967548983235187,
"loss": 1.0076,
"step": 602
},
{
"epoch": 0.0295526667238443,
"grad_norm": 0.6265244483947754,
"learning_rate": 0.00012947063844849307,
"loss": 0.8805,
"step": 603
},
{
"epoch": 0.02960167612139628,
"grad_norm": 0.7893605828285217,
"learning_rate": 0.00012926565154136368,
"loss": 1.067,
"step": 604
},
{
"epoch": 0.02965068551894826,
"grad_norm": 0.6898416876792908,
"learning_rate": 0.00012906053005361365,
"loss": 1.187,
"step": 605
},
{
"epoch": 0.02969969491650024,
"grad_norm": 0.6458303332328796,
"learning_rate": 0.0001288552749285118,
"loss": 0.9442,
"step": 606
},
{
"epoch": 0.02974870431405222,
"grad_norm": 0.6307141184806824,
"learning_rate": 0.0001286498871099415,
"loss": 1.1363,
"step": 607
},
{
"epoch": 0.0297977137116042,
"grad_norm": 0.6049492955207825,
"learning_rate": 0.00012844436754239636,
"loss": 0.8748,
"step": 608
},
{
"epoch": 0.02984672310915618,
"grad_norm": 0.6229107975959778,
"learning_rate": 0.0001282387171709758,
"loss": 0.8046,
"step": 609
},
{
"epoch": 0.029895732506708163,
"grad_norm": 0.5819247364997864,
"learning_rate": 0.00012803293694138077,
"loss": 1.0562,
"step": 610
},
{
"epoch": 0.029944741904260143,
"grad_norm": 0.6565641164779663,
"learning_rate": 0.00012782702779990944,
"loss": 1.1314,
"step": 611
},
{
"epoch": 0.029993751301812123,
"grad_norm": 0.6735644340515137,
"learning_rate": 0.00012762099069345268,
"loss": 0.9843,
"step": 612
},
{
"epoch": 0.030042760699364103,
"grad_norm": 0.5780985355377197,
"learning_rate": 0.00012741482656948992,
"loss": 0.8717,
"step": 613
},
{
"epoch": 0.030091770096916083,
"grad_norm": 0.5610291957855225,
"learning_rate": 0.0001272085363760846,
"loss": 0.8648,
"step": 614
},
{
"epoch": 0.030140779494468063,
"grad_norm": 0.7192383408546448,
"learning_rate": 0.00012700212106188009,
"loss": 0.8972,
"step": 615
},
{
"epoch": 0.030189788892020043,
"grad_norm": 0.49262872338294983,
"learning_rate": 0.00012679558157609479,
"loss": 0.8287,
"step": 616
},
{
"epoch": 0.030238798289572027,
"grad_norm": 0.6038398146629333,
"learning_rate": 0.00012658891886851848,
"loss": 0.9688,
"step": 617
},
{
"epoch": 0.030287807687124007,
"grad_norm": 0.7166770100593567,
"learning_rate": 0.0001263821338895074,
"loss": 1.2706,
"step": 618
},
{
"epoch": 0.030336817084675987,
"grad_norm": 0.6760246753692627,
"learning_rate": 0.00012617522758998006,
"loss": 0.7879,
"step": 619
},
{
"epoch": 0.030385826482227967,
"grad_norm": 0.5924021005630493,
"learning_rate": 0.00012596820092141295,
"loss": 0.9037,
"step": 620
},
{
"epoch": 0.030434835879779947,
"grad_norm": 0.8994151949882507,
"learning_rate": 0.00012576105483583602,
"loss": 1.0,
"step": 621
},
{
"epoch": 0.030483845277331927,
"grad_norm": 0.6243171095848083,
"learning_rate": 0.00012555379028582838,
"loss": 1.1253,
"step": 622
},
{
"epoch": 0.030532854674883907,
"grad_norm": 0.5963743925094604,
"learning_rate": 0.00012534640822451398,
"loss": 0.9301,
"step": 623
},
{
"epoch": 0.03058186407243589,
"grad_norm": 0.8039215803146362,
"learning_rate": 0.00012513890960555706,
"loss": 0.9926,
"step": 624
},
{
"epoch": 0.03063087346998787,
"grad_norm": 0.5766738057136536,
"learning_rate": 0.00012493129538315788,
"loss": 0.8439,
"step": 625
},
{
"epoch": 0.03067988286753985,
"grad_norm": 0.4972213804721832,
"learning_rate": 0.0001247235665120484,
"loss": 0.7884,
"step": 626
},
{
"epoch": 0.03072889226509183,
"grad_norm": 0.6013949513435364,
"learning_rate": 0.00012451572394748766,
"loss": 1.0443,
"step": 627
},
{
"epoch": 0.03077790166264381,
"grad_norm": 0.6317854523658752,
"learning_rate": 0.0001243077686452577,
"loss": 0.8208,
"step": 628
},
{
"epoch": 0.03082691106019579,
"grad_norm": 0.5569573044776917,
"learning_rate": 0.00012409970156165878,
"loss": 0.8572,
"step": 629
},
{
"epoch": 0.030875920457747775,
"grad_norm": 0.5597726702690125,
"learning_rate": 0.0001238915236535054,
"loss": 1.1361,
"step": 630
},
{
"epoch": 0.030924929855299755,
"grad_norm": 0.5146288871765137,
"learning_rate": 0.00012368323587812162,
"loss": 0.9265,
"step": 631
},
{
"epoch": 0.030973939252851735,
"grad_norm": 0.6121995449066162,
"learning_rate": 0.00012347483919333664,
"loss": 0.9335,
"step": 632
},
{
"epoch": 0.031022948650403715,
"grad_norm": 0.5894641280174255,
"learning_rate": 0.00012326633455748065,
"loss": 0.8862,
"step": 633
},
{
"epoch": 0.031071958047955695,
"grad_norm": 0.6794971823692322,
"learning_rate": 0.00012305772292938016,
"loss": 1.0275,
"step": 634
},
{
"epoch": 0.031120967445507675,
"grad_norm": 0.5644852519035339,
"learning_rate": 0.0001228490052683537,
"loss": 0.9269,
"step": 635
},
{
"epoch": 0.031169976843059655,
"grad_norm": 0.6075661778450012,
"learning_rate": 0.00012264018253420748,
"loss": 1.1916,
"step": 636
},
{
"epoch": 0.03121898624061164,
"grad_norm": 0.7460759878158569,
"learning_rate": 0.00012243125568723077,
"loss": 1.1546,
"step": 637
},
{
"epoch": 0.03126799563816362,
"grad_norm": 0.6334986090660095,
"learning_rate": 0.00012222222568819172,
"loss": 0.9377,
"step": 638
},
{
"epoch": 0.0313170050357156,
"grad_norm": 0.6031709909439087,
"learning_rate": 0.00012201309349833279,
"loss": 0.9242,
"step": 639
},
{
"epoch": 0.03136601443326758,
"grad_norm": 0.6870954632759094,
"learning_rate": 0.00012180386007936637,
"loss": 1.0381,
"step": 640
},
{
"epoch": 0.03141502383081956,
"grad_norm": 0.5240617394447327,
"learning_rate": 0.0001215945263934704,
"loss": 1.0243,
"step": 641
},
{
"epoch": 0.03146403322837154,
"grad_norm": 0.5612547397613525,
"learning_rate": 0.00012138509340328381,
"loss": 0.7303,
"step": 642
},
{
"epoch": 0.03151304262592352,
"grad_norm": 0.7289676070213318,
"learning_rate": 0.0001211755620719023,
"loss": 0.8771,
"step": 643
},
{
"epoch": 0.0315620520234755,
"grad_norm": 0.5122924447059631,
"learning_rate": 0.00012096593336287376,
"loss": 0.8387,
"step": 644
},
{
"epoch": 0.03161106142102748,
"grad_norm": 0.6075708866119385,
"learning_rate": 0.00012075620824019384,
"loss": 0.9293,
"step": 645
},
{
"epoch": 0.03166007081857946,
"grad_norm": 0.6391776204109192,
"learning_rate": 0.00012054638766830162,
"loss": 1.0081,
"step": 646
},
{
"epoch": 0.03170908021613145,
"grad_norm": 0.6238724589347839,
"learning_rate": 0.00012033647261207505,
"loss": 0.9146,
"step": 647
},
{
"epoch": 0.03175808961368343,
"grad_norm": 0.9260525703430176,
"learning_rate": 0.00012012646403682663,
"loss": 0.7296,
"step": 648
},
{
"epoch": 0.03180709901123541,
"grad_norm": 0.6483685374259949,
"learning_rate": 0.00011991636290829893,
"loss": 1.0605,
"step": 649
},
{
"epoch": 0.03185610840878739,
"grad_norm": 0.6315485835075378,
"learning_rate": 0.00011970617019266,
"loss": 0.8309,
"step": 650
},
{
"epoch": 0.03190511780633937,
"grad_norm": 0.5987114310264587,
"learning_rate": 0.00011949588685649922,
"loss": 1.0255,
"step": 651
},
{
"epoch": 0.03195412720389135,
"grad_norm": 0.6101971864700317,
"learning_rate": 0.00011928551386682262,
"loss": 0.9697,
"step": 652
},
{
"epoch": 0.03200313660144333,
"grad_norm": 0.6032626032829285,
"learning_rate": 0.00011907505219104856,
"loss": 0.8307,
"step": 653
},
{
"epoch": 0.03205214599899531,
"grad_norm": 0.8691760301589966,
"learning_rate": 0.00011886450279700313,
"loss": 1.0574,
"step": 654
},
{
"epoch": 0.03210115539654729,
"grad_norm": 0.5532340407371521,
"learning_rate": 0.00011865386665291591,
"loss": 0.9853,
"step": 655
},
{
"epoch": 0.03215016479409927,
"grad_norm": 0.5070647597312927,
"learning_rate": 0.00011844314472741533,
"loss": 0.9257,
"step": 656
},
{
"epoch": 0.03219917419165125,
"grad_norm": 0.5597984790802002,
"learning_rate": 0.00011823233798952434,
"loss": 0.8883,
"step": 657
},
{
"epoch": 0.03224818358920323,
"grad_norm": 0.601868212223053,
"learning_rate": 0.00011802144740865589,
"loss": 1.0045,
"step": 658
},
{
"epoch": 0.03229719298675521,
"grad_norm": 0.5818256139755249,
"learning_rate": 0.00011781047395460847,
"loss": 0.9819,
"step": 659
},
{
"epoch": 0.03234620238430719,
"grad_norm": 0.5207995772361755,
"learning_rate": 0.00011759941859756172,
"loss": 0.9152,
"step": 660
},
{
"epoch": 0.032395211781859175,
"grad_norm": 0.5202385187149048,
"learning_rate": 0.00011738828230807184,
"loss": 0.9291,
"step": 661
},
{
"epoch": 0.032444221179411155,
"grad_norm": 0.5271541476249695,
"learning_rate": 0.00011717706605706735,
"loss": 1.1041,
"step": 662
},
{
"epoch": 0.032493230576963135,
"grad_norm": 0.5379035472869873,
"learning_rate": 0.00011696577081584426,
"loss": 1.0,
"step": 663
},
{
"epoch": 0.032542239974515115,
"grad_norm": 0.625950276851654,
"learning_rate": 0.00011675439755606203,
"loss": 0.9559,
"step": 664
},
{
"epoch": 0.032591249372067095,
"grad_norm": 0.7411119937896729,
"learning_rate": 0.0001165429472497388,
"loss": 0.8572,
"step": 665
},
{
"epoch": 0.032640258769619075,
"grad_norm": 0.734190046787262,
"learning_rate": 0.000116331420869247,
"loss": 0.8517,
"step": 666
},
{
"epoch": 0.032689268167171055,
"grad_norm": 0.6342369318008423,
"learning_rate": 0.00011611981938730892,
"loss": 0.9284,
"step": 667
},
{
"epoch": 0.032738277564723035,
"grad_norm": 0.6163157820701599,
"learning_rate": 0.00011590814377699224,
"loss": 0.9593,
"step": 668
},
{
"epoch": 0.032787286962275015,
"grad_norm": 0.7204130291938782,
"learning_rate": 0.00011569639501170545,
"loss": 0.8955,
"step": 669
},
{
"epoch": 0.032836296359826996,
"grad_norm": 0.8771257400512695,
"learning_rate": 0.00011548457406519356,
"loss": 1.0293,
"step": 670
},
{
"epoch": 0.032885305757378976,
"grad_norm": 0.6680341362953186,
"learning_rate": 0.00011527268191153337,
"loss": 0.9114,
"step": 671
},
{
"epoch": 0.032934315154930956,
"grad_norm": 0.5671672821044922,
"learning_rate": 0.0001150607195251293,
"loss": 0.9915,
"step": 672
},
{
"epoch": 0.032983324552482936,
"grad_norm": 0.5537201762199402,
"learning_rate": 0.00011484868788070855,
"loss": 1.1166,
"step": 673
},
{
"epoch": 0.033032333950034916,
"grad_norm": 0.7294965386390686,
"learning_rate": 0.00011463658795331695,
"loss": 0.8928,
"step": 674
},
{
"epoch": 0.0330813433475869,
"grad_norm": 0.5197209715843201,
"learning_rate": 0.00011442442071831434,
"loss": 0.8729,
"step": 675
},
{
"epoch": 0.03313035274513888,
"grad_norm": 0.6862895488739014,
"learning_rate": 0.00011421218715136996,
"loss": 1.2211,
"step": 676
},
{
"epoch": 0.03317936214269086,
"grad_norm": 0.7276074886322021,
"learning_rate": 0.00011399988822845822,
"loss": 0.925,
"step": 677
},
{
"epoch": 0.03322837154024284,
"grad_norm": 0.6285182237625122,
"learning_rate": 0.00011378752492585396,
"loss": 0.8794,
"step": 678
},
{
"epoch": 0.03327738093779482,
"grad_norm": 0.7362743020057678,
"learning_rate": 0.00011357509822012817,
"loss": 0.9594,
"step": 679
},
{
"epoch": 0.0333263903353468,
"grad_norm": 0.7037959098815918,
"learning_rate": 0.00011336260908814336,
"loss": 0.748,
"step": 680
},
{
"epoch": 0.03337539973289878,
"grad_norm": 0.6326875686645508,
"learning_rate": 0.00011315005850704907,
"loss": 1.0679,
"step": 681
},
{
"epoch": 0.03342440913045076,
"grad_norm": 0.5508519411087036,
"learning_rate": 0.0001129374474542775,
"loss": 1.0165,
"step": 682
},
{
"epoch": 0.033473418528002744,
"grad_norm": 0.8182931542396545,
"learning_rate": 0.00011272477690753893,
"loss": 1.0418,
"step": 683
},
{
"epoch": 0.033522427925554724,
"grad_norm": 0.5952259302139282,
"learning_rate": 0.00011251204784481712,
"loss": 0.8814,
"step": 684
},
{
"epoch": 0.033571437323106704,
"grad_norm": 0.8138315677642822,
"learning_rate": 0.00011229926124436505,
"loss": 0.9702,
"step": 685
},
{
"epoch": 0.033620446720658684,
"grad_norm": 0.6177354454994202,
"learning_rate": 0.00011208641808470024,
"loss": 1.0031,
"step": 686
},
{
"epoch": 0.033669456118210664,
"grad_norm": 0.696311891078949,
"learning_rate": 0.00011187351934460029,
"loss": 1.0397,
"step": 687
},
{
"epoch": 0.03371846551576265,
"grad_norm": 0.706696093082428,
"learning_rate": 0.0001116605660030984,
"loss": 1.0239,
"step": 688
},
{
"epoch": 0.03376747491331463,
"grad_norm": 0.6652610898017883,
"learning_rate": 0.00011144755903947886,
"loss": 0.878,
"step": 689
},
{
"epoch": 0.03381648431086661,
"grad_norm": 0.7847949862480164,
"learning_rate": 0.00011123449943327256,
"loss": 0.879,
"step": 690
},
{
"epoch": 0.03386549370841859,
"grad_norm": 0.6994475722312927,
"learning_rate": 0.00011102138816425244,
"loss": 1.0035,
"step": 691
},
{
"epoch": 0.03391450310597057,
"grad_norm": 0.6607378721237183,
"learning_rate": 0.00011080822621242905,
"loss": 0.9448,
"step": 692
},
{
"epoch": 0.03396351250352255,
"grad_norm": 0.7712897062301636,
"learning_rate": 0.00011059501455804602,
"loss": 0.9159,
"step": 693
},
{
"epoch": 0.03401252190107453,
"grad_norm": 0.749193012714386,
"learning_rate": 0.00011038175418157548,
"loss": 1.0213,
"step": 694
},
{
"epoch": 0.03406153129862651,
"grad_norm": 0.5379979610443115,
"learning_rate": 0.00011016844606371364,
"loss": 0.8125,
"step": 695
},
{
"epoch": 0.03411054069617849,
"grad_norm": 0.6513563990592957,
"learning_rate": 0.00010995509118537632,
"loss": 0.9148,
"step": 696
},
{
"epoch": 0.03415955009373047,
"grad_norm": 0.6969139575958252,
"learning_rate": 0.00010974169052769425,
"loss": 0.9821,
"step": 697
},
{
"epoch": 0.03420855949128245,
"grad_norm": 0.6268694400787354,
"learning_rate": 0.0001095282450720088,
"loss": 1.1762,
"step": 698
},
{
"epoch": 0.03425756888883443,
"grad_norm": 0.6678013205528259,
"learning_rate": 0.00010931475579986725,
"loss": 0.7744,
"step": 699
},
{
"epoch": 0.03430657828638641,
"grad_norm": 0.6000432968139648,
"learning_rate": 0.00010910122369301842,
"loss": 0.8299,
"step": 700
},
{
"epoch": 0.03435558768393839,
"grad_norm": 0.8176240921020508,
"learning_rate": 0.00010888764973340815,
"loss": 1.0136,
"step": 701
},
{
"epoch": 0.03440459708149038,
"grad_norm": 0.5953860282897949,
"learning_rate": 0.00010867403490317465,
"loss": 0.8912,
"step": 702
},
{
"epoch": 0.03445360647904236,
"grad_norm": 0.677713930606842,
"learning_rate": 0.00010846038018464413,
"loss": 0.9466,
"step": 703
},
{
"epoch": 0.03450261587659434,
"grad_norm": 0.6491566896438599,
"learning_rate": 0.00010824668656032624,
"loss": 0.752,
"step": 704
},
{
"epoch": 0.03455162527414632,
"grad_norm": 0.46719634532928467,
"learning_rate": 0.00010803295501290953,
"loss": 0.9382,
"step": 705
},
{
"epoch": 0.0346006346716983,
"grad_norm": 0.7482130527496338,
"learning_rate": 0.00010781918652525693,
"loss": 1.1858,
"step": 706
},
{
"epoch": 0.03464964406925028,
"grad_norm": 0.5864776968955994,
"learning_rate": 0.00010760538208040125,
"loss": 0.9708,
"step": 707
},
{
"epoch": 0.03469865346680226,
"grad_norm": 0.5870556831359863,
"learning_rate": 0.00010739154266154065,
"loss": 0.9644,
"step": 708
},
{
"epoch": 0.03474766286435424,
"grad_norm": 0.7998099327087402,
"learning_rate": 0.00010717766925203418,
"loss": 0.8556,
"step": 709
},
{
"epoch": 0.03479667226190622,
"grad_norm": 0.6456443667411804,
"learning_rate": 0.00010696376283539704,
"loss": 0.9475,
"step": 710
},
{
"epoch": 0.0348456816594582,
"grad_norm": 0.7642804384231567,
"learning_rate": 0.00010674982439529642,
"loss": 0.818,
"step": 711
},
{
"epoch": 0.03489469105701018,
"grad_norm": 0.763278067111969,
"learning_rate": 0.00010653585491554664,
"loss": 0.9955,
"step": 712
},
{
"epoch": 0.03494370045456216,
"grad_norm": 0.5111615061759949,
"learning_rate": 0.00010632185538010477,
"loss": 0.8955,
"step": 713
},
{
"epoch": 0.03499270985211414,
"grad_norm": 0.8736951947212219,
"learning_rate": 0.00010610782677306614,
"loss": 1.1456,
"step": 714
},
{
"epoch": 0.03504171924966612,
"grad_norm": 0.5861103534698486,
"learning_rate": 0.00010589377007865973,
"loss": 0.8175,
"step": 715
},
{
"epoch": 0.03509072864721811,
"grad_norm": 0.5677356123924255,
"learning_rate": 0.00010567968628124367,
"loss": 1.0074,
"step": 716
},
{
"epoch": 0.03513973804477009,
"grad_norm": 0.7105158567428589,
"learning_rate": 0.00010546557636530086,
"loss": 0.8538,
"step": 717
},
{
"epoch": 0.03518874744232207,
"grad_norm": 0.6045786738395691,
"learning_rate": 0.00010525144131543405,
"loss": 0.9659,
"step": 718
},
{
"epoch": 0.03523775683987405,
"grad_norm": 0.6696767807006836,
"learning_rate": 0.00010503728211636185,
"loss": 0.9815,
"step": 719
},
{
"epoch": 0.03528676623742603,
"grad_norm": 0.7084499001502991,
"learning_rate": 0.00010482309975291373,
"loss": 1.1448,
"step": 720
},
{
"epoch": 0.03533577563497801,
"grad_norm": 0.6141902804374695,
"learning_rate": 0.00010460889521002572,
"loss": 0.9899,
"step": 721
},
{
"epoch": 0.03538478503252999,
"grad_norm": 0.6070237755775452,
"learning_rate": 0.00010439466947273595,
"loss": 0.9804,
"step": 722
},
{
"epoch": 0.03543379443008197,
"grad_norm": 0.5481370091438293,
"learning_rate": 0.00010418042352617982,
"loss": 0.8835,
"step": 723
},
{
"epoch": 0.03548280382763395,
"grad_norm": 0.679847776889801,
"learning_rate": 0.0001039661583555859,
"loss": 1.1018,
"step": 724
},
{
"epoch": 0.03553181322518593,
"grad_norm": 0.5827143788337708,
"learning_rate": 0.00010375187494627098,
"loss": 0.7655,
"step": 725
},
{
"epoch": 0.03558082262273791,
"grad_norm": 0.6592413187026978,
"learning_rate": 0.00010353757428363579,
"loss": 0.9435,
"step": 726
},
{
"epoch": 0.03562983202028989,
"grad_norm": 0.5104247331619263,
"learning_rate": 0.0001033232573531604,
"loss": 0.741,
"step": 727
},
{
"epoch": 0.03567884141784187,
"grad_norm": 0.6800944805145264,
"learning_rate": 0.00010310892514039967,
"loss": 1.0278,
"step": 728
},
{
"epoch": 0.035727850815393855,
"grad_norm": 0.6092975735664368,
"learning_rate": 0.00010289457863097875,
"loss": 0.9647,
"step": 729
},
{
"epoch": 0.035776860212945835,
"grad_norm": 0.7586796879768372,
"learning_rate": 0.00010268021881058858,
"loss": 1.0619,
"step": 730
},
{
"epoch": 0.035825869610497815,
"grad_norm": 0.7803993821144104,
"learning_rate": 0.0001024658466649812,
"loss": 0.9005,
"step": 731
},
{
"epoch": 0.035874879008049795,
"grad_norm": 0.6602010726928711,
"learning_rate": 0.00010225146317996546,
"loss": 0.8085,
"step": 732
},
{
"epoch": 0.035923888405601775,
"grad_norm": 0.8171247839927673,
"learning_rate": 0.00010203706934140225,
"loss": 0.9024,
"step": 733
},
{
"epoch": 0.035972897803153756,
"grad_norm": 0.6471207737922668,
"learning_rate": 0.00010182266613520013,
"loss": 0.9098,
"step": 734
},
{
"epoch": 0.036021907200705736,
"grad_norm": 0.8064071536064148,
"learning_rate": 0.00010160825454731071,
"loss": 0.8098,
"step": 735
},
{
"epoch": 0.036070916598257716,
"grad_norm": 0.5769966840744019,
"learning_rate": 0.00010139383556372418,
"loss": 0.8571,
"step": 736
},
{
"epoch": 0.036119925995809696,
"grad_norm": 0.6435970664024353,
"learning_rate": 0.00010117941017046467,
"loss": 0.8363,
"step": 737
},
{
"epoch": 0.036168935393361676,
"grad_norm": 0.9202609062194824,
"learning_rate": 0.00010096497935358584,
"loss": 1.1239,
"step": 738
},
{
"epoch": 0.036168935393361676,
"eval_loss": NaN,
"eval_runtime": 183.5809,
"eval_samples_per_second": 46.802,
"eval_steps_per_second": 23.401,
"step": 738
},
{
"epoch": 0.036217944790913656,
"grad_norm": 0.7503764629364014,
"learning_rate": 0.00010075054409916631,
"loss": 1.0747,
"step": 739
},
{
"epoch": 0.036266954188465636,
"grad_norm": 0.8134047389030457,
"learning_rate": 0.00010053610539330507,
"loss": 0.9621,
"step": 740
},
{
"epoch": 0.036315963586017616,
"grad_norm": 0.5960626006126404,
"learning_rate": 0.00010032166422211697,
"loss": 0.9175,
"step": 741
},
{
"epoch": 0.036364972983569596,
"grad_norm": 0.6606927514076233,
"learning_rate": 0.00010010722157172818,
"loss": 1.087,
"step": 742
},
{
"epoch": 0.03641398238112158,
"grad_norm": 1.0997828245162964,
"learning_rate": 9.989277842827183e-05,
"loss": 1.2575,
"step": 743
},
{
"epoch": 0.03646299177867356,
"grad_norm": 0.6103606224060059,
"learning_rate": 9.967833577788308e-05,
"loss": 0.8763,
"step": 744
},
{
"epoch": 0.03651200117622554,
"grad_norm": 0.9485415816307068,
"learning_rate": 9.946389460669496e-05,
"loss": 1.1258,
"step": 745
},
{
"epoch": 0.03656101057377752,
"grad_norm": 0.5692161917686462,
"learning_rate": 9.924945590083371e-05,
"loss": 0.853,
"step": 746
},
{
"epoch": 0.036610019971329504,
"grad_norm": 0.5405739545822144,
"learning_rate": 9.90350206464142e-05,
"loss": 0.8583,
"step": 747
},
{
"epoch": 0.036659029368881484,
"grad_norm": 0.546116054058075,
"learning_rate": 9.882058982953536e-05,
"loss": 0.9951,
"step": 748
},
{
"epoch": 0.036708038766433464,
"grad_norm": 0.5827860832214355,
"learning_rate": 9.860616443627586e-05,
"loss": 1.0165,
"step": 749
},
{
"epoch": 0.036757048163985444,
"grad_norm": 0.5827939510345459,
"learning_rate": 9.839174545268931e-05,
"loss": 0.7871,
"step": 750
},
{
"epoch": 0.036806057561537424,
"grad_norm": 0.5125460028648376,
"learning_rate": 9.817733386479987e-05,
"loss": 1.0009,
"step": 751
},
{
"epoch": 0.036855066959089404,
"grad_norm": 0.6548879742622375,
"learning_rate": 9.796293065859776e-05,
"loss": 1.0134,
"step": 752
},
{
"epoch": 0.036904076356641384,
"grad_norm": 0.5788992643356323,
"learning_rate": 9.774853682003456e-05,
"loss": 1.0251,
"step": 753
},
{
"epoch": 0.036953085754193364,
"grad_norm": 0.5059471130371094,
"learning_rate": 9.75341533350188e-05,
"loss": 0.9197,
"step": 754
},
{
"epoch": 0.037002095151745344,
"grad_norm": 1.1150619983673096,
"learning_rate": 9.731978118941142e-05,
"loss": 0.9863,
"step": 755
},
{
"epoch": 0.03705110454929733,
"grad_norm": 0.8003139495849609,
"learning_rate": 9.710542136902127e-05,
"loss": 0.8551,
"step": 756
},
{
"epoch": 0.03710011394684931,
"grad_norm": 0.6236535310745239,
"learning_rate": 9.689107485960038e-05,
"loss": 0.8808,
"step": 757
},
{
"epoch": 0.03714912334440129,
"grad_norm": 0.7307648658752441,
"learning_rate": 9.667674264683962e-05,
"loss": 0.7793,
"step": 758
},
{
"epoch": 0.03719813274195327,
"grad_norm": 0.6053159236907959,
"learning_rate": 9.646242571636423e-05,
"loss": 0.8144,
"step": 759
},
{
"epoch": 0.03724714213950525,
"grad_norm": 0.731212854385376,
"learning_rate": 9.624812505372907e-05,
"loss": 1.0786,
"step": 760
},
{
"epoch": 0.03729615153705723,
"grad_norm": 0.8967050313949585,
"learning_rate": 9.60338416444141e-05,
"loss": 1.0071,
"step": 761
},
{
"epoch": 0.03734516093460921,
"grad_norm": 0.9089602828025818,
"learning_rate": 9.581957647382019e-05,
"loss": 1.0854,
"step": 762
},
{
"epoch": 0.03739417033216119,
"grad_norm": 0.6406798958778381,
"learning_rate": 9.56053305272641e-05,
"loss": 0.8949,
"step": 763
},
{
"epoch": 0.03744317972971317,
"grad_norm": 0.5702521204948425,
"learning_rate": 9.539110478997429e-05,
"loss": 0.9991,
"step": 764
},
{
"epoch": 0.03749218912726515,
"grad_norm": 0.6696991324424744,
"learning_rate": 9.517690024708628e-05,
"loss": 0.7774,
"step": 765
},
{
"epoch": 0.03754119852481713,
"grad_norm": 0.7377912998199463,
"learning_rate": 9.496271788363819e-05,
"loss": 1.0607,
"step": 766
},
{
"epoch": 0.03759020792236911,
"grad_norm": 0.651839554309845,
"learning_rate": 9.474855868456593e-05,
"loss": 0.8064,
"step": 767
},
{
"epoch": 0.03763921731992109,
"grad_norm": 0.6405967473983765,
"learning_rate": 9.453442363469917e-05,
"loss": 0.8916,
"step": 768
},
{
"epoch": 0.03768822671747307,
"grad_norm": 0.6190482974052429,
"learning_rate": 9.432031371875634e-05,
"loss": 0.8863,
"step": 769
},
{
"epoch": 0.03773723611502506,
"grad_norm": 0.7289479970932007,
"learning_rate": 9.410622992134032e-05,
"loss": 0.9322,
"step": 770
},
{
"epoch": 0.03778624551257704,
"grad_norm": 0.5301181077957153,
"learning_rate": 9.389217322693388e-05,
"loss": 0.8915,
"step": 771
},
{
"epoch": 0.03783525491012902,
"grad_norm": 0.5292813181877136,
"learning_rate": 9.367814461989526e-05,
"loss": 0.9377,
"step": 772
},
{
"epoch": 0.037884264307681,
"grad_norm": 0.5793006420135498,
"learning_rate": 9.346414508445341e-05,
"loss": 0.8897,
"step": 773
},
{
"epoch": 0.03793327370523298,
"grad_norm": 0.6730450987815857,
"learning_rate": 9.325017560470358e-05,
"loss": 0.8906,
"step": 774
},
{
"epoch": 0.03798228310278496,
"grad_norm": 0.7565414905548096,
"learning_rate": 9.303623716460297e-05,
"loss": 1.0982,
"step": 775
},
{
"epoch": 0.03803129250033694,
"grad_norm": 0.5824629664421082,
"learning_rate": 9.282233074796587e-05,
"loss": 0.7489,
"step": 776
},
{
"epoch": 0.03808030189788892,
"grad_norm": 0.6062259078025818,
"learning_rate": 9.260845733845934e-05,
"loss": 1.003,
"step": 777
},
{
"epoch": 0.0381293112954409,
"grad_norm": 0.6178932785987854,
"learning_rate": 9.239461791959876e-05,
"loss": 1.0034,
"step": 778
},
{
"epoch": 0.03817832069299288,
"grad_norm": 0.4914613962173462,
"learning_rate": 9.218081347474311e-05,
"loss": 0.9453,
"step": 779
},
{
"epoch": 0.03822733009054486,
"grad_norm": 0.5196013450622559,
"learning_rate": 9.196704498709049e-05,
"loss": 0.8815,
"step": 780
},
{
"epoch": 0.03827633948809684,
"grad_norm": 0.636272132396698,
"learning_rate": 9.175331343967377e-05,
"loss": 0.9161,
"step": 781
},
{
"epoch": 0.03832534888564882,
"grad_norm": 0.5790011286735535,
"learning_rate": 9.153961981535589e-05,
"loss": 0.882,
"step": 782
},
{
"epoch": 0.0383743582832008,
"grad_norm": 0.5262641310691833,
"learning_rate": 9.13259650968254e-05,
"loss": 0.8483,
"step": 783
},
{
"epoch": 0.03842336768075279,
"grad_norm": 0.9476785063743591,
"learning_rate": 9.111235026659187e-05,
"loss": 0.9465,
"step": 784
},
{
"epoch": 0.03847237707830477,
"grad_norm": 0.7593225836753845,
"learning_rate": 9.089877630698158e-05,
"loss": 0.9721,
"step": 785
},
{
"epoch": 0.03852138647585675,
"grad_norm": 0.7612167000770569,
"learning_rate": 9.068524420013277e-05,
"loss": 0.9404,
"step": 786
},
{
"epoch": 0.03857039587340873,
"grad_norm": 0.6220802664756775,
"learning_rate": 9.047175492799121e-05,
"loss": 0.8821,
"step": 787
},
{
"epoch": 0.03861940527096071,
"grad_norm": 0.5717998743057251,
"learning_rate": 9.025830947230578e-05,
"loss": 0.8925,
"step": 788
},
{
"epoch": 0.03866841466851269,
"grad_norm": 0.5665514469146729,
"learning_rate": 9.00449088146237e-05,
"loss": 0.902,
"step": 789
},
{
"epoch": 0.03871742406606467,
"grad_norm": 0.658800482749939,
"learning_rate": 8.983155393628635e-05,
"loss": 0.9597,
"step": 790
},
{
"epoch": 0.03876643346361665,
"grad_norm": 0.5453858375549316,
"learning_rate": 8.961824581842454e-05,
"loss": 0.8837,
"step": 791
},
{
"epoch": 0.03881544286116863,
"grad_norm": 0.6304888725280762,
"learning_rate": 8.940498544195402e-05,
"loss": 0.9236,
"step": 792
},
{
"epoch": 0.03886445225872061,
"grad_norm": 0.6249805688858032,
"learning_rate": 8.919177378757096e-05,
"loss": 0.8296,
"step": 793
},
{
"epoch": 0.03891346165627259,
"grad_norm": 0.5646822452545166,
"learning_rate": 8.897861183574758e-05,
"loss": 0.6453,
"step": 794
},
{
"epoch": 0.03896247105382457,
"grad_norm": 0.8026358485221863,
"learning_rate": 8.876550056672747e-05,
"loss": 0.925,
"step": 795
},
{
"epoch": 0.03901148045137655,
"grad_norm": 0.5161921381950378,
"learning_rate": 8.855244096052114e-05,
"loss": 0.9219,
"step": 796
},
{
"epoch": 0.039060489848928535,
"grad_norm": 0.6423723697662354,
"learning_rate": 8.833943399690163e-05,
"loss": 0.856,
"step": 797
},
{
"epoch": 0.039109499246480516,
"grad_norm": 0.5908942818641663,
"learning_rate": 8.812648065539973e-05,
"loss": 0.8297,
"step": 798
},
{
"epoch": 0.039158508644032496,
"grad_norm": 0.7309346199035645,
"learning_rate": 8.791358191529978e-05,
"loss": 1.057,
"step": 799
},
{
"epoch": 0.039207518041584476,
"grad_norm": 0.7637178301811218,
"learning_rate": 8.770073875563493e-05,
"loss": 1.0936,
"step": 800
},
{
"epoch": 0.039256527439136456,
"grad_norm": 0.6103758215904236,
"learning_rate": 8.74879521551829e-05,
"loss": 0.8872,
"step": 801
},
{
"epoch": 0.039305536836688436,
"grad_norm": 0.666135311126709,
"learning_rate": 8.727522309246111e-05,
"loss": 0.7733,
"step": 802
},
{
"epoch": 0.039354546234240416,
"grad_norm": 0.6191517114639282,
"learning_rate": 8.70625525457225e-05,
"loss": 0.9633,
"step": 803
},
{
"epoch": 0.039403555631792396,
"grad_norm": 0.6226015090942383,
"learning_rate": 8.684994149295094e-05,
"loss": 0.8724,
"step": 804
},
{
"epoch": 0.039452565029344376,
"grad_norm": 0.5788629055023193,
"learning_rate": 8.663739091185668e-05,
"loss": 0.9352,
"step": 805
},
{
"epoch": 0.039501574426896356,
"grad_norm": 0.9742415547370911,
"learning_rate": 8.642490177987183e-05,
"loss": 0.8849,
"step": 806
},
{
"epoch": 0.039550583824448336,
"grad_norm": 0.5254642367362976,
"learning_rate": 8.621247507414605e-05,
"loss": 0.7238,
"step": 807
},
{
"epoch": 0.039599593222000316,
"grad_norm": 0.7346206307411194,
"learning_rate": 8.600011177154181e-05,
"loss": 0.871,
"step": 808
},
{
"epoch": 0.039648602619552296,
"grad_norm": 0.6629201173782349,
"learning_rate": 8.578781284863005e-05,
"loss": 0.8112,
"step": 809
},
{
"epoch": 0.039697612017104277,
"grad_norm": 0.5637260675430298,
"learning_rate": 8.557557928168568e-05,
"loss": 0.9456,
"step": 810
},
{
"epoch": 0.039746621414656264,
"grad_norm": 0.8434960842132568,
"learning_rate": 8.536341204668307e-05,
"loss": 1.0455,
"step": 811
},
{
"epoch": 0.039795630812208244,
"grad_norm": 0.6098150014877319,
"learning_rate": 8.515131211929151e-05,
"loss": 0.9403,
"step": 812
},
{
"epoch": 0.039844640209760224,
"grad_norm": 0.6109006404876709,
"learning_rate": 8.493928047487074e-05,
"loss": 0.9758,
"step": 813
},
{
"epoch": 0.039893649607312204,
"grad_norm": 0.8072890043258667,
"learning_rate": 8.472731808846664e-05,
"loss": 1.007,
"step": 814
},
{
"epoch": 0.039942659004864184,
"grad_norm": 0.9491591453552246,
"learning_rate": 8.45154259348065e-05,
"loss": 0.9375,
"step": 815
},
{
"epoch": 0.039991668402416164,
"grad_norm": 0.5851994752883911,
"learning_rate": 8.430360498829456e-05,
"loss": 0.9222,
"step": 816
},
{
"epoch": 0.040040677799968144,
"grad_norm": 0.5684947967529297,
"learning_rate": 8.40918562230078e-05,
"loss": 0.9068,
"step": 817
},
{
"epoch": 0.040089687197520124,
"grad_norm": 0.540804386138916,
"learning_rate": 8.388018061269112e-05,
"loss": 0.8686,
"step": 818
},
{
"epoch": 0.040138696595072104,
"grad_norm": 0.5974246859550476,
"learning_rate": 8.366857913075301e-05,
"loss": 0.7919,
"step": 819
},
{
"epoch": 0.040187705992624084,
"grad_norm": 0.6588082909584045,
"learning_rate": 8.345705275026123e-05,
"loss": 0.9414,
"step": 820
},
{
"epoch": 0.040236715390176064,
"grad_norm": 0.6050723195075989,
"learning_rate": 8.324560244393799e-05,
"loss": 1.031,
"step": 821
},
{
"epoch": 0.040285724787728044,
"grad_norm": 0.9468902945518494,
"learning_rate": 8.303422918415575e-05,
"loss": 1.0062,
"step": 822
},
{
"epoch": 0.040334734185280025,
"grad_norm": 0.5976113677024841,
"learning_rate": 8.282293394293268e-05,
"loss": 0.9076,
"step": 823
},
{
"epoch": 0.040383743582832005,
"grad_norm": 0.651478111743927,
"learning_rate": 8.261171769192818e-05,
"loss": 1.1146,
"step": 824
},
{
"epoch": 0.04043275298038399,
"grad_norm": 0.5034336447715759,
"learning_rate": 8.240058140243834e-05,
"loss": 0.7507,
"step": 825
},
{
"epoch": 0.04048176237793597,
"grad_norm": 0.6404789090156555,
"learning_rate": 8.218952604539157e-05,
"loss": 1.0417,
"step": 826
},
{
"epoch": 0.04053077177548795,
"grad_norm": 0.5468174815177917,
"learning_rate": 8.197855259134415e-05,
"loss": 0.9414,
"step": 827
},
{
"epoch": 0.04057978117303993,
"grad_norm": 0.7501478791236877,
"learning_rate": 8.176766201047573e-05,
"loss": 0.8987,
"step": 828
},
{
"epoch": 0.04062879057059191,
"grad_norm": 0.8284885287284851,
"learning_rate": 8.155685527258469e-05,
"loss": 0.7266,
"step": 829
},
{
"epoch": 0.04067779996814389,
"grad_norm": 0.6814586520195007,
"learning_rate": 8.134613334708412e-05,
"loss": 0.9165,
"step": 830
},
{
"epoch": 0.04072680936569587,
"grad_norm": 0.9068307280540466,
"learning_rate": 8.113549720299689e-05,
"loss": 0.9455,
"step": 831
},
{
"epoch": 0.04077581876324785,
"grad_norm": 0.8567232489585876,
"learning_rate": 8.092494780895144e-05,
"loss": 0.9211,
"step": 832
},
{
"epoch": 0.04082482816079983,
"grad_norm": 0.5463480949401855,
"learning_rate": 8.071448613317739e-05,
"loss": 0.7966,
"step": 833
},
{
"epoch": 0.04087383755835181,
"grad_norm": 0.6157633662223816,
"learning_rate": 8.05041131435008e-05,
"loss": 0.8836,
"step": 834
},
{
"epoch": 0.04092284695590379,
"grad_norm": 0.81013423204422,
"learning_rate": 8.029382980734e-05,
"loss": 1.1235,
"step": 835
},
{
"epoch": 0.04097185635345577,
"grad_norm": 0.693514883518219,
"learning_rate": 8.00836370917011e-05,
"loss": 0.7711,
"step": 836
},
{
"epoch": 0.04102086575100775,
"grad_norm": 0.908645510673523,
"learning_rate": 7.987353596317339e-05,
"loss": 1.1738,
"step": 837
},
{
"epoch": 0.04106987514855974,
"grad_norm": 0.6220524311065674,
"learning_rate": 7.966352738792497e-05,
"loss": 1.0084,
"step": 838
},
{
"epoch": 0.04111888454611172,
"grad_norm": 0.5720627903938293,
"learning_rate": 7.945361233169841e-05,
"loss": 0.8428,
"step": 839
},
{
"epoch": 0.0411678939436637,
"grad_norm": 0.5578500628471375,
"learning_rate": 7.924379175980618e-05,
"loss": 0.9334,
"step": 840
},
{
"epoch": 0.04121690334121568,
"grad_norm": 0.6064552664756775,
"learning_rate": 7.90340666371263e-05,
"loss": 0.8455,
"step": 841
},
{
"epoch": 0.04126591273876766,
"grad_norm": 0.6311472654342651,
"learning_rate": 7.882443792809772e-05,
"loss": 0.9325,
"step": 842
},
{
"epoch": 0.04131492213631964,
"grad_norm": 0.5778111815452576,
"learning_rate": 7.861490659671621e-05,
"loss": 0.9305,
"step": 843
},
{
"epoch": 0.04136393153387162,
"grad_norm": 0.6144542098045349,
"learning_rate": 7.840547360652964e-05,
"loss": 0.7443,
"step": 844
},
{
"epoch": 0.0414129409314236,
"grad_norm": 0.6736694574356079,
"learning_rate": 7.819613992063361e-05,
"loss": 0.9912,
"step": 845
},
{
"epoch": 0.04146195032897558,
"grad_norm": 0.6833612322807312,
"learning_rate": 7.798690650166722e-05,
"loss": 0.8983,
"step": 846
},
{
"epoch": 0.04151095972652756,
"grad_norm": 0.6620497703552246,
"learning_rate": 7.77777743118083e-05,
"loss": 0.9986,
"step": 847
},
{
"epoch": 0.04155996912407954,
"grad_norm": 0.6063772439956665,
"learning_rate": 7.756874431276924e-05,
"loss": 1.0581,
"step": 848
},
{
"epoch": 0.04160897852163152,
"grad_norm": 0.6608020663261414,
"learning_rate": 7.735981746579254e-05,
"loss": 0.8809,
"step": 849
},
{
"epoch": 0.0416579879191835,
"grad_norm": 0.6642486453056335,
"learning_rate": 7.715099473164632e-05,
"loss": 0.9361,
"step": 850
},
{
"epoch": 0.04170699731673548,
"grad_norm": 0.6425474882125854,
"learning_rate": 7.694227707061989e-05,
"loss": 1.0101,
"step": 851
},
{
"epoch": 0.04175600671428747,
"grad_norm": 0.5888500809669495,
"learning_rate": 7.673366544251936e-05,
"loss": 0.9401,
"step": 852
},
{
"epoch": 0.04180501611183945,
"grad_norm": 0.7488614916801453,
"learning_rate": 7.652516080666337e-05,
"loss": 0.921,
"step": 853
},
{
"epoch": 0.04185402550939143,
"grad_norm": 0.6826761960983276,
"learning_rate": 7.631676412187845e-05,
"loss": 0.9312,
"step": 854
},
{
"epoch": 0.04190303490694341,
"grad_norm": 0.6342119574546814,
"learning_rate": 7.610847634649458e-05,
"loss": 0.9297,
"step": 855
},
{
"epoch": 0.04195204430449539,
"grad_norm": 0.6165019869804382,
"learning_rate": 7.590029843834123e-05,
"loss": 0.8475,
"step": 856
},
{
"epoch": 0.04200105370204737,
"grad_norm": 0.6887275576591492,
"learning_rate": 7.569223135474235e-05,
"loss": 0.8233,
"step": 857
},
{
"epoch": 0.04205006309959935,
"grad_norm": 0.5744852423667908,
"learning_rate": 7.548427605251234e-05,
"loss": 0.8901,
"step": 858
},
{
"epoch": 0.04209907249715133,
"grad_norm": 0.9375432133674622,
"learning_rate": 7.527643348795162e-05,
"loss": 1.059,
"step": 859
},
{
"epoch": 0.04214808189470331,
"grad_norm": 0.6605034470558167,
"learning_rate": 7.506870461684215e-05,
"loss": 0.8993,
"step": 860
},
{
"epoch": 0.04219709129225529,
"grad_norm": 0.6732087135314941,
"learning_rate": 7.486109039444296e-05,
"loss": 0.7159,
"step": 861
},
{
"epoch": 0.04224610068980727,
"grad_norm": 0.6579399704933167,
"learning_rate": 7.465359177548605e-05,
"loss": 0.8402,
"step": 862
},
{
"epoch": 0.04229511008735925,
"grad_norm": 0.756745457649231,
"learning_rate": 7.444620971417163e-05,
"loss": 1.1157,
"step": 863
},
{
"epoch": 0.04234411948491123,
"grad_norm": 0.6364356279373169,
"learning_rate": 7.423894516416403e-05,
"loss": 0.8478,
"step": 864
},
{
"epoch": 0.042393128882463216,
"grad_norm": 0.5769418478012085,
"learning_rate": 7.403179907858708e-05,
"loss": 0.8928,
"step": 865
},
{
"epoch": 0.042442138280015196,
"grad_norm": 0.5486810207366943,
"learning_rate": 7.382477241001995e-05,
"loss": 0.9149,
"step": 866
},
{
"epoch": 0.042491147677567176,
"grad_norm": 0.7786839604377747,
"learning_rate": 7.361786611049265e-05,
"loss": 0.9168,
"step": 867
},
{
"epoch": 0.042540157075119156,
"grad_norm": 0.7111212611198425,
"learning_rate": 7.341108113148152e-05,
"loss": 1.0262,
"step": 868
},
{
"epoch": 0.042589166472671136,
"grad_norm": 0.6772000193595886,
"learning_rate": 7.320441842390522e-05,
"loss": 0.6625,
"step": 869
},
{
"epoch": 0.042638175870223116,
"grad_norm": 0.5871595740318298,
"learning_rate": 7.299787893811998e-05,
"loss": 0.883,
"step": 870
},
{
"epoch": 0.042687185267775096,
"grad_norm": 0.7698076963424683,
"learning_rate": 7.279146362391538e-05,
"loss": 1.1053,
"step": 871
},
{
"epoch": 0.042736194665327076,
"grad_norm": 0.9533458352088928,
"learning_rate": 7.25851734305101e-05,
"loss": 1.1109,
"step": 872
},
{
"epoch": 0.042785204062879056,
"grad_norm": 0.5938100218772888,
"learning_rate": 7.237900930654735e-05,
"loss": 0.9771,
"step": 873
},
{
"epoch": 0.042834213460431037,
"grad_norm": 0.6919374465942383,
"learning_rate": 7.217297220009059e-05,
"loss": 0.8795,
"step": 874
},
{
"epoch": 0.04288322285798302,
"grad_norm": 1.0210683345794678,
"learning_rate": 7.196706305861925e-05,
"loss": 1.0654,
"step": 875
},
{
"epoch": 0.042932232255535,
"grad_norm": 0.6737746000289917,
"learning_rate": 7.176128282902423e-05,
"loss": 0.8689,
"step": 876
},
{
"epoch": 0.04298124165308698,
"grad_norm": 0.6844781041145325,
"learning_rate": 7.15556324576037e-05,
"loss": 0.848,
"step": 877
},
{
"epoch": 0.04303025105063896,
"grad_norm": 0.7788298726081848,
"learning_rate": 7.135011289005853e-05,
"loss": 0.9539,
"step": 878
},
{
"epoch": 0.043079260448190944,
"grad_norm": 0.6003298759460449,
"learning_rate": 7.114472507148824e-05,
"loss": 0.947,
"step": 879
},
{
"epoch": 0.043128269845742924,
"grad_norm": 0.6445959806442261,
"learning_rate": 7.093946994638638e-05,
"loss": 1.0555,
"step": 880
},
{
"epoch": 0.043177279243294904,
"grad_norm": 0.6316936612129211,
"learning_rate": 7.073434845863631e-05,
"loss": 0.8573,
"step": 881
},
{
"epoch": 0.043226288640846884,
"grad_norm": 0.7546505331993103,
"learning_rate": 7.052936155150694e-05,
"loss": 0.94,
"step": 882
},
{
"epoch": 0.043275298038398864,
"grad_norm": 0.6299151182174683,
"learning_rate": 7.032451016764817e-05,
"loss": 1.0281,
"step": 883
},
{
"epoch": 0.043324307435950844,
"grad_norm": 0.546668291091919,
"learning_rate": 7.011979524908674e-05,
"loss": 0.7062,
"step": 884
},
{
"epoch": 0.043373316833502824,
"grad_norm": 0.6820911765098572,
"learning_rate": 6.991521773722186e-05,
"loss": 0.7952,
"step": 885
},
{
"epoch": 0.043422326231054804,
"grad_norm": 0.5912885665893555,
"learning_rate": 6.971077857282087e-05,
"loss": 0.927,
"step": 886
},
{
"epoch": 0.043471335628606785,
"grad_norm": 0.5003869533538818,
"learning_rate": 6.950647869601484e-05,
"loss": 0.9082,
"step": 887
},
{
"epoch": 0.043520345026158765,
"grad_norm": 0.654216468334198,
"learning_rate": 6.93023190462944e-05,
"loss": 0.8691,
"step": 888
},
{
"epoch": 0.043569354423710745,
"grad_norm": 0.564075767993927,
"learning_rate": 6.909830056250527e-05,
"loss": 0.9184,
"step": 889
},
{
"epoch": 0.043618363821262725,
"grad_norm": 0.7271436452865601,
"learning_rate": 6.889442418284402e-05,
"loss": 0.9718,
"step": 890
},
{
"epoch": 0.043667373218814705,
"grad_norm": 0.628280520439148,
"learning_rate": 6.86906908448538e-05,
"loss": 0.8827,
"step": 891
},
{
"epoch": 0.043716382616366685,
"grad_norm": 0.5660605430603027,
"learning_rate": 6.848710148541988e-05,
"loss": 0.9325,
"step": 892
},
{
"epoch": 0.04376539201391867,
"grad_norm": 0.6978943943977356,
"learning_rate": 6.828365704076547e-05,
"loss": 0.8103,
"step": 893
},
{
"epoch": 0.04381440141147065,
"grad_norm": 0.7359114289283752,
"learning_rate": 6.808035844644738e-05,
"loss": 0.8615,
"step": 894
},
{
"epoch": 0.04386341080902263,
"grad_norm": 0.6953577399253845,
"learning_rate": 6.787720663735178e-05,
"loss": 0.9831,
"step": 895
},
{
"epoch": 0.04391242020657461,
"grad_norm": 0.8604477047920227,
"learning_rate": 6.76742025476897e-05,
"loss": 0.9752,
"step": 896
},
{
"epoch": 0.04396142960412659,
"grad_norm": 0.6907943487167358,
"learning_rate": 6.747134711099292e-05,
"loss": 0.9169,
"step": 897
},
{
"epoch": 0.04401043900167857,
"grad_norm": 0.644757091999054,
"learning_rate": 6.726864126010973e-05,
"loss": 0.8078,
"step": 898
},
{
"epoch": 0.04405944839923055,
"grad_norm": 0.566016435623169,
"learning_rate": 6.706608592720043e-05,
"loss": 0.8091,
"step": 899
},
{
"epoch": 0.04410845779678253,
"grad_norm": 0.7891855835914612,
"learning_rate": 6.68636820437331e-05,
"loss": 1.0295,
"step": 900
},
{
"epoch": 0.04415746719433451,
"grad_norm": 0.8049860596656799,
"learning_rate": 6.666143054047955e-05,
"loss": 0.9562,
"step": 901
},
{
"epoch": 0.04420647659188649,
"grad_norm": 0.7799884080886841,
"learning_rate": 6.64593323475107e-05,
"loss": 0.927,
"step": 902
},
{
"epoch": 0.04425548598943847,
"grad_norm": 1.0052690505981445,
"learning_rate": 6.625738839419244e-05,
"loss": 1.2332,
"step": 903
},
{
"epoch": 0.04430449538699045,
"grad_norm": 0.5868604183197021,
"learning_rate": 6.605559960918155e-05,
"loss": 0.8657,
"step": 904
},
{
"epoch": 0.04435350478454243,
"grad_norm": 0.8376001119613647,
"learning_rate": 6.585396692042113e-05,
"loss": 1.0565,
"step": 905
},
{
"epoch": 0.04440251418209442,
"grad_norm": 0.7444510459899902,
"learning_rate": 6.565249125513641e-05,
"loss": 0.8771,
"step": 906
},
{
"epoch": 0.0444515235796464,
"grad_norm": 1.0465072393417358,
"learning_rate": 6.545117353983064e-05,
"loss": 1.1834,
"step": 907
},
{
"epoch": 0.04450053297719838,
"grad_norm": 0.5080894231796265,
"learning_rate": 6.525001470028068e-05,
"loss": 0.785,
"step": 908
},
{
"epoch": 0.04454954237475036,
"grad_norm": 0.7592306137084961,
"learning_rate": 6.504901566153281e-05,
"loss": 0.9745,
"step": 909
},
{
"epoch": 0.04459855177230234,
"grad_norm": 0.6754018068313599,
"learning_rate": 6.484817734789838e-05,
"loss": 0.8665,
"step": 910
},
{
"epoch": 0.04464756116985432,
"grad_norm": 0.7137823700904846,
"learning_rate": 6.464750068294974e-05,
"loss": 0.8713,
"step": 911
},
{
"epoch": 0.0446965705674063,
"grad_norm": 0.6034334897994995,
"learning_rate": 6.44469865895158e-05,
"loss": 0.865,
"step": 912
},
{
"epoch": 0.04474557996495828,
"grad_norm": 0.9829195141792297,
"learning_rate": 6.424663598967785e-05,
"loss": 1.0658,
"step": 913
},
{
"epoch": 0.04479458936251026,
"grad_norm": 1.1569057703018188,
"learning_rate": 6.404644980476551e-05,
"loss": 0.6931,
"step": 914
},
{
"epoch": 0.04484359876006224,
"grad_norm": 0.913396954536438,
"learning_rate": 6.384642895535209e-05,
"loss": 1.1898,
"step": 915
},
{
"epoch": 0.04489260815761422,
"grad_norm": 0.5226083993911743,
"learning_rate": 6.364657436125073e-05,
"loss": 0.8841,
"step": 916
},
{
"epoch": 0.0449416175551662,
"grad_norm": 0.7840003371238708,
"learning_rate": 6.344688694151004e-05,
"loss": 0.6906,
"step": 917
},
{
"epoch": 0.04499062695271818,
"grad_norm": 0.7827737331390381,
"learning_rate": 6.324736761440983e-05,
"loss": 1.0184,
"step": 918
},
{
"epoch": 0.04503963635027016,
"grad_norm": 0.6751218438148499,
"learning_rate": 6.304801729745688e-05,
"loss": 0.9126,
"step": 919
},
{
"epoch": 0.04508864574782215,
"grad_norm": 0.6796963214874268,
"learning_rate": 6.28488369073808e-05,
"loss": 0.8393,
"step": 920
},
{
"epoch": 0.04513765514537413,
"grad_norm": 0.9316746592521667,
"learning_rate": 6.26498273601298e-05,
"loss": 1.1761,
"step": 921
},
{
"epoch": 0.04518666454292611,
"grad_norm": 0.704622209072113,
"learning_rate": 6.245098957086648e-05,
"loss": 1.0137,
"step": 922
},
{
"epoch": 0.04523567394047809,
"grad_norm": 0.6289026737213135,
"learning_rate": 6.225232445396345e-05,
"loss": 0.9149,
"step": 923
},
{
"epoch": 0.04528468333803007,
"grad_norm": 0.6918268203735352,
"learning_rate": 6.205383292299942e-05,
"loss": 1.0257,
"step": 924
},
{
"epoch": 0.04533369273558205,
"grad_norm": 0.5847110152244568,
"learning_rate": 6.185551589075482e-05,
"loss": 0.9003,
"step": 925
},
{
"epoch": 0.04538270213313403,
"grad_norm": 0.6945834159851074,
"learning_rate": 6.165737426920757e-05,
"loss": 0.7409,
"step": 926
},
{
"epoch": 0.04543171153068601,
"grad_norm": 0.6071839928627014,
"learning_rate": 6.145940896952907e-05,
"loss": 0.9796,
"step": 927
},
{
"epoch": 0.04548072092823799,
"grad_norm": 0.6884227395057678,
"learning_rate": 6.126162090207972e-05,
"loss": 0.9285,
"step": 928
},
{
"epoch": 0.04552973032578997,
"grad_norm": 0.7866727709770203,
"learning_rate": 6.106401097640502e-05,
"loss": 0.951,
"step": 929
},
{
"epoch": 0.04557873972334195,
"grad_norm": 0.7089359164237976,
"learning_rate": 6.0866580101231255e-05,
"loss": 1.0993,
"step": 930
},
{
"epoch": 0.04562774912089393,
"grad_norm": 0.7076542377471924,
"learning_rate": 6.066932918446135e-05,
"loss": 0.9934,
"step": 931
},
{
"epoch": 0.04567675851844591,
"grad_norm": 0.5357589721679688,
"learning_rate": 6.047225913317058e-05,
"loss": 0.8851,
"step": 932
},
{
"epoch": 0.04572576791599789,
"grad_norm": 0.5877269506454468,
"learning_rate": 6.0275370853602596e-05,
"loss": 0.9545,
"step": 933
},
{
"epoch": 0.045774777313549876,
"grad_norm": 0.6116016507148743,
"learning_rate": 6.007866525116511e-05,
"loss": 0.9497,
"step": 934
},
{
"epoch": 0.045823786711101856,
"grad_norm": 0.6139143705368042,
"learning_rate": 5.988214323042581e-05,
"loss": 0.9749,
"step": 935
},
{
"epoch": 0.045872796108653836,
"grad_norm": 0.5860751867294312,
"learning_rate": 5.9685805695108085e-05,
"loss": 0.9524,
"step": 936
},
{
"epoch": 0.045921805506205816,
"grad_norm": 0.817271888256073,
"learning_rate": 5.948965354808709e-05,
"loss": 0.9185,
"step": 937
},
{
"epoch": 0.045970814903757796,
"grad_norm": 0.6361889243125916,
"learning_rate": 5.929368769138531e-05,
"loss": 0.8063,
"step": 938
},
{
"epoch": 0.04601982430130978,
"grad_norm": 0.5513597726821899,
"learning_rate": 5.909790902616864e-05,
"loss": 0.974,
"step": 939
},
{
"epoch": 0.04606883369886176,
"grad_norm": 0.6770530343055725,
"learning_rate": 5.890231845274219e-05,
"loss": 0.8411,
"step": 940
},
{
"epoch": 0.04611784309641374,
"grad_norm": 0.7206839919090271,
"learning_rate": 5.870691687054602e-05,
"loss": 0.7419,
"step": 941
},
{
"epoch": 0.04616685249396572,
"grad_norm": 0.5544096231460571,
"learning_rate": 5.8511705178151145e-05,
"loss": 0.9463,
"step": 942
},
{
"epoch": 0.0462158618915177,
"grad_norm": 0.8671022653579712,
"learning_rate": 5.83166842732554e-05,
"loss": 0.8734,
"step": 943
},
{
"epoch": 0.04626487128906968,
"grad_norm": 0.7392458915710449,
"learning_rate": 5.812185505267914e-05,
"loss": 0.9899,
"step": 944
},
{
"epoch": 0.04631388068662166,
"grad_norm": 0.6004726886749268,
"learning_rate": 5.792721841236143e-05,
"loss": 0.9051,
"step": 945
},
{
"epoch": 0.04636289008417364,
"grad_norm": 0.7594625949859619,
"learning_rate": 5.773277524735553e-05,
"loss": 1.053,
"step": 946
},
{
"epoch": 0.046411899481725624,
"grad_norm": 0.7624562382698059,
"learning_rate": 5.753852645182518e-05,
"loss": 0.8534,
"step": 947
},
{
"epoch": 0.046460908879277604,
"grad_norm": 0.7300339341163635,
"learning_rate": 5.7344472919040136e-05,
"loss": 0.8207,
"step": 948
},
{
"epoch": 0.046509918276829584,
"grad_norm": 0.7849738597869873,
"learning_rate": 5.715061554137232e-05,
"loss": 1.1812,
"step": 949
},
{
"epoch": 0.046558927674381564,
"grad_norm": 1.06199049949646,
"learning_rate": 5.695695521029163e-05,
"loss": 1.0975,
"step": 950
},
{
"epoch": 0.046607937071933545,
"grad_norm": 0.7558932304382324,
"learning_rate": 5.676349281636175e-05,
"loss": 0.9038,
"step": 951
},
{
"epoch": 0.046656946469485525,
"grad_norm": 0.6437797546386719,
"learning_rate": 5.6570229249236126e-05,
"loss": 0.8132,
"step": 952
},
{
"epoch": 0.046705955867037505,
"grad_norm": 1.0840975046157837,
"learning_rate": 5.637716539765397e-05,
"loss": 1.0105,
"step": 953
},
{
"epoch": 0.046754965264589485,
"grad_norm": 0.7432597279548645,
"learning_rate": 5.618430214943608e-05,
"loss": 0.8406,
"step": 954
},
{
"epoch": 0.046803974662141465,
"grad_norm": 0.5832216143608093,
"learning_rate": 5.599164039148066e-05,
"loss": 0.8905,
"step": 955
},
{
"epoch": 0.046852984059693445,
"grad_norm": 0.8845385909080505,
"learning_rate": 5.5799181009759474e-05,
"loss": 0.9021,
"step": 956
},
{
"epoch": 0.046901993457245425,
"grad_norm": 0.6067280173301697,
"learning_rate": 5.5606924889313474e-05,
"loss": 0.8903,
"step": 957
},
{
"epoch": 0.046951002854797405,
"grad_norm": 0.9422512054443359,
"learning_rate": 5.541487291424909e-05,
"loss": 1.0439,
"step": 958
},
{
"epoch": 0.047000012252349385,
"grad_norm": 0.6058634519577026,
"learning_rate": 5.522302596773383e-05,
"loss": 0.9344,
"step": 959
},
{
"epoch": 0.047049021649901365,
"grad_norm": 0.5414987206459045,
"learning_rate": 5.503138493199247e-05,
"loss": 0.938,
"step": 960
},
{
"epoch": 0.04709803104745335,
"grad_norm": 0.648684561252594,
"learning_rate": 5.483995068830272e-05,
"loss": 0.9395,
"step": 961
},
{
"epoch": 0.04714704044500533,
"grad_norm": 0.6390448212623596,
"learning_rate": 5.4648724116991536e-05,
"loss": 0.8967,
"step": 962
},
{
"epoch": 0.04719604984255731,
"grad_norm": 0.496037095785141,
"learning_rate": 5.445770609743078e-05,
"loss": 0.8173,
"step": 963
},
{
"epoch": 0.04724505924010929,
"grad_norm": 0.583728551864624,
"learning_rate": 5.4266897508033264e-05,
"loss": 0.869,
"step": 964
},
{
"epoch": 0.04729406863766127,
"grad_norm": 0.6022070050239563,
"learning_rate": 5.407629922624866e-05,
"loss": 0.9478,
"step": 965
},
{
"epoch": 0.04734307803521325,
"grad_norm": 0.6118979454040527,
"learning_rate": 5.3885912128559725e-05,
"loss": 0.9297,
"step": 966
},
{
"epoch": 0.04739208743276523,
"grad_norm": 0.6469177007675171,
"learning_rate": 5.369573709047786e-05,
"loss": 0.8545,
"step": 967
},
{
"epoch": 0.04744109683031721,
"grad_norm": 0.9258518815040588,
"learning_rate": 5.350577498653935e-05,
"loss": 0.8147,
"step": 968
},
{
"epoch": 0.04749010622786919,
"grad_norm": 0.5451820492744446,
"learning_rate": 5.3316026690301305e-05,
"loss": 0.8505,
"step": 969
},
{
"epoch": 0.04753911562542117,
"grad_norm": 0.5648760199546814,
"learning_rate": 5.312649307433768e-05,
"loss": 0.811,
"step": 970
},
{
"epoch": 0.04758812502297315,
"grad_norm": 0.7802280783653259,
"learning_rate": 5.2937175010235096e-05,
"loss": 0.7369,
"step": 971
},
{
"epoch": 0.04763713442052513,
"grad_norm": 0.5710716843605042,
"learning_rate": 5.2748073368588945e-05,
"loss": 0.9393,
"step": 972
},
{
"epoch": 0.04768614381807711,
"grad_norm": 0.6245925426483154,
"learning_rate": 5.25591890189995e-05,
"loss": 0.9471,
"step": 973
},
{
"epoch": 0.0477351532156291,
"grad_norm": 0.8834426999092102,
"learning_rate": 5.2370522830067646e-05,
"loss": 1.0204,
"step": 974
},
{
"epoch": 0.04778416261318108,
"grad_norm": 0.6848143339157104,
"learning_rate": 5.218207566939116e-05,
"loss": 0.9117,
"step": 975
},
{
"epoch": 0.04783317201073306,
"grad_norm": 0.5751230120658875,
"learning_rate": 5.1993848403560575e-05,
"loss": 0.868,
"step": 976
},
{
"epoch": 0.04788218140828504,
"grad_norm": 0.6816519498825073,
"learning_rate": 5.180584189815515e-05,
"loss": 0.9373,
"step": 977
},
{
"epoch": 0.04793119080583702,
"grad_norm": 0.543278157711029,
"learning_rate": 5.161805701773894e-05,
"loss": 0.8105,
"step": 978
},
{
"epoch": 0.047980200203389,
"grad_norm": 0.7940229177474976,
"learning_rate": 5.143049462585705e-05,
"loss": 0.8127,
"step": 979
},
{
"epoch": 0.04802920960094098,
"grad_norm": 0.6375604867935181,
"learning_rate": 5.124315558503121e-05,
"loss": 0.9169,
"step": 980
},
{
"epoch": 0.04807821899849296,
"grad_norm": 0.6112627983093262,
"learning_rate": 5.105604075675614e-05,
"loss": 0.9554,
"step": 981
},
{
"epoch": 0.04812722839604494,
"grad_norm": 0.8625818490982056,
"learning_rate": 5.0869151001495504e-05,
"loss": 0.9638,
"step": 982
},
{
"epoch": 0.04817623779359692,
"grad_norm": 0.6942315101623535,
"learning_rate": 5.068248717867801e-05,
"loss": 0.9234,
"step": 983
},
{
"epoch": 0.0482252471911489,
"grad_norm": 0.6004964709281921,
"learning_rate": 5.0496050146693254e-05,
"loss": 1.0006,
"step": 984
},
{
"epoch": 0.04827425658870088,
"grad_norm": 0.6091214418411255,
"learning_rate": 5.030984076288805e-05,
"loss": 0.9975,
"step": 985
},
{
"epoch": 0.04832326598625286,
"grad_norm": 0.657734751701355,
"learning_rate": 5.0123859883562243e-05,
"loss": 1.0277,
"step": 986
},
{
"epoch": 0.04837227538380484,
"grad_norm": 0.6832321286201477,
"learning_rate": 4.9938108363965006e-05,
"loss": 0.9766,
"step": 987
},
{
"epoch": 0.04842128478135683,
"grad_norm": 0.6531272530555725,
"learning_rate": 4.9752587058290625e-05,
"loss": 0.8578,
"step": 988
},
{
"epoch": 0.04847029417890881,
"grad_norm": 0.7007570266723633,
"learning_rate": 4.956729681967489e-05,
"loss": 0.9515,
"step": 989
},
{
"epoch": 0.04851930357646079,
"grad_norm": 1.039275050163269,
"learning_rate": 4.938223850019087e-05,
"loss": 0.9176,
"step": 990
},
{
"epoch": 0.04856831297401277,
"grad_norm": 0.802375853061676,
"learning_rate": 4.9197412950845214e-05,
"loss": 1.0036,
"step": 991
},
{
"epoch": 0.04861732237156475,
"grad_norm": 0.5418473482131958,
"learning_rate": 4.9012821021574183e-05,
"loss": 0.8192,
"step": 992
},
{
"epoch": 0.04866633176911673,
"grad_norm": 0.7108655571937561,
"learning_rate": 4.882846356123965e-05,
"loss": 0.831,
"step": 993
},
{
"epoch": 0.04871534116666871,
"grad_norm": 0.676276683807373,
"learning_rate": 4.864434141762521e-05,
"loss": 0.9073,
"step": 994
},
{
"epoch": 0.04876435056422069,
"grad_norm": 0.6457884311676025,
"learning_rate": 4.846045543743247e-05,
"loss": 0.8802,
"step": 995
},
{
"epoch": 0.04881335996177267,
"grad_norm": 0.7171979546546936,
"learning_rate": 4.827680646627699e-05,
"loss": 1.1391,
"step": 996
},
{
"epoch": 0.04886236935932465,
"grad_norm": 0.8592037558555603,
"learning_rate": 4.809339534868432e-05,
"loss": 1.0055,
"step": 997
},
{
"epoch": 0.04891137875687663,
"grad_norm": 0.8035635948181152,
"learning_rate": 4.791022292808636e-05,
"loss": 0.8176,
"step": 998
},
{
"epoch": 0.04896038815442861,
"grad_norm": 0.7246670722961426,
"learning_rate": 4.77272900468172e-05,
"loss": 0.899,
"step": 999
},
{
"epoch": 0.04900939755198059,
"grad_norm": 0.8624154329299927,
"learning_rate": 4.7544597546109514e-05,
"loss": 0.9579,
"step": 1000
},
{
"epoch": 0.04905840694953257,
"grad_norm": 0.6191447377204895,
"learning_rate": 4.7362146266090465e-05,
"loss": 0.9171,
"step": 1001
},
{
"epoch": 0.049107416347084556,
"grad_norm": 0.664317786693573,
"learning_rate": 4.717993704577806e-05,
"loss": 0.671,
"step": 1002
},
{
"epoch": 0.04915642574463654,
"grad_norm": 0.5835380554199219,
"learning_rate": 4.6997970723077e-05,
"loss": 0.838,
"step": 1003
},
{
"epoch": 0.04920543514218852,
"grad_norm": 0.7254071831703186,
"learning_rate": 4.681624813477515e-05,
"loss": 0.9072,
"step": 1004
},
{
"epoch": 0.0492544445397405,
"grad_norm": 0.7444500923156738,
"learning_rate": 4.663477011653955e-05,
"loss": 0.932,
"step": 1005
},
{
"epoch": 0.04930345393729248,
"grad_norm": 0.6171773076057434,
"learning_rate": 4.645353750291245e-05,
"loss": 0.9917,
"step": 1006
},
{
"epoch": 0.04935246333484446,
"grad_norm": 0.6146338582038879,
"learning_rate": 4.627255112730761e-05,
"loss": 1.0638,
"step": 1007
},
{
"epoch": 0.04940147273239644,
"grad_norm": 0.5829159617424011,
"learning_rate": 4.6091811822006507e-05,
"loss": 0.7155,
"step": 1008
},
{
"epoch": 0.04945048212994842,
"grad_norm": 0.9201460480690002,
"learning_rate": 4.591132041815445e-05,
"loss": 0.9896,
"step": 1009
},
{
"epoch": 0.0494994915275004,
"grad_norm": 0.6689648032188416,
"learning_rate": 4.5731077745756644e-05,
"loss": 1.0704,
"step": 1010
},
{
"epoch": 0.04954850092505238,
"grad_norm": 0.6462419629096985,
"learning_rate": 4.555108463367463e-05,
"loss": 0.9133,
"step": 1011
},
{
"epoch": 0.04959751032260436,
"grad_norm": 0.7375591993331909,
"learning_rate": 4.537134190962216e-05,
"loss": 0.9788,
"step": 1012
},
{
"epoch": 0.04964651972015634,
"grad_norm": 0.6392641067504883,
"learning_rate": 4.5191850400161715e-05,
"loss": 0.8552,
"step": 1013
},
{
"epoch": 0.04969552911770832,
"grad_norm": 0.7200862169265747,
"learning_rate": 4.5012610930700406e-05,
"loss": 0.9416,
"step": 1014
},
{
"epoch": 0.049744538515260305,
"grad_norm": 0.6843019723892212,
"learning_rate": 4.4833624325486446e-05,
"loss": 0.7569,
"step": 1015
},
{
"epoch": 0.049793547912812285,
"grad_norm": 0.7355234026908875,
"learning_rate": 4.4654891407605096e-05,
"loss": 0.778,
"step": 1016
},
{
"epoch": 0.049842557310364265,
"grad_norm": 0.8321521878242493,
"learning_rate": 4.4476412998975106e-05,
"loss": 0.7831,
"step": 1017
},
{
"epoch": 0.049891566707916245,
"grad_norm": 0.8588862419128418,
"learning_rate": 4.429818992034487e-05,
"loss": 0.9741,
"step": 1018
},
{
"epoch": 0.049940576105468225,
"grad_norm": 0.7462893128395081,
"learning_rate": 4.412022299128853e-05,
"loss": 0.8281,
"step": 1019
},
{
"epoch": 0.049989585503020205,
"grad_norm": 0.6284858584403992,
"learning_rate": 4.3942513030202305e-05,
"loss": 0.905,
"step": 1020
},
{
"epoch": 0.050038594900572185,
"grad_norm": 0.7673025131225586,
"learning_rate": 4.376506085430081e-05,
"loss": 0.9064,
"step": 1021
},
{
"epoch": 0.050087604298124165,
"grad_norm": 0.718449056148529,
"learning_rate": 4.3587867279613206e-05,
"loss": 0.9743,
"step": 1022
},
{
"epoch": 0.050136613695676145,
"grad_norm": 0.7094829678535461,
"learning_rate": 4.341093312097932e-05,
"loss": 0.996,
"step": 1023
},
{
"epoch": 0.050185623093228125,
"grad_norm": 0.8285147547721863,
"learning_rate": 4.3234259192046244e-05,
"loss": 1.1247,
"step": 1024
},
{
"epoch": 0.050234632490780105,
"grad_norm": 0.8108729124069214,
"learning_rate": 4.305784630526416e-05,
"loss": 0.8781,
"step": 1025
},
{
"epoch": 0.050283641888332085,
"grad_norm": 0.6478090882301331,
"learning_rate": 4.288169527188301e-05,
"loss": 0.7416,
"step": 1026
},
{
"epoch": 0.050332651285884066,
"grad_norm": 0.7296517491340637,
"learning_rate": 4.270580690194844e-05,
"loss": 1.0396,
"step": 1027
},
{
"epoch": 0.050381660683436046,
"grad_norm": 0.7231200933456421,
"learning_rate": 4.253018200429834e-05,
"loss": 0.9782,
"step": 1028
},
{
"epoch": 0.05043067008098803,
"grad_norm": 0.9415457844734192,
"learning_rate": 4.2354821386558855e-05,
"loss": 1.1343,
"step": 1029
},
{
"epoch": 0.05047967947854001,
"grad_norm": 0.776109516620636,
"learning_rate": 4.217972585514095e-05,
"loss": 0.8988,
"step": 1030
},
{
"epoch": 0.05052868887609199,
"grad_norm": 0.6780912280082703,
"learning_rate": 4.2004896215236544e-05,
"loss": 0.9418,
"step": 1031
},
{
"epoch": 0.05057769827364397,
"grad_norm": 0.6523477435112,
"learning_rate": 4.183033327081476e-05,
"loss": 0.7962,
"step": 1032
},
{
"epoch": 0.05062670767119595,
"grad_norm": 0.7628219723701477,
"learning_rate": 4.1656037824618325e-05,
"loss": 1.0017,
"step": 1033
},
{
"epoch": 0.05067571706874793,
"grad_norm": 0.8618097305297852,
"learning_rate": 4.148201067815989e-05,
"loss": 0.8404,
"step": 1034
},
{
"epoch": 0.05072472646629991,
"grad_norm": 0.6040647029876709,
"learning_rate": 4.1308252631718325e-05,
"loss": 0.9687,
"step": 1035
},
{
"epoch": 0.05077373586385189,
"grad_norm": 0.7941866517066956,
"learning_rate": 4.113476448433491e-05,
"loss": 1.0444,
"step": 1036
},
{
"epoch": 0.05082274526140387,
"grad_norm": 0.6268519163131714,
"learning_rate": 4.09615470338099e-05,
"loss": 1.0379,
"step": 1037
},
{
"epoch": 0.05087175465895585,
"grad_norm": 0.7264936566352844,
"learning_rate": 4.078860107669862e-05,
"loss": 0.8959,
"step": 1038
},
{
"epoch": 0.05092076405650783,
"grad_norm": 0.6215760707855225,
"learning_rate": 4.061592740830801e-05,
"loss": 0.9007,
"step": 1039
},
{
"epoch": 0.050969773454059814,
"grad_norm": 0.5822103023529053,
"learning_rate": 4.0443526822692755e-05,
"loss": 0.8713,
"step": 1040
},
{
"epoch": 0.051018782851611794,
"grad_norm": 0.7218736410140991,
"learning_rate": 4.027140011265187e-05,
"loss": 0.9172,
"step": 1041
},
{
"epoch": 0.051067792249163774,
"grad_norm": 0.678709089756012,
"learning_rate": 4.00995480697248e-05,
"loss": 1.0434,
"step": 1042
},
{
"epoch": 0.05111680164671576,
"grad_norm": 0.7455527782440186,
"learning_rate": 3.9927971484187995e-05,
"loss": 0.8981,
"step": 1043
},
{
"epoch": 0.05116581104426774,
"grad_norm": 0.6330903768539429,
"learning_rate": 3.975667114505123e-05,
"loss": 0.8602,
"step": 1044
},
{
"epoch": 0.05121482044181972,
"grad_norm": 0.7926675081253052,
"learning_rate": 3.958564784005382e-05,
"loss": 0.8785,
"step": 1045
},
{
"epoch": 0.0512638298393717,
"grad_norm": 0.7453003525733948,
"learning_rate": 3.9414902355661145e-05,
"loss": 0.8781,
"step": 1046
},
{
"epoch": 0.05131283923692368,
"grad_norm": 0.5748602151870728,
"learning_rate": 3.924443547706106e-05,
"loss": 0.8991,
"step": 1047
},
{
"epoch": 0.05136184863447566,
"grad_norm": 0.787139356136322,
"learning_rate": 3.907424798816023e-05,
"loss": 0.8958,
"step": 1048
},
{
"epoch": 0.05141085803202764,
"grad_norm": 0.6263467669487,
"learning_rate": 3.890434067158043e-05,
"loss": 0.8452,
"step": 1049
},
{
"epoch": 0.05145986742957962,
"grad_norm": 0.7788869738578796,
"learning_rate": 3.873471430865515e-05,
"loss": 0.8231,
"step": 1050
},
{
"epoch": 0.0515088768271316,
"grad_norm": 1.0092405080795288,
"learning_rate": 3.856536967942579e-05,
"loss": 1.084,
"step": 1051
},
{
"epoch": 0.05155788622468358,
"grad_norm": 0.6644636392593384,
"learning_rate": 3.839630756263828e-05,
"loss": 1.0511,
"step": 1052
},
{
"epoch": 0.05160689562223556,
"grad_norm": 0.611219584941864,
"learning_rate": 3.822752873573926e-05,
"loss": 0.877,
"step": 1053
},
{
"epoch": 0.05165590501978754,
"grad_norm": 0.7147572636604309,
"learning_rate": 3.8059033974872784e-05,
"loss": 1.0783,
"step": 1054
},
{
"epoch": 0.05170491441733952,
"grad_norm": 0.7852771878242493,
"learning_rate": 3.789082405487645e-05,
"loss": 0.8292,
"step": 1055
},
{
"epoch": 0.05175392381489151,
"grad_norm": 0.5585947036743164,
"learning_rate": 3.772289974927813e-05,
"loss": 0.8234,
"step": 1056
},
{
"epoch": 0.05180293321244349,
"grad_norm": 0.6067047119140625,
"learning_rate": 3.755526183029223e-05,
"loss": 0.9943,
"step": 1057
},
{
"epoch": 0.05185194260999547,
"grad_norm": 0.6506572961807251,
"learning_rate": 3.738791106881614e-05,
"loss": 0.9735,
"step": 1058
},
{
"epoch": 0.05190095200754745,
"grad_norm": 0.7339874505996704,
"learning_rate": 3.722084823442669e-05,
"loss": 0.9753,
"step": 1059
},
{
"epoch": 0.05194996140509943,
"grad_norm": 0.6757487654685974,
"learning_rate": 3.705407409537684e-05,
"loss": 0.9961,
"step": 1060
},
{
"epoch": 0.05199897080265141,
"grad_norm": 0.6002933382987976,
"learning_rate": 3.68875894185918e-05,
"loss": 0.747,
"step": 1061
},
{
"epoch": 0.05204798020020339,
"grad_norm": 0.7781504392623901,
"learning_rate": 3.672139496966566e-05,
"loss": 0.9216,
"step": 1062
},
{
"epoch": 0.05209698959775537,
"grad_norm": 0.7153550386428833,
"learning_rate": 3.655549151285794e-05,
"loss": 0.9178,
"step": 1063
},
{
"epoch": 0.05214599899530735,
"grad_norm": 0.7319151759147644,
"learning_rate": 3.638987981109003e-05,
"loss": 0.9615,
"step": 1064
},
{
"epoch": 0.05219500839285933,
"grad_norm": 0.8046290278434753,
"learning_rate": 3.622456062594154e-05,
"loss": 0.9093,
"step": 1065
},
{
"epoch": 0.05224401779041131,
"grad_norm": 0.6472988128662109,
"learning_rate": 3.605953471764705e-05,
"loss": 0.7422,
"step": 1066
},
{
"epoch": 0.05229302718796329,
"grad_norm": 1.0517202615737915,
"learning_rate": 3.5894802845092354e-05,
"loss": 1.0699,
"step": 1067
},
{
"epoch": 0.05234203658551527,
"grad_norm": 0.6424945592880249,
"learning_rate": 3.573036576581126e-05,
"loss": 0.8699,
"step": 1068
},
{
"epoch": 0.05239104598306725,
"grad_norm": 0.684185802936554,
"learning_rate": 3.5566224235981737e-05,
"loss": 1.0873,
"step": 1069
},
{
"epoch": 0.05244005538061924,
"grad_norm": 0.680806040763855,
"learning_rate": 3.540237901042285e-05,
"loss": 0.9018,
"step": 1070
},
{
"epoch": 0.05248906477817122,
"grad_norm": 0.6284951567649841,
"learning_rate": 3.5238830842590945e-05,
"loss": 0.8599,
"step": 1071
},
{
"epoch": 0.0525380741757232,
"grad_norm": 0.7522977590560913,
"learning_rate": 3.50755804845763e-05,
"loss": 0.9505,
"step": 1072
},
{
"epoch": 0.05258708357327518,
"grad_norm": 0.6616835594177246,
"learning_rate": 3.491262868709989e-05,
"loss": 1.0088,
"step": 1073
},
{
"epoch": 0.05263609297082716,
"grad_norm": 0.8060148358345032,
"learning_rate": 3.474997619950955e-05,
"loss": 0.9374,
"step": 1074
},
{
"epoch": 0.05268510236837914,
"grad_norm": 0.8048257231712341,
"learning_rate": 3.458762376977669e-05,
"loss": 0.8275,
"step": 1075
},
{
"epoch": 0.05273411176593112,
"grad_norm": 0.7710302472114563,
"learning_rate": 3.4425572144493014e-05,
"loss": 0.8884,
"step": 1076
},
{
"epoch": 0.0527831211634831,
"grad_norm": 0.6914852857589722,
"learning_rate": 3.4263822068866905e-05,
"loss": 1.1101,
"step": 1077
},
{
"epoch": 0.05283213056103508,
"grad_norm": 0.874279260635376,
"learning_rate": 3.410237428671995e-05,
"loss": 0.9308,
"step": 1078
},
{
"epoch": 0.05288113995858706,
"grad_norm": 0.637709379196167,
"learning_rate": 3.3941229540483774e-05,
"loss": 0.9252,
"step": 1079
},
{
"epoch": 0.05293014935613904,
"grad_norm": 0.6194972395896912,
"learning_rate": 3.378038857119632e-05,
"loss": 0.8933,
"step": 1080
},
{
"epoch": 0.05297915875369102,
"grad_norm": 0.7647547125816345,
"learning_rate": 3.3619852118498685e-05,
"loss": 0.8601,
"step": 1081
},
{
"epoch": 0.053028168151243,
"grad_norm": 0.5452641248703003,
"learning_rate": 3.345962092063153e-05,
"loss": 0.8647,
"step": 1082
},
{
"epoch": 0.053077177548794985,
"grad_norm": 0.5692489147186279,
"learning_rate": 3.3299695714431886e-05,
"loss": 0.771,
"step": 1083
},
{
"epoch": 0.053126186946346965,
"grad_norm": 0.5245976448059082,
"learning_rate": 3.314007723532954e-05,
"loss": 0.9189,
"step": 1084
},
{
"epoch": 0.053175196343898945,
"grad_norm": 1.0263336896896362,
"learning_rate": 3.298076621734385e-05,
"loss": 0.7727,
"step": 1085
},
{
"epoch": 0.053224205741450925,
"grad_norm": 0.5948834419250488,
"learning_rate": 3.282176339308029e-05,
"loss": 0.9622,
"step": 1086
},
{
"epoch": 0.053273215139002905,
"grad_norm": 0.6458703875541687,
"learning_rate": 3.266306949372704e-05,
"loss": 0.9235,
"step": 1087
},
{
"epoch": 0.053322224536554885,
"grad_norm": 0.6071894764900208,
"learning_rate": 3.2504685249051606e-05,
"loss": 0.8635,
"step": 1088
},
{
"epoch": 0.053371233934106865,
"grad_norm": 0.6334041357040405,
"learning_rate": 3.234661138739764e-05,
"loss": 0.8896,
"step": 1089
},
{
"epoch": 0.053420243331658845,
"grad_norm": 0.6917246580123901,
"learning_rate": 3.2188848635681446e-05,
"loss": 0.9068,
"step": 1090
},
{
"epoch": 0.053469252729210826,
"grad_norm": 0.7009928226470947,
"learning_rate": 3.2031397719388556e-05,
"loss": 0.9179,
"step": 1091
},
{
"epoch": 0.053518262126762806,
"grad_norm": 0.5563487410545349,
"learning_rate": 3.1874259362570666e-05,
"loss": 0.9428,
"step": 1092
},
{
"epoch": 0.053567271524314786,
"grad_norm": 0.814445972442627,
"learning_rate": 3.171743428784198e-05,
"loss": 0.9235,
"step": 1093
},
{
"epoch": 0.053616280921866766,
"grad_norm": 0.6743494272232056,
"learning_rate": 3.156092321637616e-05,
"loss": 0.821,
"step": 1094
},
{
"epoch": 0.053665290319418746,
"grad_norm": 0.6412997245788574,
"learning_rate": 3.1404726867902815e-05,
"loss": 0.9212,
"step": 1095
},
{
"epoch": 0.053714299716970726,
"grad_norm": 0.8848609924316406,
"learning_rate": 3.124884596070438e-05,
"loss": 1.057,
"step": 1096
},
{
"epoch": 0.05376330911452271,
"grad_norm": 0.7107406258583069,
"learning_rate": 3.109328121161256e-05,
"loss": 0.9214,
"step": 1097
},
{
"epoch": 0.05381231851207469,
"grad_norm": 0.8470025658607483,
"learning_rate": 3.09380333360053e-05,
"loss": 0.9982,
"step": 1098
},
{
"epoch": 0.05386132790962667,
"grad_norm": 0.574743390083313,
"learning_rate": 3.078310304780336e-05,
"loss": 0.7368,
"step": 1099
},
{
"epoch": 0.05391033730717865,
"grad_norm": 0.5940549373626709,
"learning_rate": 3.0628491059467014e-05,
"loss": 0.8097,
"step": 1100
},
{
"epoch": 0.05395934670473063,
"grad_norm": 0.8564255237579346,
"learning_rate": 3.0474198081992754e-05,
"loss": 0.968,
"step": 1101
},
{
"epoch": 0.05400835610228261,
"grad_norm": 0.6052840352058411,
"learning_rate": 3.0320224824910182e-05,
"loss": 0.999,
"step": 1102
},
{
"epoch": 0.05405736549983459,
"grad_norm": 0.7785791158676147,
"learning_rate": 3.0166571996278615e-05,
"loss": 0.8447,
"step": 1103
},
{
"epoch": 0.054106374897386574,
"grad_norm": 0.7929477691650391,
"learning_rate": 3.0013240302683766e-05,
"loss": 0.9153,
"step": 1104
},
{
"epoch": 0.054155384294938554,
"grad_norm": 0.7250458002090454,
"learning_rate": 2.9860230449234706e-05,
"loss": 0.9096,
"step": 1105
},
{
"epoch": 0.054204393692490534,
"grad_norm": 0.8561221957206726,
"learning_rate": 2.9707543139560358e-05,
"loss": 0.8301,
"step": 1106
},
{
"epoch": 0.054253403090042514,
"grad_norm": 0.9120008945465088,
"learning_rate": 2.955517907580656e-05,
"loss": 1.0136,
"step": 1107
},
{
"epoch": 0.054253403090042514,
"eval_loss": NaN,
"eval_runtime": 185.5324,
"eval_samples_per_second": 46.31,
"eval_steps_per_second": 23.155,
"step": 1107
},
{
"epoch": 0.054302412487594494,
"grad_norm": 0.6959465146064758,
"learning_rate": 2.9403138958632503e-05,
"loss": 0.7701,
"step": 1108
},
{
"epoch": 0.054351421885146474,
"grad_norm": 0.5777866840362549,
"learning_rate": 2.925142348720784e-05,
"loss": 0.8953,
"step": 1109
},
{
"epoch": 0.054400431282698454,
"grad_norm": 0.703305184841156,
"learning_rate": 2.910003335920918e-05,
"loss": 1.0123,
"step": 1110
},
{
"epoch": 0.05444944068025044,
"grad_norm": 0.8165416717529297,
"learning_rate": 2.8948969270817096e-05,
"loss": 0.8779,
"step": 1111
},
{
"epoch": 0.05449845007780242,
"grad_norm": 0.6612878441810608,
"learning_rate": 2.879823191671286e-05,
"loss": 1.0614,
"step": 1112
},
{
"epoch": 0.0545474594753544,
"grad_norm": 0.6103464365005493,
"learning_rate": 2.8647821990075153e-05,
"loss": 0.7585,
"step": 1113
},
{
"epoch": 0.05459646887290638,
"grad_norm": 0.7475747466087341,
"learning_rate": 2.8497740182576948e-05,
"loss": 0.8482,
"step": 1114
},
{
"epoch": 0.05464547827045836,
"grad_norm": 0.6378394365310669,
"learning_rate": 2.8347987184382398e-05,
"loss": 0.9145,
"step": 1115
},
{
"epoch": 0.05469448766801034,
"grad_norm": 0.563686728477478,
"learning_rate": 2.819856368414361e-05,
"loss": 0.8355,
"step": 1116
},
{
"epoch": 0.05474349706556232,
"grad_norm": 0.6585412621498108,
"learning_rate": 2.8049470368997355e-05,
"loss": 0.7453,
"step": 1117
},
{
"epoch": 0.0547925064631143,
"grad_norm": 0.5961164236068726,
"learning_rate": 2.7900707924562166e-05,
"loss": 0.9076,
"step": 1118
},
{
"epoch": 0.05484151586066628,
"grad_norm": 0.5402793884277344,
"learning_rate": 2.7752277034934894e-05,
"loss": 0.7223,
"step": 1119
},
{
"epoch": 0.05489052525821826,
"grad_norm": 0.7519727945327759,
"learning_rate": 2.760417838268784e-05,
"loss": 1.0406,
"step": 1120
},
{
"epoch": 0.05493953465577024,
"grad_norm": 0.6112848520278931,
"learning_rate": 2.745641264886536e-05,
"loss": 0.7626,
"step": 1121
},
{
"epoch": 0.05498854405332222,
"grad_norm": 0.9848921895027161,
"learning_rate": 2.7308980512980965e-05,
"loss": 1.0251,
"step": 1122
},
{
"epoch": 0.0550375534508742,
"grad_norm": 0.5984755158424377,
"learning_rate": 2.716188265301398e-05,
"loss": 0.838,
"step": 1123
},
{
"epoch": 0.05508656284842619,
"grad_norm": 0.9945166707038879,
"learning_rate": 2.7015119745406636e-05,
"loss": 1.0268,
"step": 1124
},
{
"epoch": 0.05513557224597817,
"grad_norm": 0.8502665758132935,
"learning_rate": 2.6868692465060828e-05,
"loss": 0.9499,
"step": 1125
},
{
"epoch": 0.05518458164353015,
"grad_norm": 0.6849496960639954,
"learning_rate": 2.6722601485334998e-05,
"loss": 0.7584,
"step": 1126
},
{
"epoch": 0.05523359104108213,
"grad_norm": 0.6284264326095581,
"learning_rate": 2.6576847478041067e-05,
"loss": 0.9142,
"step": 1127
},
{
"epoch": 0.05528260043863411,
"grad_norm": 0.644045352935791,
"learning_rate": 2.643143111344144e-05,
"loss": 1.0832,
"step": 1128
},
{
"epoch": 0.05533160983618609,
"grad_norm": 0.690579891204834,
"learning_rate": 2.6286353060245826e-05,
"loss": 0.7999,
"step": 1129
},
{
"epoch": 0.05538061923373807,
"grad_norm": 0.6707479953765869,
"learning_rate": 2.6141613985608093e-05,
"loss": 1.1036,
"step": 1130
},
{
"epoch": 0.05542962863129005,
"grad_norm": 0.5971590280532837,
"learning_rate": 2.5997214555123416e-05,
"loss": 1.0718,
"step": 1131
},
{
"epoch": 0.05547863802884203,
"grad_norm": 0.616248369216919,
"learning_rate": 2.585315543282496e-05,
"loss": 0.9957,
"step": 1132
},
{
"epoch": 0.05552764742639401,
"grad_norm": 0.7989575266838074,
"learning_rate": 2.570943728118106e-05,
"loss": 0.8987,
"step": 1133
},
{
"epoch": 0.05557665682394599,
"grad_norm": 0.553438663482666,
"learning_rate": 2.556606076109198e-05,
"loss": 0.8544,
"step": 1134
},
{
"epoch": 0.05562566622149797,
"grad_norm": 0.6931489109992981,
"learning_rate": 2.542302653188704e-05,
"loss": 0.9196,
"step": 1135
},
{
"epoch": 0.05567467561904995,
"grad_norm": 0.851711630821228,
"learning_rate": 2.528033525132144e-05,
"loss": 1.0254,
"step": 1136
},
{
"epoch": 0.05572368501660193,
"grad_norm": 0.6611626148223877,
"learning_rate": 2.513798757557333e-05,
"loss": 1.0014,
"step": 1137
},
{
"epoch": 0.05577269441415392,
"grad_norm": 1.1130104064941406,
"learning_rate": 2.4995984159240814e-05,
"loss": 0.8845,
"step": 1138
},
{
"epoch": 0.0558217038117059,
"grad_norm": 0.6933576464653015,
"learning_rate": 2.4854325655338805e-05,
"loss": 1.0355,
"step": 1139
},
{
"epoch": 0.05587071320925788,
"grad_norm": 0.6513526439666748,
"learning_rate": 2.4713012715296113e-05,
"loss": 0.8088,
"step": 1140
},
{
"epoch": 0.05591972260680986,
"grad_norm": 0.8315907716751099,
"learning_rate": 2.4572045988952495e-05,
"loss": 0.973,
"step": 1141
},
{
"epoch": 0.05596873200436184,
"grad_norm": 0.5299463272094727,
"learning_rate": 2.4431426124555625e-05,
"loss": 0.8138,
"step": 1142
},
{
"epoch": 0.05601774140191382,
"grad_norm": 0.6596815586090088,
"learning_rate": 2.429115376875799e-05,
"loss": 0.7319,
"step": 1143
},
{
"epoch": 0.0560667507994658,
"grad_norm": 0.6985355019569397,
"learning_rate": 2.41512295666142e-05,
"loss": 0.9264,
"step": 1144
},
{
"epoch": 0.05611576019701778,
"grad_norm": 0.8074831366539001,
"learning_rate": 2.4011654161577667e-05,
"loss": 0.8036,
"step": 1145
},
{
"epoch": 0.05616476959456976,
"grad_norm": 0.6924799084663391,
"learning_rate": 2.3872428195497998e-05,
"loss": 1.2314,
"step": 1146
},
{
"epoch": 0.05621377899212174,
"grad_norm": 0.7358079552650452,
"learning_rate": 2.3733552308617736e-05,
"loss": 1.1863,
"step": 1147
},
{
"epoch": 0.05626278838967372,
"grad_norm": 0.6338258385658264,
"learning_rate": 2.3595027139569658e-05,
"loss": 0.9938,
"step": 1148
},
{
"epoch": 0.0563117977872257,
"grad_norm": 0.7516512870788574,
"learning_rate": 2.345685332537364e-05,
"loss": 0.8846,
"step": 1149
},
{
"epoch": 0.05636080718477768,
"grad_norm": 0.849188506603241,
"learning_rate": 2.331903150143391e-05,
"loss": 0.9154,
"step": 1150
},
{
"epoch": 0.056409816582329665,
"grad_norm": 0.7366519570350647,
"learning_rate": 2.318156230153603e-05,
"loss": 1.0099,
"step": 1151
},
{
"epoch": 0.056458825979881645,
"grad_norm": 0.8387792110443115,
"learning_rate": 2.304444635784393e-05,
"loss": 0.9518,
"step": 1152
},
{
"epoch": 0.056507835377433625,
"grad_norm": 0.6383851170539856,
"learning_rate": 2.2907684300897027e-05,
"loss": 1.0355,
"step": 1153
},
{
"epoch": 0.056556844774985605,
"grad_norm": 0.7643039226531982,
"learning_rate": 2.2771276759607564e-05,
"loss": 0.9693,
"step": 1154
},
{
"epoch": 0.056605854172537586,
"grad_norm": 0.6342633962631226,
"learning_rate": 2.263522436125729e-05,
"loss": 0.8413,
"step": 1155
},
{
"epoch": 0.056654863570089566,
"grad_norm": 0.7279918193817139,
"learning_rate": 2.2499527731494886e-05,
"loss": 1.1611,
"step": 1156
},
{
"epoch": 0.056703872967641546,
"grad_norm": 0.6141911149024963,
"learning_rate": 2.2364187494333e-05,
"loss": 0.6722,
"step": 1157
},
{
"epoch": 0.056752882365193526,
"grad_norm": 0.5212133526802063,
"learning_rate": 2.222920427214541e-05,
"loss": 0.8071,
"step": 1158
},
{
"epoch": 0.056801891762745506,
"grad_norm": 0.6113694906234741,
"learning_rate": 2.2094578685664047e-05,
"loss": 0.8748,
"step": 1159
},
{
"epoch": 0.056850901160297486,
"grad_norm": 0.718829333782196,
"learning_rate": 2.1960311353976316e-05,
"loss": 0.9405,
"step": 1160
},
{
"epoch": 0.056899910557849466,
"grad_norm": 0.8659762144088745,
"learning_rate": 2.182640289452207e-05,
"loss": 0.8237,
"step": 1161
},
{
"epoch": 0.056948919955401446,
"grad_norm": 0.6210221648216248,
"learning_rate": 2.169285392309095e-05,
"loss": 0.7797,
"step": 1162
},
{
"epoch": 0.056997929352953426,
"grad_norm": 0.6985928416252136,
"learning_rate": 2.1559665053819366e-05,
"loss": 0.8632,
"step": 1163
},
{
"epoch": 0.057046938750505406,
"grad_norm": 0.8535677790641785,
"learning_rate": 2.142683689918784e-05,
"loss": 0.987,
"step": 1164
},
{
"epoch": 0.05709594814805739,
"grad_norm": 0.6734438538551331,
"learning_rate": 2.1294370070018076e-05,
"loss": 0.9561,
"step": 1165
},
{
"epoch": 0.05714495754560937,
"grad_norm": 0.6575422883033752,
"learning_rate": 2.1162265175470153e-05,
"loss": 0.9701,
"step": 1166
},
{
"epoch": 0.05719396694316135,
"grad_norm": 1.0730395317077637,
"learning_rate": 2.103052282303992e-05,
"loss": 0.7416,
"step": 1167
},
{
"epoch": 0.057242976340713334,
"grad_norm": 0.5347709059715271,
"learning_rate": 2.089914361855588e-05,
"loss": 0.7869,
"step": 1168
},
{
"epoch": 0.057291985738265314,
"grad_norm": 1.0355056524276733,
"learning_rate": 2.0768128166176604e-05,
"loss": 0.954,
"step": 1169
},
{
"epoch": 0.057340995135817294,
"grad_norm": 0.6163832545280457,
"learning_rate": 2.0637477068387957e-05,
"loss": 1.0077,
"step": 1170
},
{
"epoch": 0.057390004533369274,
"grad_norm": 0.6667943596839905,
"learning_rate": 2.050719092600031e-05,
"loss": 0.8894,
"step": 1171
},
{
"epoch": 0.057439013930921254,
"grad_norm": 0.6550483107566833,
"learning_rate": 2.037727033814565e-05,
"loss": 0.9482,
"step": 1172
},
{
"epoch": 0.057488023328473234,
"grad_norm": 0.8213583827018738,
"learning_rate": 2.0247715902275068e-05,
"loss": 0.7194,
"step": 1173
},
{
"epoch": 0.057537032726025214,
"grad_norm": 0.6851637363433838,
"learning_rate": 2.011852821415573e-05,
"loss": 0.7352,
"step": 1174
},
{
"epoch": 0.057586042123577194,
"grad_norm": 0.682693600654602,
"learning_rate": 1.9989707867868425e-05,
"loss": 1.135,
"step": 1175
},
{
"epoch": 0.057635051521129174,
"grad_norm": 0.7706078290939331,
"learning_rate": 1.986125545580455e-05,
"loss": 0.7256,
"step": 1176
},
{
"epoch": 0.057684060918681154,
"grad_norm": 0.6869995594024658,
"learning_rate": 1.9733171568663643e-05,
"loss": 0.9948,
"step": 1177
},
{
"epoch": 0.057733070316233134,
"grad_norm": 0.8122284412384033,
"learning_rate": 1.960545679545045e-05,
"loss": 0.7801,
"step": 1178
},
{
"epoch": 0.05778207971378512,
"grad_norm": 0.8644903898239136,
"learning_rate": 1.947811172347239e-05,
"loss": 0.9036,
"step": 1179
},
{
"epoch": 0.0578310891113371,
"grad_norm": 0.9533315896987915,
"learning_rate": 1.9351136938336777e-05,
"loss": 1.0587,
"step": 1180
},
{
"epoch": 0.05788009850888908,
"grad_norm": 0.7774580717086792,
"learning_rate": 1.9224533023948077e-05,
"loss": 1.0339,
"step": 1181
},
{
"epoch": 0.05792910790644106,
"grad_norm": 0.6420020461082458,
"learning_rate": 1.9098300562505266e-05,
"loss": 0.884,
"step": 1182
},
{
"epoch": 0.05797811730399304,
"grad_norm": 0.6784622669219971,
"learning_rate": 1.8972440134499224e-05,
"loss": 0.7252,
"step": 1183
},
{
"epoch": 0.05802712670154502,
"grad_norm": 0.981188952922821,
"learning_rate": 1.884695231871001e-05,
"loss": 1.0207,
"step": 1184
},
{
"epoch": 0.058076136099097,
"grad_norm": 0.6668174862861633,
"learning_rate": 1.8721837692204115e-05,
"loss": 0.8547,
"step": 1185
},
{
"epoch": 0.05812514549664898,
"grad_norm": 0.6276751756668091,
"learning_rate": 1.859709683033195e-05,
"loss": 0.8833,
"step": 1186
},
{
"epoch": 0.05817415489420096,
"grad_norm": 0.6397258043289185,
"learning_rate": 1.8472730306725107e-05,
"loss": 0.8333,
"step": 1187
},
{
"epoch": 0.05822316429175294,
"grad_norm": 0.6282427310943604,
"learning_rate": 1.8348738693293777e-05,
"loss": 0.9277,
"step": 1188
},
{
"epoch": 0.05827217368930492,
"grad_norm": 0.742828905582428,
"learning_rate": 1.822512256022405e-05,
"loss": 0.9255,
"step": 1189
},
{
"epoch": 0.0583211830868569,
"grad_norm": 0.6836369037628174,
"learning_rate": 1.8101882475975417e-05,
"loss": 1.0912,
"step": 1190
},
{
"epoch": 0.05837019248440888,
"grad_norm": 0.6010401844978333,
"learning_rate": 1.7979019007277975e-05,
"loss": 0.9613,
"step": 1191
},
{
"epoch": 0.05841920188196087,
"grad_norm": 0.724628210067749,
"learning_rate": 1.785653271912999e-05,
"loss": 0.8902,
"step": 1192
},
{
"epoch": 0.05846821127951285,
"grad_norm": 0.6914888024330139,
"learning_rate": 1.773442417479525e-05,
"loss": 0.9536,
"step": 1193
},
{
"epoch": 0.05851722067706483,
"grad_norm": 0.65279620885849,
"learning_rate": 1.7612693935800373e-05,
"loss": 0.8499,
"step": 1194
},
{
"epoch": 0.05856623007461681,
"grad_norm": 0.617305338382721,
"learning_rate": 1.7491342561932355e-05,
"loss": 1.0133,
"step": 1195
},
{
"epoch": 0.05861523947216879,
"grad_norm": 0.7648184299468994,
"learning_rate": 1.7370370611235963e-05,
"loss": 0.994,
"step": 1196
},
{
"epoch": 0.05866424886972077,
"grad_norm": 0.7441121935844421,
"learning_rate": 1.7249778640011148e-05,
"loss": 0.8789,
"step": 1197
},
{
"epoch": 0.05871325826727275,
"grad_norm": 0.9436798691749573,
"learning_rate": 1.7129567202810448e-05,
"loss": 1.0732,
"step": 1198
},
{
"epoch": 0.05876226766482473,
"grad_norm": 0.7364475727081299,
"learning_rate": 1.7009736852436563e-05,
"loss": 0.7994,
"step": 1199
},
{
"epoch": 0.05881127706237671,
"grad_norm": 0.7213372588157654,
"learning_rate": 1.6890288139939625e-05,
"loss": 0.9831,
"step": 1200
},
{
"epoch": 0.05886028645992869,
"grad_norm": 0.7466797828674316,
"learning_rate": 1.6771221614614906e-05,
"loss": 0.938,
"step": 1201
},
{
"epoch": 0.05890929585748067,
"grad_norm": 0.613419771194458,
"learning_rate": 1.6652537824000015e-05,
"loss": 0.939,
"step": 1202
},
{
"epoch": 0.05895830525503265,
"grad_norm": 0.6488571763038635,
"learning_rate": 1.6534237313872636e-05,
"loss": 1.0457,
"step": 1203
},
{
"epoch": 0.05900731465258463,
"grad_norm": 0.6202888488769531,
"learning_rate": 1.641632062824783e-05,
"loss": 1.0043,
"step": 1204
},
{
"epoch": 0.05905632405013661,
"grad_norm": 0.6015529036521912,
"learning_rate": 1.6298788309375644e-05,
"loss": 0.9645,
"step": 1205
},
{
"epoch": 0.0591053334476886,
"grad_norm": 0.5945176482200623,
"learning_rate": 1.6181640897738603e-05,
"loss": 0.9764,
"step": 1206
},
{
"epoch": 0.05915434284524058,
"grad_norm": 0.657114565372467,
"learning_rate": 1.6064878932049166e-05,
"loss": 1.0262,
"step": 1207
},
{
"epoch": 0.05920335224279256,
"grad_norm": 0.5701066255569458,
"learning_rate": 1.5948502949247246e-05,
"loss": 0.8868,
"step": 1208
},
{
"epoch": 0.05925236164034454,
"grad_norm": 1.2002894878387451,
"learning_rate": 1.583251348449788e-05,
"loss": 1.1998,
"step": 1209
},
{
"epoch": 0.05930137103789652,
"grad_norm": 0.7580971121788025,
"learning_rate": 1.571691107118861e-05,
"loss": 0.7739,
"step": 1210
},
{
"epoch": 0.0593503804354485,
"grad_norm": 0.8998621106147766,
"learning_rate": 1.5601696240927076e-05,
"loss": 0.8264,
"step": 1211
},
{
"epoch": 0.05939938983300048,
"grad_norm": 0.7180477380752563,
"learning_rate": 1.5486869523538638e-05,
"loss": 1.1046,
"step": 1212
},
{
"epoch": 0.05944839923055246,
"grad_norm": 0.7890926599502563,
"learning_rate": 1.5372431447063807e-05,
"loss": 0.8471,
"step": 1213
},
{
"epoch": 0.05949740862810444,
"grad_norm": 0.879736065864563,
"learning_rate": 1.5258382537755989e-05,
"loss": 1.2106,
"step": 1214
},
{
"epoch": 0.05954641802565642,
"grad_norm": 0.7328320145606995,
"learning_rate": 1.5144723320078869e-05,
"loss": 0.7876,
"step": 1215
},
{
"epoch": 0.0595954274232084,
"grad_norm": 0.5559690594673157,
"learning_rate": 1.5031454316704207e-05,
"loss": 0.8941,
"step": 1216
},
{
"epoch": 0.05964443682076038,
"grad_norm": 0.9444091320037842,
"learning_rate": 1.4918576048509225e-05,
"loss": 0.9342,
"step": 1217
},
{
"epoch": 0.05969344621831236,
"grad_norm": 0.654117226600647,
"learning_rate": 1.48060890345744e-05,
"loss": 0.7734,
"step": 1218
},
{
"epoch": 0.05974245561586434,
"grad_norm": 0.7489770650863647,
"learning_rate": 1.469399379218096e-05,
"loss": 0.9926,
"step": 1219
},
{
"epoch": 0.059791465013416326,
"grad_norm": 0.6979839205741882,
"learning_rate": 1.4582290836808543e-05,
"loss": 0.9055,
"step": 1220
},
{
"epoch": 0.059840474410968306,
"grad_norm": 0.5855501294136047,
"learning_rate": 1.4470980682132762e-05,
"loss": 0.8791,
"step": 1221
},
{
"epoch": 0.059889483808520286,
"grad_norm": 0.8113967180252075,
"learning_rate": 1.4360063840023008e-05,
"loss": 0.9865,
"step": 1222
},
{
"epoch": 0.059938493206072266,
"grad_norm": 0.7481881380081177,
"learning_rate": 1.4249540820539941e-05,
"loss": 1.0255,
"step": 1223
},
{
"epoch": 0.059987502603624246,
"grad_norm": 0.636202335357666,
"learning_rate": 1.413941213193316e-05,
"loss": 0.8995,
"step": 1224
},
{
"epoch": 0.060036512001176226,
"grad_norm": 0.7428346872329712,
"learning_rate": 1.402967828063897e-05,
"loss": 1.106,
"step": 1225
},
{
"epoch": 0.060085521398728206,
"grad_norm": 1.2551703453063965,
"learning_rate": 1.3920339771277891e-05,
"loss": 0.8506,
"step": 1226
},
{
"epoch": 0.060134530796280186,
"grad_norm": 0.6625940203666687,
"learning_rate": 1.3811397106652524e-05,
"loss": 0.7749,
"step": 1227
},
{
"epoch": 0.060183540193832166,
"grad_norm": 0.8333196640014648,
"learning_rate": 1.370285078774507e-05,
"loss": 1.0665,
"step": 1228
},
{
"epoch": 0.060232549591384146,
"grad_norm": 0.6439178586006165,
"learning_rate": 1.3594701313715152e-05,
"loss": 0.95,
"step": 1229
},
{
"epoch": 0.060281558988936126,
"grad_norm": 0.7931140065193176,
"learning_rate": 1.3486949181897434e-05,
"loss": 1.0114,
"step": 1230
},
{
"epoch": 0.060330568386488106,
"grad_norm": 0.8823480606079102,
"learning_rate": 1.3379594887799384e-05,
"loss": 1.0644,
"step": 1231
},
{
"epoch": 0.06037957778404009,
"grad_norm": 0.9899863004684448,
"learning_rate": 1.327263892509899e-05,
"loss": 1.0687,
"step": 1232
},
{
"epoch": 0.060428587181592074,
"grad_norm": 0.6067537069320679,
"learning_rate": 1.316608178564246e-05,
"loss": 0.9695,
"step": 1233
},
{
"epoch": 0.060477596579144054,
"grad_norm": 0.5756759643554688,
"learning_rate": 1.3059923959441944e-05,
"loss": 0.9244,
"step": 1234
},
{
"epoch": 0.060526605976696034,
"grad_norm": 0.5859988927841187,
"learning_rate": 1.295416593467338e-05,
"loss": 0.7473,
"step": 1235
},
{
"epoch": 0.060575615374248014,
"grad_norm": 0.6280271410942078,
"learning_rate": 1.284880819767419e-05,
"loss": 0.8095,
"step": 1236
},
{
"epoch": 0.060624624771799994,
"grad_norm": 0.6644018292427063,
"learning_rate": 1.2743851232940951e-05,
"loss": 0.8925,
"step": 1237
},
{
"epoch": 0.060673634169351974,
"grad_norm": 0.6548195481300354,
"learning_rate": 1.263929552312737e-05,
"loss": 0.9308,
"step": 1238
},
{
"epoch": 0.060722643566903954,
"grad_norm": 0.594926118850708,
"learning_rate": 1.2535141549041829e-05,
"loss": 0.9031,
"step": 1239
},
{
"epoch": 0.060771652964455934,
"grad_norm": 0.7365604639053345,
"learning_rate": 1.2431389789645399e-05,
"loss": 0.9495,
"step": 1240
},
{
"epoch": 0.060820662362007914,
"grad_norm": 0.7257662415504456,
"learning_rate": 1.232804072204945e-05,
"loss": 0.755,
"step": 1241
},
{
"epoch": 0.060869671759559894,
"grad_norm": 0.7510412335395813,
"learning_rate": 1.2225094821513616e-05,
"loss": 0.8904,
"step": 1242
},
{
"epoch": 0.060918681157111874,
"grad_norm": 0.7142137885093689,
"learning_rate": 1.2122552561443456e-05,
"loss": 1.0054,
"step": 1243
},
{
"epoch": 0.060967690554663855,
"grad_norm": 1.0007776021957397,
"learning_rate": 1.2020414413388403e-05,
"loss": 1.0645,
"step": 1244
},
{
"epoch": 0.061016699952215835,
"grad_norm": 0.6386979222297668,
"learning_rate": 1.1918680847039554e-05,
"loss": 0.857,
"step": 1245
},
{
"epoch": 0.061065709349767815,
"grad_norm": 0.5809711813926697,
"learning_rate": 1.181735233022746e-05,
"loss": 0.817,
"step": 1246
},
{
"epoch": 0.0611147187473198,
"grad_norm": 0.7974873781204224,
"learning_rate": 1.1716429328919998e-05,
"loss": 0.9746,
"step": 1247
},
{
"epoch": 0.06116372814487178,
"grad_norm": 0.6156415939331055,
"learning_rate": 1.1615912307220378e-05,
"loss": 0.8998,
"step": 1248
},
{
"epoch": 0.06121273754242376,
"grad_norm": 1.158423900604248,
"learning_rate": 1.1515801727364727e-05,
"loss": 0.9229,
"step": 1249
},
{
"epoch": 0.06126174693997574,
"grad_norm": 0.5545446872711182,
"learning_rate": 1.141609804972017e-05,
"loss": 0.8738,
"step": 1250
},
{
"epoch": 0.06131075633752772,
"grad_norm": 0.7248711585998535,
"learning_rate": 1.1316801732782667e-05,
"loss": 1.0821,
"step": 1251
},
{
"epoch": 0.0613597657350797,
"grad_norm": 0.6983115077018738,
"learning_rate": 1.1217913233174915e-05,
"loss": 0.9231,
"step": 1252
},
{
"epoch": 0.06140877513263168,
"grad_norm": 0.5736908316612244,
"learning_rate": 1.1119433005644176e-05,
"loss": 0.8463,
"step": 1253
},
{
"epoch": 0.06145778453018366,
"grad_norm": 0.9075560569763184,
"learning_rate": 1.102136150306028e-05,
"loss": 0.7854,
"step": 1254
},
{
"epoch": 0.06150679392773564,
"grad_norm": 0.6925746202468872,
"learning_rate": 1.0923699176413448e-05,
"loss": 1.0055,
"step": 1255
},
{
"epoch": 0.06155580332528762,
"grad_norm": 0.6659478545188904,
"learning_rate": 1.0826446474812368e-05,
"loss": 0.8888,
"step": 1256
},
{
"epoch": 0.0616048127228396,
"grad_norm": 2.326383352279663,
"learning_rate": 1.0729603845481894e-05,
"loss": 1.1274,
"step": 1257
},
{
"epoch": 0.06165382212039158,
"grad_norm": 0.6801324486732483,
"learning_rate": 1.0633171733761272e-05,
"loss": 0.9899,
"step": 1258
},
{
"epoch": 0.06170283151794356,
"grad_norm": 0.8603371381759644,
"learning_rate": 1.0537150583101817e-05,
"loss": 1.0472,
"step": 1259
},
{
"epoch": 0.06175184091549555,
"grad_norm": 0.7354358434677124,
"learning_rate": 1.04415408350651e-05,
"loss": 1.0156,
"step": 1260
},
{
"epoch": 0.06180085031304753,
"grad_norm": 0.593607485294342,
"learning_rate": 1.0346342929320796e-05,
"loss": 0.8159,
"step": 1261
},
{
"epoch": 0.06184985971059951,
"grad_norm": 0.7112290263175964,
"learning_rate": 1.0251557303644665e-05,
"loss": 0.8867,
"step": 1262
},
{
"epoch": 0.06189886910815149,
"grad_norm": 0.7560910582542419,
"learning_rate": 1.0157184393916563e-05,
"loss": 0.8092,
"step": 1263
},
{
"epoch": 0.06194787850570347,
"grad_norm": 0.6165046095848083,
"learning_rate": 1.006322463411845e-05,
"loss": 0.9094,
"step": 1264
},
{
"epoch": 0.06199688790325545,
"grad_norm": 1.0106430053710938,
"learning_rate": 9.969678456332409e-06,
"loss": 0.8011,
"step": 1265
},
{
"epoch": 0.06204589730080743,
"grad_norm": 0.7880820035934448,
"learning_rate": 9.876546290738564e-06,
"loss": 1.0987,
"step": 1266
},
{
"epoch": 0.06209490669835941,
"grad_norm": 0.6300559043884277,
"learning_rate": 9.78382856561323e-06,
"loss": 0.7252,
"step": 1267
},
{
"epoch": 0.06214391609591139,
"grad_norm": 0.59274822473526,
"learning_rate": 9.691525707326832e-06,
"loss": 0.8166,
"step": 1268
},
{
"epoch": 0.06219292549346337,
"grad_norm": 0.6262571811676025,
"learning_rate": 9.599638140342049e-06,
"loss": 0.9126,
"step": 1269
},
{
"epoch": 0.06224193489101535,
"grad_norm": 0.6474040746688843,
"learning_rate": 9.508166287211739e-06,
"loss": 0.8498,
"step": 1270
},
{
"epoch": 0.06229094428856733,
"grad_norm": 0.769927978515625,
"learning_rate": 9.417110568577136e-06,
"loss": 0.9494,
"step": 1271
},
{
"epoch": 0.06233995368611931,
"grad_norm": 0.8656934499740601,
"learning_rate": 9.326471403165782e-06,
"loss": 0.9175,
"step": 1272
},
{
"epoch": 0.06238896308367129,
"grad_norm": 0.5913616418838501,
"learning_rate": 9.236249207789705e-06,
"loss": 0.7902,
"step": 1273
},
{
"epoch": 0.06243797248122328,
"grad_norm": 0.7363753318786621,
"learning_rate": 9.146444397343457e-06,
"loss": 0.9849,
"step": 1274
},
{
"epoch": 0.06248698187877526,
"grad_norm": 0.6947307586669922,
"learning_rate": 9.057057384802181e-06,
"loss": 0.8358,
"step": 1275
},
{
"epoch": 0.06253599127632724,
"grad_norm": 0.7444347739219666,
"learning_rate": 8.968088581219746e-06,
"loss": 0.9795,
"step": 1276
},
{
"epoch": 0.06258500067387922,
"grad_norm": 0.6162583827972412,
"learning_rate": 8.879538395726884e-06,
"loss": 0.8446,
"step": 1277
},
{
"epoch": 0.0626340100714312,
"grad_norm": 0.7605543732643127,
"learning_rate": 8.791407235529247e-06,
"loss": 0.9335,
"step": 1278
},
{
"epoch": 0.06268301946898318,
"grad_norm": 0.6320784091949463,
"learning_rate": 8.703695505905573e-06,
"loss": 0.999,
"step": 1279
},
{
"epoch": 0.06273202886653516,
"grad_norm": 1.867570400238037,
"learning_rate": 8.616403610205814e-06,
"loss": 1.066,
"step": 1280
},
{
"epoch": 0.06278103826408714,
"grad_norm": 0.744623601436615,
"learning_rate": 8.529531949849245e-06,
"loss": 0.8478,
"step": 1281
},
{
"epoch": 0.06283004766163912,
"grad_norm": 0.6711301207542419,
"learning_rate": 8.443080924322733e-06,
"loss": 0.7788,
"step": 1282
},
{
"epoch": 0.0628790570591911,
"grad_norm": 0.6933310031890869,
"learning_rate": 8.357050931178723e-06,
"loss": 0.8071,
"step": 1283
},
{
"epoch": 0.06292806645674308,
"grad_norm": 0.7776893973350525,
"learning_rate": 8.271442366033577e-06,
"loss": 1.0827,
"step": 1284
},
{
"epoch": 0.06297707585429506,
"grad_norm": 0.5825070738792419,
"learning_rate": 8.186255622565642e-06,
"loss": 0.9195,
"step": 1285
},
{
"epoch": 0.06302608525184704,
"grad_norm": 0.5672100782394409,
"learning_rate": 8.101491092513513e-06,
"loss": 0.8868,
"step": 1286
},
{
"epoch": 0.06307509464939902,
"grad_norm": 0.8170425891876221,
"learning_rate": 8.017149165674199e-06,
"loss": 0.9684,
"step": 1287
},
{
"epoch": 0.063124104046951,
"grad_norm": 0.6922415494918823,
"learning_rate": 7.9332302299013e-06,
"loss": 0.7434,
"step": 1288
},
{
"epoch": 0.06317311344450298,
"grad_norm": 0.7274526357650757,
"learning_rate": 7.849734671103259e-06,
"loss": 0.9571,
"step": 1289
},
{
"epoch": 0.06322212284205496,
"grad_norm": 0.7111207246780396,
"learning_rate": 7.766662873241614e-06,
"loss": 0.8063,
"step": 1290
},
{
"epoch": 0.06327113223960694,
"grad_norm": 0.8042239546775818,
"learning_rate": 7.684015218329221e-06,
"loss": 1.1872,
"step": 1291
},
{
"epoch": 0.06332014163715892,
"grad_norm": 0.6595145463943481,
"learning_rate": 7.601792086428383e-06,
"loss": 0.9201,
"step": 1292
},
{
"epoch": 0.0633691510347109,
"grad_norm": 1.050703763961792,
"learning_rate": 7.5199938556492984e-06,
"loss": 1.2614,
"step": 1293
},
{
"epoch": 0.0634181604322629,
"grad_norm": 0.5375563502311707,
"learning_rate": 7.438620902148163e-06,
"loss": 0.7987,
"step": 1294
},
{
"epoch": 0.06346716982981487,
"grad_norm": 0.7361612319946289,
"learning_rate": 7.357673600125525e-06,
"loss": 1.0086,
"step": 1295
},
{
"epoch": 0.06351617922736685,
"grad_norm": 0.854240357875824,
"learning_rate": 7.277152321824521e-06,
"loss": 0.9092,
"step": 1296
},
{
"epoch": 0.06356518862491883,
"grad_norm": 0.5932867527008057,
"learning_rate": 7.197057437529209e-06,
"loss": 0.6281,
"step": 1297
},
{
"epoch": 0.06361419802247081,
"grad_norm": 0.7344982624053955,
"learning_rate": 7.117389315562772e-06,
"loss": 0.7349,
"step": 1298
},
{
"epoch": 0.0636632074200228,
"grad_norm": 0.6009116172790527,
"learning_rate": 7.0381483222859754e-06,
"loss": 0.7879,
"step": 1299
},
{
"epoch": 0.06371221681757477,
"grad_norm": 0.7015933394432068,
"learning_rate": 6.959334822095354e-06,
"loss": 0.9432,
"step": 1300
},
{
"epoch": 0.06376122621512675,
"grad_norm": 0.6471702456474304,
"learning_rate": 6.88094917742157e-06,
"loss": 0.9026,
"step": 1301
},
{
"epoch": 0.06381023561267873,
"grad_norm": 0.7404037117958069,
"learning_rate": 6.80299174872775e-06,
"loss": 0.8975,
"step": 1302
},
{
"epoch": 0.06385924501023071,
"grad_norm": 0.7222086787223816,
"learning_rate": 6.725462894507861e-06,
"loss": 0.9482,
"step": 1303
},
{
"epoch": 0.0639082544077827,
"grad_norm": 0.8654917478561401,
"learning_rate": 6.648362971285038e-06,
"loss": 0.9204,
"step": 1304
},
{
"epoch": 0.06395726380533467,
"grad_norm": 0.6940625309944153,
"learning_rate": 6.571692333609891e-06,
"loss": 0.8663,
"step": 1305
},
{
"epoch": 0.06400627320288665,
"grad_norm": 0.6776160001754761,
"learning_rate": 6.495451334058989e-06,
"loss": 0.8543,
"step": 1306
},
{
"epoch": 0.06405528260043863,
"grad_norm": 1.0531834363937378,
"learning_rate": 6.4196403232331e-06,
"loss": 0.8712,
"step": 1307
},
{
"epoch": 0.06410429199799061,
"grad_norm": 0.7188198566436768,
"learning_rate": 6.344259649755724e-06,
"loss": 1.0192,
"step": 1308
},
{
"epoch": 0.0641533013955426,
"grad_norm": 0.6964803338050842,
"learning_rate": 6.269309660271361e-06,
"loss": 0.8468,
"step": 1309
},
{
"epoch": 0.06420231079309457,
"grad_norm": 0.7297788262367249,
"learning_rate": 6.1947906994440195e-06,
"loss": 0.8149,
"step": 1310
},
{
"epoch": 0.06425132019064655,
"grad_norm": 0.8535167574882507,
"learning_rate": 6.1207031099555276e-06,
"loss": 0.9986,
"step": 1311
},
{
"epoch": 0.06430032958819853,
"grad_norm": 0.6777470111846924,
"learning_rate": 6.047047232504077e-06,
"loss": 0.8337,
"step": 1312
},
{
"epoch": 0.06434933898575051,
"grad_norm": 0.6274713277816772,
"learning_rate": 5.973823405802581e-06,
"loss": 0.8513,
"step": 1313
},
{
"epoch": 0.0643983483833025,
"grad_norm": 0.8152085542678833,
"learning_rate": 5.901031966577097e-06,
"loss": 0.8394,
"step": 1314
},
{
"epoch": 0.06444735778085448,
"grad_norm": 0.7193106412887573,
"learning_rate": 5.8286732495653196e-06,
"loss": 0.8796,
"step": 1315
},
{
"epoch": 0.06449636717840646,
"grad_norm": 0.7336921095848083,
"learning_rate": 5.756747587515055e-06,
"loss": 0.8721,
"step": 1316
},
{
"epoch": 0.06454537657595844,
"grad_norm": 0.6193203926086426,
"learning_rate": 5.685255311182669e-06,
"loss": 1.0327,
"step": 1317
},
{
"epoch": 0.06459438597351042,
"grad_norm": 0.8311421275138855,
"learning_rate": 5.614196749331546e-06,
"loss": 1.0421,
"step": 1318
},
{
"epoch": 0.0646433953710624,
"grad_norm": 0.5709735751152039,
"learning_rate": 5.54357222873062e-06,
"loss": 0.8931,
"step": 1319
},
{
"epoch": 0.06469240476861438,
"grad_norm": 0.684586763381958,
"learning_rate": 5.4733820741528e-06,
"loss": 0.9502,
"step": 1320
},
{
"epoch": 0.06474141416616636,
"grad_norm": 0.7916139364242554,
"learning_rate": 5.403626608373602e-06,
"loss": 0.8714,
"step": 1321
},
{
"epoch": 0.06479042356371835,
"grad_norm": 0.6645509004592896,
"learning_rate": 5.334306152169521e-06,
"loss": 0.9467,
"step": 1322
},
{
"epoch": 0.06483943296127033,
"grad_norm": 0.6344390511512756,
"learning_rate": 5.265421024316664e-06,
"loss": 0.6801,
"step": 1323
},
{
"epoch": 0.06488844235882231,
"grad_norm": 0.7653583288192749,
"learning_rate": 5.196971541589213e-06,
"loss": 0.8782,
"step": 1324
},
{
"epoch": 0.06493745175637429,
"grad_norm": 0.6185832619667053,
"learning_rate": 5.128958018758012e-06,
"loss": 0.8937,
"step": 1325
},
{
"epoch": 0.06498646115392627,
"grad_norm": 0.5814158916473389,
"learning_rate": 5.06138076858913e-06,
"loss": 0.7603,
"step": 1326
},
{
"epoch": 0.06503547055147825,
"grad_norm": 0.649247944355011,
"learning_rate": 4.9942401018423625e-06,
"loss": 0.9353,
"step": 1327
},
{
"epoch": 0.06508447994903023,
"grad_norm": 0.6215224862098694,
"learning_rate": 4.9275363272698215e-06,
"loss": 0.8805,
"step": 1328
},
{
"epoch": 0.06513348934658221,
"grad_norm": 0.8012757301330566,
"learning_rate": 4.861269751614628e-06,
"loss": 0.8529,
"step": 1329
},
{
"epoch": 0.06518249874413419,
"grad_norm": 0.621527373790741,
"learning_rate": 4.795440679609298e-06,
"loss": 0.8713,
"step": 1330
},
{
"epoch": 0.06523150814168617,
"grad_norm": 0.7184974551200867,
"learning_rate": 4.73004941397448e-06,
"loss": 0.9106,
"step": 1331
},
{
"epoch": 0.06528051753923815,
"grad_norm": 0.7722237706184387,
"learning_rate": 4.665096255417578e-06,
"loss": 0.9184,
"step": 1332
},
{
"epoch": 0.06532952693679013,
"grad_norm": 0.9010992050170898,
"learning_rate": 4.600581502631263e-06,
"loss": 0.9899,
"step": 1333
},
{
"epoch": 0.06537853633434211,
"grad_norm": 0.762018620967865,
"learning_rate": 4.536505452292206e-06,
"loss": 0.8958,
"step": 1334
},
{
"epoch": 0.06542754573189409,
"grad_norm": 0.6331391930580139,
"learning_rate": 4.472868399059626e-06,
"loss": 0.8438,
"step": 1335
},
{
"epoch": 0.06547655512944607,
"grad_norm": 0.828467845916748,
"learning_rate": 4.4096706355740145e-06,
"loss": 0.8307,
"step": 1336
},
{
"epoch": 0.06552556452699805,
"grad_norm": 0.6935224533081055,
"learning_rate": 4.34691245245572e-06,
"loss": 0.9423,
"step": 1337
},
{
"epoch": 0.06557457392455003,
"grad_norm": 0.6905186176300049,
"learning_rate": 4.284594138303655e-06,
"loss": 0.9069,
"step": 1338
},
{
"epoch": 0.06562358332210201,
"grad_norm": 1.1875556707382202,
"learning_rate": 4.22271597969397e-06,
"loss": 0.906,
"step": 1339
},
{
"epoch": 0.06567259271965399,
"grad_norm": 0.5741354823112488,
"learning_rate": 4.161278261178714e-06,
"loss": 0.8514,
"step": 1340
},
{
"epoch": 0.06572160211720597,
"grad_norm": 0.6324502825737,
"learning_rate": 4.1002812652845e-06,
"loss": 1.0126,
"step": 1341
},
{
"epoch": 0.06577061151475795,
"grad_norm": 0.6402712464332581,
"learning_rate": 4.039725272511308e-06,
"loss": 0.7793,
"step": 1342
},
{
"epoch": 0.06581962091230993,
"grad_norm": 0.6293900012969971,
"learning_rate": 3.979610561331071e-06,
"loss": 0.8921,
"step": 1343
},
{
"epoch": 0.06586863030986191,
"grad_norm": 0.9233715534210205,
"learning_rate": 3.919937408186447e-06,
"loss": 0.9569,
"step": 1344
},
{
"epoch": 0.06591763970741389,
"grad_norm": 0.6657326221466064,
"learning_rate": 3.860706087489607e-06,
"loss": 0.9189,
"step": 1345
},
{
"epoch": 0.06596664910496587,
"grad_norm": 0.6525406837463379,
"learning_rate": 3.801916871620881e-06,
"loss": 0.884,
"step": 1346
},
{
"epoch": 0.06601565850251785,
"grad_norm": 0.8163220882415771,
"learning_rate": 3.7435700309275345e-06,
"loss": 1.1409,
"step": 1347
},
{
"epoch": 0.06606466790006983,
"grad_norm": 0.8402869701385498,
"learning_rate": 3.6856658337225405e-06,
"loss": 1.0371,
"step": 1348
},
{
"epoch": 0.06611367729762183,
"grad_norm": 0.8356976509094238,
"learning_rate": 3.6282045462833427e-06,
"loss": 0.7902,
"step": 1349
},
{
"epoch": 0.0661626866951738,
"grad_norm": 0.6106687188148499,
"learning_rate": 3.571186432850626e-06,
"loss": 0.8288,
"step": 1350
},
{
"epoch": 0.06621169609272579,
"grad_norm": 0.7321956157684326,
"learning_rate": 3.514611755627084e-06,
"loss": 1.0251,
"step": 1351
},
{
"epoch": 0.06626070549027777,
"grad_norm": 0.7708463072776794,
"learning_rate": 3.458480774776274e-06,
"loss": 1.0283,
"step": 1352
},
{
"epoch": 0.06630971488782975,
"grad_norm": 0.7667643427848816,
"learning_rate": 3.402793748421318e-06,
"loss": 0.9402,
"step": 1353
},
{
"epoch": 0.06635872428538173,
"grad_norm": 0.7186606526374817,
"learning_rate": 3.3475509326438283e-06,
"loss": 0.9398,
"step": 1354
},
{
"epoch": 0.0664077336829337,
"grad_norm": 0.9855289459228516,
"learning_rate": 3.29275258148265e-06,
"loss": 1.1716,
"step": 1355
},
{
"epoch": 0.06645674308048569,
"grad_norm": 0.7203282117843628,
"learning_rate": 3.238398946932719e-06,
"loss": 0.9221,
"step": 1356
},
{
"epoch": 0.06650575247803767,
"grad_norm": 0.6085039377212524,
"learning_rate": 3.184490278943897e-06,
"loss": 0.8337,
"step": 1357
},
{
"epoch": 0.06655476187558965,
"grad_norm": 0.8711732625961304,
"learning_rate": 3.131026825419858e-06,
"loss": 1.0989,
"step": 1358
},
{
"epoch": 0.06660377127314163,
"grad_norm": 0.5561559200286865,
"learning_rate": 3.078008832216894e-06,
"loss": 0.7172,
"step": 1359
},
{
"epoch": 0.0666527806706936,
"grad_norm": 1.0216760635375977,
"learning_rate": 3.0254365431428013e-06,
"loss": 1.0683,
"step": 1360
},
{
"epoch": 0.06670179006824559,
"grad_norm": 0.6487326622009277,
"learning_rate": 2.9733101999558142e-06,
"loss": 0.9112,
"step": 1361
},
{
"epoch": 0.06675079946579757,
"grad_norm": 0.643084704875946,
"learning_rate": 2.9216300423633767e-06,
"loss": 0.9172,
"step": 1362
},
{
"epoch": 0.06679980886334955,
"grad_norm": 0.8597632646560669,
"learning_rate": 2.8703963080211837e-06,
"loss": 1.1109,
"step": 1363
},
{
"epoch": 0.06684881826090153,
"grad_norm": 0.6921860575675964,
"learning_rate": 2.819609232531939e-06,
"loss": 0.8207,
"step": 1364
},
{
"epoch": 0.06689782765845351,
"grad_norm": 0.8340615034103394,
"learning_rate": 2.7692690494444227e-06,
"loss": 0.7822,
"step": 1365
},
{
"epoch": 0.06694683705600549,
"grad_norm": 0.8636243939399719,
"learning_rate": 2.719375990252282e-06,
"loss": 0.9578,
"step": 1366
},
{
"epoch": 0.06699584645355747,
"grad_norm": 0.8949651122093201,
"learning_rate": 2.669930284393052e-06,
"loss": 1.0168,
"step": 1367
},
{
"epoch": 0.06704485585110945,
"grad_norm": 0.6321136951446533,
"learning_rate": 2.6209321592470804e-06,
"loss": 0.7191,
"step": 1368
},
{
"epoch": 0.06709386524866143,
"grad_norm": 0.5103330612182617,
"learning_rate": 2.572381840136462e-06,
"loss": 0.728,
"step": 1369
},
{
"epoch": 0.06714287464621341,
"grad_norm": 0.6573939323425293,
"learning_rate": 2.524279550324027e-06,
"loss": 0.85,
"step": 1370
},
{
"epoch": 0.06719188404376539,
"grad_norm": 0.6426239609718323,
"learning_rate": 2.476625511012287e-06,
"loss": 0.8577,
"step": 1371
},
{
"epoch": 0.06724089344131737,
"grad_norm": 0.6823493838310242,
"learning_rate": 2.42941994134247e-06,
"loss": 0.9067,
"step": 1372
},
{
"epoch": 0.06728990283886935,
"grad_norm": 0.5893593430519104,
"learning_rate": 2.382663058393442e-06,
"loss": 0.657,
"step": 1373
},
{
"epoch": 0.06733891223642133,
"grad_norm": 0.7312384843826294,
"learning_rate": 2.336355077180774e-06,
"loss": 0.9672,
"step": 1374
},
{
"epoch": 0.06738792163397331,
"grad_norm": 0.7566931247711182,
"learning_rate": 2.2904962106556793e-06,
"loss": 1.0775,
"step": 1375
},
{
"epoch": 0.0674369310315253,
"grad_norm": 0.8246945738792419,
"learning_rate": 2.245086669704144e-06,
"loss": 0.9501,
"step": 1376
},
{
"epoch": 0.06748594042907728,
"grad_norm": 1.02047598361969,
"learning_rate": 2.2001266631458186e-06,
"loss": 1.0639,
"step": 1377
},
{
"epoch": 0.06753494982662926,
"grad_norm": 0.6248430013656616,
"learning_rate": 2.1556163977331958e-06,
"loss": 0.9738,
"step": 1378
},
{
"epoch": 0.06758395922418124,
"grad_norm": 0.5300363898277283,
"learning_rate": 2.1115560781505562e-06,
"loss": 0.8054,
"step": 1379
},
{
"epoch": 0.06763296862173322,
"grad_norm": 0.6251785755157471,
"learning_rate": 2.067945907013069e-06,
"loss": 0.9462,
"step": 1380
},
{
"epoch": 0.0676819780192852,
"grad_norm": 0.6517499089241028,
"learning_rate": 2.0247860848658815e-06,
"loss": 0.8846,
"step": 1381
},
{
"epoch": 0.06773098741683718,
"grad_norm": 0.703231930732727,
"learning_rate": 1.982076810183153e-06,
"loss": 0.8759,
"step": 1382
},
{
"epoch": 0.06777999681438916,
"grad_norm": 0.6347089409828186,
"learning_rate": 1.9398182793671447e-06,
"loss": 0.8239,
"step": 1383
},
{
"epoch": 0.06782900621194114,
"grad_norm": 0.7737944722175598,
"learning_rate": 1.8980106867473536e-06,
"loss": 0.9956,
"step": 1384
},
{
"epoch": 0.06787801560949312,
"grad_norm": 0.6915851831436157,
"learning_rate": 1.8566542245796347e-06,
"loss": 0.8786,
"step": 1385
},
{
"epoch": 0.0679270250070451,
"grad_norm": 0.6760699152946472,
"learning_rate": 1.815749083045193e-06,
"loss": 0.8288,
"step": 1386
},
{
"epoch": 0.06797603440459708,
"grad_norm": 0.5723273158073425,
"learning_rate": 1.775295450249892e-06,
"loss": 0.9204,
"step": 1387
},
{
"epoch": 0.06802504380214906,
"grad_norm": 0.880534291267395,
"learning_rate": 1.7352935122232128e-06,
"loss": 0.7802,
"step": 1388
},
{
"epoch": 0.06807405319970104,
"grad_norm": 0.8262280821800232,
"learning_rate": 1.6957434529175309e-06,
"loss": 0.8427,
"step": 1389
},
{
"epoch": 0.06812306259725302,
"grad_norm": 0.5815930366516113,
"learning_rate": 1.6566454542071951e-06,
"loss": 0.8462,
"step": 1390
},
{
"epoch": 0.068172071994805,
"grad_norm": 0.768941342830658,
"learning_rate": 1.6179996958877397e-06,
"loss": 0.9763,
"step": 1391
},
{
"epoch": 0.06822108139235698,
"grad_norm": 0.5803397297859192,
"learning_rate": 1.5798063556749954e-06,
"loss": 0.8994,
"step": 1392
},
{
"epoch": 0.06827009078990896,
"grad_norm": 0.745637834072113,
"learning_rate": 1.5420656092043352e-06,
"loss": 0.8625,
"step": 1393
},
{
"epoch": 0.06831910018746094,
"grad_norm": 1.1419793367385864,
"learning_rate": 1.5047776300298411e-06,
"loss": 0.9221,
"step": 1394
},
{
"epoch": 0.06836810958501292,
"grad_norm": 0.8074661493301392,
"learning_rate": 1.4679425896234833e-06,
"loss": 1.1161,
"step": 1395
},
{
"epoch": 0.0684171189825649,
"grad_norm": 0.5724306106567383,
"learning_rate": 1.4315606573743755e-06,
"loss": 0.9047,
"step": 1396
},
{
"epoch": 0.06846612838011688,
"grad_norm": 1.6764434576034546,
"learning_rate": 1.3956320005879765e-06,
"loss": 0.9669,
"step": 1397
},
{
"epoch": 0.06851513777766886,
"grad_norm": 0.7271180748939514,
"learning_rate": 1.3601567844853114e-06,
"loss": 0.9313,
"step": 1398
},
{
"epoch": 0.06856414717522084,
"grad_norm": 0.6413068175315857,
"learning_rate": 1.3251351722021964e-06,
"loss": 0.8178,
"step": 1399
},
{
"epoch": 0.06861315657277282,
"grad_norm": 0.6867620348930359,
"learning_rate": 1.2905673247885718e-06,
"loss": 0.9873,
"step": 1400
},
{
"epoch": 0.0686621659703248,
"grad_norm": 0.6367160677909851,
"learning_rate": 1.2564534012076245e-06,
"loss": 0.9447,
"step": 1401
},
{
"epoch": 0.06871117536787678,
"grad_norm": 0.6995184421539307,
"learning_rate": 1.222793558335189e-06,
"loss": 0.9478,
"step": 1402
},
{
"epoch": 0.06876018476542878,
"grad_norm": 0.6374393701553345,
"learning_rate": 1.1895879509589592e-06,
"loss": 0.8458,
"step": 1403
},
{
"epoch": 0.06880919416298076,
"grad_norm": 0.8276384472846985,
"learning_rate": 1.1568367317777662e-06,
"loss": 0.8084,
"step": 1404
},
{
"epoch": 0.06885820356053274,
"grad_norm": 1.0978977680206299,
"learning_rate": 1.1245400514009351e-06,
"loss": 1.1747,
"step": 1405
},
{
"epoch": 0.06890721295808472,
"grad_norm": 0.7997610569000244,
"learning_rate": 1.0926980583475076e-06,
"loss": 0.7353,
"step": 1406
},
{
"epoch": 0.0689562223556367,
"grad_norm": 0.902769923210144,
"learning_rate": 1.0613108990456643e-06,
"loss": 0.7245,
"step": 1407
},
{
"epoch": 0.06900523175318868,
"grad_norm": 0.5651756525039673,
"learning_rate": 1.0303787178319368e-06,
"loss": 0.9087,
"step": 1408
},
{
"epoch": 0.06905424115074066,
"grad_norm": 0.6706457138061523,
"learning_rate": 9.999016569506304e-07,
"loss": 1.0007,
"step": 1409
},
{
"epoch": 0.06910325054829264,
"grad_norm": 0.6027235984802246,
"learning_rate": 9.698798565531464e-07,
"loss": 0.8687,
"step": 1410
},
{
"epoch": 0.06915225994584462,
"grad_norm": 0.7728469371795654,
"learning_rate": 9.403134546973058e-07,
"loss": 0.6449,
"step": 1411
},
{
"epoch": 0.0692012693433966,
"grad_norm": 0.6937982439994812,
"learning_rate": 9.112025873467711e-07,
"loss": 0.8034,
"step": 1412
},
{
"epoch": 0.06925027874094858,
"grad_norm": 0.7616922855377197,
"learning_rate": 8.825473883703695e-07,
"loss": 1.0196,
"step": 1413
},
{
"epoch": 0.06929928813850056,
"grad_norm": 0.6613472104072571,
"learning_rate": 8.543479895415041e-07,
"loss": 0.9019,
"step": 1414
},
{
"epoch": 0.06934829753605254,
"grad_norm": 0.751873791217804,
"learning_rate": 8.266045205375328e-07,
"loss": 0.9623,
"step": 1415
},
{
"epoch": 0.06939730693360452,
"grad_norm": 0.6417289972305298,
"learning_rate": 7.993171089391905e-07,
"loss": 1.0231,
"step": 1416
},
{
"epoch": 0.0694463163311565,
"grad_norm": 0.7124348878860474,
"learning_rate": 7.724858802300006e-07,
"loss": 0.9813,
"step": 1417
},
{
"epoch": 0.06949532572870848,
"grad_norm": 0.7025142908096313,
"learning_rate": 7.461109577956648e-07,
"loss": 0.9227,
"step": 1418
},
{
"epoch": 0.06954433512626046,
"grad_norm": 0.9039905071258545,
"learning_rate": 7.201924629235524e-07,
"loss": 1.0741,
"step": 1419
},
{
"epoch": 0.06959334452381244,
"grad_norm": 0.6852249503135681,
"learning_rate": 6.947305148020889e-07,
"loss": 0.985,
"step": 1420
},
{
"epoch": 0.06964235392136442,
"grad_norm": 0.8314114809036255,
"learning_rate": 6.697252305202461e-07,
"loss": 0.8991,
"step": 1421
},
{
"epoch": 0.0696913633189164,
"grad_norm": 0.6999631524085999,
"learning_rate": 6.451767250669538e-07,
"loss": 0.9926,
"step": 1422
},
{
"epoch": 0.06974037271646838,
"grad_norm": 0.6860081553459167,
"learning_rate": 6.210851113306548e-07,
"loss": 0.9839,
"step": 1423
},
{
"epoch": 0.06978938211402036,
"grad_norm": 0.6958408355712891,
"learning_rate": 5.974505000987062e-07,
"loss": 0.9177,
"step": 1424
},
{
"epoch": 0.06983839151157234,
"grad_norm": 0.9045180082321167,
"learning_rate": 5.742730000568908e-07,
"loss": 1.0029,
"step": 1425
},
{
"epoch": 0.06988740090912432,
"grad_norm": 0.6793363690376282,
"learning_rate": 5.515527177889501e-07,
"loss": 0.9631,
"step": 1426
},
{
"epoch": 0.0699364103066763,
"grad_norm": 0.7411555647850037,
"learning_rate": 5.292897577760747e-07,
"loss": 1.0294,
"step": 1427
},
{
"epoch": 0.06998541970422828,
"grad_norm": 0.7518298625946045,
"learning_rate": 5.074842223963816e-07,
"loss": 0.8648,
"step": 1428
},
{
"epoch": 0.07003442910178026,
"grad_norm": 0.8642678260803223,
"learning_rate": 4.861362119245039e-07,
"loss": 1.0929,
"step": 1429
},
{
"epoch": 0.07008343849933224,
"grad_norm": 0.7154537439346313,
"learning_rate": 4.652458245311242e-07,
"loss": 0.8986,
"step": 1430
},
{
"epoch": 0.07013244789688423,
"grad_norm": 0.6140034794807434,
"learning_rate": 4.448131562824864e-07,
"loss": 1.0537,
"step": 1431
},
{
"epoch": 0.07018145729443621,
"grad_norm": 0.6018050909042358,
"learning_rate": 4.248383011399626e-07,
"loss": 0.8818,
"step": 1432
},
{
"epoch": 0.0702304666919882,
"grad_norm": 0.5931532979011536,
"learning_rate": 4.053213509596532e-07,
"loss": 0.8915,
"step": 1433
},
{
"epoch": 0.07027947608954017,
"grad_norm": 0.5891003012657166,
"learning_rate": 3.862623954919431e-07,
"loss": 0.9231,
"step": 1434
},
{
"epoch": 0.07032848548709215,
"grad_norm": 0.7145758867263794,
"learning_rate": 3.6766152238106865e-07,
"loss": 0.8431,
"step": 1435
},
{
"epoch": 0.07037749488464413,
"grad_norm": 0.7791634798049927,
"learning_rate": 3.495188171647512e-07,
"loss": 0.8727,
"step": 1436
},
{
"epoch": 0.07042650428219611,
"grad_norm": 0.6540659070014954,
"learning_rate": 3.3183436327379744e-07,
"loss": 0.8934,
"step": 1437
},
{
"epoch": 0.0704755136797481,
"grad_norm": 0.677821934223175,
"learning_rate": 3.146082420316776e-07,
"loss": 1.0389,
"step": 1438
},
{
"epoch": 0.07052452307730007,
"grad_norm": 0.5481241941452026,
"learning_rate": 2.978405326541922e-07,
"loss": 0.8784,
"step": 1439
},
{
"epoch": 0.07057353247485205,
"grad_norm": 0.6913087368011475,
"learning_rate": 2.81531312249117e-07,
"loss": 0.8306,
"step": 1440
},
{
"epoch": 0.07062254187240403,
"grad_norm": 0.7169864177703857,
"learning_rate": 2.6568065581579207e-07,
"loss": 0.6832,
"step": 1441
},
{
"epoch": 0.07067155126995601,
"grad_norm": 0.6380186080932617,
"learning_rate": 2.5028863624482204e-07,
"loss": 0.9776,
"step": 1442
},
{
"epoch": 0.070720560667508,
"grad_norm": 0.7633563280105591,
"learning_rate": 2.353553243177542e-07,
"loss": 0.9383,
"step": 1443
},
{
"epoch": 0.07076957006505998,
"grad_norm": 0.6005712747573853,
"learning_rate": 2.2088078870668994e-07,
"loss": 0.8413,
"step": 1444
},
{
"epoch": 0.07081857946261196,
"grad_norm": 1.0024440288543701,
"learning_rate": 2.0686509597404037e-07,
"loss": 0.8163,
"step": 1445
},
{
"epoch": 0.07086758886016394,
"grad_norm": 0.8160889148712158,
"learning_rate": 1.9330831057218223e-07,
"loss": 0.9675,
"step": 1446
},
{
"epoch": 0.07091659825771592,
"grad_norm": 0.77981036901474,
"learning_rate": 1.8021049484314712e-07,
"loss": 0.9325,
"step": 1447
},
{
"epoch": 0.0709656076552679,
"grad_norm": 0.6795889735221863,
"learning_rate": 1.6757170901837703e-07,
"loss": 0.9341,
"step": 1448
},
{
"epoch": 0.07101461705281988,
"grad_norm": 0.7234419584274292,
"learning_rate": 1.5539201121841373e-07,
"loss": 0.7137,
"step": 1449
},
{
"epoch": 0.07106362645037186,
"grad_norm": 0.6544097661972046,
"learning_rate": 1.436714574526543e-07,
"loss": 0.8777,
"step": 1450
},
{
"epoch": 0.07111263584792384,
"grad_norm": 0.7138416171073914,
"learning_rate": 1.3241010161907375e-07,
"loss": 0.8288,
"step": 1451
},
{
"epoch": 0.07116164524547582,
"grad_norm": 0.6424012184143066,
"learning_rate": 1.216079955039806e-07,
"loss": 1.0348,
"step": 1452
},
{
"epoch": 0.0712106546430278,
"grad_norm": 0.6267834901809692,
"learning_rate": 1.1126518878179504e-07,
"loss": 0.9566,
"step": 1453
},
{
"epoch": 0.07125966404057978,
"grad_norm": 0.846564531326294,
"learning_rate": 1.0138172901480447e-07,
"loss": 0.9682,
"step": 1454
},
{
"epoch": 0.07130867343813176,
"grad_norm": 1.1881537437438965,
"learning_rate": 9.195766165295272e-08,
"loss": 1.1735,
"step": 1455
},
{
"epoch": 0.07135768283568374,
"grad_norm": 0.5580865144729614,
"learning_rate": 8.299303003361791e-08,
"loss": 0.7716,
"step": 1456
},
{
"epoch": 0.07140669223323572,
"grad_norm": 0.7786732912063599,
"learning_rate": 7.448787538144597e-08,
"loss": 0.7723,
"step": 1457
},
{
"epoch": 0.07145570163078771,
"grad_norm": 0.7427374720573425,
"learning_rate": 6.644223680810635e-08,
"loss": 0.8426,
"step": 1458
},
{
"epoch": 0.07150471102833969,
"grad_norm": 0.7211986184120178,
"learning_rate": 5.885615131216993e-08,
"loss": 0.8432,
"step": 1459
},
{
"epoch": 0.07155372042589167,
"grad_norm": 0.7002434730529785,
"learning_rate": 5.172965377890915e-08,
"loss": 0.9946,
"step": 1460
},
{
"epoch": 0.07160272982344365,
"grad_norm": 0.5924804210662842,
"learning_rate": 4.50627769801315e-08,
"loss": 0.9195,
"step": 1461
},
{
"epoch": 0.07165173922099563,
"grad_norm": 0.7211676239967346,
"learning_rate": 3.88555515740463e-08,
"loss": 0.7864,
"step": 1462
},
{
"epoch": 0.07170074861854761,
"grad_norm": 0.5475707650184631,
"learning_rate": 3.310800610510922e-08,
"loss": 0.9374,
"step": 1463
},
{
"epoch": 0.07174975801609959,
"grad_norm": 0.8015599846839905,
"learning_rate": 2.7820167003911324e-08,
"loss": 0.8218,
"step": 1464
},
{
"epoch": 0.07179876741365157,
"grad_norm": 0.7530079483985901,
"learning_rate": 2.2992058587023578e-08,
"loss": 0.7691,
"step": 1465
},
{
"epoch": 0.07184777681120355,
"grad_norm": 0.6978147625923157,
"learning_rate": 1.862370305694139e-08,
"loss": 0.7911,
"step": 1466
},
{
"epoch": 0.07189678620875553,
"grad_norm": 1.2115317583084106,
"learning_rate": 1.4715120501895829e-08,
"loss": 0.9976,
"step": 1467
},
{
"epoch": 0.07194579560630751,
"grad_norm": 0.6246762275695801,
"learning_rate": 1.1266328895864764e-08,
"loss": 1.0398,
"step": 1468
},
{
"epoch": 0.07199480500385949,
"grad_norm": 0.86592036485672,
"learning_rate": 8.277344098406303e-09,
"loss": 1.0155,
"step": 1469
},
{
"epoch": 0.07204381440141147,
"grad_norm": 0.6934499740600586,
"learning_rate": 5.7481798546144e-09,
"loss": 0.8013,
"step": 1470
},
{
"epoch": 0.07209282379896345,
"grad_norm": 0.7887607216835022,
"learning_rate": 3.678847795085538e-09,
"loss": 0.9494,
"step": 1471
},
{
"epoch": 0.07214183319651543,
"grad_norm": 0.7494127154350281,
"learning_rate": 2.069357435796615e-09,
"loss": 0.8994,
"step": 1472
},
{
"epoch": 0.07219084259406741,
"grad_norm": 0.6598345637321472,
"learning_rate": 9.197161781604458e-10,
"loss": 0.8317,
"step": 1473
},
{
"epoch": 0.07223985199161939,
"grad_norm": 0.6405826807022095,
"learning_rate": 2.2992930888143804e-10,
"loss": 0.8109,
"step": 1474
},
{
"epoch": 0.07228886138917137,
"grad_norm": 0.7148447632789612,
"learning_rate": 0.0,
"loss": 0.9846,
"step": 1475
}
],
"logging_steps": 1,
"max_steps": 1475,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 369,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.063103052008653e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}