cctv-caption-v1.0 / trainer_state.json
seanphan's picture
Upload v1.0 CCTV caption model
052abf4 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9719154307352477,
"eval_steps": 770,
"global_step": 770,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012622278321236984,
"grad_norm": 1.1810976266860962,
"learning_rate": 0.0,
"loss": 2.1786725521087646,
"step": 1
},
{
"epoch": 0.0025244556642473968,
"grad_norm": 1.1999785900115967,
"learning_rate": 4e-05,
"loss": 1.9390826225280762,
"step": 2
},
{
"epoch": 0.003786683496371095,
"grad_norm": 1.2012475728988647,
"learning_rate": 8e-05,
"loss": 1.841808795928955,
"step": 3
},
{
"epoch": 0.0050489113284947935,
"grad_norm": 1.4274017810821533,
"learning_rate": 0.00012,
"loss": 2.174586772918701,
"step": 4
},
{
"epoch": 0.006311139160618492,
"grad_norm": 0.5815935730934143,
"learning_rate": 0.00016,
"loss": 1.7276136875152588,
"step": 5
},
{
"epoch": 0.00757336699274219,
"grad_norm": 0.48476865887641907,
"learning_rate": 0.0002,
"loss": 1.6276743412017822,
"step": 6
},
{
"epoch": 0.008835594824865888,
"grad_norm": 0.5590611696243286,
"learning_rate": 0.0001999991567695732,
"loss": 1.6253315210342407,
"step": 7
},
{
"epoch": 0.010097822656989587,
"grad_norm": 0.5516509413719177,
"learning_rate": 0.00019999662709251355,
"loss": 1.457699179649353,
"step": 8
},
{
"epoch": 0.011360050489113285,
"grad_norm": 1.3951493501663208,
"learning_rate": 0.00019999241101148306,
"loss": 1.448043942451477,
"step": 9
},
{
"epoch": 0.012622278321236984,
"grad_norm": 0.7879750728607178,
"learning_rate": 0.0001999865085975843,
"loss": 1.127958059310913,
"step": 10
},
{
"epoch": 0.013884506153360681,
"grad_norm": 0.6136755347251892,
"learning_rate": 0.00019997891995035912,
"loss": 1.29304039478302,
"step": 11
},
{
"epoch": 0.01514673398548438,
"grad_norm": 0.8061326146125793,
"learning_rate": 0.0001999696451977872,
"loss": 0.9419246912002563,
"step": 12
},
{
"epoch": 0.016408961817608078,
"grad_norm": 0.6488391757011414,
"learning_rate": 0.00019995868449628346,
"loss": 0.8523351550102234,
"step": 13
},
{
"epoch": 0.017671189649731776,
"grad_norm": 0.9592429399490356,
"learning_rate": 0.00019994603803069594,
"loss": 0.7415441870689392,
"step": 14
},
{
"epoch": 0.018933417481855473,
"grad_norm": 0.6320379972457886,
"learning_rate": 0.0001999317060143023,
"loss": 0.9742417335510254,
"step": 15
},
{
"epoch": 0.020195645313979174,
"grad_norm": 0.6976192593574524,
"learning_rate": 0.0001999156886888064,
"loss": 1.0749256610870361,
"step": 16
},
{
"epoch": 0.02145787314610287,
"grad_norm": 0.6568692922592163,
"learning_rate": 0.00019989798632433415,
"loss": 0.7685850262641907,
"step": 17
},
{
"epoch": 0.02272010097822657,
"grad_norm": 0.48727890849113464,
"learning_rate": 0.00019987859921942903,
"loss": 0.5362906455993652,
"step": 18
},
{
"epoch": 0.023982328810350267,
"grad_norm": 0.42397183179855347,
"learning_rate": 0.0001998575277010469,
"loss": 0.6970788836479187,
"step": 19
},
{
"epoch": 0.025244556642473968,
"grad_norm": 0.4272933602333069,
"learning_rate": 0.00019983477212455074,
"loss": 0.8377600312232971,
"step": 20
},
{
"epoch": 0.026506784474597665,
"grad_norm": 0.3498779535293579,
"learning_rate": 0.00019981033287370443,
"loss": 0.7417164444923401,
"step": 21
},
{
"epoch": 0.027769012306721363,
"grad_norm": 0.45754557847976685,
"learning_rate": 0.00019978421036066633,
"loss": 0.7524069547653198,
"step": 22
},
{
"epoch": 0.02903124013884506,
"grad_norm": 0.406505823135376,
"learning_rate": 0.00019975640502598244,
"loss": 0.8919811248779297,
"step": 23
},
{
"epoch": 0.03029346797096876,
"grad_norm": 0.3776075839996338,
"learning_rate": 0.00019972691733857883,
"loss": 0.5425232648849487,
"step": 24
},
{
"epoch": 0.031555695803092455,
"grad_norm": 0.4487985670566559,
"learning_rate": 0.00019969574779575376,
"loss": 0.5764633417129517,
"step": 25
},
{
"epoch": 0.032817923635216156,
"grad_norm": 0.4203525483608246,
"learning_rate": 0.00019966289692316944,
"loss": 0.7679987549781799,
"step": 26
},
{
"epoch": 0.03408015146733986,
"grad_norm": 0.36741408705711365,
"learning_rate": 0.00019962836527484296,
"loss": 0.6128969192504883,
"step": 27
},
{
"epoch": 0.03534237929946355,
"grad_norm": 0.3909834325313568,
"learning_rate": 0.00019959215343313703,
"loss": 0.6979946494102478,
"step": 28
},
{
"epoch": 0.03660460713158725,
"grad_norm": 0.3810923099517822,
"learning_rate": 0.00019955426200875018,
"loss": 0.8191502690315247,
"step": 29
},
{
"epoch": 0.037866834963710946,
"grad_norm": 0.4916118085384369,
"learning_rate": 0.00019951469164070646,
"loss": 0.9299726486206055,
"step": 30
},
{
"epoch": 0.03912906279583465,
"grad_norm": 0.37555935978889465,
"learning_rate": 0.00019947344299634464,
"loss": 1.0361579656600952,
"step": 31
},
{
"epoch": 0.04039129062795835,
"grad_norm": 0.42949214577674866,
"learning_rate": 0.00019943051677130696,
"loss": 0.8678889274597168,
"step": 32
},
{
"epoch": 0.04165351846008204,
"grad_norm": 0.41855067014694214,
"learning_rate": 0.0001993859136895274,
"loss": 0.8316136002540588,
"step": 33
},
{
"epoch": 0.04291574629220574,
"grad_norm": 0.4109402894973755,
"learning_rate": 0.00019933963450321945,
"loss": 0.6912973523139954,
"step": 34
},
{
"epoch": 0.044177974124329444,
"grad_norm": 0.4073610007762909,
"learning_rate": 0.0001992916799928635,
"loss": 0.9194254875183105,
"step": 35
},
{
"epoch": 0.04544020195645314,
"grad_norm": 0.4720235764980316,
"learning_rate": 0.0001992420509671936,
"loss": 0.7957297563552856,
"step": 36
},
{
"epoch": 0.04670242978857684,
"grad_norm": 0.3987046182155609,
"learning_rate": 0.0001991907482631838,
"loss": 0.6258067488670349,
"step": 37
},
{
"epoch": 0.04796465762070053,
"grad_norm": 0.4448748528957367,
"learning_rate": 0.00019913777274603418,
"loss": 1.003873348236084,
"step": 38
},
{
"epoch": 0.049226885452824234,
"grad_norm": 0.4538639783859253,
"learning_rate": 0.00019908312530915603,
"loss": 0.8705529570579529,
"step": 39
},
{
"epoch": 0.050489113284947935,
"grad_norm": 3.1903927326202393,
"learning_rate": 0.00019902680687415705,
"loss": 0.5736751556396484,
"step": 40
},
{
"epoch": 0.05175134111707163,
"grad_norm": 0.34906044602394104,
"learning_rate": 0.00019896881839082556,
"loss": 0.6542955636978149,
"step": 41
},
{
"epoch": 0.05301356894919533,
"grad_norm": 3.0380051136016846,
"learning_rate": 0.0001989091608371146,
"loss": 0.9085805416107178,
"step": 42
},
{
"epoch": 0.05427579678131903,
"grad_norm": 0.3339233696460724,
"learning_rate": 0.00019884783521912554,
"loss": 0.4547462463378906,
"step": 43
},
{
"epoch": 0.055538024613442726,
"grad_norm": 0.38581445813179016,
"learning_rate": 0.00019878484257109083,
"loss": 0.5983158349990845,
"step": 44
},
{
"epoch": 0.056800252445566426,
"grad_norm": 0.3721480071544647,
"learning_rate": 0.0001987201839553569,
"loss": 0.8342102766036987,
"step": 45
},
{
"epoch": 0.05806248027769012,
"grad_norm": 0.4079038202762604,
"learning_rate": 0.00019865386046236596,
"loss": 0.854637861251831,
"step": 46
},
{
"epoch": 0.05932470810981382,
"grad_norm": 0.33452996611595154,
"learning_rate": 0.00019858587321063776,
"loss": 0.48024851083755493,
"step": 47
},
{
"epoch": 0.06058693594193752,
"grad_norm": 0.35006284713745117,
"learning_rate": 0.00019851622334675066,
"loss": 0.7163654565811157,
"step": 48
},
{
"epoch": 0.06184916377406122,
"grad_norm": 0.41123610734939575,
"learning_rate": 0.00019844491204532236,
"loss": 0.4998229742050171,
"step": 49
},
{
"epoch": 0.06311139160618491,
"grad_norm": 0.3749666213989258,
"learning_rate": 0.0001983719405089901,
"loss": 0.48700374364852905,
"step": 50
},
{
"epoch": 0.06437361943830862,
"grad_norm": 0.41837647557258606,
"learning_rate": 0.0001982973099683902,
"loss": 1.0134358406066895,
"step": 51
},
{
"epoch": 0.06563584727043231,
"grad_norm": 0.3964208960533142,
"learning_rate": 0.00019822102168213753,
"loss": 0.8818788528442383,
"step": 52
},
{
"epoch": 0.066898075102556,
"grad_norm": 0.4097653925418854,
"learning_rate": 0.0001981430769368042,
"loss": 0.6342326998710632,
"step": 53
},
{
"epoch": 0.06816030293467971,
"grad_norm": 0.3813578188419342,
"learning_rate": 0.00019806347704689778,
"loss": 0.6181271076202393,
"step": 54
},
{
"epoch": 0.06942253076680341,
"grad_norm": 0.36281293630599976,
"learning_rate": 0.00019798222335483932,
"loss": 0.9839555025100708,
"step": 55
},
{
"epoch": 0.0706847585989271,
"grad_norm": 0.4149906039237976,
"learning_rate": 0.00019789931723094046,
"loss": 0.6778839826583862,
"step": 56
},
{
"epoch": 0.07194698643105081,
"grad_norm": 0.3341962993144989,
"learning_rate": 0.00019781476007338058,
"loss": 0.47752535343170166,
"step": 57
},
{
"epoch": 0.0732092142631745,
"grad_norm": 0.3859621286392212,
"learning_rate": 0.000197728553308183,
"loss": 0.8040428161621094,
"step": 58
},
{
"epoch": 0.0744714420952982,
"grad_norm": 0.4537695348262787,
"learning_rate": 0.0001976406983891911,
"loss": 0.5346378684043884,
"step": 59
},
{
"epoch": 0.07573366992742189,
"grad_norm": 0.39911121129989624,
"learning_rate": 0.00019755119679804367,
"loss": 0.8945479989051819,
"step": 60
},
{
"epoch": 0.0769958977595456,
"grad_norm": 0.3326367437839508,
"learning_rate": 0.00019746005004415005,
"loss": 0.40628719329833984,
"step": 61
},
{
"epoch": 0.0782581255916693,
"grad_norm": 0.3570570945739746,
"learning_rate": 0.0001973672596646645,
"loss": 0.4461412727832794,
"step": 62
},
{
"epoch": 0.07952035342379299,
"grad_norm": 0.46154263615608215,
"learning_rate": 0.00019727282722446047,
"loss": 0.8460710048675537,
"step": 63
},
{
"epoch": 0.0807825812559167,
"grad_norm": 0.3912942111492157,
"learning_rate": 0.00019717675431610415,
"loss": 0.855891764163971,
"step": 64
},
{
"epoch": 0.08204480908804039,
"grad_norm": 0.39667049050331116,
"learning_rate": 0.00019707904255982745,
"loss": 0.7594934105873108,
"step": 65
},
{
"epoch": 0.08330703692016408,
"grad_norm": 0.37858495116233826,
"learning_rate": 0.00019697969360350098,
"loss": 0.8552739024162292,
"step": 66
},
{
"epoch": 0.08456926475228779,
"grad_norm": 0.3944226801395416,
"learning_rate": 0.0001968787091226059,
"loss": 0.6596317291259766,
"step": 67
},
{
"epoch": 0.08583149258441149,
"grad_norm": 0.4035973846912384,
"learning_rate": 0.00019677609082020597,
"loss": 0.7658134698867798,
"step": 68
},
{
"epoch": 0.08709372041653518,
"grad_norm": 0.3967765271663666,
"learning_rate": 0.00019667184042691875,
"loss": 0.768731951713562,
"step": 69
},
{
"epoch": 0.08835594824865889,
"grad_norm": 0.40382981300354004,
"learning_rate": 0.00019656595970088628,
"loss": 0.689699649810791,
"step": 70
},
{
"epoch": 0.08961817608078258,
"grad_norm": 0.3337244391441345,
"learning_rate": 0.00019645845042774553,
"loss": 0.33471691608428955,
"step": 71
},
{
"epoch": 0.09088040391290628,
"grad_norm": 0.32900235056877136,
"learning_rate": 0.00019634931442059832,
"loss": 0.8053317070007324,
"step": 72
},
{
"epoch": 0.09214263174502998,
"grad_norm": 0.33187833428382874,
"learning_rate": 0.00019623855351998072,
"loss": 0.4668503999710083,
"step": 73
},
{
"epoch": 0.09340485957715368,
"grad_norm": 0.4185413420200348,
"learning_rate": 0.0001961261695938319,
"loss": 0.7394185066223145,
"step": 74
},
{
"epoch": 0.09466708740927737,
"grad_norm": 0.3454440236091614,
"learning_rate": 0.00019601216453746283,
"loss": 0.5356079339981079,
"step": 75
},
{
"epoch": 0.09592931524140107,
"grad_norm": 0.36690330505371094,
"learning_rate": 0.00019589654027352414,
"loss": 0.496408611536026,
"step": 76
},
{
"epoch": 0.09719154307352477,
"grad_norm": 1.212344765663147,
"learning_rate": 0.00019577929875197377,
"loss": 1.0225098133087158,
"step": 77
},
{
"epoch": 0.09845377090564847,
"grad_norm": 0.43937745690345764,
"learning_rate": 0.0001956604419500441,
"loss": 0.7864935398101807,
"step": 78
},
{
"epoch": 0.09971599873777216,
"grad_norm": 0.37690651416778564,
"learning_rate": 0.00019553997187220855,
"loss": 0.4752700924873352,
"step": 79
},
{
"epoch": 0.10097822656989587,
"grad_norm": 0.34280529618263245,
"learning_rate": 0.00019541789055014784,
"loss": 0.5001055002212524,
"step": 80
},
{
"epoch": 0.10224045440201956,
"grad_norm": 0.37480127811431885,
"learning_rate": 0.00019529420004271567,
"loss": 0.6418332457542419,
"step": 81
},
{
"epoch": 0.10350268223414326,
"grad_norm": 0.3891831338405609,
"learning_rate": 0.000195168902435904,
"loss": 0.8710986375808716,
"step": 82
},
{
"epoch": 0.10476491006626697,
"grad_norm": 0.3586503565311432,
"learning_rate": 0.00019504199984280799,
"loss": 0.6337010860443115,
"step": 83
},
{
"epoch": 0.10602713789839066,
"grad_norm": 0.36571335792541504,
"learning_rate": 0.00019491349440359015,
"loss": 0.7422975301742554,
"step": 84
},
{
"epoch": 0.10728936573051435,
"grad_norm": 0.39639922976493835,
"learning_rate": 0.00019478338828544435,
"loss": 0.8967505097389221,
"step": 85
},
{
"epoch": 0.10855159356263806,
"grad_norm": 0.409046471118927,
"learning_rate": 0.00019465168368255946,
"loss": 0.6384124159812927,
"step": 86
},
{
"epoch": 0.10981382139476176,
"grad_norm": 0.40344712138175964,
"learning_rate": 0.00019451838281608197,
"loss": 0.8778766393661499,
"step": 87
},
{
"epoch": 0.11107604922688545,
"grad_norm": 0.32860085368156433,
"learning_rate": 0.00019438348793407881,
"loss": 0.4792889654636383,
"step": 88
},
{
"epoch": 0.11233827705900915,
"grad_norm": 0.39201056957244873,
"learning_rate": 0.0001942470013114994,
"loss": 0.7574765086174011,
"step": 89
},
{
"epoch": 0.11360050489113285,
"grad_norm": 0.3348289728164673,
"learning_rate": 0.0001941089252501372,
"loss": 0.9156350493431091,
"step": 90
},
{
"epoch": 0.11486273272325655,
"grad_norm": 0.40806034207344055,
"learning_rate": 0.00019396926207859084,
"loss": 0.5706713795661926,
"step": 91
},
{
"epoch": 0.11612496055538024,
"grad_norm": 0.4064014256000519,
"learning_rate": 0.00019382801415222516,
"loss": 0.697914719581604,
"step": 92
},
{
"epoch": 0.11738718838750395,
"grad_norm": 0.3701585829257965,
"learning_rate": 0.00019368518385313107,
"loss": 0.5228875279426575,
"step": 93
},
{
"epoch": 0.11864941621962764,
"grad_norm": 0.4085630476474762,
"learning_rate": 0.0001935407735900857,
"loss": 0.5461081266403198,
"step": 94
},
{
"epoch": 0.11991164405175134,
"grad_norm": 0.42529523372650146,
"learning_rate": 0.00019339478579851155,
"loss": 0.7004275918006897,
"step": 95
},
{
"epoch": 0.12117387188387505,
"grad_norm": 0.3296562731266022,
"learning_rate": 0.00019324722294043558,
"loss": 0.728748619556427,
"step": 96
},
{
"epoch": 0.12243609971599874,
"grad_norm": 0.35158950090408325,
"learning_rate": 0.0001930980875044477,
"loss": 0.4642578959465027,
"step": 97
},
{
"epoch": 0.12369832754812243,
"grad_norm": 0.3580923080444336,
"learning_rate": 0.00019294738200565856,
"loss": 0.6952727437019348,
"step": 98
},
{
"epoch": 0.12496055538024614,
"grad_norm": 0.3877851963043213,
"learning_rate": 0.0001927951089856575,
"loss": 0.9369809031486511,
"step": 99
},
{
"epoch": 0.12622278321236982,
"grad_norm": 0.35963308811187744,
"learning_rate": 0.0001926412710124693,
"loss": 0.8294747471809387,
"step": 100
},
{
"epoch": 0.12748501104449353,
"grad_norm": 0.3461640179157257,
"learning_rate": 0.0001924858706805112,
"loss": 0.5015355348587036,
"step": 101
},
{
"epoch": 0.12874723887661724,
"grad_norm": 0.41662901639938354,
"learning_rate": 0.00019232891061054895,
"loss": 0.613286018371582,
"step": 102
},
{
"epoch": 0.13000946670874092,
"grad_norm": 0.39659371972084045,
"learning_rate": 0.0001921703934496527,
"loss": 0.7263169884681702,
"step": 103
},
{
"epoch": 0.13127169454086463,
"grad_norm": 0.3626038134098053,
"learning_rate": 0.00019201032187115234,
"loss": 0.5920513272285461,
"step": 104
},
{
"epoch": 0.13253392237298833,
"grad_norm": 0.25446978211402893,
"learning_rate": 0.00019184869857459232,
"loss": 0.20390769839286804,
"step": 105
},
{
"epoch": 0.133796150205112,
"grad_norm": 0.3908882439136505,
"learning_rate": 0.00019168552628568631,
"loss": 0.911649763584137,
"step": 106
},
{
"epoch": 0.13505837803723572,
"grad_norm": 0.5168955326080322,
"learning_rate": 0.00019152080775627103,
"loss": 0.783044159412384,
"step": 107
},
{
"epoch": 0.13632060586935943,
"grad_norm": 0.32102423906326294,
"learning_rate": 0.0001913545457642601,
"loss": 0.284521222114563,
"step": 108
},
{
"epoch": 0.1375828337014831,
"grad_norm": 0.41527506709098816,
"learning_rate": 0.00019118674311359684,
"loss": 0.690119206905365,
"step": 109
},
{
"epoch": 0.13884506153360682,
"grad_norm": 0.3743795156478882,
"learning_rate": 0.0001910174026342073,
"loss": 0.8299716711044312,
"step": 110
},
{
"epoch": 0.14010728936573053,
"grad_norm": 0.4144361615180969,
"learning_rate": 0.00019084652718195238,
"loss": 0.7170496582984924,
"step": 111
},
{
"epoch": 0.1413695171978542,
"grad_norm": 0.3862667679786682,
"learning_rate": 0.00019067411963857967,
"loss": 0.6340428590774536,
"step": 112
},
{
"epoch": 0.1426317450299779,
"grad_norm": 0.41245025396347046,
"learning_rate": 0.0001905001829116749,
"loss": 0.644637405872345,
"step": 113
},
{
"epoch": 0.14389397286210162,
"grad_norm": 0.34236887097358704,
"learning_rate": 0.0001903247199346129,
"loss": 0.5065594911575317,
"step": 114
},
{
"epoch": 0.1451562006942253,
"grad_norm": 0.406076043844223,
"learning_rate": 0.00019014773366650807,
"loss": 0.8917930126190186,
"step": 115
},
{
"epoch": 0.146418428526349,
"grad_norm": 0.3787905275821686,
"learning_rate": 0.00018996922709216455,
"loss": 0.8648253083229065,
"step": 116
},
{
"epoch": 0.14768065635847272,
"grad_norm": 0.3749518096446991,
"learning_rate": 0.00018978920322202582,
"loss": 0.6751912832260132,
"step": 117
},
{
"epoch": 0.1489428841905964,
"grad_norm": 0.32289671897888184,
"learning_rate": 0.000189607665092124,
"loss": 0.5505026578903198,
"step": 118
},
{
"epoch": 0.1502051120227201,
"grad_norm": 0.3582629859447479,
"learning_rate": 0.00018942461576402857,
"loss": 0.6920587420463562,
"step": 119
},
{
"epoch": 0.15146733985484379,
"grad_norm": 0.3632330596446991,
"learning_rate": 0.00018924005832479478,
"loss": 0.6031773090362549,
"step": 120
},
{
"epoch": 0.1527295676869675,
"grad_norm": 0.40739816427230835,
"learning_rate": 0.00018905399588691163,
"loss": 0.8041491508483887,
"step": 121
},
{
"epoch": 0.1539917955190912,
"grad_norm": 0.35906773805618286,
"learning_rate": 0.0001888664315882493,
"loss": 0.851598858833313,
"step": 122
},
{
"epoch": 0.15525402335121488,
"grad_norm": 0.29666247963905334,
"learning_rate": 0.0001886773685920062,
"loss": 0.46212196350097656,
"step": 123
},
{
"epoch": 0.1565162511833386,
"grad_norm": 0.3250925540924072,
"learning_rate": 0.00018848681008665582,
"loss": 0.4569106101989746,
"step": 124
},
{
"epoch": 0.1577784790154623,
"grad_norm": 0.36993423104286194,
"learning_rate": 0.00018829475928589271,
"loss": 0.6663421988487244,
"step": 125
},
{
"epoch": 0.15904070684758598,
"grad_norm": 0.3611743152141571,
"learning_rate": 0.00018810121942857845,
"loss": 0.7817614674568176,
"step": 126
},
{
"epoch": 0.16030293467970969,
"grad_norm": 0.370026558637619,
"learning_rate": 0.00018790619377868703,
"loss": 0.47573864459991455,
"step": 127
},
{
"epoch": 0.1615651625118334,
"grad_norm": 0.32366666197776794,
"learning_rate": 0.0001877096856252496,
"loss": 0.5783149003982544,
"step": 128
},
{
"epoch": 0.16282739034395707,
"grad_norm": 0.3249809741973877,
"learning_rate": 0.00018751169828229927,
"loss": 0.46492838859558105,
"step": 129
},
{
"epoch": 0.16408961817608078,
"grad_norm": 0.41037416458129883,
"learning_rate": 0.0001873122350888151,
"loss": 0.796636164188385,
"step": 130
},
{
"epoch": 0.1653518460082045,
"grad_norm": 0.313863605260849,
"learning_rate": 0.00018711129940866575,
"loss": 0.38488903641700745,
"step": 131
},
{
"epoch": 0.16661407384032817,
"grad_norm": 0.36502766609191895,
"learning_rate": 0.00018690889463055283,
"loss": 0.7027624249458313,
"step": 132
},
{
"epoch": 0.16787630167245188,
"grad_norm": 0.348656564950943,
"learning_rate": 0.00018670502416795367,
"loss": 0.8470883369445801,
"step": 133
},
{
"epoch": 0.16913852950457559,
"grad_norm": 0.35909080505371094,
"learning_rate": 0.0001864996914590638,
"loss": 0.661641001701355,
"step": 134
},
{
"epoch": 0.17040075733669927,
"grad_norm": 0.38659459352493286,
"learning_rate": 0.00018629289996673897,
"loss": 0.694800853729248,
"step": 135
},
{
"epoch": 0.17166298516882297,
"grad_norm": 0.366533100605011,
"learning_rate": 0.00018608465317843678,
"loss": 0.9004327654838562,
"step": 136
},
{
"epoch": 0.17292521300094668,
"grad_norm": 0.42530369758605957,
"learning_rate": 0.00018587495460615778,
"loss": 0.9930410385131836,
"step": 137
},
{
"epoch": 0.17418744083307036,
"grad_norm": 0.38337844610214233,
"learning_rate": 0.00018566380778638628,
"loss": 0.621214747428894,
"step": 138
},
{
"epoch": 0.17544966866519407,
"grad_norm": 0.3821134567260742,
"learning_rate": 0.00018545121628003077,
"loss": 0.8524945974349976,
"step": 139
},
{
"epoch": 0.17671189649731778,
"grad_norm": 0.6962800621986389,
"learning_rate": 0.0001852371836723638,
"loss": 0.490077942609787,
"step": 140
},
{
"epoch": 0.17797412432944146,
"grad_norm": 0.40078434348106384,
"learning_rate": 0.00018502171357296144,
"loss": 0.7751069664955139,
"step": 141
},
{
"epoch": 0.17923635216156517,
"grad_norm": 0.3736267685890198,
"learning_rate": 0.0001848048096156426,
"loss": 0.5479488968849182,
"step": 142
},
{
"epoch": 0.18049857999368887,
"grad_norm": 0.3780677914619446,
"learning_rate": 0.00018458647545840763,
"loss": 0.6310573220252991,
"step": 143
},
{
"epoch": 0.18176080782581255,
"grad_norm": 0.3293318748474121,
"learning_rate": 0.00018436671478337666,
"loss": 0.4275631010532379,
"step": 144
},
{
"epoch": 0.18302303565793626,
"grad_norm": 0.3664384186267853,
"learning_rate": 0.00018414553129672732,
"loss": 0.4785746932029724,
"step": 145
},
{
"epoch": 0.18428526349005997,
"grad_norm": 0.3737381100654602,
"learning_rate": 0.00018392292872863267,
"loss": 0.5807976722717285,
"step": 146
},
{
"epoch": 0.18554749132218365,
"grad_norm": 0.40464866161346436,
"learning_rate": 0.00018369891083319778,
"loss": 0.673311710357666,
"step": 147
},
{
"epoch": 0.18680971915430736,
"grad_norm": 0.4158247411251068,
"learning_rate": 0.00018347348138839683,
"loss": 0.5220749974250793,
"step": 148
},
{
"epoch": 0.18807194698643104,
"grad_norm": 0.332676500082016,
"learning_rate": 0.0001832466441960091,
"loss": 0.42914730310440063,
"step": 149
},
{
"epoch": 0.18933417481855475,
"grad_norm": 0.3765426278114319,
"learning_rate": 0.00018301840308155507,
"loss": 0.5210474729537964,
"step": 150
},
{
"epoch": 0.19059640265067845,
"grad_norm": 0.3598466217517853,
"learning_rate": 0.00018278876189423179,
"loss": 1.0533007383346558,
"step": 151
},
{
"epoch": 0.19185863048280213,
"grad_norm": 0.5936484932899475,
"learning_rate": 0.00018255772450684798,
"loss": 0.8764799237251282,
"step": 152
},
{
"epoch": 0.19312085831492584,
"grad_norm": 0.37642624974250793,
"learning_rate": 0.00018232529481575872,
"loss": 0.46875783801078796,
"step": 153
},
{
"epoch": 0.19438308614704955,
"grad_norm": 0.36098363995552063,
"learning_rate": 0.00018209147674079983,
"loss": 0.6464822292327881,
"step": 154
},
{
"epoch": 0.19564531397917323,
"grad_norm": 0.39462804794311523,
"learning_rate": 0.00018185627422522148,
"loss": 0.7827063798904419,
"step": 155
},
{
"epoch": 0.19690754181129694,
"grad_norm": 0.36141112446784973,
"learning_rate": 0.0001816196912356222,
"loss": 0.9432686567306519,
"step": 156
},
{
"epoch": 0.19816976964342065,
"grad_norm": 0.3857667148113251,
"learning_rate": 0.00018138173176188133,
"loss": 0.8610580563545227,
"step": 157
},
{
"epoch": 0.19943199747554433,
"grad_norm": 0.35036033391952515,
"learning_rate": 0.00018114239981709232,
"loss": 0.7541987299919128,
"step": 158
},
{
"epoch": 0.20069422530766803,
"grad_norm": 0.3643214702606201,
"learning_rate": 0.00018090169943749476,
"loss": 0.5373222827911377,
"step": 159
},
{
"epoch": 0.20195645313979174,
"grad_norm": 0.3778736889362335,
"learning_rate": 0.00018065963468240625,
"loss": 0.5798829197883606,
"step": 160
},
{
"epoch": 0.20321868097191542,
"grad_norm": 0.3862821161746979,
"learning_rate": 0.00018041620963415417,
"loss": 0.8069719672203064,
"step": 161
},
{
"epoch": 0.20448090880403913,
"grad_norm": 0.36028918623924255,
"learning_rate": 0.00018017142839800668,
"loss": 0.7396454811096191,
"step": 162
},
{
"epoch": 0.20574313663616284,
"grad_norm": 0.3179962635040283,
"learning_rate": 0.00017992529510210348,
"loss": 0.4463472366333008,
"step": 163
},
{
"epoch": 0.20700536446828652,
"grad_norm": 0.3768749237060547,
"learning_rate": 0.00017967781389738625,
"loss": 0.6056400537490845,
"step": 164
},
{
"epoch": 0.20826759230041023,
"grad_norm": 0.3443696200847626,
"learning_rate": 0.0001794289889575286,
"loss": 0.6053676009178162,
"step": 165
},
{
"epoch": 0.20952982013253393,
"grad_norm": 0.40036582946777344,
"learning_rate": 0.00017917882447886582,
"loss": 0.669062077999115,
"step": 166
},
{
"epoch": 0.21079204796465761,
"grad_norm": 0.373081773519516,
"learning_rate": 0.00017892732468032386,
"loss": 0.6552575826644897,
"step": 167
},
{
"epoch": 0.21205427579678132,
"grad_norm": 0.3748333752155304,
"learning_rate": 0.00017867449380334834,
"loss": 0.7766703963279724,
"step": 168
},
{
"epoch": 0.21331650362890503,
"grad_norm": 0.3774300813674927,
"learning_rate": 0.00017842033611183307,
"loss": 0.425309419631958,
"step": 169
},
{
"epoch": 0.2145787314610287,
"grad_norm": 0.3346552848815918,
"learning_rate": 0.00017816485589204801,
"loss": 0.39386531710624695,
"step": 170
},
{
"epoch": 0.21584095929315242,
"grad_norm": 0.37330710887908936,
"learning_rate": 0.00017790805745256704,
"loss": 0.8232768774032593,
"step": 171
},
{
"epoch": 0.21710318712527613,
"grad_norm": 0.39691922068595886,
"learning_rate": 0.00017764994512419534,
"loss": 0.6968734264373779,
"step": 172
},
{
"epoch": 0.2183654149573998,
"grad_norm": 0.39556068181991577,
"learning_rate": 0.0001773905232598963,
"loss": 0.6288269758224487,
"step": 173
},
{
"epoch": 0.21962764278952351,
"grad_norm": 0.3653506338596344,
"learning_rate": 0.00017712979623471807,
"loss": 0.6284940838813782,
"step": 174
},
{
"epoch": 0.2208898706216472,
"grad_norm": 0.390316367149353,
"learning_rate": 0.00017686776844571988,
"loss": 0.7067583799362183,
"step": 175
},
{
"epoch": 0.2221520984537709,
"grad_norm": 0.3740655481815338,
"learning_rate": 0.0001766044443118978,
"loss": 0.5908397436141968,
"step": 176
},
{
"epoch": 0.2234143262858946,
"grad_norm": 0.3652481138706207,
"learning_rate": 0.00017633982827411032,
"loss": 0.5462816953659058,
"step": 177
},
{
"epoch": 0.2246765541180183,
"grad_norm": 0.32050153613090515,
"learning_rate": 0.00017607392479500325,
"loss": 0.46369433403015137,
"step": 178
},
{
"epoch": 0.225938781950142,
"grad_norm": 0.3392358720302582,
"learning_rate": 0.00017580673835893473,
"loss": 0.6735156774520874,
"step": 179
},
{
"epoch": 0.2272010097822657,
"grad_norm": 0.3717758059501648,
"learning_rate": 0.00017553827347189938,
"loss": 0.9343303442001343,
"step": 180
},
{
"epoch": 0.2284632376143894,
"grad_norm": 0.3827629089355469,
"learning_rate": 0.00017526853466145244,
"loss": 0.7392931580543518,
"step": 181
},
{
"epoch": 0.2297254654465131,
"grad_norm": 0.39305350184440613,
"learning_rate": 0.0001749975264766334,
"loss": 0.9212709665298462,
"step": 182
},
{
"epoch": 0.2309876932786368,
"grad_norm": 0.4486978352069855,
"learning_rate": 0.0001747252534878891,
"loss": 0.5881640315055847,
"step": 183
},
{
"epoch": 0.23224992111076048,
"grad_norm": 0.31108546257019043,
"learning_rate": 0.000174451720286997,
"loss": 0.3923819959163666,
"step": 184
},
{
"epoch": 0.2335121489428842,
"grad_norm": 0.3748640716075897,
"learning_rate": 0.00017417693148698743,
"loss": 0.7098450064659119,
"step": 185
},
{
"epoch": 0.2347743767750079,
"grad_norm": 0.3929251730442047,
"learning_rate": 0.00017390089172206592,
"loss": 0.6599665880203247,
"step": 186
},
{
"epoch": 0.23603660460713158,
"grad_norm": 0.3102874159812927,
"learning_rate": 0.00017362360564753505,
"loss": 0.48892730474472046,
"step": 187
},
{
"epoch": 0.2372988324392553,
"grad_norm": 0.3638162314891815,
"learning_rate": 0.00017334507793971592,
"loss": 0.6274378895759583,
"step": 188
},
{
"epoch": 0.238561060271379,
"grad_norm": 0.280404657125473,
"learning_rate": 0.00017306531329586933,
"loss": 0.2670789361000061,
"step": 189
},
{
"epoch": 0.23982328810350267,
"grad_norm": 0.3414492905139923,
"learning_rate": 0.00017278431643411642,
"loss": 0.854606568813324,
"step": 190
},
{
"epoch": 0.24108551593562638,
"grad_norm": 0.339760959148407,
"learning_rate": 0.00017250209209335927,
"loss": 0.4224780797958374,
"step": 191
},
{
"epoch": 0.2423477437677501,
"grad_norm": 0.3548067808151245,
"learning_rate": 0.00017221864503320092,
"loss": 0.6572182178497314,
"step": 192
},
{
"epoch": 0.24360997159987377,
"grad_norm": 0.3619638681411743,
"learning_rate": 0.0001719339800338651,
"loss": 0.4573401212692261,
"step": 193
},
{
"epoch": 0.24487219943199748,
"grad_norm": 0.36929795145988464,
"learning_rate": 0.0001716481018961156,
"loss": 0.6632043123245239,
"step": 194
},
{
"epoch": 0.24613442726412119,
"grad_norm": 0.37808045744895935,
"learning_rate": 0.00017136101544117525,
"loss": 0.7357593178749084,
"step": 195
},
{
"epoch": 0.24739665509624487,
"grad_norm": 0.38574209809303284,
"learning_rate": 0.00017107272551064473,
"loss": 0.4269335865974426,
"step": 196
},
{
"epoch": 0.24865888292836857,
"grad_norm": 0.3391668200492859,
"learning_rate": 0.0001707832369664209,
"loss": 0.8197081685066223,
"step": 197
},
{
"epoch": 0.24992111076049228,
"grad_norm": 0.40485379099845886,
"learning_rate": 0.00017049255469061474,
"loss": 0.7450565099716187,
"step": 198
},
{
"epoch": 0.251183338592616,
"grad_norm": 0.37861743569374084,
"learning_rate": 0.00017020068358546898,
"loss": 0.5399523973464966,
"step": 199
},
{
"epoch": 0.25244556642473964,
"grad_norm": 0.39403632283210754,
"learning_rate": 0.0001699076285732756,
"loss": 0.9128871560096741,
"step": 200
},
{
"epoch": 0.25370779425686335,
"grad_norm": 0.40291762351989746,
"learning_rate": 0.0001696133945962927,
"loss": 0.8255231976509094,
"step": 201
},
{
"epoch": 0.25497002208898706,
"grad_norm": 0.6885679364204407,
"learning_rate": 0.000169317986616661,
"loss": 0.40416646003723145,
"step": 202
},
{
"epoch": 0.25623224992111077,
"grad_norm": 0.37489989399909973,
"learning_rate": 0.00016902140961632054,
"loss": 0.688234269618988,
"step": 203
},
{
"epoch": 0.2574944777532345,
"grad_norm": 0.38479313254356384,
"learning_rate": 0.00016872366859692627,
"loss": 0.5331247448921204,
"step": 204
},
{
"epoch": 0.2587567055853582,
"grad_norm": 0.40287116169929504,
"learning_rate": 0.00016842476857976396,
"loss": 0.7545835971832275,
"step": 205
},
{
"epoch": 0.26001893341748183,
"grad_norm": 0.3530018627643585,
"learning_rate": 0.0001681247146056654,
"loss": 0.5984229445457458,
"step": 206
},
{
"epoch": 0.26128116124960554,
"grad_norm": 0.34704816341400146,
"learning_rate": 0.00016782351173492342,
"loss": 0.867906391620636,
"step": 207
},
{
"epoch": 0.26254338908172925,
"grad_norm": 0.3187376856803894,
"learning_rate": 0.00016752116504720644,
"loss": 0.3967270255088806,
"step": 208
},
{
"epoch": 0.26380561691385296,
"grad_norm": 0.4047222435474396,
"learning_rate": 0.00016721767964147306,
"loss": 0.7225915193557739,
"step": 209
},
{
"epoch": 0.26506784474597667,
"grad_norm": 0.3720124661922455,
"learning_rate": 0.00016691306063588583,
"loss": 0.414902001619339,
"step": 210
},
{
"epoch": 0.2663300725781004,
"grad_norm": 0.27026864886283875,
"learning_rate": 0.00016660731316772505,
"loss": 0.2642422616481781,
"step": 211
},
{
"epoch": 0.267592300410224,
"grad_norm": 0.28109508752822876,
"learning_rate": 0.00016630044239330204,
"loss": 0.3239024877548218,
"step": 212
},
{
"epoch": 0.26885452824234773,
"grad_norm": 0.4051285982131958,
"learning_rate": 0.0001659924534878723,
"loss": 0.5133159160614014,
"step": 213
},
{
"epoch": 0.27011675607447144,
"grad_norm": 0.389447420835495,
"learning_rate": 0.00016568335164554812,
"loss": 0.5882396101951599,
"step": 214
},
{
"epoch": 0.27137898390659515,
"grad_norm": 0.4064750075340271,
"learning_rate": 0.00016537314207921115,
"loss": 0.8135666847229004,
"step": 215
},
{
"epoch": 0.27264121173871886,
"grad_norm": 0.4201750159263611,
"learning_rate": 0.0001650618300204242,
"loss": 0.5702388286590576,
"step": 216
},
{
"epoch": 0.2739034395708425,
"grad_norm": 0.39069369435310364,
"learning_rate": 0.00016474942071934337,
"loss": 0.5717343688011169,
"step": 217
},
{
"epoch": 0.2751656674029662,
"grad_norm": 0.407742977142334,
"learning_rate": 0.00016443591944462915,
"loss": 0.7300087213516235,
"step": 218
},
{
"epoch": 0.2764278952350899,
"grad_norm": 0.3515043258666992,
"learning_rate": 0.00016412133148335784,
"loss": 0.3343101143836975,
"step": 219
},
{
"epoch": 0.27769012306721363,
"grad_norm": 0.391044557094574,
"learning_rate": 0.00016380566214093225,
"loss": 0.7425781488418579,
"step": 220
},
{
"epoch": 0.27895235089933734,
"grad_norm": 0.4042036831378937,
"learning_rate": 0.0001634889167409923,
"loss": 0.7461481690406799,
"step": 221
},
{
"epoch": 0.28021457873146105,
"grad_norm": 0.3601584732532501,
"learning_rate": 0.0001631711006253251,
"loss": 0.37352609634399414,
"step": 222
},
{
"epoch": 0.2814768065635847,
"grad_norm": 0.37277212738990784,
"learning_rate": 0.00016285221915377508,
"loss": 0.39840951561927795,
"step": 223
},
{
"epoch": 0.2827390343957084,
"grad_norm": 0.41219770908355713,
"learning_rate": 0.0001625322777041534,
"loss": 0.631761908531189,
"step": 224
},
{
"epoch": 0.2840012622278321,
"grad_norm": 0.3973751962184906,
"learning_rate": 0.0001622112816721474,
"loss": 0.905396580696106,
"step": 225
},
{
"epoch": 0.2852634900599558,
"grad_norm": 0.4199240505695343,
"learning_rate": 0.00016188923647122947,
"loss": 0.5509951710700989,
"step": 226
},
{
"epoch": 0.28652571789207953,
"grad_norm": 0.3599737882614136,
"learning_rate": 0.0001615661475325658,
"loss": 0.6364030838012695,
"step": 227
},
{
"epoch": 0.28778794572420324,
"grad_norm": 0.36739909648895264,
"learning_rate": 0.000161242020304925,
"loss": 0.6433310508728027,
"step": 228
},
{
"epoch": 0.2890501735563269,
"grad_norm": 0.3900837004184723,
"learning_rate": 0.00016091686025458576,
"loss": 0.965069055557251,
"step": 229
},
{
"epoch": 0.2903124013884506,
"grad_norm": 0.35347774624824524,
"learning_rate": 0.0001605906728652451,
"loss": 0.5886582136154175,
"step": 230
},
{
"epoch": 0.2915746292205743,
"grad_norm": 0.4109002649784088,
"learning_rate": 0.00016026346363792567,
"loss": 0.5591490268707275,
"step": 231
},
{
"epoch": 0.292836857052698,
"grad_norm": 0.3631947636604309,
"learning_rate": 0.0001599352380908829,
"loss": 0.544223427772522,
"step": 232
},
{
"epoch": 0.2940990848848217,
"grad_norm": 0.3431711196899414,
"learning_rate": 0.00015960600175951223,
"loss": 0.4162474274635315,
"step": 233
},
{
"epoch": 0.29536131271694543,
"grad_norm": 0.36346155405044556,
"learning_rate": 0.0001592757601962555,
"loss": 0.8591347932815552,
"step": 234
},
{
"epoch": 0.2966235405490691,
"grad_norm": 0.33583030104637146,
"learning_rate": 0.00015894451897050738,
"loss": 0.4463670551776886,
"step": 235
},
{
"epoch": 0.2978857683811928,
"grad_norm": 0.3296612799167633,
"learning_rate": 0.00015861228366852148,
"loss": 0.46573173999786377,
"step": 236
},
{
"epoch": 0.2991479962133165,
"grad_norm": 0.3123343288898468,
"learning_rate": 0.0001582790598933161,
"loss": 0.3503931164741516,
"step": 237
},
{
"epoch": 0.3004102240454402,
"grad_norm": 0.374508261680603,
"learning_rate": 0.0001579448532645798,
"loss": 0.5895912051200867,
"step": 238
},
{
"epoch": 0.3016724518775639,
"grad_norm": 0.3595065176486969,
"learning_rate": 0.00015760966941857647,
"loss": 0.565118670463562,
"step": 239
},
{
"epoch": 0.30293467970968757,
"grad_norm": 0.3403629660606384,
"learning_rate": 0.00015727351400805052,
"loss": 0.3920265734195709,
"step": 240
},
{
"epoch": 0.3041969075418113,
"grad_norm": 0.3979881703853607,
"learning_rate": 0.00015693639270213136,
"loss": 0.8573540449142456,
"step": 241
},
{
"epoch": 0.305459135373935,
"grad_norm": 0.39144444465637207,
"learning_rate": 0.0001565983111862378,
"loss": 0.6504969000816345,
"step": 242
},
{
"epoch": 0.3067213632060587,
"grad_norm": 0.37401193380355835,
"learning_rate": 0.00015625927516198232,
"loss": 0.5543976426124573,
"step": 243
},
{
"epoch": 0.3079835910381824,
"grad_norm": 0.37249916791915894,
"learning_rate": 0.0001559192903470747,
"loss": 0.781203031539917,
"step": 244
},
{
"epoch": 0.3092458188703061,
"grad_norm": 0.36005863547325134,
"learning_rate": 0.00015557836247522575,
"loss": 0.4812963306903839,
"step": 245
},
{
"epoch": 0.31050804670242976,
"grad_norm": 0.3561168611049652,
"learning_rate": 0.0001552364972960506,
"loss": 0.5244578719139099,
"step": 246
},
{
"epoch": 0.31177027453455347,
"grad_norm": 0.3064718544483185,
"learning_rate": 0.00015489370057497165,
"loss": 0.35441693663597107,
"step": 247
},
{
"epoch": 0.3130325023666772,
"grad_norm": 0.38345471024513245,
"learning_rate": 0.0001545499780931214,
"loss": 0.6824744343757629,
"step": 248
},
{
"epoch": 0.3142947301988009,
"grad_norm": 0.36782291531562805,
"learning_rate": 0.00015420533564724495,
"loss": 0.41213345527648926,
"step": 249
},
{
"epoch": 0.3155569580309246,
"grad_norm": 0.39493328332901,
"learning_rate": 0.00015385977904960226,
"loss": 0.5020935535430908,
"step": 250
},
{
"epoch": 0.3168191858630483,
"grad_norm": 0.3497244715690613,
"learning_rate": 0.00015351331412787004,
"loss": 0.5641796588897705,
"step": 251
},
{
"epoch": 0.31808141369517196,
"grad_norm": 0.3519827127456665,
"learning_rate": 0.0001531659467250436,
"loss": 0.8068366646766663,
"step": 252
},
{
"epoch": 0.31934364152729566,
"grad_norm": 0.3616220951080322,
"learning_rate": 0.0001528176826993382,
"loss": 0.8782303929328918,
"step": 253
},
{
"epoch": 0.32060586935941937,
"grad_norm": 0.4184557795524597,
"learning_rate": 0.00015246852792409033,
"loss": 0.7177759408950806,
"step": 254
},
{
"epoch": 0.3218680971915431,
"grad_norm": 0.4233710765838623,
"learning_rate": 0.0001521184882876585,
"loss": 0.7468725442886353,
"step": 255
},
{
"epoch": 0.3231303250236668,
"grad_norm": 0.358642578125,
"learning_rate": 0.00015176756969332425,
"loss": 0.4827675223350525,
"step": 256
},
{
"epoch": 0.3243925528557905,
"grad_norm": 0.33649536967277527,
"learning_rate": 0.00015141577805919226,
"loss": 0.3861742317676544,
"step": 257
},
{
"epoch": 0.32565478068791415,
"grad_norm": 0.3700178861618042,
"learning_rate": 0.0001510631193180907,
"loss": 0.7173401713371277,
"step": 258
},
{
"epoch": 0.32691700852003786,
"grad_norm": 0.3805610239505768,
"learning_rate": 0.00015070959941747124,
"loss": 0.8101674318313599,
"step": 259
},
{
"epoch": 0.32817923635216156,
"grad_norm": 0.38329991698265076,
"learning_rate": 0.00015035522431930856,
"loss": 0.8402124643325806,
"step": 260
},
{
"epoch": 0.32944146418428527,
"grad_norm": 0.361529678106308,
"learning_rate": 0.00015000000000000001,
"loss": 0.6627713441848755,
"step": 261
},
{
"epoch": 0.330703692016409,
"grad_norm": 0.3611642122268677,
"learning_rate": 0.00014964393245026466,
"loss": 0.3878118693828583,
"step": 262
},
{
"epoch": 0.3319659198485327,
"grad_norm": 0.41715049743652344,
"learning_rate": 0.00014928702767504233,
"loss": 0.5380449295043945,
"step": 263
},
{
"epoch": 0.33322814768065634,
"grad_norm": 0.39908990263938904,
"learning_rate": 0.00014892929169339235,
"loss": 0.5558310151100159,
"step": 264
},
{
"epoch": 0.33449037551278005,
"grad_norm": 0.39582890272140503,
"learning_rate": 0.00014857073053839206,
"loss": 0.7881603837013245,
"step": 265
},
{
"epoch": 0.33575260334490376,
"grad_norm": 0.3694429397583008,
"learning_rate": 0.0001482113502570349,
"loss": 0.6454510688781738,
"step": 266
},
{
"epoch": 0.33701483117702746,
"grad_norm": 0.25048568844795227,
"learning_rate": 0.00014785115691012864,
"loss": 0.23232965171337128,
"step": 267
},
{
"epoch": 0.33827705900915117,
"grad_norm": 0.34138715267181396,
"learning_rate": 0.00014749015657219313,
"loss": 0.4494091868400574,
"step": 268
},
{
"epoch": 0.3395392868412748,
"grad_norm": 0.34587278962135315,
"learning_rate": 0.00014712835533135774,
"loss": 0.6932641863822937,
"step": 269
},
{
"epoch": 0.34080151467339853,
"grad_norm": 0.39235740900039673,
"learning_rate": 0.00014676575928925867,
"loss": 0.6115721464157104,
"step": 270
},
{
"epoch": 0.34206374250552224,
"grad_norm": 0.372470498085022,
"learning_rate": 0.00014640237456093634,
"loss": 0.5936945676803589,
"step": 271
},
{
"epoch": 0.34332597033764595,
"grad_norm": 0.3751293122768402,
"learning_rate": 0.0001460382072747319,
"loss": 0.6361874341964722,
"step": 272
},
{
"epoch": 0.34458819816976965,
"grad_norm": 0.3495366871356964,
"learning_rate": 0.00014567326357218407,
"loss": 0.27429258823394775,
"step": 273
},
{
"epoch": 0.34585042600189336,
"grad_norm": 0.40388405323028564,
"learning_rate": 0.00014530754960792553,
"loss": 0.46181124448776245,
"step": 274
},
{
"epoch": 0.347112653834017,
"grad_norm": 0.319353312253952,
"learning_rate": 0.0001449410715495791,
"loss": 0.3895929455757141,
"step": 275
},
{
"epoch": 0.3483748816661407,
"grad_norm": 0.3918631970882416,
"learning_rate": 0.00014457383557765386,
"loss": 0.7136199474334717,
"step": 276
},
{
"epoch": 0.34963710949826443,
"grad_norm": 0.36512160301208496,
"learning_rate": 0.00014420584788544057,
"loss": 0.6242626905441284,
"step": 277
},
{
"epoch": 0.35089933733038814,
"grad_norm": 0.4133952558040619,
"learning_rate": 0.00014383711467890774,
"loss": 0.5601866245269775,
"step": 278
},
{
"epoch": 0.35216156516251185,
"grad_norm": 0.4711982011795044,
"learning_rate": 0.00014346764217659653,
"loss": 0.3125555217266083,
"step": 279
},
{
"epoch": 0.35342379299463555,
"grad_norm": 0.3581778109073639,
"learning_rate": 0.00014309743660951595,
"loss": 0.715130090713501,
"step": 280
},
{
"epoch": 0.3546860208267592,
"grad_norm": 0.34894779324531555,
"learning_rate": 0.0001427265042210381,
"loss": 0.5023713111877441,
"step": 281
},
{
"epoch": 0.3559482486588829,
"grad_norm": 0.3577764332294464,
"learning_rate": 0.00014235485126679243,
"loss": 0.6359988451004028,
"step": 282
},
{
"epoch": 0.3572104764910066,
"grad_norm": 0.44540712237358093,
"learning_rate": 0.00014198248401456055,
"loss": 0.8171525597572327,
"step": 283
},
{
"epoch": 0.35847270432313033,
"grad_norm": 0.3892884850502014,
"learning_rate": 0.0001416094087441704,
"loss": 0.5745326280593872,
"step": 284
},
{
"epoch": 0.35973493215525404,
"grad_norm": 0.36921554803848267,
"learning_rate": 0.00014123563174739037,
"loss": 0.4776252210140228,
"step": 285
},
{
"epoch": 0.36099715998737775,
"grad_norm": 0.38392379879951477,
"learning_rate": 0.00014086115932782314,
"loss": 0.5178923606872559,
"step": 286
},
{
"epoch": 0.3622593878195014,
"grad_norm": 0.2495623081922531,
"learning_rate": 0.00014048599780079957,
"loss": 0.25248217582702637,
"step": 287
},
{
"epoch": 0.3635216156516251,
"grad_norm": 0.4058895409107208,
"learning_rate": 0.00014011015349327187,
"loss": 0.6448837518692017,
"step": 288
},
{
"epoch": 0.3647838434837488,
"grad_norm": 0.38654524087905884,
"learning_rate": 0.00013973363274370721,
"loss": 0.5187302827835083,
"step": 289
},
{
"epoch": 0.3660460713158725,
"grad_norm": 0.3716411292552948,
"learning_rate": 0.0001393564419019806,
"loss": 0.7247863411903381,
"step": 290
},
{
"epoch": 0.36730829914799623,
"grad_norm": 0.36923542618751526,
"learning_rate": 0.00013897858732926793,
"loss": 0.44380512833595276,
"step": 291
},
{
"epoch": 0.36857052698011994,
"grad_norm": 0.38871094584465027,
"learning_rate": 0.00013860007539793871,
"loss": 0.8842666149139404,
"step": 292
},
{
"epoch": 0.3698327548122436,
"grad_norm": 0.35937783122062683,
"learning_rate": 0.00013822091249144838,
"loss": 0.489496111869812,
"step": 293
},
{
"epoch": 0.3710949826443673,
"grad_norm": 0.3654249310493469,
"learning_rate": 0.00013784110500423104,
"loss": 0.5621508955955505,
"step": 294
},
{
"epoch": 0.372357210476491,
"grad_norm": 0.4184640049934387,
"learning_rate": 0.00013746065934159123,
"loss": 0.4694799780845642,
"step": 295
},
{
"epoch": 0.3736194383086147,
"grad_norm": 0.40087419748306274,
"learning_rate": 0.00013707958191959608,
"loss": 0.7347521781921387,
"step": 296
},
{
"epoch": 0.3748816661407384,
"grad_norm": 0.43245846033096313,
"learning_rate": 0.00013669787916496722,
"loss": 0.6806380152702332,
"step": 297
},
{
"epoch": 0.3761438939728621,
"grad_norm": 0.36302655935287476,
"learning_rate": 0.00013631555751497215,
"loss": 0.8191426992416382,
"step": 298
},
{
"epoch": 0.3774061218049858,
"grad_norm": 0.3232358396053314,
"learning_rate": 0.00013593262341731578,
"loss": 0.3671002984046936,
"step": 299
},
{
"epoch": 0.3786683496371095,
"grad_norm": 0.3223403990268707,
"learning_rate": 0.0001355490833300318,
"loss": 0.3676319718360901,
"step": 300
},
{
"epoch": 0.3799305774692332,
"grad_norm": 0.3848235309123993,
"learning_rate": 0.00013516494372137368,
"loss": 0.7041884660720825,
"step": 301
},
{
"epoch": 0.3811928053013569,
"grad_norm": 0.39564049243927,
"learning_rate": 0.0001347802110697055,
"loss": 0.7267032861709595,
"step": 302
},
{
"epoch": 0.3824550331334806,
"grad_norm": 0.3752077519893646,
"learning_rate": 0.00013439489186339282,
"loss": 0.44746118783950806,
"step": 303
},
{
"epoch": 0.38371726096560427,
"grad_norm": 0.3596220016479492,
"learning_rate": 0.00013400899260069323,
"loss": 0.42425066232681274,
"step": 304
},
{
"epoch": 0.384979488797728,
"grad_norm": 0.36152541637420654,
"learning_rate": 0.00013362251978964675,
"loss": 0.457078754901886,
"step": 305
},
{
"epoch": 0.3862417166298517,
"grad_norm": 0.3770156502723694,
"learning_rate": 0.00013323547994796597,
"loss": 0.5810063481330872,
"step": 306
},
{
"epoch": 0.3875039444619754,
"grad_norm": 0.42228955030441284,
"learning_rate": 0.0001328478796029264,
"loss": 0.8851193189620972,
"step": 307
},
{
"epoch": 0.3887661722940991,
"grad_norm": 0.4153822660446167,
"learning_rate": 0.00013245972529125606,
"loss": 0.6357755661010742,
"step": 308
},
{
"epoch": 0.3900284001262228,
"grad_norm": 0.3957383930683136,
"learning_rate": 0.00013207102355902552,
"loss": 0.7041004300117493,
"step": 309
},
{
"epoch": 0.39129062795834646,
"grad_norm": 0.37788495421409607,
"learning_rate": 0.0001316817809615373,
"loss": 0.5084975361824036,
"step": 310
},
{
"epoch": 0.39255285579047017,
"grad_norm": 0.3773125410079956,
"learning_rate": 0.00013129200406321545,
"loss": 0.7748256325721741,
"step": 311
},
{
"epoch": 0.3938150836225939,
"grad_norm": 0.36805328726768494,
"learning_rate": 0.00013090169943749476,
"loss": 0.5911955833435059,
"step": 312
},
{
"epoch": 0.3950773114547176,
"grad_norm": 0.4318149983882904,
"learning_rate": 0.00013051087366670994,
"loss": 0.6285633444786072,
"step": 313
},
{
"epoch": 0.3963395392868413,
"grad_norm": 0.27865713834762573,
"learning_rate": 0.00013011953334198466,
"loss": 0.2808951139450073,
"step": 314
},
{
"epoch": 0.397601767118965,
"grad_norm": 0.38748934864997864,
"learning_rate": 0.00012972768506312027,
"loss": 0.7810741662979126,
"step": 315
},
{
"epoch": 0.39886399495108865,
"grad_norm": 0.39623865485191345,
"learning_rate": 0.00012933533543848461,
"loss": 0.8346691727638245,
"step": 316
},
{
"epoch": 0.40012622278321236,
"grad_norm": 0.3087095022201538,
"learning_rate": 0.0001289424910849005,
"loss": 0.35411983728408813,
"step": 317
},
{
"epoch": 0.40138845061533607,
"grad_norm": 0.37265872955322266,
"learning_rate": 0.00012854915862753422,
"loss": 0.7961377501487732,
"step": 318
},
{
"epoch": 0.4026506784474598,
"grad_norm": 0.3931768536567688,
"learning_rate": 0.00012815534469978363,
"loss": 0.5816214084625244,
"step": 319
},
{
"epoch": 0.4039129062795835,
"grad_norm": 0.35481584072113037,
"learning_rate": 0.00012776105594316647,
"loss": 0.7527205944061279,
"step": 320
},
{
"epoch": 0.40517513411170714,
"grad_norm": 0.3482368290424347,
"learning_rate": 0.0001273662990072083,
"loss": 0.4816396236419678,
"step": 321
},
{
"epoch": 0.40643736194383084,
"grad_norm": 0.35917821526527405,
"learning_rate": 0.00012697108054933025,
"loss": 0.358943372964859,
"step": 322
},
{
"epoch": 0.40769958977595455,
"grad_norm": 0.35279327630996704,
"learning_rate": 0.000126575407234737,
"loss": 0.6909571290016174,
"step": 323
},
{
"epoch": 0.40896181760807826,
"grad_norm": 0.3735545575618744,
"learning_rate": 0.00012617928573630406,
"loss": 0.7668647170066833,
"step": 324
},
{
"epoch": 0.41022404544020197,
"grad_norm": 0.3791963458061218,
"learning_rate": 0.00012578272273446536,
"loss": 0.4582277238368988,
"step": 325
},
{
"epoch": 0.4114862732723257,
"grad_norm": 0.3846660852432251,
"learning_rate": 0.0001253857249171008,
"loss": 0.5816541910171509,
"step": 326
},
{
"epoch": 0.41274850110444933,
"grad_norm": 0.2960149049758911,
"learning_rate": 0.0001249882989794231,
"loss": 0.33520427346229553,
"step": 327
},
{
"epoch": 0.41401072893657304,
"grad_norm": 0.5094306468963623,
"learning_rate": 0.00012459045162386512,
"loss": 0.901237964630127,
"step": 328
},
{
"epoch": 0.41527295676869674,
"grad_norm": 0.4056321680545807,
"learning_rate": 0.00012419218955996676,
"loss": 0.37850597500801086,
"step": 329
},
{
"epoch": 0.41653518460082045,
"grad_norm": 0.4399261772632599,
"learning_rate": 0.00012379351950426187,
"loss": 0.7433345913887024,
"step": 330
},
{
"epoch": 0.41779741243294416,
"grad_norm": 0.38947823643684387,
"learning_rate": 0.0001233944481801649,
"loss": 0.7301508784294128,
"step": 331
},
{
"epoch": 0.41905964026506787,
"grad_norm": 0.4117131531238556,
"learning_rate": 0.00012299498231785737,
"loss": 0.5769900679588318,
"step": 332
},
{
"epoch": 0.4203218680971915,
"grad_norm": 0.3559359312057495,
"learning_rate": 0.00012259512865417477,
"loss": 0.5584972500801086,
"step": 333
},
{
"epoch": 0.42158409592931523,
"grad_norm": 0.4073047637939453,
"learning_rate": 0.00012219489393249262,
"loss": 0.4495258927345276,
"step": 334
},
{
"epoch": 0.42284632376143894,
"grad_norm": 0.36505264043807983,
"learning_rate": 0.00012179428490261278,
"loss": 0.749606192111969,
"step": 335
},
{
"epoch": 0.42410855159356264,
"grad_norm": 0.3678975999355316,
"learning_rate": 0.00012139330832064974,
"loss": 0.32790112495422363,
"step": 336
},
{
"epoch": 0.42537077942568635,
"grad_norm": 0.37156620621681213,
"learning_rate": 0.00012099197094891659,
"loss": 0.43149426579475403,
"step": 337
},
{
"epoch": 0.42663300725781006,
"grad_norm": 0.3237273395061493,
"learning_rate": 0.00012059027955581099,
"loss": 0.3703850209712982,
"step": 338
},
{
"epoch": 0.4278952350899337,
"grad_norm": 0.3485283851623535,
"learning_rate": 0.00012018824091570103,
"loss": 0.569449782371521,
"step": 339
},
{
"epoch": 0.4291574629220574,
"grad_norm": 0.378540962934494,
"learning_rate": 0.00011978586180881099,
"loss": 0.48175811767578125,
"step": 340
},
{
"epoch": 0.43041969075418113,
"grad_norm": 0.3947147727012634,
"learning_rate": 0.00011938314902110701,
"loss": 0.4960615634918213,
"step": 341
},
{
"epoch": 0.43168191858630484,
"grad_norm": 0.34757497906684875,
"learning_rate": 0.0001189801093441826,
"loss": 0.34023621678352356,
"step": 342
},
{
"epoch": 0.43294414641842854,
"grad_norm": 0.3692375719547272,
"learning_rate": 0.00011857674957514411,
"loss": 0.760047197341919,
"step": 343
},
{
"epoch": 0.43420637425055225,
"grad_norm": 0.38019847869873047,
"learning_rate": 0.00011817307651649616,
"loss": 0.8378443717956543,
"step": 344
},
{
"epoch": 0.4354686020826759,
"grad_norm": 0.3751029074192047,
"learning_rate": 0.00011776909697602689,
"loss": 0.4766428470611572,
"step": 345
},
{
"epoch": 0.4367308299147996,
"grad_norm": 0.5471876263618469,
"learning_rate": 0.00011736481776669306,
"loss": 0.41353490948677063,
"step": 346
},
{
"epoch": 0.4379930577469233,
"grad_norm": 0.3773936629295349,
"learning_rate": 0.00011696024570650528,
"loss": 0.5652437210083008,
"step": 347
},
{
"epoch": 0.43925528557904703,
"grad_norm": 0.3828847110271454,
"learning_rate": 0.000116555387618413,
"loss": 0.6103649139404297,
"step": 348
},
{
"epoch": 0.44051751341117074,
"grad_norm": 0.35921478271484375,
"learning_rate": 0.00011615025033018936,
"loss": 0.609113872051239,
"step": 349
},
{
"epoch": 0.4417797412432944,
"grad_norm": 0.3687792420387268,
"learning_rate": 0.00011574484067431617,
"loss": 0.8462064266204834,
"step": 350
},
{
"epoch": 0.4430419690754181,
"grad_norm": 0.3686203956604004,
"learning_rate": 0.00011533916548786857,
"loss": 0.656709611415863,
"step": 351
},
{
"epoch": 0.4443041969075418,
"grad_norm": 0.39589008688926697,
"learning_rate": 0.0001149332316123997,
"loss": 0.7393782734870911,
"step": 352
},
{
"epoch": 0.4455664247396655,
"grad_norm": 0.38354629278182983,
"learning_rate": 0.0001145270458938255,
"loss": 0.6119332909584045,
"step": 353
},
{
"epoch": 0.4468286525717892,
"grad_norm": 0.3615580201148987,
"learning_rate": 0.00011412061518230914,
"loss": 0.5982248783111572,
"step": 354
},
{
"epoch": 0.44809088040391293,
"grad_norm": 0.35184618830680847,
"learning_rate": 0.00011371394633214547,
"loss": 0.7312008142471313,
"step": 355
},
{
"epoch": 0.4493531082360366,
"grad_norm": 0.37319618463516235,
"learning_rate": 0.00011330704620164538,
"loss": 0.4518621265888214,
"step": 356
},
{
"epoch": 0.4506153360681603,
"grad_norm": 0.38271263241767883,
"learning_rate": 0.00011289992165302035,
"loss": 0.684691309928894,
"step": 357
},
{
"epoch": 0.451877563900284,
"grad_norm": 0.3614532947540283,
"learning_rate": 0.00011249257955226648,
"loss": 0.7593181729316711,
"step": 358
},
{
"epoch": 0.4531397917324077,
"grad_norm": 0.42146942019462585,
"learning_rate": 0.00011208502676904886,
"loss": 0.6286287307739258,
"step": 359
},
{
"epoch": 0.4544020195645314,
"grad_norm": 0.36411377787590027,
"learning_rate": 0.00011167727017658562,
"loss": 0.7084791660308838,
"step": 360
},
{
"epoch": 0.4556642473966551,
"grad_norm": 0.3926357328891754,
"learning_rate": 0.00011126931665153212,
"loss": 0.7415444254875183,
"step": 361
},
{
"epoch": 0.4569264752287788,
"grad_norm": 0.3722608685493469,
"learning_rate": 0.0001108611730738648,
"loss": 0.5457031726837158,
"step": 362
},
{
"epoch": 0.4581887030609025,
"grad_norm": 0.34348252415657043,
"learning_rate": 0.00011045284632676536,
"loss": 0.3467724919319153,
"step": 363
},
{
"epoch": 0.4594509308930262,
"grad_norm": 0.38620299100875854,
"learning_rate": 0.00011004434329650452,
"loss": 0.6784603595733643,
"step": 364
},
{
"epoch": 0.4607131587251499,
"grad_norm": 0.412806898355484,
"learning_rate": 0.000109635670872326,
"loss": 0.541936993598938,
"step": 365
},
{
"epoch": 0.4619753865572736,
"grad_norm": 0.37946563959121704,
"learning_rate": 0.00010922683594633021,
"loss": 0.7005019187927246,
"step": 366
},
{
"epoch": 0.4632376143893973,
"grad_norm": 0.36721378564834595,
"learning_rate": 0.00010881784541335817,
"loss": 0.5035321712493896,
"step": 367
},
{
"epoch": 0.46449984222152096,
"grad_norm": 0.41076555848121643,
"learning_rate": 0.00010840870617087514,
"loss": 0.7746437191963196,
"step": 368
},
{
"epoch": 0.4657620700536447,
"grad_norm": 0.3742596209049225,
"learning_rate": 0.00010799942511885418,
"loss": 0.5171118974685669,
"step": 369
},
{
"epoch": 0.4670242978857684,
"grad_norm": 0.3880580961704254,
"learning_rate": 0.00010759000915966011,
"loss": 0.7049781680107117,
"step": 370
},
{
"epoch": 0.4682865257178921,
"grad_norm": 0.3612365424633026,
"learning_rate": 0.00010718046519793276,
"loss": 0.43177270889282227,
"step": 371
},
{
"epoch": 0.4695487535500158,
"grad_norm": 0.4223220944404602,
"learning_rate": 0.00010677080014047076,
"loss": 0.6074368357658386,
"step": 372
},
{
"epoch": 0.47081098138213945,
"grad_norm": 0.3780396282672882,
"learning_rate": 0.00010636102089611491,
"loss": 0.5008561015129089,
"step": 373
},
{
"epoch": 0.47207320921426316,
"grad_norm": 0.3705812096595764,
"learning_rate": 0.00010595113437563176,
"loss": 0.6822476983070374,
"step": 374
},
{
"epoch": 0.47333543704638686,
"grad_norm": 0.4130505919456482,
"learning_rate": 0.000105541147491597,
"loss": 0.5583031177520752,
"step": 375
},
{
"epoch": 0.4745976648785106,
"grad_norm": 0.3589628040790558,
"learning_rate": 0.00010513106715827896,
"loss": 0.801206111907959,
"step": 376
},
{
"epoch": 0.4758598927106343,
"grad_norm": 0.3859142065048218,
"learning_rate": 0.00010472090029152196,
"loss": 0.5001563429832458,
"step": 377
},
{
"epoch": 0.477122120542758,
"grad_norm": 0.5252732038497925,
"learning_rate": 0.00010431065380862959,
"loss": 0.6630918383598328,
"step": 378
},
{
"epoch": 0.47838434837488164,
"grad_norm": 0.37909185886383057,
"learning_rate": 0.00010390033462824817,
"loss": 0.7034825682640076,
"step": 379
},
{
"epoch": 0.47964657620700535,
"grad_norm": 0.3590451776981354,
"learning_rate": 0.00010348994967025012,
"loss": 0.36768239736557007,
"step": 380
},
{
"epoch": 0.48090880403912906,
"grad_norm": 0.3347563147544861,
"learning_rate": 0.00010307950585561706,
"loss": 0.35689371824264526,
"step": 381
},
{
"epoch": 0.48217103187125276,
"grad_norm": 0.3807820975780487,
"learning_rate": 0.00010266901010632324,
"loss": 0.4797685742378235,
"step": 382
},
{
"epoch": 0.48343325970337647,
"grad_norm": 0.35765600204467773,
"learning_rate": 0.00010225846934521881,
"loss": 0.5064284205436707,
"step": 383
},
{
"epoch": 0.4846954875355002,
"grad_norm": 0.39294371008872986,
"learning_rate": 0.00010184789049591299,
"loss": 0.6024259924888611,
"step": 384
},
{
"epoch": 0.48595771536762383,
"grad_norm": 0.3386979401111603,
"learning_rate": 0.00010143728048265735,
"loss": 0.4336264133453369,
"step": 385
},
{
"epoch": 0.48721994319974754,
"grad_norm": 0.38877370953559875,
"learning_rate": 0.00010102664623022899,
"loss": 0.5891298055648804,
"step": 386
},
{
"epoch": 0.48848217103187125,
"grad_norm": 0.3828097879886627,
"learning_rate": 0.00010061599466381389,
"loss": 0.608544111251831,
"step": 387
},
{
"epoch": 0.48974439886399496,
"grad_norm": 0.3743601441383362,
"learning_rate": 0.0001002053327088899,
"loss": 0.6880306601524353,
"step": 388
},
{
"epoch": 0.49100662669611866,
"grad_norm": 0.39663559198379517,
"learning_rate": 9.979466729111013e-05,
"loss": 0.587350070476532,
"step": 389
},
{
"epoch": 0.49226885452824237,
"grad_norm": 0.4369630813598633,
"learning_rate": 9.938400533618615e-05,
"loss": 0.6706233024597168,
"step": 390
},
{
"epoch": 0.493531082360366,
"grad_norm": 0.41926079988479614,
"learning_rate": 9.897335376977102e-05,
"loss": 0.6896798610687256,
"step": 391
},
{
"epoch": 0.49479331019248973,
"grad_norm": 0.4132974147796631,
"learning_rate": 9.856271951734268e-05,
"loss": 0.49843940138816833,
"step": 392
},
{
"epoch": 0.49605553802461344,
"grad_norm": 0.2707560956478119,
"learning_rate": 9.815210950408704e-05,
"loss": 0.2632002830505371,
"step": 393
},
{
"epoch": 0.49731776585673715,
"grad_norm": 0.38526275753974915,
"learning_rate": 9.774153065478121e-05,
"loss": 0.40896376967430115,
"step": 394
},
{
"epoch": 0.49857999368886086,
"grad_norm": 0.38434556126594543,
"learning_rate": 9.733098989367677e-05,
"loss": 0.5658249855041504,
"step": 395
},
{
"epoch": 0.49984222152098456,
"grad_norm": 0.37741097807884216,
"learning_rate": 9.692049414438299e-05,
"loss": 0.6638325452804565,
"step": 396
},
{
"epoch": 0.5011044493531083,
"grad_norm": 0.38284313678741455,
"learning_rate": 9.651005032974994e-05,
"loss": 0.822309672832489,
"step": 397
},
{
"epoch": 0.502366677185232,
"grad_norm": 0.39180007576942444,
"learning_rate": 9.609966537175185e-05,
"loss": 0.6988601684570312,
"step": 398
},
{
"epoch": 0.5036289050173557,
"grad_norm": 0.37315770983695984,
"learning_rate": 9.568934619137046e-05,
"loss": 0.3722432851791382,
"step": 399
},
{
"epoch": 0.5048911328494793,
"grad_norm": 0.3731346130371094,
"learning_rate": 9.52790997084781e-05,
"loss": 0.6665936708450317,
"step": 400
},
{
"epoch": 0.506153360681603,
"grad_norm": 0.39265018701553345,
"learning_rate": 9.486893284172102e-05,
"loss": 0.4295370578765869,
"step": 401
},
{
"epoch": 0.5074155885137267,
"grad_norm": 0.22621490061283112,
"learning_rate": 9.4458852508403e-05,
"loss": 0.1555391401052475,
"step": 402
},
{
"epoch": 0.5086778163458504,
"grad_norm": 0.39791470766067505,
"learning_rate": 9.404886562436825e-05,
"loss": 0.7941228151321411,
"step": 403
},
{
"epoch": 0.5099400441779741,
"grad_norm": 0.39022767543792725,
"learning_rate": 9.36389791038851e-05,
"loss": 0.6743201613426208,
"step": 404
},
{
"epoch": 0.5112022720100978,
"grad_norm": 0.3959182798862457,
"learning_rate": 9.322919985952926e-05,
"loss": 0.6928982138633728,
"step": 405
},
{
"epoch": 0.5124644998422215,
"grad_norm": 0.35128676891326904,
"learning_rate": 9.281953480206725e-05,
"loss": 0.4283405840396881,
"step": 406
},
{
"epoch": 0.5137267276743452,
"grad_norm": 0.38393881916999817,
"learning_rate": 9.240999084033991e-05,
"loss": 0.48866939544677734,
"step": 407
},
{
"epoch": 0.514988955506469,
"grad_norm": 0.3746855556964874,
"learning_rate": 9.200057488114585e-05,
"loss": 0.7293848395347595,
"step": 408
},
{
"epoch": 0.5162511833385927,
"grad_norm": 0.3574482500553131,
"learning_rate": 9.15912938291249e-05,
"loss": 0.7160978317260742,
"step": 409
},
{
"epoch": 0.5175134111707164,
"grad_norm": 0.31795260310173035,
"learning_rate": 9.118215458664185e-05,
"loss": 0.3059941828250885,
"step": 410
},
{
"epoch": 0.51877563900284,
"grad_norm": 0.37041789293289185,
"learning_rate": 9.077316405366981e-05,
"loss": 0.40029266476631165,
"step": 411
},
{
"epoch": 0.5200378668349637,
"grad_norm": 0.3135358989238739,
"learning_rate": 9.036432912767403e-05,
"loss": 0.34788432717323303,
"step": 412
},
{
"epoch": 0.5213000946670874,
"grad_norm": 0.3632740080356598,
"learning_rate": 8.99556567034955e-05,
"loss": 0.47788649797439575,
"step": 413
},
{
"epoch": 0.5225623224992111,
"grad_norm": 0.39943233132362366,
"learning_rate": 8.954715367323468e-05,
"loss": 0.7340242862701416,
"step": 414
},
{
"epoch": 0.5238245503313348,
"grad_norm": 0.35586607456207275,
"learning_rate": 8.91388269261352e-05,
"loss": 0.416128933429718,
"step": 415
},
{
"epoch": 0.5250867781634585,
"grad_norm": 0.38117703795433044,
"learning_rate": 8.87306833484679e-05,
"loss": 0.5627406239509583,
"step": 416
},
{
"epoch": 0.5263490059955822,
"grad_norm": 0.4389495253562927,
"learning_rate": 8.832272982341439e-05,
"loss": 0.41440343856811523,
"step": 417
},
{
"epoch": 0.5276112338277059,
"grad_norm": 0.4085499942302704,
"learning_rate": 8.791497323095116e-05,
"loss": 0.48129522800445557,
"step": 418
},
{
"epoch": 0.5288734616598296,
"grad_norm": 0.4046858549118042,
"learning_rate": 8.750742044773354e-05,
"loss": 0.6476734280586243,
"step": 419
},
{
"epoch": 0.5301356894919533,
"grad_norm": 0.4076245427131653,
"learning_rate": 8.710007834697969e-05,
"loss": 0.6386293768882751,
"step": 420
},
{
"epoch": 0.531397917324077,
"grad_norm": 0.4085608124732971,
"learning_rate": 8.669295379835467e-05,
"loss": 0.6650468707084656,
"step": 421
},
{
"epoch": 0.5326601451562007,
"grad_norm": 0.4489421844482422,
"learning_rate": 8.628605366785458e-05,
"loss": 0.5000302195549011,
"step": 422
},
{
"epoch": 0.5339223729883243,
"grad_norm": 0.3692164123058319,
"learning_rate": 8.587938481769089e-05,
"loss": 0.6816071271896362,
"step": 423
},
{
"epoch": 0.535184600820448,
"grad_norm": 0.40202704071998596,
"learning_rate": 8.547295410617453e-05,
"loss": 0.7187950611114502,
"step": 424
},
{
"epoch": 0.5364468286525718,
"grad_norm": 0.3954196870326996,
"learning_rate": 8.506676838760032e-05,
"loss": 0.47280117869377136,
"step": 425
},
{
"epoch": 0.5377090564846955,
"grad_norm": 0.4074536859989166,
"learning_rate": 8.466083451213144e-05,
"loss": 0.5304967761039734,
"step": 426
},
{
"epoch": 0.5389712843168192,
"grad_norm": 0.4292575418949127,
"learning_rate": 8.425515932568382e-05,
"loss": 0.5013709664344788,
"step": 427
},
{
"epoch": 0.5402335121489429,
"grad_norm": 0.3722835183143616,
"learning_rate": 8.384974966981063e-05,
"loss": 0.5023803114891052,
"step": 428
},
{
"epoch": 0.5414957399810666,
"grad_norm": 0.39425259828567505,
"learning_rate": 8.344461238158699e-05,
"loss": 0.5070059299468994,
"step": 429
},
{
"epoch": 0.5427579678131903,
"grad_norm": 0.3532828688621521,
"learning_rate": 8.303975429349473e-05,
"loss": 0.4102450907230377,
"step": 430
},
{
"epoch": 0.544020195645314,
"grad_norm": 0.41622671484947205,
"learning_rate": 8.263518223330697e-05,
"loss": 0.7629631757736206,
"step": 431
},
{
"epoch": 0.5452824234774377,
"grad_norm": 0.410709947347641,
"learning_rate": 8.223090302397313e-05,
"loss": 0.7080658078193665,
"step": 432
},
{
"epoch": 0.5465446513095614,
"grad_norm": 0.3647861182689667,
"learning_rate": 8.182692348350385e-05,
"loss": 0.48096179962158203,
"step": 433
},
{
"epoch": 0.547806879141685,
"grad_norm": 0.39459702372550964,
"learning_rate": 8.142325042485592e-05,
"loss": 0.8301153779029846,
"step": 434
},
{
"epoch": 0.5490691069738087,
"grad_norm": 0.3667653799057007,
"learning_rate": 8.101989065581743e-05,
"loss": 0.44432565569877625,
"step": 435
},
{
"epoch": 0.5503313348059324,
"grad_norm": 0.4047844707965851,
"learning_rate": 8.0616850978893e-05,
"loss": 0.5940053462982178,
"step": 436
},
{
"epoch": 0.5515935626380561,
"grad_norm": 0.4128320515155792,
"learning_rate": 8.021413819118903e-05,
"loss": 0.512177050113678,
"step": 437
},
{
"epoch": 0.5528557904701799,
"grad_norm": 0.37576359510421753,
"learning_rate": 7.9811759084299e-05,
"loss": 0.5231778025627136,
"step": 438
},
{
"epoch": 0.5541180183023036,
"grad_norm": 0.3246806263923645,
"learning_rate": 7.940972044418902e-05,
"loss": 0.31796854734420776,
"step": 439
},
{
"epoch": 0.5553802461344273,
"grad_norm": 0.35433802008628845,
"learning_rate": 7.900802905108342e-05,
"loss": 0.42495012283325195,
"step": 440
},
{
"epoch": 0.556642473966551,
"grad_norm": 0.4064764380455017,
"learning_rate": 7.860669167935028e-05,
"loss": 0.6670479774475098,
"step": 441
},
{
"epoch": 0.5579047017986747,
"grad_norm": 0.3848694860935211,
"learning_rate": 7.820571509738723e-05,
"loss": 0.9129263162612915,
"step": 442
},
{
"epoch": 0.5591669296307984,
"grad_norm": 0.33378908038139343,
"learning_rate": 7.780510606750742e-05,
"loss": 0.3959806561470032,
"step": 443
},
{
"epoch": 0.5604291574629221,
"grad_norm": 0.4084720313549042,
"learning_rate": 7.740487134582525e-05,
"loss": 0.5052785873413086,
"step": 444
},
{
"epoch": 0.5616913852950458,
"grad_norm": 0.4099523425102234,
"learning_rate": 7.700501768214267e-05,
"loss": 0.6453187465667725,
"step": 445
},
{
"epoch": 0.5629536131271694,
"grad_norm": 0.3560808002948761,
"learning_rate": 7.660555181983518e-05,
"loss": 0.4158024787902832,
"step": 446
},
{
"epoch": 0.5642158409592931,
"grad_norm": 0.39216476678848267,
"learning_rate": 7.620648049573815e-05,
"loss": 0.5767735242843628,
"step": 447
},
{
"epoch": 0.5654780687914168,
"grad_norm": 0.3903045356273651,
"learning_rate": 7.580781044003324e-05,
"loss": 0.44133317470550537,
"step": 448
},
{
"epoch": 0.5667402966235405,
"grad_norm": 0.37804114818573,
"learning_rate": 7.540954837613488e-05,
"loss": 0.3772793710231781,
"step": 449
},
{
"epoch": 0.5680025244556642,
"grad_norm": 0.40392929315567017,
"learning_rate": 7.50117010205769e-05,
"loss": 0.6205388307571411,
"step": 450
},
{
"epoch": 0.569264752287788,
"grad_norm": 0.414870023727417,
"learning_rate": 7.461427508289922e-05,
"loss": 0.58516925573349,
"step": 451
},
{
"epoch": 0.5705269801199117,
"grad_norm": 0.3570805490016937,
"learning_rate": 7.421727726553463e-05,
"loss": 0.4138091802597046,
"step": 452
},
{
"epoch": 0.5717892079520354,
"grad_norm": 0.3515688478946686,
"learning_rate": 7.382071426369597e-05,
"loss": 0.3913613557815552,
"step": 453
},
{
"epoch": 0.5730514357841591,
"grad_norm": 0.3770284056663513,
"learning_rate": 7.342459276526302e-05,
"loss": 0.6880075335502625,
"step": 454
},
{
"epoch": 0.5743136636162828,
"grad_norm": 0.3983762264251709,
"learning_rate": 7.302891945066974e-05,
"loss": 0.6962027549743652,
"step": 455
},
{
"epoch": 0.5755758914484065,
"grad_norm": 0.3529524505138397,
"learning_rate": 7.263370099279172e-05,
"loss": 0.4161332845687866,
"step": 456
},
{
"epoch": 0.5768381192805301,
"grad_norm": 0.3377407193183899,
"learning_rate": 7.223894405683354e-05,
"loss": 0.39849692583084106,
"step": 457
},
{
"epoch": 0.5781003471126538,
"grad_norm": 0.4013289511203766,
"learning_rate": 7.18446553002164e-05,
"loss": 0.5468084812164307,
"step": 458
},
{
"epoch": 0.5793625749447775,
"grad_norm": 0.39508214592933655,
"learning_rate": 7.14508413724658e-05,
"loss": 0.8175787329673767,
"step": 459
},
{
"epoch": 0.5806248027769012,
"grad_norm": 0.4191129803657532,
"learning_rate": 7.10575089150995e-05,
"loss": 0.5919452905654907,
"step": 460
},
{
"epoch": 0.5818870306090249,
"grad_norm": 0.40128064155578613,
"learning_rate": 7.066466456151541e-05,
"loss": 0.8323053121566772,
"step": 461
},
{
"epoch": 0.5831492584411486,
"grad_norm": 0.3903089761734009,
"learning_rate": 7.027231493687974e-05,
"loss": 0.4888315796852112,
"step": 462
},
{
"epoch": 0.5844114862732723,
"grad_norm": 0.3628254532814026,
"learning_rate": 6.988046665801536e-05,
"loss": 0.33037495613098145,
"step": 463
},
{
"epoch": 0.585673714105396,
"grad_norm": 0.3754008710384369,
"learning_rate": 6.948912633329007e-05,
"loss": 0.5007816553115845,
"step": 464
},
{
"epoch": 0.5869359419375197,
"grad_norm": 0.376667320728302,
"learning_rate": 6.909830056250527e-05,
"loss": 0.757786750793457,
"step": 465
},
{
"epoch": 0.5881981697696435,
"grad_norm": 0.29717469215393066,
"learning_rate": 6.870799593678459e-05,
"loss": 0.2943430244922638,
"step": 466
},
{
"epoch": 0.5894603976017672,
"grad_norm": 0.38486912846565247,
"learning_rate": 6.831821903846273e-05,
"loss": 0.44896000623703003,
"step": 467
},
{
"epoch": 0.5907226254338909,
"grad_norm": 0.34192511439323425,
"learning_rate": 6.792897644097451e-05,
"loss": 0.29370012879371643,
"step": 468
},
{
"epoch": 0.5919848532660145,
"grad_norm": 0.4050130248069763,
"learning_rate": 6.754027470874396e-05,
"loss": 0.6608400344848633,
"step": 469
},
{
"epoch": 0.5932470810981382,
"grad_norm": 0.3004320561885834,
"learning_rate": 6.715212039707364e-05,
"loss": 0.23013579845428467,
"step": 470
},
{
"epoch": 0.5945093089302619,
"grad_norm": 0.36933329701423645,
"learning_rate": 6.676452005203406e-05,
"loss": 0.6952561140060425,
"step": 471
},
{
"epoch": 0.5957715367623856,
"grad_norm": 0.42043766379356384,
"learning_rate": 6.63774802103533e-05,
"loss": 0.7303497195243835,
"step": 472
},
{
"epoch": 0.5970337645945093,
"grad_norm": 0.3762672543525696,
"learning_rate": 6.599100739930677e-05,
"loss": 0.7378503084182739,
"step": 473
},
{
"epoch": 0.598295992426633,
"grad_norm": 0.36484387516975403,
"learning_rate": 6.560510813660719e-05,
"loss": 0.4264744818210602,
"step": 474
},
{
"epoch": 0.5995582202587567,
"grad_norm": 0.4137173295021057,
"learning_rate": 6.521978893029452e-05,
"loss": 0.6754275560379028,
"step": 475
},
{
"epoch": 0.6008204480908804,
"grad_norm": 0.4293482303619385,
"learning_rate": 6.483505627862632e-05,
"loss": 0.7817292809486389,
"step": 476
},
{
"epoch": 0.6020826759230041,
"grad_norm": 0.4162338376045227,
"learning_rate": 6.44509166699682e-05,
"loss": 0.6910249590873718,
"step": 477
},
{
"epoch": 0.6033449037551278,
"grad_norm": 0.4081710875034332,
"learning_rate": 6.406737658268425e-05,
"loss": 0.68759685754776,
"step": 478
},
{
"epoch": 0.6046071315872515,
"grad_norm": 0.37592121958732605,
"learning_rate": 6.368444248502789e-05,
"loss": 0.6178593635559082,
"step": 479
},
{
"epoch": 0.6058693594193751,
"grad_norm": 0.43066924810409546,
"learning_rate": 6.33021208350328e-05,
"loss": 0.5456580519676208,
"step": 480
},
{
"epoch": 0.6071315872514988,
"grad_norm": 0.3334132730960846,
"learning_rate": 6.292041808040393e-05,
"loss": 0.36408746242523193,
"step": 481
},
{
"epoch": 0.6083938150836226,
"grad_norm": 0.42052480578422546,
"learning_rate": 6.25393406584088e-05,
"loss": 0.6775397062301636,
"step": 482
},
{
"epoch": 0.6096560429157463,
"grad_norm": 0.3473283648490906,
"learning_rate": 6.215889499576898e-05,
"loss": 0.4786512851715088,
"step": 483
},
{
"epoch": 0.61091827074787,
"grad_norm": 0.35813814401626587,
"learning_rate": 6.177908750855164e-05,
"loss": 0.35457998514175415,
"step": 484
},
{
"epoch": 0.6121804985799937,
"grad_norm": 0.33015450835227966,
"learning_rate": 6.139992460206132e-05,
"loss": 0.314817875623703,
"step": 485
},
{
"epoch": 0.6134427264121174,
"grad_norm": 0.3904082179069519,
"learning_rate": 6.102141267073207e-05,
"loss": 0.5199745893478394,
"step": 486
},
{
"epoch": 0.6147049542442411,
"grad_norm": 0.3974827229976654,
"learning_rate": 6.064355809801943e-05,
"loss": 0.6768912672996521,
"step": 487
},
{
"epoch": 0.6159671820763648,
"grad_norm": 0.3908008635044098,
"learning_rate": 6.02663672562928e-05,
"loss": 0.5883216261863708,
"step": 488
},
{
"epoch": 0.6172294099084885,
"grad_norm": 0.3862961232662201,
"learning_rate": 5.988984650672813e-05,
"loss": 0.7970855236053467,
"step": 489
},
{
"epoch": 0.6184916377406122,
"grad_norm": 0.3746252655982971,
"learning_rate": 5.951400219920046e-05,
"loss": 0.4062190651893616,
"step": 490
},
{
"epoch": 0.6197538655727359,
"grad_norm": 0.36359089612960815,
"learning_rate": 5.913884067217685e-05,
"loss": 0.4925137758255005,
"step": 491
},
{
"epoch": 0.6210160934048595,
"grad_norm": 0.3990168273448944,
"learning_rate": 5.876436825260967e-05,
"loss": 0.7016726732254028,
"step": 492
},
{
"epoch": 0.6222783212369832,
"grad_norm": 0.3235120475292206,
"learning_rate": 5.8390591255829644e-05,
"loss": 0.31492355465888977,
"step": 493
},
{
"epoch": 0.6235405490691069,
"grad_norm": 0.41507890820503235,
"learning_rate": 5.8017515985439465e-05,
"loss": 0.647290825843811,
"step": 494
},
{
"epoch": 0.6248027769012306,
"grad_norm": 0.27676281332969666,
"learning_rate": 5.764514873320761e-05,
"loss": 0.2870396375656128,
"step": 495
},
{
"epoch": 0.6260650047333544,
"grad_norm": 0.3965661823749542,
"learning_rate": 5.727349577896194e-05,
"loss": 0.4853188693523407,
"step": 496
},
{
"epoch": 0.6273272325654781,
"grad_norm": 0.4400973916053772,
"learning_rate": 5.6902563390484023e-05,
"loss": 0.6750615239143372,
"step": 497
},
{
"epoch": 0.6285894603976018,
"grad_norm": 0.3927224576473236,
"learning_rate": 5.6532357823403517e-05,
"loss": 0.4222678542137146,
"step": 498
},
{
"epoch": 0.6298516882297255,
"grad_norm": 0.3898910880088806,
"learning_rate": 5.616288532109225e-05,
"loss": 0.6995186805725098,
"step": 499
},
{
"epoch": 0.6311139160618492,
"grad_norm": 0.38628652691841125,
"learning_rate": 5.579415211455941e-05,
"loss": 0.44969233870506287,
"step": 500
},
{
"epoch": 0.6323761438939729,
"grad_norm": 0.42243316769599915,
"learning_rate": 5.542616442234618e-05,
"loss": 0.6847352981567383,
"step": 501
},
{
"epoch": 0.6336383717260966,
"grad_norm": 0.394643098115921,
"learning_rate": 5.505892845042089e-05,
"loss": 0.5232677459716797,
"step": 502
},
{
"epoch": 0.6349005995582203,
"grad_norm": 0.3849993050098419,
"learning_rate": 5.469245039207451e-05,
"loss": 0.45429885387420654,
"step": 503
},
{
"epoch": 0.6361628273903439,
"grad_norm": 0.39264214038848877,
"learning_rate": 5.4326736427815946e-05,
"loss": 0.7198891639709473,
"step": 504
},
{
"epoch": 0.6374250552224676,
"grad_norm": 0.3624120056629181,
"learning_rate": 5.39617927252681e-05,
"loss": 0.6535207033157349,
"step": 505
},
{
"epoch": 0.6386872830545913,
"grad_norm": 0.41762086749076843,
"learning_rate": 5.359762543906368e-05,
"loss": 0.5117899775505066,
"step": 506
},
{
"epoch": 0.639949510886715,
"grad_norm": 0.3560762405395508,
"learning_rate": 5.3234240710741337e-05,
"loss": 0.3488892912864685,
"step": 507
},
{
"epoch": 0.6412117387188387,
"grad_norm": 0.3697710633277893,
"learning_rate": 5.28716446686423e-05,
"loss": 0.5296636819839478,
"step": 508
},
{
"epoch": 0.6424739665509624,
"grad_norm": 0.3891625702381134,
"learning_rate": 5.250984342780689e-05,
"loss": 0.4500022530555725,
"step": 509
},
{
"epoch": 0.6437361943830862,
"grad_norm": 0.4205571115016937,
"learning_rate": 5.214884308987136e-05,
"loss": 0.4895755648612976,
"step": 510
},
{
"epoch": 0.6449984222152099,
"grad_norm": 0.41864123940467834,
"learning_rate": 5.178864974296511e-05,
"loss": 0.7258821725845337,
"step": 511
},
{
"epoch": 0.6462606500473336,
"grad_norm": 0.3590496778488159,
"learning_rate": 5.142926946160799e-05,
"loss": 0.3575442135334015,
"step": 512
},
{
"epoch": 0.6475228778794573,
"grad_norm": 0.41997307538986206,
"learning_rate": 5.107070830660765e-05,
"loss": 0.6464291214942932,
"step": 513
},
{
"epoch": 0.648785105711581,
"grad_norm": 0.40842562913894653,
"learning_rate": 5.071297232495769e-05,
"loss": 0.693924069404602,
"step": 514
},
{
"epoch": 0.6500473335437046,
"grad_norm": 0.4067709445953369,
"learning_rate": 5.035606754973539e-05,
"loss": 0.7233395576477051,
"step": 515
},
{
"epoch": 0.6513095613758283,
"grad_norm": 0.4231897294521332,
"learning_rate": 5.000000000000002e-05,
"loss": 0.5112624764442444,
"step": 516
},
{
"epoch": 0.652571789207952,
"grad_norm": 0.33488285541534424,
"learning_rate": 4.964477568069146e-05,
"loss": 0.335151731967926,
"step": 517
},
{
"epoch": 0.6538340170400757,
"grad_norm": 0.39816269278526306,
"learning_rate": 4.9290400582528815e-05,
"loss": 0.47427669167518616,
"step": 518
},
{
"epoch": 0.6550962448721994,
"grad_norm": 0.3252885341644287,
"learning_rate": 4.893688068190932e-05,
"loss": 0.26451653242111206,
"step": 519
},
{
"epoch": 0.6563584727043231,
"grad_norm": 0.3190288543701172,
"learning_rate": 4.8584221940807774e-05,
"loss": 0.29336637258529663,
"step": 520
},
{
"epoch": 0.6576207005364468,
"grad_norm": 0.3690161108970642,
"learning_rate": 4.823243030667576e-05,
"loss": 0.4153848886489868,
"step": 521
},
{
"epoch": 0.6588829283685705,
"grad_norm": 0.38851308822631836,
"learning_rate": 4.7881511712341484e-05,
"loss": 0.8248839974403381,
"step": 522
},
{
"epoch": 0.6601451562006942,
"grad_norm": 0.3935796618461609,
"learning_rate": 4.753147207590971e-05,
"loss": 0.8026013970375061,
"step": 523
},
{
"epoch": 0.661407384032818,
"grad_norm": 0.39873406291007996,
"learning_rate": 4.7182317300661796e-05,
"loss": 0.7289063930511475,
"step": 524
},
{
"epoch": 0.6626696118649417,
"grad_norm": 0.3880118429660797,
"learning_rate": 4.683405327495638e-05,
"loss": 0.5413039922714233,
"step": 525
},
{
"epoch": 0.6639318396970654,
"grad_norm": 0.41318458318710327,
"learning_rate": 4.648668587212997e-05,
"loss": 0.6406034827232361,
"step": 526
},
{
"epoch": 0.665194067529189,
"grad_norm": 0.3890816271305084,
"learning_rate": 4.6140220950397764e-05,
"loss": 0.7736164927482605,
"step": 527
},
{
"epoch": 0.6664562953613127,
"grad_norm": 0.3265458047389984,
"learning_rate": 4.5794664352755055e-05,
"loss": 0.3139330744743347,
"step": 528
},
{
"epoch": 0.6677185231934364,
"grad_norm": 0.3433822691440582,
"learning_rate": 4.545002190687865e-05,
"loss": 0.35356977581977844,
"step": 529
},
{
"epoch": 0.6689807510255601,
"grad_norm": 0.3755057156085968,
"learning_rate": 4.510629942502839e-05,
"loss": 0.8373801708221436,
"step": 530
},
{
"epoch": 0.6702429788576838,
"grad_norm": 0.31386467814445496,
"learning_rate": 4.476350270394942e-05,
"loss": 0.2859068214893341,
"step": 531
},
{
"epoch": 0.6715052066898075,
"grad_norm": 0.3479110598564148,
"learning_rate": 4.4421637524774285e-05,
"loss": 0.4022149443626404,
"step": 532
},
{
"epoch": 0.6727674345219312,
"grad_norm": 0.3931775689125061,
"learning_rate": 4.4080709652925336e-05,
"loss": 0.4654971957206726,
"step": 533
},
{
"epoch": 0.6740296623540549,
"grad_norm": 0.41888129711151123,
"learning_rate": 4.374072483801769e-05,
"loss": 0.6287370920181274,
"step": 534
},
{
"epoch": 0.6752918901861786,
"grad_norm": 0.3527485430240631,
"learning_rate": 4.340168881376222e-05,
"loss": 0.424509197473526,
"step": 535
},
{
"epoch": 0.6765541180183023,
"grad_norm": 0.3850213289260864,
"learning_rate": 4.306360729786867e-05,
"loss": 0.6349387764930725,
"step": 536
},
{
"epoch": 0.677816345850426,
"grad_norm": 0.39798423647880554,
"learning_rate": 4.272648599194948e-05,
"loss": 0.4587141275405884,
"step": 537
},
{
"epoch": 0.6790785736825496,
"grad_norm": 0.4049997925758362,
"learning_rate": 4.239033058142356e-05,
"loss": 0.6317430138587952,
"step": 538
},
{
"epoch": 0.6803408015146734,
"grad_norm": 0.3872447609901428,
"learning_rate": 4.2055146735420245e-05,
"loss": 0.511966347694397,
"step": 539
},
{
"epoch": 0.6816030293467971,
"grad_norm": 0.34591948986053467,
"learning_rate": 4.172094010668391e-05,
"loss": 0.34035632014274597,
"step": 540
},
{
"epoch": 0.6828652571789208,
"grad_norm": 0.35914257168769836,
"learning_rate": 4.1387716331478565e-05,
"loss": 0.4750257134437561,
"step": 541
},
{
"epoch": 0.6841274850110445,
"grad_norm": 0.37576189637184143,
"learning_rate": 4.1055481029492645e-05,
"loss": 0.44672656059265137,
"step": 542
},
{
"epoch": 0.6853897128431682,
"grad_norm": 0.38701605796813965,
"learning_rate": 4.072423980374452e-05,
"loss": 0.45069319009780884,
"step": 543
},
{
"epoch": 0.6866519406752919,
"grad_norm": 0.3991917669773102,
"learning_rate": 4.039399824048777e-05,
"loss": 0.4803800582885742,
"step": 544
},
{
"epoch": 0.6879141685074156,
"grad_norm": 0.3985093832015991,
"learning_rate": 4.00647619091171e-05,
"loss": 0.707385778427124,
"step": 545
},
{
"epoch": 0.6891763963395393,
"grad_norm": 0.34546467661857605,
"learning_rate": 3.973653636207437e-05,
"loss": 0.40447893738746643,
"step": 546
},
{
"epoch": 0.690438624171663,
"grad_norm": 0.3801027834415436,
"learning_rate": 3.9409327134754895e-05,
"loss": 0.4316953420639038,
"step": 547
},
{
"epoch": 0.6917008520037867,
"grad_norm": 0.39960116147994995,
"learning_rate": 3.908313974541422e-05,
"loss": 0.6661956906318665,
"step": 548
},
{
"epoch": 0.6929630798359104,
"grad_norm": 0.4249173402786255,
"learning_rate": 3.875797969507502e-05,
"loss": 0.6954900026321411,
"step": 549
},
{
"epoch": 0.694225307668034,
"grad_norm": 0.4491938650608063,
"learning_rate": 3.843385246743417e-05,
"loss": 0.694817066192627,
"step": 550
},
{
"epoch": 0.6954875355001577,
"grad_norm": 0.4053807556629181,
"learning_rate": 3.811076352877054e-05,
"loss": 0.677171528339386,
"step": 551
},
{
"epoch": 0.6967497633322814,
"grad_norm": 0.3556557893753052,
"learning_rate": 3.778871832785262e-05,
"loss": 0.31312018632888794,
"step": 552
},
{
"epoch": 0.6980119911644052,
"grad_norm": 0.37487420439720154,
"learning_rate": 3.74677222958466e-05,
"loss": 0.43329551815986633,
"step": 553
},
{
"epoch": 0.6992742189965289,
"grad_norm": 0.4070112407207489,
"learning_rate": 3.714778084622492e-05,
"loss": 0.6022857427597046,
"step": 554
},
{
"epoch": 0.7005364468286526,
"grad_norm": 0.3633062243461609,
"learning_rate": 3.682889937467493e-05,
"loss": 0.407479465007782,
"step": 555
},
{
"epoch": 0.7017986746607763,
"grad_norm": 0.38449397683143616,
"learning_rate": 3.651108325900773e-05,
"loss": 0.5523849725723267,
"step": 556
},
{
"epoch": 0.7030609024929,
"grad_norm": 0.3744942247867584,
"learning_rate": 3.619433785906775e-05,
"loss": 0.48631197214126587,
"step": 557
},
{
"epoch": 0.7043231303250237,
"grad_norm": 0.40868815779685974,
"learning_rate": 3.587866851664219e-05,
"loss": 0.6774845719337463,
"step": 558
},
{
"epoch": 0.7055853581571474,
"grad_norm": 0.35936489701271057,
"learning_rate": 3.556408055537087e-05,
"loss": 0.34799298644065857,
"step": 559
},
{
"epoch": 0.7068475859892711,
"grad_norm": 0.3731677234172821,
"learning_rate": 3.5250579280656636e-05,
"loss": 0.3729614317417145,
"step": 560
},
{
"epoch": 0.7081098138213947,
"grad_norm": 0.4450969398021698,
"learning_rate": 3.493816997957582e-05,
"loss": 0.39747729897499084,
"step": 561
},
{
"epoch": 0.7093720416535184,
"grad_norm": 0.3150026500225067,
"learning_rate": 3.462685792078888e-05,
"loss": 0.30238404870033264,
"step": 562
},
{
"epoch": 0.7106342694856421,
"grad_norm": 0.4264235496520996,
"learning_rate": 3.4316648354451895e-05,
"loss": 0.7084164023399353,
"step": 563
},
{
"epoch": 0.7118964973177658,
"grad_norm": 0.35976630449295044,
"learning_rate": 3.400754651212776e-05,
"loss": 0.35280704498291016,
"step": 564
},
{
"epoch": 0.7131587251498895,
"grad_norm": 0.3740016520023346,
"learning_rate": 3.3699557606698015e-05,
"loss": 0.487404465675354,
"step": 565
},
{
"epoch": 0.7144209529820132,
"grad_norm": 0.4432770013809204,
"learning_rate": 3.339268683227499e-05,
"loss": 0.6776658296585083,
"step": 566
},
{
"epoch": 0.715683180814137,
"grad_norm": 0.3524283766746521,
"learning_rate": 3.308693936411421e-05,
"loss": 0.3227110207080841,
"step": 567
},
{
"epoch": 0.7169454086462607,
"grad_norm": 0.39707088470458984,
"learning_rate": 3.278232035852693e-05,
"loss": 0.6849966645240784,
"step": 568
},
{
"epoch": 0.7182076364783844,
"grad_norm": 0.4202400743961334,
"learning_rate": 3.247883495279358e-05,
"loss": 0.6456137299537659,
"step": 569
},
{
"epoch": 0.7194698643105081,
"grad_norm": 0.4002569317817688,
"learning_rate": 3.2176488265076596e-05,
"loss": 0.7039542198181152,
"step": 570
},
{
"epoch": 0.7207320921426318,
"grad_norm": 0.40294668078422546,
"learning_rate": 3.187528539433458e-05,
"loss": 0.46439212560653687,
"step": 571
},
{
"epoch": 0.7219943199747555,
"grad_norm": 0.40857481956481934,
"learning_rate": 3.157523142023604e-05,
"loss": 0.5847267508506775,
"step": 572
},
{
"epoch": 0.7232565478068791,
"grad_norm": 0.43344590067863464,
"learning_rate": 3.1276331403073735e-05,
"loss": 0.5486865043640137,
"step": 573
},
{
"epoch": 0.7245187756390028,
"grad_norm": 0.4011099934577942,
"learning_rate": 3.097859038367947e-05,
"loss": 0.6386106014251709,
"step": 574
},
{
"epoch": 0.7257810034711265,
"grad_norm": 0.39212876558303833,
"learning_rate": 3.068201338333903e-05,
"loss": 0.6849637031555176,
"step": 575
},
{
"epoch": 0.7270432313032502,
"grad_norm": 0.3913683593273163,
"learning_rate": 3.0386605403707346e-05,
"loss": 0.9085783958435059,
"step": 576
},
{
"epoch": 0.7283054591353739,
"grad_norm": 0.4202577769756317,
"learning_rate": 3.0092371426724398e-05,
"loss": 0.692664623260498,
"step": 577
},
{
"epoch": 0.7295676869674976,
"grad_norm": 0.33715662360191345,
"learning_rate": 2.979931641453104e-05,
"loss": 0.3271544575691223,
"step": 578
},
{
"epoch": 0.7308299147996213,
"grad_norm": 0.34124237298965454,
"learning_rate": 2.9507445309385294e-05,
"loss": 0.34397092461586,
"step": 579
},
{
"epoch": 0.732092142631745,
"grad_norm": 0.40698572993278503,
"learning_rate": 2.9216763033579097e-05,
"loss": 0.4819522500038147,
"step": 580
},
{
"epoch": 0.7333543704638688,
"grad_norm": 0.37911415100097656,
"learning_rate": 2.8927274489355293e-05,
"loss": 0.4310797154903412,
"step": 581
},
{
"epoch": 0.7346165982959925,
"grad_norm": 0.36646318435668945,
"learning_rate": 2.8638984558824777e-05,
"loss": 0.5274304747581482,
"step": 582
},
{
"epoch": 0.7358788261281162,
"grad_norm": 0.3488803803920746,
"learning_rate": 2.835189810388441e-05,
"loss": 0.7499272227287292,
"step": 583
},
{
"epoch": 0.7371410539602399,
"grad_norm": 0.40415751934051514,
"learning_rate": 2.8066019966134904e-05,
"loss": 0.8633046746253967,
"step": 584
},
{
"epoch": 0.7384032817923635,
"grad_norm": 0.325978547334671,
"learning_rate": 2.7781354966799078e-05,
"loss": 0.3552260994911194,
"step": 585
},
{
"epoch": 0.7396655096244872,
"grad_norm": 0.37058016657829285,
"learning_rate": 2.7497907906640742e-05,
"loss": 0.913851261138916,
"step": 586
},
{
"epoch": 0.7409277374566109,
"grad_norm": 0.36124756932258606,
"learning_rate": 2.721568356588362e-05,
"loss": 0.5102133750915527,
"step": 587
},
{
"epoch": 0.7421899652887346,
"grad_norm": 0.41945722699165344,
"learning_rate": 2.6934686704130696e-05,
"loss": 0.5533009767532349,
"step": 588
},
{
"epoch": 0.7434521931208583,
"grad_norm": 0.40652337670326233,
"learning_rate": 2.665492206028407e-05,
"loss": 0.6261847019195557,
"step": 589
},
{
"epoch": 0.744714420952982,
"grad_norm": 0.36238163709640503,
"learning_rate": 2.6376394352464972e-05,
"loss": 0.5246446132659912,
"step": 590
},
{
"epoch": 0.7459766487851057,
"grad_norm": 0.3909083306789398,
"learning_rate": 2.6099108277934103e-05,
"loss": 0.5678606033325195,
"step": 591
},
{
"epoch": 0.7472388766172294,
"grad_norm": 0.3918708562850952,
"learning_rate": 2.5823068513012595e-05,
"loss": 0.4282546639442444,
"step": 592
},
{
"epoch": 0.7485011044493531,
"grad_norm": 0.3766772150993347,
"learning_rate": 2.5548279713002997e-05,
"loss": 0.43503549695014954,
"step": 593
},
{
"epoch": 0.7497633322814768,
"grad_norm": 0.43319037556648254,
"learning_rate": 2.527474651211089e-05,
"loss": 0.6522255539894104,
"step": 594
},
{
"epoch": 0.7510255601136006,
"grad_norm": 0.4107663035392761,
"learning_rate": 2.500247352336664e-05,
"loss": 0.3986871540546417,
"step": 595
},
{
"epoch": 0.7522877879457242,
"grad_norm": 0.4372679591178894,
"learning_rate": 2.4731465338547556e-05,
"loss": 0.681415855884552,
"step": 596
},
{
"epoch": 0.7535500157778479,
"grad_norm": 0.3968641459941864,
"learning_rate": 2.4461726528100615e-05,
"loss": 0.44046419858932495,
"step": 597
},
{
"epoch": 0.7548122436099716,
"grad_norm": 0.33103057742118835,
"learning_rate": 2.41932616410653e-05,
"loss": 0.37138405442237854,
"step": 598
},
{
"epoch": 0.7560744714420953,
"grad_norm": 0.36118385195732117,
"learning_rate": 2.392607520499677e-05,
"loss": 0.31369921565055847,
"step": 599
},
{
"epoch": 0.757336699274219,
"grad_norm": 0.35563066601753235,
"learning_rate": 2.36601717258897e-05,
"loss": 0.3743899464607239,
"step": 600
},
{
"epoch": 0.7585989271063427,
"grad_norm": 0.4097678065299988,
"learning_rate": 2.339555568810221e-05,
"loss": 0.418079674243927,
"step": 601
},
{
"epoch": 0.7598611549384664,
"grad_norm": 0.38674771785736084,
"learning_rate": 2.3132231554280136e-05,
"loss": 0.8224179744720459,
"step": 602
},
{
"epoch": 0.7611233827705901,
"grad_norm": 0.3854767084121704,
"learning_rate": 2.2870203765281926e-05,
"loss": 0.542049765586853,
"step": 603
},
{
"epoch": 0.7623856106027138,
"grad_norm": 0.35851332545280457,
"learning_rate": 2.260947674010372e-05,
"loss": 0.5342020988464355,
"step": 604
},
{
"epoch": 0.7636478384348375,
"grad_norm": 0.37478891015052795,
"learning_rate": 2.235005487580466e-05,
"loss": 0.8123199939727783,
"step": 605
},
{
"epoch": 0.7649100662669612,
"grad_norm": 0.451459676027298,
"learning_rate": 2.2091942547432955e-05,
"loss": 0.5622618198394775,
"step": 606
},
{
"epoch": 0.7661722940990849,
"grad_norm": 0.42055562138557434,
"learning_rate": 2.1835144107952022e-05,
"loss": 0.6805808544158936,
"step": 607
},
{
"epoch": 0.7674345219312085,
"grad_norm": 0.38752734661102295,
"learning_rate": 2.1579663888166956e-05,
"loss": 0.6346580982208252,
"step": 608
},
{
"epoch": 0.7686967497633322,
"grad_norm": 0.39068523049354553,
"learning_rate": 2.132550619665168e-05,
"loss": 0.5962034463882446,
"step": 609
},
{
"epoch": 0.769958977595456,
"grad_norm": 0.3247472643852234,
"learning_rate": 2.107267531967618e-05,
"loss": 0.25553497672080994,
"step": 610
},
{
"epoch": 0.7712212054275797,
"grad_norm": 0.4266479015350342,
"learning_rate": 2.0821175521134207e-05,
"loss": 0.5519466996192932,
"step": 611
},
{
"epoch": 0.7724834332597034,
"grad_norm": 0.4060700237751007,
"learning_rate": 2.05710110424714e-05,
"loss": 0.6059053540229797,
"step": 612
},
{
"epoch": 0.7737456610918271,
"grad_norm": 0.4174729585647583,
"learning_rate": 2.0322186102613795e-05,
"loss": 0.42115089297294617,
"step": 613
},
{
"epoch": 0.7750078889239508,
"grad_norm": 0.375446617603302,
"learning_rate": 2.0074704897896558e-05,
"loss": 0.368305504322052,
"step": 614
},
{
"epoch": 0.7762701167560745,
"grad_norm": 0.37311506271362305,
"learning_rate": 1.982857160199334e-05,
"loss": 0.3238658010959625,
"step": 615
},
{
"epoch": 0.7775323445881982,
"grad_norm": 0.41771042346954346,
"learning_rate": 1.9583790365845822e-05,
"loss": 0.6185348033905029,
"step": 616
},
{
"epoch": 0.7787945724203219,
"grad_norm": 0.39036667346954346,
"learning_rate": 1.9340365317593746e-05,
"loss": 0.7339574098587036,
"step": 617
},
{
"epoch": 0.7800568002524456,
"grad_norm": 0.40570926666259766,
"learning_rate": 1.9098300562505266e-05,
"loss": 0.46005457639694214,
"step": 618
},
{
"epoch": 0.7813190280845692,
"grad_norm": 0.36136454343795776,
"learning_rate": 1.8857600182907675e-05,
"loss": 0.3527463972568512,
"step": 619
},
{
"epoch": 0.7825812559166929,
"grad_norm": 0.38751932978630066,
"learning_rate": 1.8618268238118675e-05,
"loss": 0.7095609307289124,
"step": 620
},
{
"epoch": 0.7838434837488166,
"grad_norm": 0.4258861541748047,
"learning_rate": 1.8380308764377842e-05,
"loss": 0.6087920665740967,
"step": 621
},
{
"epoch": 0.7851057115809403,
"grad_norm": 0.3894071578979492,
"learning_rate": 1.8143725774778508e-05,
"loss": 0.5984947085380554,
"step": 622
},
{
"epoch": 0.786367939413064,
"grad_norm": 0.39034441113471985,
"learning_rate": 1.7908523259200192e-05,
"loss": 0.5467015504837036,
"step": 623
},
{
"epoch": 0.7876301672451878,
"grad_norm": 0.40297675132751465,
"learning_rate": 1.767470518424129e-05,
"loss": 0.6903741359710693,
"step": 624
},
{
"epoch": 0.7888923950773115,
"grad_norm": 0.3851509392261505,
"learning_rate": 1.7442275493152037e-05,
"loss": 0.486089825630188,
"step": 625
},
{
"epoch": 0.7901546229094352,
"grad_norm": 0.37658852338790894,
"learning_rate": 1.7211238105768214e-05,
"loss": 0.4333967864513397,
"step": 626
},
{
"epoch": 0.7914168507415589,
"grad_norm": 0.45156872272491455,
"learning_rate": 1.6981596918444953e-05,
"loss": 0.7170761823654175,
"step": 627
},
{
"epoch": 0.7926790785736826,
"grad_norm": 0.41625985503196716,
"learning_rate": 1.6753355803990912e-05,
"loss": 0.45374661684036255,
"step": 628
},
{
"epoch": 0.7939413064058063,
"grad_norm": 0.41271454095840454,
"learning_rate": 1.652651861160318e-05,
"loss": 0.49166661500930786,
"step": 629
},
{
"epoch": 0.79520353423793,
"grad_norm": 0.30450883507728577,
"learning_rate": 1.630108916680223e-05,
"loss": 0.26509180665016174,
"step": 630
},
{
"epoch": 0.7964657620700536,
"grad_norm": 0.41994258761405945,
"learning_rate": 1.607707127136734e-05,
"loss": 0.5564639568328857,
"step": 631
},
{
"epoch": 0.7977279899021773,
"grad_norm": 0.42379099130630493,
"learning_rate": 1.5854468703272663e-05,
"loss": 0.6809132695198059,
"step": 632
},
{
"epoch": 0.798990217734301,
"grad_norm": 0.3801705837249756,
"learning_rate": 1.5633285216623385e-05,
"loss": 0.4586731493473053,
"step": 633
},
{
"epoch": 0.8002524455664247,
"grad_norm": 0.3840394914150238,
"learning_rate": 1.541352454159237e-05,
"loss": 0.38096368312835693,
"step": 634
},
{
"epoch": 0.8015146733985484,
"grad_norm": 0.3911992311477661,
"learning_rate": 1.5195190384357404e-05,
"loss": 0.6233262419700623,
"step": 635
},
{
"epoch": 0.8027769012306721,
"grad_norm": 0.4130832254886627,
"learning_rate": 1.4978286427038601e-05,
"loss": 0.6100831031799316,
"step": 636
},
{
"epoch": 0.8040391290627958,
"grad_norm": 0.530238687992096,
"learning_rate": 1.4762816327636241e-05,
"loss": 0.6475313901901245,
"step": 637
},
{
"epoch": 0.8053013568949196,
"grad_norm": 0.43065938353538513,
"learning_rate": 1.4548783719969239e-05,
"loss": 0.6517763137817383,
"step": 638
},
{
"epoch": 0.8065635847270433,
"grad_norm": 0.39852434396743774,
"learning_rate": 1.4336192213613742e-05,
"loss": 0.762035608291626,
"step": 639
},
{
"epoch": 0.807825812559167,
"grad_norm": 0.4060841202735901,
"learning_rate": 1.4125045393842219e-05,
"loss": 0.5141922831535339,
"step": 640
},
{
"epoch": 0.8090880403912907,
"grad_norm": 0.42946869134902954,
"learning_rate": 1.3915346821563235e-05,
"loss": 0.4715317189693451,
"step": 641
},
{
"epoch": 0.8103502682234143,
"grad_norm": 0.4243875741958618,
"learning_rate": 1.3707100033261034e-05,
"loss": 0.5333652496337891,
"step": 642
},
{
"epoch": 0.811612496055538,
"grad_norm": 0.40289306640625,
"learning_rate": 1.3500308540936201e-05,
"loss": 0.8304973840713501,
"step": 643
},
{
"epoch": 0.8128747238876617,
"grad_norm": 0.43981650471687317,
"learning_rate": 1.3294975832046353e-05,
"loss": 0.7121323347091675,
"step": 644
},
{
"epoch": 0.8141369517197854,
"grad_norm": 0.3223661780357361,
"learning_rate": 1.3091105369447165e-05,
"loss": 0.2905374765396118,
"step": 645
},
{
"epoch": 0.8153991795519091,
"grad_norm": 0.4346272051334381,
"learning_rate": 1.2888700591334223e-05,
"loss": 0.537320613861084,
"step": 646
},
{
"epoch": 0.8166614073840328,
"grad_norm": 0.35340362787246704,
"learning_rate": 1.2687764911184907e-05,
"loss": 0.34484896063804626,
"step": 647
},
{
"epoch": 0.8179236352161565,
"grad_norm": 0.40185239911079407,
"learning_rate": 1.2488301717700735e-05,
"loss": 0.4863336682319641,
"step": 648
},
{
"epoch": 0.8191858630482802,
"grad_norm": 0.33702552318573,
"learning_rate": 1.2290314374750422e-05,
"loss": 0.3356221318244934,
"step": 649
},
{
"epoch": 0.8204480908804039,
"grad_norm": 0.38969579339027405,
"learning_rate": 1.2093806221313008e-05,
"loss": 0.6058964729309082,
"step": 650
},
{
"epoch": 0.8217103187125276,
"grad_norm": 0.4453175961971283,
"learning_rate": 1.1898780571421552e-05,
"loss": 0.44390422105789185,
"step": 651
},
{
"epoch": 0.8229725465446514,
"grad_norm": 0.39128580689430237,
"learning_rate": 1.1705240714107302e-05,
"loss": 0.6540953516960144,
"step": 652
},
{
"epoch": 0.8242347743767751,
"grad_norm": 0.3710046708583832,
"learning_rate": 1.1513189913344214e-05,
"loss": 0.5617390871047974,
"step": 653
},
{
"epoch": 0.8254970022088987,
"grad_norm": 0.4133809208869934,
"learning_rate": 1.1322631407993811e-05,
"loss": 0.6450774669647217,
"step": 654
},
{
"epoch": 0.8267592300410224,
"grad_norm": 0.3774697184562683,
"learning_rate": 1.1133568411750727e-05,
"loss": 0.3926354646682739,
"step": 655
},
{
"epoch": 0.8280214578731461,
"grad_norm": 0.39373353123664856,
"learning_rate": 1.0946004113088381e-05,
"loss": 0.7614798545837402,
"step": 656
},
{
"epoch": 0.8292836857052698,
"grad_norm": 0.3788921535015106,
"learning_rate": 1.0759941675205221e-05,
"loss": 0.6513789892196655,
"step": 657
},
{
"epoch": 0.8305459135373935,
"grad_norm": 0.47546783089637756,
"learning_rate": 1.0575384235971465e-05,
"loss": 0.43815821409225464,
"step": 658
},
{
"epoch": 0.8318081413695172,
"grad_norm": 0.4033801257610321,
"learning_rate": 1.0392334907876022e-05,
"loss": 0.7993838787078857,
"step": 659
},
{
"epoch": 0.8330703692016409,
"grad_norm": 0.3804508447647095,
"learning_rate": 1.0210796777974197e-05,
"loss": 0.5399584174156189,
"step": 660
},
{
"epoch": 0.8343325970337646,
"grad_norm": 0.40873584151268005,
"learning_rate": 1.0030772907835483e-05,
"loss": 0.4069630801677704,
"step": 661
},
{
"epoch": 0.8355948248658883,
"grad_norm": 0.31726691126823425,
"learning_rate": 9.852266333491954e-06,
"loss": 0.31673499941825867,
"step": 662
},
{
"epoch": 0.836857052698012,
"grad_norm": 0.42769894003868103,
"learning_rate": 9.675280065387116e-06,
"loss": 0.5651416778564453,
"step": 663
},
{
"epoch": 0.8381192805301357,
"grad_norm": 0.34212225675582886,
"learning_rate": 9.499817088325102e-06,
"loss": 0.3379066288471222,
"step": 664
},
{
"epoch": 0.8393815083622594,
"grad_norm": 0.3834571838378906,
"learning_rate": 9.325880361420336e-06,
"loss": 0.532379686832428,
"step": 665
},
{
"epoch": 0.840643736194383,
"grad_norm": 0.4152385890483856,
"learning_rate": 9.153472818047625e-06,
"loss": 0.5268415212631226,
"step": 666
},
{
"epoch": 0.8419059640265067,
"grad_norm": 0.43394723534584045,
"learning_rate": 8.982597365792711e-06,
"loss": 0.5578685402870178,
"step": 667
},
{
"epoch": 0.8431681918586305,
"grad_norm": 0.3674545884132385,
"learning_rate": 8.813256886403164e-06,
"loss": 0.4507666826248169,
"step": 668
},
{
"epoch": 0.8444304196907542,
"grad_norm": 0.4950237572193146,
"learning_rate": 8.645454235739903e-06,
"loss": 0.5587325096130371,
"step": 669
},
{
"epoch": 0.8456926475228779,
"grad_norm": 0.42047086358070374,
"learning_rate": 8.479192243728962e-06,
"loss": 0.46830785274505615,
"step": 670
},
{
"epoch": 0.8469548753550016,
"grad_norm": 0.33029595017433167,
"learning_rate": 8.314473714313719e-06,
"loss": 0.3492874503135681,
"step": 671
},
{
"epoch": 0.8482171031871253,
"grad_norm": 0.3771483600139618,
"learning_rate": 8.151301425407699e-06,
"loss": 0.416072815656662,
"step": 672
},
{
"epoch": 0.849479331019249,
"grad_norm": 0.3575372099876404,
"learning_rate": 7.9896781288477e-06,
"loss": 0.4314277470111847,
"step": 673
},
{
"epoch": 0.8507415588513727,
"grad_norm": 0.42138731479644775,
"learning_rate": 7.829606550347313e-06,
"loss": 0.6481724381446838,
"step": 674
},
{
"epoch": 0.8520037866834964,
"grad_norm": 0.39553171396255493,
"learning_rate": 7.671089389451058e-06,
"loss": 0.3940804600715637,
"step": 675
},
{
"epoch": 0.8532660145156201,
"grad_norm": 0.3964840769767761,
"learning_rate": 7.514129319488839e-06,
"loss": 0.7153723835945129,
"step": 676
},
{
"epoch": 0.8545282423477437,
"grad_norm": 0.4527961015701294,
"learning_rate": 7.358728987530728e-06,
"loss": 0.7575295567512512,
"step": 677
},
{
"epoch": 0.8557904701798674,
"grad_norm": 0.47758570313453674,
"learning_rate": 7.204891014342552e-06,
"loss": 0.732297420501709,
"step": 678
},
{
"epoch": 0.8570526980119911,
"grad_norm": 0.3915818929672241,
"learning_rate": 7.052617994341448e-06,
"loss": 0.5047644376754761,
"step": 679
},
{
"epoch": 0.8583149258441148,
"grad_norm": 0.42662402987480164,
"learning_rate": 6.901912495552332e-06,
"loss": 0.7435489892959595,
"step": 680
},
{
"epoch": 0.8595771536762385,
"grad_norm": 0.44890522956848145,
"learning_rate": 6.75277705956443e-06,
"loss": 0.5125769376754761,
"step": 681
},
{
"epoch": 0.8608393815083623,
"grad_norm": 0.3554657995700836,
"learning_rate": 6.605214201488486e-06,
"loss": 0.3450443744659424,
"step": 682
},
{
"epoch": 0.862101609340486,
"grad_norm": 0.32458341121673584,
"learning_rate": 6.459226409914332e-06,
"loss": 0.31173160672187805,
"step": 683
},
{
"epoch": 0.8633638371726097,
"grad_norm": 0.3945808708667755,
"learning_rate": 6.314816146868952e-06,
"loss": 0.4987742304801941,
"step": 684
},
{
"epoch": 0.8646260650047334,
"grad_norm": 0.41859179735183716,
"learning_rate": 6.171985847774864e-06,
"loss": 0.5809845924377441,
"step": 685
},
{
"epoch": 0.8658882928368571,
"grad_norm": 0.4125705361366272,
"learning_rate": 6.030737921409169e-06,
"loss": 0.6869086623191833,
"step": 686
},
{
"epoch": 0.8671505206689808,
"grad_norm": 0.5110360980033875,
"learning_rate": 5.891074749862857e-06,
"loss": 0.5902141332626343,
"step": 687
},
{
"epoch": 0.8684127485011045,
"grad_norm": 0.3964199125766754,
"learning_rate": 5.75299868850061e-06,
"loss": 0.778140127658844,
"step": 688
},
{
"epoch": 0.8696749763332281,
"grad_norm": 0.3277434706687927,
"learning_rate": 5.616512065921187e-06,
"loss": 0.2611342966556549,
"step": 689
},
{
"epoch": 0.8709372041653518,
"grad_norm": 0.3749728500843048,
"learning_rate": 5.481617183918053e-06,
"loss": 0.42815372347831726,
"step": 690
},
{
"epoch": 0.8721994319974755,
"grad_norm": 0.36340272426605225,
"learning_rate": 5.348316317440549e-06,
"loss": 0.4718218445777893,
"step": 691
},
{
"epoch": 0.8734616598295992,
"grad_norm": 0.3954283893108368,
"learning_rate": 5.21661171455563e-06,
"loss": 0.49787670373916626,
"step": 692
},
{
"epoch": 0.8747238876617229,
"grad_norm": 0.39619600772857666,
"learning_rate": 5.086505596409885e-06,
"loss": 0.568760335445404,
"step": 693
},
{
"epoch": 0.8759861154938466,
"grad_norm": 0.33868858218193054,
"learning_rate": 4.958000157192022e-06,
"loss": 0.37448927760124207,
"step": 694
},
{
"epoch": 0.8772483433259703,
"grad_norm": 0.43138137459754944,
"learning_rate": 4.831097564095999e-06,
"loss": 0.6743485331535339,
"step": 695
},
{
"epoch": 0.8785105711580941,
"grad_norm": 0.41570451855659485,
"learning_rate": 4.705799957284351e-06,
"loss": 0.6966921091079712,
"step": 696
},
{
"epoch": 0.8797727989902178,
"grad_norm": 0.3950325548648834,
"learning_rate": 4.582109449852168e-06,
"loss": 0.8221022486686707,
"step": 697
},
{
"epoch": 0.8810350268223415,
"grad_norm": 0.31951889395713806,
"learning_rate": 4.4600281277914715e-06,
"loss": 0.33876973390579224,
"step": 698
},
{
"epoch": 0.8822972546544652,
"grad_norm": 0.408273309469223,
"learning_rate": 4.339558049955927e-06,
"loss": 0.5404328107833862,
"step": 699
},
{
"epoch": 0.8835594824865888,
"grad_norm": 0.3891682028770447,
"learning_rate": 4.220701248026248e-06,
"loss": 0.48202747106552124,
"step": 700
},
{
"epoch": 0.8848217103187125,
"grad_norm": 0.40945693850517273,
"learning_rate": 4.103459726475889e-06,
"loss": 0.8016560077667236,
"step": 701
},
{
"epoch": 0.8860839381508362,
"grad_norm": 0.43001535534858704,
"learning_rate": 3.987835462537193e-06,
"loss": 0.6459006071090698,
"step": 702
},
{
"epoch": 0.8873461659829599,
"grad_norm": 0.41465309262275696,
"learning_rate": 3.873830406168111e-06,
"loss": 0.5275793671607971,
"step": 703
},
{
"epoch": 0.8886083938150836,
"grad_norm": 0.3870158791542053,
"learning_rate": 3.761446480019315e-06,
"loss": 0.8116216063499451,
"step": 704
},
{
"epoch": 0.8898706216472073,
"grad_norm": 0.3732059895992279,
"learning_rate": 3.6506855794016913e-06,
"loss": 0.3549728989601135,
"step": 705
},
{
"epoch": 0.891132849479331,
"grad_norm": 0.38289642333984375,
"learning_rate": 3.541549572254488e-06,
"loss": 0.3792566955089569,
"step": 706
},
{
"epoch": 0.8923950773114547,
"grad_norm": 0.3992280066013336,
"learning_rate": 3.43404029911375e-06,
"loss": 0.7304099798202515,
"step": 707
},
{
"epoch": 0.8936573051435784,
"grad_norm": 0.3860641121864319,
"learning_rate": 3.3281595730812575e-06,
"loss": 0.6320814490318298,
"step": 708
},
{
"epoch": 0.8949195329757021,
"grad_norm": 0.40705665946006775,
"learning_rate": 3.223909179794027e-06,
"loss": 0.7557500600814819,
"step": 709
},
{
"epoch": 0.8961817608078259,
"grad_norm": 0.3863953649997711,
"learning_rate": 3.121290877394134e-06,
"loss": 0.5255841016769409,
"step": 710
},
{
"epoch": 0.8974439886399496,
"grad_norm": 0.3851090967655182,
"learning_rate": 3.0203063964990617e-06,
"loss": 0.5183653235435486,
"step": 711
},
{
"epoch": 0.8987062164720732,
"grad_norm": 0.39725980162620544,
"learning_rate": 2.9209574401725557e-06,
"loss": 0.5958725214004517,
"step": 712
},
{
"epoch": 0.8999684443041969,
"grad_norm": 0.47921210527420044,
"learning_rate": 2.82324568389587e-06,
"loss": 0.7262052297592163,
"step": 713
},
{
"epoch": 0.9012306721363206,
"grad_norm": 0.405513733625412,
"learning_rate": 2.7271727755395214e-06,
"loss": 0.6049070954322815,
"step": 714
},
{
"epoch": 0.9024928999684443,
"grad_norm": 0.3995083272457123,
"learning_rate": 2.6327403353355264e-06,
"loss": 0.808394193649292,
"step": 715
},
{
"epoch": 0.903755127800568,
"grad_norm": 0.43631553649902344,
"learning_rate": 2.539949955849985e-06,
"loss": 0.48620936274528503,
"step": 716
},
{
"epoch": 0.9050173556326917,
"grad_norm": 0.479377806186676,
"learning_rate": 2.4488032019563402e-06,
"loss": 0.6404117941856384,
"step": 717
},
{
"epoch": 0.9062795834648154,
"grad_norm": 0.408569872379303,
"learning_rate": 2.359301610808917e-06,
"loss": 0.7001040577888489,
"step": 718
},
{
"epoch": 0.9075418112969391,
"grad_norm": 0.4069215655326843,
"learning_rate": 2.271446691817014e-06,
"loss": 0.6278159618377686,
"step": 719
},
{
"epoch": 0.9088040391290628,
"grad_norm": 0.4575406014919281,
"learning_rate": 2.1852399266194314e-06,
"loss": 0.6095160245895386,
"step": 720
},
{
"epoch": 0.9100662669611865,
"grad_norm": 0.43460536003112793,
"learning_rate": 2.100682769059548e-06,
"loss": 0.4627190828323364,
"step": 721
},
{
"epoch": 0.9113284947933102,
"grad_norm": 0.4876587986946106,
"learning_rate": 2.017776645160707e-06,
"loss": 0.4769670367240906,
"step": 722
},
{
"epoch": 0.9125907226254338,
"grad_norm": 0.4268261194229126,
"learning_rate": 1.9365229531022264e-06,
"loss": 0.49713101983070374,
"step": 723
},
{
"epoch": 0.9138529504575575,
"grad_norm": 0.4099612832069397,
"learning_rate": 1.8569230631958256e-06,
"loss": 0.45675134658813477,
"step": 724
},
{
"epoch": 0.9151151782896813,
"grad_norm": 0.39911365509033203,
"learning_rate": 1.7789783178624897e-06,
"loss": 0.4840657711029053,
"step": 725
},
{
"epoch": 0.916377406121805,
"grad_norm": 0.39041027426719666,
"learning_rate": 1.7026900316098215e-06,
"loss": 0.5516049861907959,
"step": 726
},
{
"epoch": 0.9176396339539287,
"grad_norm": 0.401254802942276,
"learning_rate": 1.6280594910099256e-06,
"loss": 0.7506740093231201,
"step": 727
},
{
"epoch": 0.9189018617860524,
"grad_norm": 0.38945209980010986,
"learning_rate": 1.5550879546776364e-06,
"loss": 0.45651984214782715,
"step": 728
},
{
"epoch": 0.9201640896181761,
"grad_norm": 0.3908751904964447,
"learning_rate": 1.4837766532493468e-06,
"loss": 0.4634789824485779,
"step": 729
},
{
"epoch": 0.9214263174502998,
"grad_norm": 0.42969706654548645,
"learning_rate": 1.414126789362269e-06,
"loss": 0.8332436084747314,
"step": 730
},
{
"epoch": 0.9226885452824235,
"grad_norm": 0.3828902542591095,
"learning_rate": 1.3461395376340502e-06,
"loss": 0.36839234828948975,
"step": 731
},
{
"epoch": 0.9239507731145472,
"grad_norm": 0.4279589354991913,
"learning_rate": 1.2798160446431006e-06,
"loss": 0.7247366309165955,
"step": 732
},
{
"epoch": 0.9252130009466709,
"grad_norm": 0.4109678566455841,
"learning_rate": 1.2151574289091749e-06,
"loss": 0.44771307706832886,
"step": 733
},
{
"epoch": 0.9264752287787946,
"grad_norm": 0.3857699930667877,
"learning_rate": 1.1521647808744873e-06,
"loss": 0.7814648151397705,
"step": 734
},
{
"epoch": 0.9277374566109182,
"grad_norm": 0.40495210886001587,
"learning_rate": 1.0908391628854041e-06,
"loss": 0.4813134968280792,
"step": 735
},
{
"epoch": 0.9289996844430419,
"grad_norm": 0.40271830558776855,
"learning_rate": 1.0311816091744698e-06,
"loss": 0.4100000858306885,
"step": 736
},
{
"epoch": 0.9302619122751656,
"grad_norm": 0.37395796179771423,
"learning_rate": 9.731931258429638e-07,
"loss": 0.4800105690956116,
"step": 737
},
{
"epoch": 0.9315241401072893,
"grad_norm": 0.3781779408454895,
"learning_rate": 9.168746908439718e-07,
"loss": 0.48567116260528564,
"step": 738
},
{
"epoch": 0.932786367939413,
"grad_norm": 0.383577436208725,
"learning_rate": 8.622272539658415e-07,
"loss": 0.4960499107837677,
"step": 739
},
{
"epoch": 0.9340485957715368,
"grad_norm": 0.40534883737564087,
"learning_rate": 8.092517368162078e-07,
"loss": 0.4538559913635254,
"step": 740
},
{
"epoch": 0.9353108236036605,
"grad_norm": 0.3785009980201721,
"learning_rate": 7.579490328064265e-07,
"loss": 0.4022294580936432,
"step": 741
},
{
"epoch": 0.9365730514357842,
"grad_norm": 0.3643127381801605,
"learning_rate": 7.083200071365203e-07,
"loss": 0.429392009973526,
"step": 742
},
{
"epoch": 0.9378352792679079,
"grad_norm": 0.4218924343585968,
"learning_rate": 6.603654967805683e-07,
"loss": 0.6960986256599426,
"step": 743
},
{
"epoch": 0.9390975071000316,
"grad_norm": 0.387144535779953,
"learning_rate": 6.140863104726391e-07,
"loss": 0.359319269657135,
"step": 744
},
{
"epoch": 0.9403597349321553,
"grad_norm": 0.386854887008667,
"learning_rate": 5.694832286930685e-07,
"loss": 0.5978315472602844,
"step": 745
},
{
"epoch": 0.9416219627642789,
"grad_norm": 0.38212618231773376,
"learning_rate": 5.265570036553813e-07,
"loss": 0.7151321172714233,
"step": 746
},
{
"epoch": 0.9428841905964026,
"grad_norm": 0.38942816853523254,
"learning_rate": 4.85308359293557e-07,
"loss": 0.34270745515823364,
"step": 747
},
{
"epoch": 0.9441464184285263,
"grad_norm": 0.4136378765106201,
"learning_rate": 4.457379912498394e-07,
"loss": 0.3653174340724945,
"step": 748
},
{
"epoch": 0.94540864626065,
"grad_norm": 0.42216548323631287,
"learning_rate": 4.078465668629905e-07,
"loss": 0.663544237613678,
"step": 749
},
{
"epoch": 0.9466708740927737,
"grad_norm": 0.4414190948009491,
"learning_rate": 3.716347251570551e-07,
"loss": 0.7294875383377075,
"step": 750
},
{
"epoch": 0.9479331019248974,
"grad_norm": 0.3959789574146271,
"learning_rate": 3.371030768305583e-07,
"loss": 0.6958010196685791,
"step": 751
},
{
"epoch": 0.9491953297570211,
"grad_norm": 0.45387375354766846,
"learning_rate": 3.042522042462359e-07,
"loss": 0.7474179267883301,
"step": 752
},
{
"epoch": 0.9504575575891449,
"grad_norm": 0.37097567319869995,
"learning_rate": 2.7308266142119785e-07,
"loss": 0.7090280055999756,
"step": 753
},
{
"epoch": 0.9517197854212686,
"grad_norm": 0.4319815933704376,
"learning_rate": 2.4359497401758024e-07,
"loss": 0.632872462272644,
"step": 754
},
{
"epoch": 0.9529820132533923,
"grad_norm": 0.412222295999527,
"learning_rate": 2.1578963933367446e-07,
"loss": 0.6069747805595398,
"step": 755
},
{
"epoch": 0.954244241085516,
"grad_norm": 0.4318292737007141,
"learning_rate": 1.8966712629558957e-07,
"loss": 0.48516613245010376,
"step": 756
},
{
"epoch": 0.9555064689176397,
"grad_norm": 0.4013379216194153,
"learning_rate": 1.6522787544926977e-07,
"loss": 0.7001821994781494,
"step": 757
},
{
"epoch": 0.9567686967497633,
"grad_norm": 0.3875749111175537,
"learning_rate": 1.424722989531113e-07,
"loss": 0.5603348016738892,
"step": 758
},
{
"epoch": 0.958030924581887,
"grad_norm": 0.2857275605201721,
"learning_rate": 1.2140078057101266e-07,
"loss": 0.2514762878417969,
"step": 759
},
{
"epoch": 0.9592931524140107,
"grad_norm": 0.38641858100891113,
"learning_rate": 1.020136756658574e-07,
"loss": 0.6449640393257141,
"step": 760
},
{
"epoch": 0.9605553802461344,
"grad_norm": 0.4277747571468353,
"learning_rate": 8.43113111936189e-08,
"loss": 0.7620565891265869,
"step": 761
},
{
"epoch": 0.9618176080782581,
"grad_norm": 0.3486212193965912,
"learning_rate": 6.829398569770939e-08,
"loss": 0.43015536665916443,
"step": 762
},
{
"epoch": 0.9630798359103818,
"grad_norm": 0.36243584752082825,
"learning_rate": 5.3961969304072715e-08,
"loss": 0.393317312002182,
"step": 763
},
{
"epoch": 0.9643420637425055,
"grad_norm": 0.38432276248931885,
"learning_rate": 4.131550371655468e-08,
"loss": 0.752675473690033,
"step": 764
},
{
"epoch": 0.9656042915746292,
"grad_norm": 0.413333535194397,
"learning_rate": 3.0354802212839705e-08,
"loss": 0.7670407891273499,
"step": 765
},
{
"epoch": 0.9668665194067529,
"grad_norm": 0.3813234269618988,
"learning_rate": 2.108004964086474e-08,
"loss": 0.4830048382282257,
"step": 766
},
{
"epoch": 0.9681287472388767,
"grad_norm": 0.2374144047498703,
"learning_rate": 1.3491402415710675e-08,
"loss": 0.1855914294719696,
"step": 767
},
{
"epoch": 0.9693909750710004,
"grad_norm": 0.48682042956352234,
"learning_rate": 7.58898851693779e-09,
"loss": 0.5933582186698914,
"step": 768
},
{
"epoch": 0.9706532029031241,
"grad_norm": 0.4472711980342865,
"learning_rate": 3.3729074864541355e-09,
"loss": 0.55843585729599,
"step": 769
},
{
"epoch": 0.9719154307352477,
"grad_norm": 0.4075043201446533,
"learning_rate": 8.432304268057856e-10,
"loss": 0.7006219625473022,
"step": 770
},
{
"epoch": 0.9719154307352477,
"eval_loss": 0.5271598100662231,
"eval_runtime": 224.8405,
"eval_samples_per_second": 2.126,
"eval_steps_per_second": 0.534,
"step": 770
}
],
"logging_steps": 1,
"max_steps": 770,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2224210803964467e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}