ErrorAI's picture
Training in progress, step 931, checkpoint
5262317 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.000806234883096,
"eval_steps": 233,
"global_step": 931,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010749798441279225,
"grad_norm": 17.287843704223633,
"learning_rate": 2e-05,
"loss": 18.567,
"step": 1
},
{
"epoch": 0.0010749798441279225,
"eval_loss": 4.0416483879089355,
"eval_runtime": 6.371,
"eval_samples_per_second": 61.529,
"eval_steps_per_second": 30.765,
"step": 1
},
{
"epoch": 0.002149959688255845,
"grad_norm": 19.48370933532715,
"learning_rate": 4e-05,
"loss": 18.1211,
"step": 2
},
{
"epoch": 0.0032249395323837677,
"grad_norm": 14.203254699707031,
"learning_rate": 6e-05,
"loss": 14.5618,
"step": 3
},
{
"epoch": 0.00429991937651169,
"grad_norm": 16.90338897705078,
"learning_rate": 8e-05,
"loss": 17.2773,
"step": 4
},
{
"epoch": 0.005374899220639613,
"grad_norm": 10.459429740905762,
"learning_rate": 0.0001,
"loss": 14.4675,
"step": 5
},
{
"epoch": 0.0064498790647675355,
"grad_norm": 16.3222713470459,
"learning_rate": 0.00012,
"loss": 16.8006,
"step": 6
},
{
"epoch": 0.007524858908895459,
"grad_norm": 17.23369789123535,
"learning_rate": 0.00014,
"loss": 14.2422,
"step": 7
},
{
"epoch": 0.00859983875302338,
"grad_norm": 18.750120162963867,
"learning_rate": 0.00016,
"loss": 17.4167,
"step": 8
},
{
"epoch": 0.009674818597151304,
"grad_norm": 12.818103790283203,
"learning_rate": 0.00018,
"loss": 13.2251,
"step": 9
},
{
"epoch": 0.010749798441279226,
"grad_norm": 15.632926940917969,
"learning_rate": 0.0002,
"loss": 14.3551,
"step": 10
},
{
"epoch": 0.011824778285407149,
"grad_norm": 10.966743469238281,
"learning_rate": 0.00019999941823167997,
"loss": 12.1329,
"step": 11
},
{
"epoch": 0.012899758129535071,
"grad_norm": 10.447884559631348,
"learning_rate": 0.00019999767293348887,
"loss": 11.0189,
"step": 12
},
{
"epoch": 0.013974737973662993,
"grad_norm": 14.069694519042969,
"learning_rate": 0.00019999476412573398,
"loss": 12.6871,
"step": 13
},
{
"epoch": 0.015049717817790917,
"grad_norm": 17.129362106323242,
"learning_rate": 0.0001999906918422603,
"loss": 12.774,
"step": 14
},
{
"epoch": 0.01612469766191884,
"grad_norm": 12.345664978027344,
"learning_rate": 0.00019998545613045035,
"loss": 10.1907,
"step": 15
},
{
"epoch": 0.01719967750604676,
"grad_norm": 12.960017204284668,
"learning_rate": 0.00019997905705122353,
"loss": 9.124,
"step": 16
},
{
"epoch": 0.018274657350174684,
"grad_norm": 17.12679672241211,
"learning_rate": 0.0001999714946790355,
"loss": 11.1126,
"step": 17
},
{
"epoch": 0.019349637194302608,
"grad_norm": 13.355628967285156,
"learning_rate": 0.0001999627691018772,
"loss": 9.9217,
"step": 18
},
{
"epoch": 0.02042461703843053,
"grad_norm": 14.47995376586914,
"learning_rate": 0.00019995288042127393,
"loss": 9.8122,
"step": 19
},
{
"epoch": 0.021499596882558453,
"grad_norm": 15.26109504699707,
"learning_rate": 0.00019994182875228417,
"loss": 9.1869,
"step": 20
},
{
"epoch": 0.022574576726686373,
"grad_norm": 19.227262496948242,
"learning_rate": 0.00019992961422349805,
"loss": 8.0937,
"step": 21
},
{
"epoch": 0.023649556570814297,
"grad_norm": 15.230995178222656,
"learning_rate": 0.00019991623697703613,
"loss": 8.5341,
"step": 22
},
{
"epoch": 0.02472453641494222,
"grad_norm": 16.82895278930664,
"learning_rate": 0.00019990169716854758,
"loss": 9.1735,
"step": 23
},
{
"epoch": 0.025799516259070142,
"grad_norm": 22.261363983154297,
"learning_rate": 0.00019988599496720836,
"loss": 8.5753,
"step": 24
},
{
"epoch": 0.026874496103198066,
"grad_norm": 14.500650405883789,
"learning_rate": 0.0001998691305557194,
"loss": 8.7694,
"step": 25
},
{
"epoch": 0.027949475947325986,
"grad_norm": 13.450296401977539,
"learning_rate": 0.00019985110413030425,
"loss": 7.6744,
"step": 26
},
{
"epoch": 0.02902445579145391,
"grad_norm": 11.800576210021973,
"learning_rate": 0.00019983191590070703,
"loss": 6.7168,
"step": 27
},
{
"epoch": 0.030099435635581834,
"grad_norm": 15.437250137329102,
"learning_rate": 0.00019981156609018977,
"loss": 7.8992,
"step": 28
},
{
"epoch": 0.031174415479709755,
"grad_norm": 13.048258781433105,
"learning_rate": 0.00019979005493552996,
"loss": 7.4647,
"step": 29
},
{
"epoch": 0.03224939532383768,
"grad_norm": 17.663209915161133,
"learning_rate": 0.00019976738268701784,
"loss": 7.6277,
"step": 30
},
{
"epoch": 0.0333243751679656,
"grad_norm": 17.522117614746094,
"learning_rate": 0.00019974354960845326,
"loss": 7.3131,
"step": 31
},
{
"epoch": 0.03439935501209352,
"grad_norm": 16.121286392211914,
"learning_rate": 0.00019971855597714284,
"loss": 7.1682,
"step": 32
},
{
"epoch": 0.035474334856221444,
"grad_norm": 12.511422157287598,
"learning_rate": 0.00019969240208389665,
"loss": 6.4537,
"step": 33
},
{
"epoch": 0.03654931470034937,
"grad_norm": 14.760931015014648,
"learning_rate": 0.00019966508823302483,
"loss": 6.8972,
"step": 34
},
{
"epoch": 0.03762429454447729,
"grad_norm": 16.834484100341797,
"learning_rate": 0.00019963661474233402,
"loss": 8.2614,
"step": 35
},
{
"epoch": 0.038699274388605216,
"grad_norm": 13.5601224899292,
"learning_rate": 0.0001996069819431237,
"loss": 6.4588,
"step": 36
},
{
"epoch": 0.03977425423273313,
"grad_norm": 14.121377944946289,
"learning_rate": 0.00019957619018018242,
"loss": 6.057,
"step": 37
},
{
"epoch": 0.04084923407686106,
"grad_norm": 14.331984519958496,
"learning_rate": 0.00019954423981178354,
"loss": 5.9236,
"step": 38
},
{
"epoch": 0.04192421392098898,
"grad_norm": 14.163195610046387,
"learning_rate": 0.00019951113120968134,
"loss": 6.0719,
"step": 39
},
{
"epoch": 0.042999193765116905,
"grad_norm": 13.852533340454102,
"learning_rate": 0.00019947686475910655,
"loss": 5.6034,
"step": 40
},
{
"epoch": 0.04407417360924483,
"grad_norm": 14.488564491271973,
"learning_rate": 0.00019944144085876184,
"loss": 7.0848,
"step": 41
},
{
"epoch": 0.045149153453372746,
"grad_norm": 11.431620597839355,
"learning_rate": 0.0001994048599208173,
"loss": 5.5335,
"step": 42
},
{
"epoch": 0.04622413329750067,
"grad_norm": 13.871944427490234,
"learning_rate": 0.00019936712237090553,
"loss": 5.8063,
"step": 43
},
{
"epoch": 0.047299113141628595,
"grad_norm": 18.87192726135254,
"learning_rate": 0.00019932822864811677,
"loss": 6.0023,
"step": 44
},
{
"epoch": 0.04837409298575652,
"grad_norm": 12.797957420349121,
"learning_rate": 0.00019928817920499375,
"loss": 5.546,
"step": 45
},
{
"epoch": 0.04944907282988444,
"grad_norm": 14.95291519165039,
"learning_rate": 0.00019924697450752633,
"loss": 6.1613,
"step": 46
},
{
"epoch": 0.05052405267401236,
"grad_norm": 18.501853942871094,
"learning_rate": 0.00019920461503514635,
"loss": 6.1402,
"step": 47
},
{
"epoch": 0.051599032518140284,
"grad_norm": 19.192930221557617,
"learning_rate": 0.0001991611012807218,
"loss": 5.7711,
"step": 48
},
{
"epoch": 0.05267401236226821,
"grad_norm": 23.865346908569336,
"learning_rate": 0.00019911643375055107,
"loss": 6.6772,
"step": 49
},
{
"epoch": 0.05374899220639613,
"grad_norm": 27.616785049438477,
"learning_rate": 0.00019907061296435728,
"loss": 6.6335,
"step": 50
},
{
"epoch": 0.054823972050524056,
"grad_norm": 20.929126739501953,
"learning_rate": 0.0001990236394552821,
"loss": 6.4005,
"step": 51
},
{
"epoch": 0.05589895189465197,
"grad_norm": 10.98021411895752,
"learning_rate": 0.00019897551376987948,
"loss": 4.4406,
"step": 52
},
{
"epoch": 0.0569739317387799,
"grad_norm": 12.884988784790039,
"learning_rate": 0.00019892623646810943,
"loss": 4.7416,
"step": 53
},
{
"epoch": 0.05804891158290782,
"grad_norm": 14.663339614868164,
"learning_rate": 0.0001988758081233314,
"loss": 5.6428,
"step": 54
},
{
"epoch": 0.059123891427035745,
"grad_norm": 13.638205528259277,
"learning_rate": 0.00019882422932229765,
"loss": 6.3548,
"step": 55
},
{
"epoch": 0.06019887127116367,
"grad_norm": 15.025946617126465,
"learning_rate": 0.00019877150066514645,
"loss": 4.9333,
"step": 56
},
{
"epoch": 0.061273851115291586,
"grad_norm": 20.201622009277344,
"learning_rate": 0.000198717622765395,
"loss": 6.5034,
"step": 57
},
{
"epoch": 0.06234883095941951,
"grad_norm": 14.798491477966309,
"learning_rate": 0.00019866259624993246,
"loss": 4.757,
"step": 58
},
{
"epoch": 0.06342381080354743,
"grad_norm": 15.673213005065918,
"learning_rate": 0.00019860642175901247,
"loss": 7.0599,
"step": 59
},
{
"epoch": 0.06449879064767536,
"grad_norm": 18.45370864868164,
"learning_rate": 0.00019854909994624582,
"loss": 6.7934,
"step": 60
},
{
"epoch": 0.06557377049180328,
"grad_norm": 15.39392375946045,
"learning_rate": 0.0001984906314785928,
"loss": 5.5127,
"step": 61
},
{
"epoch": 0.0666487503359312,
"grad_norm": 16.213571548461914,
"learning_rate": 0.00019843101703635548,
"loss": 4.8815,
"step": 62
},
{
"epoch": 0.06772373018005913,
"grad_norm": 20.046100616455078,
"learning_rate": 0.00019837025731316967,
"loss": 5.3901,
"step": 63
},
{
"epoch": 0.06879871002418704,
"grad_norm": 16.978891372680664,
"learning_rate": 0.0001983083530159971,
"loss": 5.7858,
"step": 64
},
{
"epoch": 0.06987368986831496,
"grad_norm": 17.5430965423584,
"learning_rate": 0.00019824530486511687,
"loss": 6.2824,
"step": 65
},
{
"epoch": 0.07094866971244289,
"grad_norm": 15.383797645568848,
"learning_rate": 0.00019818111359411737,
"loss": 4.4531,
"step": 66
},
{
"epoch": 0.07202364955657081,
"grad_norm": 16.83544921875,
"learning_rate": 0.00019811577994988754,
"loss": 6.4399,
"step": 67
},
{
"epoch": 0.07309862940069874,
"grad_norm": 22.3226261138916,
"learning_rate": 0.00019804930469260828,
"loss": 7.8473,
"step": 68
},
{
"epoch": 0.07417360924482666,
"grad_norm": 18.50650978088379,
"learning_rate": 0.00019798168859574356,
"loss": 6.8441,
"step": 69
},
{
"epoch": 0.07524858908895458,
"grad_norm": 17.836515426635742,
"learning_rate": 0.00019791293244603142,
"loss": 5.3271,
"step": 70
},
{
"epoch": 0.07632356893308251,
"grad_norm": 16.706695556640625,
"learning_rate": 0.00019784303704347488,
"loss": 5.4312,
"step": 71
},
{
"epoch": 0.07739854877721043,
"grad_norm": 18.03818130493164,
"learning_rate": 0.00019777200320133254,
"loss": 5.9135,
"step": 72
},
{
"epoch": 0.07847352862133836,
"grad_norm": 11.856945991516113,
"learning_rate": 0.00019769983174610918,
"loss": 5.6232,
"step": 73
},
{
"epoch": 0.07954850846546627,
"grad_norm": 17.87145233154297,
"learning_rate": 0.00019762652351754616,
"loss": 4.9234,
"step": 74
},
{
"epoch": 0.08062348830959419,
"grad_norm": 16.913291931152344,
"learning_rate": 0.00019755207936861155,
"loss": 6.6548,
"step": 75
},
{
"epoch": 0.08169846815372211,
"grad_norm": 12.137495040893555,
"learning_rate": 0.00019747650016549027,
"loss": 4.1446,
"step": 76
},
{
"epoch": 0.08277344799785004,
"grad_norm": 15.74571704864502,
"learning_rate": 0.00019739978678757412,
"loss": 6.0891,
"step": 77
},
{
"epoch": 0.08384842784197796,
"grad_norm": 13.001721382141113,
"learning_rate": 0.0001973219401274513,
"loss": 4.2512,
"step": 78
},
{
"epoch": 0.08492340768610589,
"grad_norm": 20.199098587036133,
"learning_rate": 0.00019724296109089622,
"loss": 6.0262,
"step": 79
},
{
"epoch": 0.08599838753023381,
"grad_norm": 16.964731216430664,
"learning_rate": 0.00019716285059685892,
"loss": 4.7964,
"step": 80
},
{
"epoch": 0.08707336737436173,
"grad_norm": 15.965873718261719,
"learning_rate": 0.0001970816095774544,
"loss": 5.7548,
"step": 81
},
{
"epoch": 0.08814834721848966,
"grad_norm": 22.0924129486084,
"learning_rate": 0.00019699923897795163,
"loss": 7.1131,
"step": 82
},
{
"epoch": 0.08922332706261758,
"grad_norm": 16.394224166870117,
"learning_rate": 0.0001969157397567627,
"loss": 5.7294,
"step": 83
},
{
"epoch": 0.09029830690674549,
"grad_norm": 14.458036422729492,
"learning_rate": 0.0001968311128854317,
"loss": 5.4966,
"step": 84
},
{
"epoch": 0.09137328675087342,
"grad_norm": 12.637007713317871,
"learning_rate": 0.00019674535934862325,
"loss": 3.4551,
"step": 85
},
{
"epoch": 0.09244826659500134,
"grad_norm": 13.059622764587402,
"learning_rate": 0.00019665848014411118,
"loss": 5.1353,
"step": 86
},
{
"epoch": 0.09352324643912927,
"grad_norm": 20.794404983520508,
"learning_rate": 0.00019657047628276688,
"loss": 4.9761,
"step": 87
},
{
"epoch": 0.09459822628325719,
"grad_norm": 15.221658706665039,
"learning_rate": 0.00019648134878854747,
"loss": 4.8321,
"step": 88
},
{
"epoch": 0.09567320612738511,
"grad_norm": 14.838767051696777,
"learning_rate": 0.0001963910986984841,
"loss": 4.5949,
"step": 89
},
{
"epoch": 0.09674818597151304,
"grad_norm": 12.857973098754883,
"learning_rate": 0.00019629972706266952,
"loss": 4.0017,
"step": 90
},
{
"epoch": 0.09782316581564096,
"grad_norm": 15.524980545043945,
"learning_rate": 0.00019620723494424627,
"loss": 4.57,
"step": 91
},
{
"epoch": 0.09889814565976889,
"grad_norm": 11.060179710388184,
"learning_rate": 0.000196113623419394,
"loss": 4.1406,
"step": 92
},
{
"epoch": 0.0999731255038968,
"grad_norm": 18.566953659057617,
"learning_rate": 0.00019601889357731713,
"loss": 4.3026,
"step": 93
},
{
"epoch": 0.10104810534802472,
"grad_norm": 13.795799255371094,
"learning_rate": 0.00019592304652023206,
"loss": 3.585,
"step": 94
},
{
"epoch": 0.10212308519215264,
"grad_norm": 17.49445343017578,
"learning_rate": 0.0001958260833633544,
"loss": 5.1268,
"step": 95
},
{
"epoch": 0.10319806503628057,
"grad_norm": 13.12109661102295,
"learning_rate": 0.00019572800523488609,
"loss": 4.2585,
"step": 96
},
{
"epoch": 0.10427304488040849,
"grad_norm": 15.185480117797852,
"learning_rate": 0.00019562881327600198,
"loss": 4.8719,
"step": 97
},
{
"epoch": 0.10534802472453642,
"grad_norm": 11.378191947937012,
"learning_rate": 0.00019552850864083693,
"loss": 4.2474,
"step": 98
},
{
"epoch": 0.10642300456866434,
"grad_norm": 17.479673385620117,
"learning_rate": 0.0001954270924964721,
"loss": 4.1351,
"step": 99
},
{
"epoch": 0.10749798441279226,
"grad_norm": 20.179716110229492,
"learning_rate": 0.0001953245660229215,
"loss": 4.218,
"step": 100
},
{
"epoch": 0.10857296425692019,
"grad_norm": 15.806692123413086,
"learning_rate": 0.00019522093041311815,
"loss": 5.7112,
"step": 101
},
{
"epoch": 0.10964794410104811,
"grad_norm": 13.089418411254883,
"learning_rate": 0.00019511618687290043,
"loss": 3.3798,
"step": 102
},
{
"epoch": 0.11072292394517602,
"grad_norm": 15.82168197631836,
"learning_rate": 0.00019501033662099778,
"loss": 5.123,
"step": 103
},
{
"epoch": 0.11179790378930395,
"grad_norm": 17.98412322998047,
"learning_rate": 0.00019490338088901666,
"loss": 4.6133,
"step": 104
},
{
"epoch": 0.11287288363343187,
"grad_norm": 13.41305160522461,
"learning_rate": 0.0001947953209214262,
"loss": 4.4088,
"step": 105
},
{
"epoch": 0.1139478634775598,
"grad_norm": 17.843494415283203,
"learning_rate": 0.00019468615797554374,
"loss": 3.5071,
"step": 106
},
{
"epoch": 0.11502284332168772,
"grad_norm": 17.681631088256836,
"learning_rate": 0.00019457589332152008,
"loss": 5.0372,
"step": 107
},
{
"epoch": 0.11609782316581564,
"grad_norm": 17.937023162841797,
"learning_rate": 0.00019446452824232492,
"loss": 4.3635,
"step": 108
},
{
"epoch": 0.11717280300994357,
"grad_norm": 21.669342041015625,
"learning_rate": 0.00019435206403373178,
"loss": 5.2923,
"step": 109
},
{
"epoch": 0.11824778285407149,
"grad_norm": 18.59075927734375,
"learning_rate": 0.00019423850200430293,
"loss": 4.7142,
"step": 110
},
{
"epoch": 0.11932276269819941,
"grad_norm": 19.25830841064453,
"learning_rate": 0.00019412384347537414,
"loss": 5.0176,
"step": 111
},
{
"epoch": 0.12039774254232734,
"grad_norm": 21.377017974853516,
"learning_rate": 0.00019400808978103947,
"loss": 5.0599,
"step": 112
},
{
"epoch": 0.12147272238645525,
"grad_norm": 14.341522216796875,
"learning_rate": 0.0001938912422681355,
"loss": 4.9352,
"step": 113
},
{
"epoch": 0.12254770223058317,
"grad_norm": 15.528069496154785,
"learning_rate": 0.00019377330229622595,
"loss": 5.6631,
"step": 114
},
{
"epoch": 0.1236226820747111,
"grad_norm": 18.492849349975586,
"learning_rate": 0.0001936542712375855,
"loss": 4.6148,
"step": 115
},
{
"epoch": 0.12469766191883902,
"grad_norm": 20.3253116607666,
"learning_rate": 0.0001935341504771842,
"loss": 4.5666,
"step": 116
},
{
"epoch": 0.12577264176296696,
"grad_norm": 14.674714088439941,
"learning_rate": 0.00019341294141267108,
"loss": 4.8294,
"step": 117
},
{
"epoch": 0.12684762160709487,
"grad_norm": 10.383010864257812,
"learning_rate": 0.00019329064545435803,
"loss": 4.0049,
"step": 118
},
{
"epoch": 0.12792260145122278,
"grad_norm": 15.706880569458008,
"learning_rate": 0.00019316726402520334,
"loss": 4.5301,
"step": 119
},
{
"epoch": 0.12899758129535072,
"grad_norm": 14.116766929626465,
"learning_rate": 0.0001930427985607951,
"loss": 4.21,
"step": 120
},
{
"epoch": 0.13007256113947863,
"grad_norm": 12.008879661560059,
"learning_rate": 0.00019291725050933468,
"loss": 3.6814,
"step": 121
},
{
"epoch": 0.13114754098360656,
"grad_norm": 15.146964073181152,
"learning_rate": 0.00019279062133161957,
"loss": 4.0279,
"step": 122
},
{
"epoch": 0.13222252082773447,
"grad_norm": 15.026398658752441,
"learning_rate": 0.0001926629125010267,
"loss": 4.0473,
"step": 123
},
{
"epoch": 0.1332975006718624,
"grad_norm": 15.858999252319336,
"learning_rate": 0.00019253412550349509,
"loss": 4.3264,
"step": 124
},
{
"epoch": 0.13437248051599032,
"grad_norm": 18.721647262573242,
"learning_rate": 0.00019240426183750865,
"loss": 4.4262,
"step": 125
},
{
"epoch": 0.13544746036011826,
"grad_norm": 19.55031394958496,
"learning_rate": 0.0001922733230140787,
"loss": 5.5739,
"step": 126
},
{
"epoch": 0.13652244020424617,
"grad_norm": 10.331392288208008,
"learning_rate": 0.00019214131055672647,
"loss": 3.5695,
"step": 127
},
{
"epoch": 0.13759742004837408,
"grad_norm": 19.27557373046875,
"learning_rate": 0.0001920082260014652,
"loss": 5.4195,
"step": 128
},
{
"epoch": 0.13867239989250202,
"grad_norm": 21.552522659301758,
"learning_rate": 0.0001918740708967825,
"loss": 4.1473,
"step": 129
},
{
"epoch": 0.13974737973662993,
"grad_norm": 19.07160186767578,
"learning_rate": 0.0001917388468036222,
"loss": 4.3624,
"step": 130
},
{
"epoch": 0.14082235958075787,
"grad_norm": 24.328269958496094,
"learning_rate": 0.0001916025552953661,
"loss": 4.5408,
"step": 131
},
{
"epoch": 0.14189733942488578,
"grad_norm": 22.924718856811523,
"learning_rate": 0.00019146519795781587,
"loss": 4.2812,
"step": 132
},
{
"epoch": 0.14297231926901371,
"grad_norm": 15.937036514282227,
"learning_rate": 0.00019132677638917449,
"loss": 4.6842,
"step": 133
},
{
"epoch": 0.14404729911314162,
"grad_norm": 14.515525817871094,
"learning_rate": 0.00019118729220002755,
"loss": 3.2523,
"step": 134
},
{
"epoch": 0.14512227895726956,
"grad_norm": 19.804189682006836,
"learning_rate": 0.00019104674701332476,
"loss": 4.5473,
"step": 135
},
{
"epoch": 0.14619725880139747,
"grad_norm": 13.662827491760254,
"learning_rate": 0.00019090514246436087,
"loss": 4.1841,
"step": 136
},
{
"epoch": 0.14727223864552538,
"grad_norm": 22.40411376953125,
"learning_rate": 0.00019076248020075665,
"loss": 6.2449,
"step": 137
},
{
"epoch": 0.14834721848965332,
"grad_norm": 15.33782958984375,
"learning_rate": 0.00019061876188243982,
"loss": 2.8611,
"step": 138
},
{
"epoch": 0.14942219833378123,
"grad_norm": 20.899106979370117,
"learning_rate": 0.00019047398918162572,
"loss": 5.3855,
"step": 139
},
{
"epoch": 0.15049717817790917,
"grad_norm": 16.781774520874023,
"learning_rate": 0.00019032816378279768,
"loss": 4.2343,
"step": 140
},
{
"epoch": 0.15157215802203708,
"grad_norm": 15.55665397644043,
"learning_rate": 0.00019018128738268773,
"loss": 4.5545,
"step": 141
},
{
"epoch": 0.15264713786616502,
"grad_norm": 22.28097152709961,
"learning_rate": 0.00019003336169025654,
"loss": 5.1255,
"step": 142
},
{
"epoch": 0.15372211771029293,
"grad_norm": 14.668632507324219,
"learning_rate": 0.00018988438842667375,
"loss": 5.7869,
"step": 143
},
{
"epoch": 0.15479709755442086,
"grad_norm": 21.854108810424805,
"learning_rate": 0.00018973436932529793,
"loss": 5.1173,
"step": 144
},
{
"epoch": 0.15587207739854878,
"grad_norm": 16.630081176757812,
"learning_rate": 0.00018958330613165622,
"loss": 4.251,
"step": 145
},
{
"epoch": 0.1569470572426767,
"grad_norm": 16.389333724975586,
"learning_rate": 0.00018943120060342425,
"loss": 4.6531,
"step": 146
},
{
"epoch": 0.15802203708680462,
"grad_norm": 14.595359802246094,
"learning_rate": 0.0001892780545104056,
"loss": 4.1349,
"step": 147
},
{
"epoch": 0.15909701693093253,
"grad_norm": 18.753944396972656,
"learning_rate": 0.00018912386963451113,
"loss": 4.0963,
"step": 148
},
{
"epoch": 0.16017199677506047,
"grad_norm": 15.209190368652344,
"learning_rate": 0.00018896864776973837,
"loss": 3.6522,
"step": 149
},
{
"epoch": 0.16124697661918838,
"grad_norm": 20.873994827270508,
"learning_rate": 0.00018881239072215063,
"loss": 5.3913,
"step": 150
},
{
"epoch": 0.16232195646331632,
"grad_norm": 12.859075546264648,
"learning_rate": 0.00018865510030985588,
"loss": 2.6075,
"step": 151
},
{
"epoch": 0.16339693630744423,
"grad_norm": 21.292451858520508,
"learning_rate": 0.00018849677836298568,
"loss": 4.9356,
"step": 152
},
{
"epoch": 0.16447191615157217,
"grad_norm": 14.94565200805664,
"learning_rate": 0.00018833742672367393,
"loss": 3.804,
"step": 153
},
{
"epoch": 0.16554689599570008,
"grad_norm": 20.21578025817871,
"learning_rate": 0.00018817704724603536,
"loss": 5.3554,
"step": 154
},
{
"epoch": 0.16662187583982802,
"grad_norm": 18.05601692199707,
"learning_rate": 0.00018801564179614388,
"loss": 4.6274,
"step": 155
},
{
"epoch": 0.16769685568395593,
"grad_norm": 13.378555297851562,
"learning_rate": 0.00018785321225201108,
"loss": 3.8398,
"step": 156
},
{
"epoch": 0.16877183552808384,
"grad_norm": 13.038491249084473,
"learning_rate": 0.00018768976050356426,
"loss": 3.7924,
"step": 157
},
{
"epoch": 0.16984681537221177,
"grad_norm": 10.797876358032227,
"learning_rate": 0.00018752528845262433,
"loss": 3.273,
"step": 158
},
{
"epoch": 0.17092179521633968,
"grad_norm": 12.845779418945312,
"learning_rate": 0.00018735979801288392,
"loss": 3.7228,
"step": 159
},
{
"epoch": 0.17199677506046762,
"grad_norm": 12.737683296203613,
"learning_rate": 0.00018719329110988486,
"loss": 4.2175,
"step": 160
},
{
"epoch": 0.17307175490459553,
"grad_norm": 16.20618438720703,
"learning_rate": 0.00018702576968099608,
"loss": 3.4056,
"step": 161
},
{
"epoch": 0.17414673474872347,
"grad_norm": 19.91802978515625,
"learning_rate": 0.00018685723567539068,
"loss": 4.6,
"step": 162
},
{
"epoch": 0.17522171459285138,
"grad_norm": 15.743769645690918,
"learning_rate": 0.00018668769105402365,
"loss": 3.5829,
"step": 163
},
{
"epoch": 0.17629669443697932,
"grad_norm": 16.444740295410156,
"learning_rate": 0.00018651713778960875,
"loss": 4.4017,
"step": 164
},
{
"epoch": 0.17737167428110723,
"grad_norm": 15.459141731262207,
"learning_rate": 0.0001863455778665957,
"loss": 4.1357,
"step": 165
},
{
"epoch": 0.17844665412523517,
"grad_norm": 18.962736129760742,
"learning_rate": 0.00018617301328114705,
"loss": 4.5289,
"step": 166
},
{
"epoch": 0.17952163396936308,
"grad_norm": 17.9486083984375,
"learning_rate": 0.000185999446041115,
"loss": 4.1333,
"step": 167
},
{
"epoch": 0.18059661381349099,
"grad_norm": 12.445462226867676,
"learning_rate": 0.00018582487816601797,
"loss": 3.7512,
"step": 168
},
{
"epoch": 0.18167159365761892,
"grad_norm": 19.503494262695312,
"learning_rate": 0.00018564931168701712,
"loss": 4.716,
"step": 169
},
{
"epoch": 0.18274657350174683,
"grad_norm": 31.339258193969727,
"learning_rate": 0.00018547274864689285,
"loss": 6.1173,
"step": 170
},
{
"epoch": 0.18382155334587477,
"grad_norm": 10.792606353759766,
"learning_rate": 0.00018529519110002077,
"loss": 3.1399,
"step": 171
},
{
"epoch": 0.18489653319000268,
"grad_norm": 16.1004695892334,
"learning_rate": 0.00018511664111234798,
"loss": 3.8947,
"step": 172
},
{
"epoch": 0.18597151303413062,
"grad_norm": 11.773107528686523,
"learning_rate": 0.00018493710076136898,
"loss": 3.0606,
"step": 173
},
{
"epoch": 0.18704649287825853,
"grad_norm": 12.119939804077148,
"learning_rate": 0.00018475657213610166,
"loss": 2.9083,
"step": 174
},
{
"epoch": 0.18812147272238647,
"grad_norm": 17.70090103149414,
"learning_rate": 0.0001845750573370626,
"loss": 5.4718,
"step": 175
},
{
"epoch": 0.18919645256651438,
"grad_norm": 15.901100158691406,
"learning_rate": 0.00018439255847624303,
"loss": 5.1192,
"step": 176
},
{
"epoch": 0.1902714324106423,
"grad_norm": 14.755876541137695,
"learning_rate": 0.00018420907767708407,
"loss": 3.7262,
"step": 177
},
{
"epoch": 0.19134641225477023,
"grad_norm": 20.44917869567871,
"learning_rate": 0.00018402461707445205,
"loss": 4.4912,
"step": 178
},
{
"epoch": 0.19242139209889814,
"grad_norm": 12.053194046020508,
"learning_rate": 0.00018383917881461366,
"loss": 3.2561,
"step": 179
},
{
"epoch": 0.19349637194302607,
"grad_norm": 16.65236473083496,
"learning_rate": 0.000183652765055211,
"loss": 3.8193,
"step": 180
},
{
"epoch": 0.19457135178715398,
"grad_norm": 18.52997589111328,
"learning_rate": 0.00018346537796523645,
"loss": 5.1119,
"step": 181
},
{
"epoch": 0.19564633163128192,
"grad_norm": 20.083873748779297,
"learning_rate": 0.0001832770197250075,
"loss": 3.7478,
"step": 182
},
{
"epoch": 0.19672131147540983,
"grad_norm": 16.985313415527344,
"learning_rate": 0.00018308769252614124,
"loss": 4.1994,
"step": 183
},
{
"epoch": 0.19779629131953777,
"grad_norm": 20.08932113647461,
"learning_rate": 0.00018289739857152903,
"loss": 5.1951,
"step": 184
},
{
"epoch": 0.19887127116366568,
"grad_norm": 19.779159545898438,
"learning_rate": 0.00018270614007531076,
"loss": 3.849,
"step": 185
},
{
"epoch": 0.1999462510077936,
"grad_norm": 17.439552307128906,
"learning_rate": 0.00018251391926284906,
"loss": 3.9962,
"step": 186
},
{
"epoch": 0.20102123085192153,
"grad_norm": 22.15019416809082,
"learning_rate": 0.0001823207383707036,
"loss": 5.3047,
"step": 187
},
{
"epoch": 0.20209621069604944,
"grad_norm": 12.635649681091309,
"learning_rate": 0.00018212659964660476,
"loss": 2.8466,
"step": 188
},
{
"epoch": 0.20317119054017738,
"grad_norm": 21.744354248046875,
"learning_rate": 0.00018193150534942778,
"loss": 4.3091,
"step": 189
},
{
"epoch": 0.2042461703843053,
"grad_norm": 26.043798446655273,
"learning_rate": 0.00018173545774916627,
"loss": 3.7433,
"step": 190
},
{
"epoch": 0.20532115022843322,
"grad_norm": 13.015897750854492,
"learning_rate": 0.00018153845912690587,
"loss": 4.0063,
"step": 191
},
{
"epoch": 0.20639613007256113,
"grad_norm": 21.41050148010254,
"learning_rate": 0.00018134051177479777,
"loss": 3.7365,
"step": 192
},
{
"epoch": 0.20747110991668907,
"grad_norm": 20.63169288635254,
"learning_rate": 0.00018114161799603193,
"loss": 3.8878,
"step": 193
},
{
"epoch": 0.20854608976081698,
"grad_norm": 18.148544311523438,
"learning_rate": 0.00018094178010481034,
"loss": 3.4437,
"step": 194
},
{
"epoch": 0.20962106960494492,
"grad_norm": 15.158918380737305,
"learning_rate": 0.00018074100042632005,
"loss": 3.2009,
"step": 195
},
{
"epoch": 0.21069604944907283,
"grad_norm": 14.152005195617676,
"learning_rate": 0.00018053928129670624,
"loss": 3.4912,
"step": 196
},
{
"epoch": 0.21177102929320074,
"grad_norm": 13.470719337463379,
"learning_rate": 0.00018033662506304485,
"loss": 3.3799,
"step": 197
},
{
"epoch": 0.21284600913732868,
"grad_norm": 16.506755828857422,
"learning_rate": 0.00018013303408331543,
"loss": 2.9757,
"step": 198
},
{
"epoch": 0.2139209889814566,
"grad_norm": 15.626431465148926,
"learning_rate": 0.00017992851072637364,
"loss": 4.4908,
"step": 199
},
{
"epoch": 0.21499596882558453,
"grad_norm": 13.705154418945312,
"learning_rate": 0.00017972305737192366,
"loss": 3.9591,
"step": 200
},
{
"epoch": 0.21607094866971244,
"grad_norm": 17.541763305664062,
"learning_rate": 0.00017951667641049053,
"loss": 3.2296,
"step": 201
},
{
"epoch": 0.21714592851384037,
"grad_norm": 17.75946044921875,
"learning_rate": 0.0001793093702433924,
"loss": 3.4873,
"step": 202
},
{
"epoch": 0.21822090835796829,
"grad_norm": 21.355859756469727,
"learning_rate": 0.0001791011412827124,
"loss": 5.255,
"step": 203
},
{
"epoch": 0.21929588820209622,
"grad_norm": 11.548168182373047,
"learning_rate": 0.00017889199195127086,
"loss": 3.1538,
"step": 204
},
{
"epoch": 0.22037086804622413,
"grad_norm": 22.31789207458496,
"learning_rate": 0.00017868192468259686,
"loss": 4.0628,
"step": 205
},
{
"epoch": 0.22144584789035204,
"grad_norm": 13.790849685668945,
"learning_rate": 0.00017847094192090005,
"loss": 3.399,
"step": 206
},
{
"epoch": 0.22252082773447998,
"grad_norm": 9.146944046020508,
"learning_rate": 0.00017825904612104215,
"loss": 2.4616,
"step": 207
},
{
"epoch": 0.2235958075786079,
"grad_norm": 14.422385215759277,
"learning_rate": 0.00017804623974850844,
"loss": 3.5906,
"step": 208
},
{
"epoch": 0.22467078742273583,
"grad_norm": 16.845298767089844,
"learning_rate": 0.00017783252527937905,
"loss": 4.7812,
"step": 209
},
{
"epoch": 0.22574576726686374,
"grad_norm": 24.236703872680664,
"learning_rate": 0.0001776179052003001,
"loss": 5.1536,
"step": 210
},
{
"epoch": 0.22682074711099168,
"grad_norm": 21.463781356811523,
"learning_rate": 0.00017740238200845485,
"loss": 5.0244,
"step": 211
},
{
"epoch": 0.2278957269551196,
"grad_norm": 18.184011459350586,
"learning_rate": 0.00017718595821153462,
"loss": 5.0591,
"step": 212
},
{
"epoch": 0.22897070679924753,
"grad_norm": 16.528148651123047,
"learning_rate": 0.0001769686363277096,
"loss": 3.7127,
"step": 213
},
{
"epoch": 0.23004568664337544,
"grad_norm": 16.78187370300293,
"learning_rate": 0.0001767504188855995,
"loss": 4.499,
"step": 214
},
{
"epoch": 0.23112066648750335,
"grad_norm": 19.419116973876953,
"learning_rate": 0.00017653130842424427,
"loss": 3.4537,
"step": 215
},
{
"epoch": 0.23219564633163128,
"grad_norm": 14.738585472106934,
"learning_rate": 0.00017631130749307436,
"loss": 3.7363,
"step": 216
},
{
"epoch": 0.2332706261757592,
"grad_norm": 16.021595001220703,
"learning_rate": 0.0001760904186518812,
"loss": 4.2678,
"step": 217
},
{
"epoch": 0.23434560601988713,
"grad_norm": 16.80089569091797,
"learning_rate": 0.00017586864447078742,
"loss": 4.2492,
"step": 218
},
{
"epoch": 0.23542058586401504,
"grad_norm": 17.562274932861328,
"learning_rate": 0.0001756459875302169,
"loss": 4.4658,
"step": 219
},
{
"epoch": 0.23649556570814298,
"grad_norm": 13.105591773986816,
"learning_rate": 0.0001754224504208647,
"loss": 3.8664,
"step": 220
},
{
"epoch": 0.2375705455522709,
"grad_norm": 11.499171257019043,
"learning_rate": 0.00017519803574366698,
"loss": 3.7275,
"step": 221
},
{
"epoch": 0.23864552539639883,
"grad_norm": 14.275655746459961,
"learning_rate": 0.00017497274610977072,
"loss": 3.9924,
"step": 222
},
{
"epoch": 0.23972050524052674,
"grad_norm": 20.798551559448242,
"learning_rate": 0.00017474658414050342,
"loss": 4.1779,
"step": 223
},
{
"epoch": 0.24079548508465468,
"grad_norm": 17.76445770263672,
"learning_rate": 0.0001745195524673424,
"loss": 4.5601,
"step": 224
},
{
"epoch": 0.24187046492878259,
"grad_norm": 15.017122268676758,
"learning_rate": 0.00017429165373188438,
"loss": 3.23,
"step": 225
},
{
"epoch": 0.2429454447729105,
"grad_norm": 17.35443878173828,
"learning_rate": 0.00017406289058581465,
"loss": 4.0901,
"step": 226
},
{
"epoch": 0.24402042461703843,
"grad_norm": 17.933059692382812,
"learning_rate": 0.00017383326569087623,
"loss": 4.353,
"step": 227
},
{
"epoch": 0.24509540446116634,
"grad_norm": 16.176124572753906,
"learning_rate": 0.0001736027817188389,
"loss": 4.1159,
"step": 228
},
{
"epoch": 0.24617038430529428,
"grad_norm": 18.375131607055664,
"learning_rate": 0.00017337144135146817,
"loss": 4.673,
"step": 229
},
{
"epoch": 0.2472453641494222,
"grad_norm": 14.222498893737793,
"learning_rate": 0.00017313924728049393,
"loss": 3.5181,
"step": 230
},
{
"epoch": 0.24832034399355013,
"grad_norm": 14.064003944396973,
"learning_rate": 0.00017290620220757928,
"loss": 3.0101,
"step": 231
},
{
"epoch": 0.24939532383767804,
"grad_norm": 14.68502140045166,
"learning_rate": 0.00017267230884428905,
"loss": 2.8587,
"step": 232
},
{
"epoch": 0.25047030368180595,
"grad_norm": 15.439518928527832,
"learning_rate": 0.0001724375699120582,
"loss": 3.5475,
"step": 233
},
{
"epoch": 0.25047030368180595,
"eval_loss": 0.9161850214004517,
"eval_runtime": 5.6189,
"eval_samples_per_second": 69.765,
"eval_steps_per_second": 34.882,
"step": 233
},
{
"epoch": 0.2515452835259339,
"grad_norm": 16.505876541137695,
"learning_rate": 0.0001722019881421602,
"loss": 3.5136,
"step": 234
},
{
"epoch": 0.2526202633700618,
"grad_norm": 11.073945999145508,
"learning_rate": 0.0001719655662756753,
"loss": 2.7815,
"step": 235
},
{
"epoch": 0.25369524321418974,
"grad_norm": 18.41621208190918,
"learning_rate": 0.00017172830706345854,
"loss": 2.9592,
"step": 236
},
{
"epoch": 0.25477022305831765,
"grad_norm": 17.0198974609375,
"learning_rate": 0.00017149021326610776,
"loss": 3.414,
"step": 237
},
{
"epoch": 0.25584520290244556,
"grad_norm": 18.122941970825195,
"learning_rate": 0.00017125128765393155,
"loss": 3.9874,
"step": 238
},
{
"epoch": 0.2569201827465735,
"grad_norm": 16.342615127563477,
"learning_rate": 0.00017101153300691694,
"loss": 4.0199,
"step": 239
},
{
"epoch": 0.25799516259070143,
"grad_norm": 17.71503257751465,
"learning_rate": 0.00017077095211469708,
"loss": 3.5693,
"step": 240
},
{
"epoch": 0.25907014243482934,
"grad_norm": 14.306414604187012,
"learning_rate": 0.00017052954777651883,
"loss": 3.7534,
"step": 241
},
{
"epoch": 0.26014512227895725,
"grad_norm": 19.34443473815918,
"learning_rate": 0.00017028732280121008,
"loss": 4.534,
"step": 242
},
{
"epoch": 0.2612201021230852,
"grad_norm": 19.286706924438477,
"learning_rate": 0.00017004428000714722,
"loss": 4.4074,
"step": 243
},
{
"epoch": 0.26229508196721313,
"grad_norm": 14.999690055847168,
"learning_rate": 0.00016980042222222217,
"loss": 3.5992,
"step": 244
},
{
"epoch": 0.26337006181134104,
"grad_norm": 12.448857307434082,
"learning_rate": 0.0001695557522838096,
"loss": 3.4263,
"step": 245
},
{
"epoch": 0.26444504165546895,
"grad_norm": 14.967903137207031,
"learning_rate": 0.00016931027303873392,
"loss": 3.3476,
"step": 246
},
{
"epoch": 0.26552002149959686,
"grad_norm": 16.7852725982666,
"learning_rate": 0.00016906398734323606,
"loss": 3.4892,
"step": 247
},
{
"epoch": 0.2665950013437248,
"grad_norm": 12.612491607666016,
"learning_rate": 0.00016881689806294036,
"loss": 3.6491,
"step": 248
},
{
"epoch": 0.26766998118785273,
"grad_norm": 23.1019287109375,
"learning_rate": 0.00016856900807282114,
"loss": 4.2173,
"step": 249
},
{
"epoch": 0.26874496103198064,
"grad_norm": 13.499241828918457,
"learning_rate": 0.00016832032025716921,
"loss": 4.167,
"step": 250
},
{
"epoch": 0.26981994087610855,
"grad_norm": 17.28055191040039,
"learning_rate": 0.00016807083750955846,
"loss": 3.9582,
"step": 251
},
{
"epoch": 0.2708949207202365,
"grad_norm": 21.756202697753906,
"learning_rate": 0.00016782056273281207,
"loss": 4.9348,
"step": 252
},
{
"epoch": 0.27196990056436443,
"grad_norm": 19.564050674438477,
"learning_rate": 0.00016756949883896876,
"loss": 4.5317,
"step": 253
},
{
"epoch": 0.27304488040849234,
"grad_norm": 17.771780014038086,
"learning_rate": 0.0001673176487492489,
"loss": 4.1873,
"step": 254
},
{
"epoch": 0.27411986025262025,
"grad_norm": 20.27021598815918,
"learning_rate": 0.00016706501539402063,
"loss": 4.289,
"step": 255
},
{
"epoch": 0.27519484009674816,
"grad_norm": 14.25924301147461,
"learning_rate": 0.0001668116017127655,
"loss": 3.5322,
"step": 256
},
{
"epoch": 0.2762698199408761,
"grad_norm": 10.925680160522461,
"learning_rate": 0.0001665574106540446,
"loss": 3.296,
"step": 257
},
{
"epoch": 0.27734479978500404,
"grad_norm": 17.243980407714844,
"learning_rate": 0.0001663024451754641,
"loss": 3.4127,
"step": 258
},
{
"epoch": 0.27841977962913195,
"grad_norm": 11.710211753845215,
"learning_rate": 0.00016604670824364067,
"loss": 3.4865,
"step": 259
},
{
"epoch": 0.27949475947325986,
"grad_norm": 15.336100578308105,
"learning_rate": 0.00016579020283416724,
"loss": 3.9446,
"step": 260
},
{
"epoch": 0.2805697393173878,
"grad_norm": 13.642552375793457,
"learning_rate": 0.00016553293193157824,
"loss": 3.8352,
"step": 261
},
{
"epoch": 0.28164471916151573,
"grad_norm": 15.919568061828613,
"learning_rate": 0.0001652748985293149,
"loss": 2.8705,
"step": 262
},
{
"epoch": 0.28271969900564364,
"grad_norm": 16.444141387939453,
"learning_rate": 0.00016501610562969033,
"loss": 3.1671,
"step": 263
},
{
"epoch": 0.28379467884977155,
"grad_norm": 21.732051849365234,
"learning_rate": 0.00016475655624385483,
"loss": 3.4969,
"step": 264
},
{
"epoch": 0.28486965869389946,
"grad_norm": 18.68508529663086,
"learning_rate": 0.00016449625339176054,
"loss": 4.4919,
"step": 265
},
{
"epoch": 0.28594463853802743,
"grad_norm": 18.98984146118164,
"learning_rate": 0.00016423520010212656,
"loss": 3.9249,
"step": 266
},
{
"epoch": 0.28701961838215534,
"grad_norm": 9.13257122039795,
"learning_rate": 0.00016397339941240355,
"loss": 2.1867,
"step": 267
},
{
"epoch": 0.28809459822628325,
"grad_norm": 19.376365661621094,
"learning_rate": 0.00016371085436873845,
"loss": 4.499,
"step": 268
},
{
"epoch": 0.28916957807041116,
"grad_norm": 14.490234375,
"learning_rate": 0.00016344756802593905,
"loss": 3.1082,
"step": 269
},
{
"epoch": 0.2902445579145391,
"grad_norm": 14.469581604003906,
"learning_rate": 0.00016318354344743843,
"loss": 3.027,
"step": 270
},
{
"epoch": 0.29131953775866704,
"grad_norm": 19.300182342529297,
"learning_rate": 0.00016291878370525926,
"loss": 3.7289,
"step": 271
},
{
"epoch": 0.29239451760279495,
"grad_norm": 17.35890007019043,
"learning_rate": 0.00016265329187997818,
"loss": 4.74,
"step": 272
},
{
"epoch": 0.29346949744692286,
"grad_norm": 13.944664001464844,
"learning_rate": 0.00016238707106068983,
"loss": 3.7071,
"step": 273
},
{
"epoch": 0.29454447729105077,
"grad_norm": 13.07278060913086,
"learning_rate": 0.00016212012434497103,
"loss": 3.3124,
"step": 274
},
{
"epoch": 0.29561945713517873,
"grad_norm": 14.615857124328613,
"learning_rate": 0.00016185245483884457,
"loss": 3.766,
"step": 275
},
{
"epoch": 0.29669443697930664,
"grad_norm": 15.86770248413086,
"learning_rate": 0.0001615840656567433,
"loss": 4.3149,
"step": 276
},
{
"epoch": 0.29776941682343455,
"grad_norm": 16.81600570678711,
"learning_rate": 0.0001613149599214736,
"loss": 4.2699,
"step": 277
},
{
"epoch": 0.29884439666756246,
"grad_norm": 16.2762393951416,
"learning_rate": 0.00016104514076417935,
"loss": 3.4487,
"step": 278
},
{
"epoch": 0.2999193765116904,
"grad_norm": 15.317776679992676,
"learning_rate": 0.00016077461132430533,
"loss": 4.2894,
"step": 279
},
{
"epoch": 0.30099435635581834,
"grad_norm": 14.264720916748047,
"learning_rate": 0.00016050337474956067,
"loss": 3.7338,
"step": 280
},
{
"epoch": 0.30206933619994625,
"grad_norm": 16.04495620727539,
"learning_rate": 0.00016023143419588228,
"loss": 3.7665,
"step": 281
},
{
"epoch": 0.30314431604407416,
"grad_norm": 13.27541446685791,
"learning_rate": 0.0001599587928273982,
"loss": 3.0945,
"step": 282
},
{
"epoch": 0.3042192958882021,
"grad_norm": 20.908628463745117,
"learning_rate": 0.0001596854538163906,
"loss": 3.5324,
"step": 283
},
{
"epoch": 0.30529427573233003,
"grad_norm": 22.76654815673828,
"learning_rate": 0.000159411420343259,
"loss": 4.0003,
"step": 284
},
{
"epoch": 0.30636925557645794,
"grad_norm": 15.583036422729492,
"learning_rate": 0.00015913669559648334,
"loss": 4.1793,
"step": 285
},
{
"epoch": 0.30744423542058585,
"grad_norm": 19.990497589111328,
"learning_rate": 0.00015886128277258662,
"loss": 4.0978,
"step": 286
},
{
"epoch": 0.30851921526471376,
"grad_norm": 15.330724716186523,
"learning_rate": 0.00015858518507609804,
"loss": 3.8858,
"step": 287
},
{
"epoch": 0.30959419510884173,
"grad_norm": 18.55107307434082,
"learning_rate": 0.00015830840571951543,
"loss": 3.3965,
"step": 288
},
{
"epoch": 0.31066917495296964,
"grad_norm": 16.54085350036621,
"learning_rate": 0.000158030947923268,
"loss": 3.9363,
"step": 289
},
{
"epoch": 0.31174415479709755,
"grad_norm": 16.187206268310547,
"learning_rate": 0.00015775281491567887,
"loss": 3.9742,
"step": 290
},
{
"epoch": 0.31281913464122546,
"grad_norm": 24.83617401123047,
"learning_rate": 0.00015747400993292756,
"loss": 3.765,
"step": 291
},
{
"epoch": 0.3138941144853534,
"grad_norm": 19.85297393798828,
"learning_rate": 0.0001571945362190121,
"loss": 4.4267,
"step": 292
},
{
"epoch": 0.31496909432948134,
"grad_norm": 17.904691696166992,
"learning_rate": 0.0001569143970257116,
"loss": 3.953,
"step": 293
},
{
"epoch": 0.31604407417360925,
"grad_norm": 14.280094146728516,
"learning_rate": 0.00015663359561254823,
"loss": 4.2375,
"step": 294
},
{
"epoch": 0.31711905401773716,
"grad_norm": 20.711565017700195,
"learning_rate": 0.00015635213524674928,
"loss": 3.4025,
"step": 295
},
{
"epoch": 0.31819403386186507,
"grad_norm": 17.165376663208008,
"learning_rate": 0.00015607001920320927,
"loss": 4.1795,
"step": 296
},
{
"epoch": 0.31926901370599303,
"grad_norm": 14.331582069396973,
"learning_rate": 0.0001557872507644517,
"loss": 3.846,
"step": 297
},
{
"epoch": 0.32034399355012094,
"grad_norm": 13.659123420715332,
"learning_rate": 0.000155503833220591,
"loss": 3.6604,
"step": 298
},
{
"epoch": 0.32141897339424885,
"grad_norm": 23.412158966064453,
"learning_rate": 0.0001552197698692941,
"loss": 4.6184,
"step": 299
},
{
"epoch": 0.32249395323837676,
"grad_norm": 13.751562118530273,
"learning_rate": 0.00015493506401574218,
"loss": 2.5928,
"step": 300
},
{
"epoch": 0.32356893308250473,
"grad_norm": 19.892213821411133,
"learning_rate": 0.00015464971897259219,
"loss": 4.3952,
"step": 301
},
{
"epoch": 0.32464391292663264,
"grad_norm": 15.620636940002441,
"learning_rate": 0.00015436373805993825,
"loss": 4.1718,
"step": 302
},
{
"epoch": 0.32571889277076055,
"grad_norm": 25.407278060913086,
"learning_rate": 0.00015407712460527304,
"loss": 3.3582,
"step": 303
},
{
"epoch": 0.32679387261488846,
"grad_norm": 15.422714233398438,
"learning_rate": 0.0001537898819434491,
"loss": 4.1691,
"step": 304
},
{
"epoch": 0.32786885245901637,
"grad_norm": 15.950149536132812,
"learning_rate": 0.00015350201341664014,
"loss": 3.1559,
"step": 305
},
{
"epoch": 0.32894383230314433,
"grad_norm": 16.09635353088379,
"learning_rate": 0.00015321352237430185,
"loss": 3.5173,
"step": 306
},
{
"epoch": 0.33001881214727224,
"grad_norm": 14.290729522705078,
"learning_rate": 0.00015292441217313324,
"loss": 3.5474,
"step": 307
},
{
"epoch": 0.33109379199140015,
"grad_norm": 12.314271926879883,
"learning_rate": 0.00015263468617703743,
"loss": 3.81,
"step": 308
},
{
"epoch": 0.33216877183552806,
"grad_norm": 12.49377155303955,
"learning_rate": 0.0001523443477570826,
"loss": 3.0498,
"step": 309
},
{
"epoch": 0.33324375167965603,
"grad_norm": 14.970670700073242,
"learning_rate": 0.00015205340029146255,
"loss": 3.8808,
"step": 310
},
{
"epoch": 0.33431873152378394,
"grad_norm": 21.925323486328125,
"learning_rate": 0.0001517618471654577,
"loss": 3.9662,
"step": 311
},
{
"epoch": 0.33539371136791185,
"grad_norm": 15.550761222839355,
"learning_rate": 0.0001514696917713955,
"loss": 3.0303,
"step": 312
},
{
"epoch": 0.33646869121203976,
"grad_norm": 19.790069580078125,
"learning_rate": 0.00015117693750861096,
"loss": 3.3957,
"step": 313
},
{
"epoch": 0.33754367105616767,
"grad_norm": 16.23335075378418,
"learning_rate": 0.00015088358778340725,
"loss": 3.386,
"step": 314
},
{
"epoch": 0.33861865090029564,
"grad_norm": 16.61414337158203,
"learning_rate": 0.00015058964600901583,
"loss": 3.6885,
"step": 315
},
{
"epoch": 0.33969363074442355,
"grad_norm": 14.917675018310547,
"learning_rate": 0.00015029511560555708,
"loss": 3.1788,
"step": 316
},
{
"epoch": 0.34076861058855146,
"grad_norm": 16.645130157470703,
"learning_rate": 0.00015000000000000001,
"loss": 3.0699,
"step": 317
},
{
"epoch": 0.34184359043267937,
"grad_norm": 14.486559867858887,
"learning_rate": 0.0001497043026261229,
"loss": 2.9101,
"step": 318
},
{
"epoch": 0.34291857027680733,
"grad_norm": 20.343292236328125,
"learning_rate": 0.00014940802692447306,
"loss": 3.3769,
"step": 319
},
{
"epoch": 0.34399355012093524,
"grad_norm": 12.295371055603027,
"learning_rate": 0.00014911117634232678,
"loss": 3.1421,
"step": 320
},
{
"epoch": 0.34506852996506315,
"grad_norm": 16.151826858520508,
"learning_rate": 0.00014881375433364936,
"loss": 3.8197,
"step": 321
},
{
"epoch": 0.34614350980919106,
"grad_norm": 17.043066024780273,
"learning_rate": 0.0001485157643590549,
"loss": 3.6181,
"step": 322
},
{
"epoch": 0.347218489653319,
"grad_norm": 22.119674682617188,
"learning_rate": 0.00014821720988576585,
"loss": 5.0421,
"step": 323
},
{
"epoch": 0.34829346949744694,
"grad_norm": 15.809839248657227,
"learning_rate": 0.00014791809438757296,
"loss": 3.9497,
"step": 324
},
{
"epoch": 0.34936844934157485,
"grad_norm": 12.98931884765625,
"learning_rate": 0.00014761842134479463,
"loss": 3.346,
"step": 325
},
{
"epoch": 0.35044342918570276,
"grad_norm": 17.794139862060547,
"learning_rate": 0.00014731819424423651,
"loss": 3.7576,
"step": 326
},
{
"epoch": 0.35151840902983067,
"grad_norm": 12.563948631286621,
"learning_rate": 0.00014701741657915094,
"loss": 3.4395,
"step": 327
},
{
"epoch": 0.35259338887395864,
"grad_norm": 21.27597999572754,
"learning_rate": 0.0001467160918491962,
"loss": 5.3584,
"step": 328
},
{
"epoch": 0.35366836871808655,
"grad_norm": 17.49233055114746,
"learning_rate": 0.00014641422356039604,
"loss": 3.3713,
"step": 329
},
{
"epoch": 0.35474334856221446,
"grad_norm": 18.22296142578125,
"learning_rate": 0.00014611181522509846,
"loss": 2.7666,
"step": 330
},
{
"epoch": 0.35581832840634237,
"grad_norm": 19.422439575195312,
"learning_rate": 0.00014580887036193537,
"loss": 4.8558,
"step": 331
},
{
"epoch": 0.35689330825047033,
"grad_norm": 14.737062454223633,
"learning_rate": 0.0001455053924957812,
"loss": 3.5708,
"step": 332
},
{
"epoch": 0.35796828809459824,
"grad_norm": 12.72256088256836,
"learning_rate": 0.0001452013851577121,
"loss": 2.6484,
"step": 333
},
{
"epoch": 0.35904326793872615,
"grad_norm": 15.688788414001465,
"learning_rate": 0.00014489685188496488,
"loss": 3.2307,
"step": 334
},
{
"epoch": 0.36011824778285406,
"grad_norm": 12.274057388305664,
"learning_rate": 0.0001445917962208957,
"loss": 3.0441,
"step": 335
},
{
"epoch": 0.36119322762698197,
"grad_norm": 16.341550827026367,
"learning_rate": 0.000144286221714939,
"loss": 3.9343,
"step": 336
},
{
"epoch": 0.36226820747110994,
"grad_norm": 18.544235229492188,
"learning_rate": 0.00014398013192256615,
"loss": 3.5993,
"step": 337
},
{
"epoch": 0.36334318731523785,
"grad_norm": 15.311888694763184,
"learning_rate": 0.000143673530405244,
"loss": 3.3302,
"step": 338
},
{
"epoch": 0.36441816715936576,
"grad_norm": 13.064177513122559,
"learning_rate": 0.00014336642073039358,
"loss": 2.9194,
"step": 339
},
{
"epoch": 0.36549314700349367,
"grad_norm": 17.951086044311523,
"learning_rate": 0.00014305880647134847,
"loss": 3.0824,
"step": 340
},
{
"epoch": 0.36656812684762163,
"grad_norm": 23.24056625366211,
"learning_rate": 0.00014275069120731323,
"loss": 5.6002,
"step": 341
},
{
"epoch": 0.36764310669174954,
"grad_norm": 14.519164085388184,
"learning_rate": 0.0001424420785233219,
"loss": 3.6317,
"step": 342
},
{
"epoch": 0.36871808653587745,
"grad_norm": 13.191924095153809,
"learning_rate": 0.00014213297201019618,
"loss": 3.3136,
"step": 343
},
{
"epoch": 0.36979306638000536,
"grad_norm": 11.676475524902344,
"learning_rate": 0.0001418233752645035,
"loss": 3.3879,
"step": 344
},
{
"epoch": 0.3708680462241333,
"grad_norm": 12.395687103271484,
"learning_rate": 0.00014151329188851554,
"loss": 3.2117,
"step": 345
},
{
"epoch": 0.37194302606826124,
"grad_norm": 15.743541717529297,
"learning_rate": 0.0001412027254901659,
"loss": 3.259,
"step": 346
},
{
"epoch": 0.37301800591238915,
"grad_norm": 19.783029556274414,
"learning_rate": 0.0001408916796830085,
"loss": 4.3767,
"step": 347
},
{
"epoch": 0.37409298575651706,
"grad_norm": 17.02685546875,
"learning_rate": 0.0001405801580861752,
"loss": 3.9807,
"step": 348
},
{
"epoch": 0.37516796560064497,
"grad_norm": 14.147680282592773,
"learning_rate": 0.00014026816432433399,
"loss": 3.1309,
"step": 349
},
{
"epoch": 0.37624294544477294,
"grad_norm": 14.738043785095215,
"learning_rate": 0.00013995570202764656,
"loss": 2.9714,
"step": 350
},
{
"epoch": 0.37731792528890085,
"grad_norm": 17.08428382873535,
"learning_rate": 0.0001396427748317262,
"loss": 3.1942,
"step": 351
},
{
"epoch": 0.37839290513302876,
"grad_norm": 15.932269096374512,
"learning_rate": 0.00013932938637759555,
"loss": 3.6591,
"step": 352
},
{
"epoch": 0.37946788497715667,
"grad_norm": 19.795040130615234,
"learning_rate": 0.00013901554031164404,
"loss": 4.9021,
"step": 353
},
{
"epoch": 0.3805428648212846,
"grad_norm": 17.933841705322266,
"learning_rate": 0.0001387012402855857,
"loss": 4.1404,
"step": 354
},
{
"epoch": 0.38161784466541254,
"grad_norm": 24.006519317626953,
"learning_rate": 0.00013838648995641645,
"loss": 5.4279,
"step": 355
},
{
"epoch": 0.38269282450954045,
"grad_norm": 18.881019592285156,
"learning_rate": 0.0001380712929863717,
"loss": 4.1727,
"step": 356
},
{
"epoch": 0.38376780435366836,
"grad_norm": 18.83523178100586,
"learning_rate": 0.00013775565304288372,
"loss": 4.6629,
"step": 357
},
{
"epoch": 0.3848427841977963,
"grad_norm": 12.512502670288086,
"learning_rate": 0.00013743957379853884,
"loss": 2.8857,
"step": 358
},
{
"epoch": 0.38591776404192424,
"grad_norm": 18.216079711914062,
"learning_rate": 0.00013712305893103492,
"loss": 2.8652,
"step": 359
},
{
"epoch": 0.38699274388605215,
"grad_norm": 16.71242332458496,
"learning_rate": 0.00013680611212313841,
"loss": 3.6149,
"step": 360
},
{
"epoch": 0.38806772373018006,
"grad_norm": 21.3747501373291,
"learning_rate": 0.0001364887370626416,
"loss": 3.4502,
"step": 361
},
{
"epoch": 0.38914270357430797,
"grad_norm": 16.120399475097656,
"learning_rate": 0.0001361709374423195,
"loss": 2.8048,
"step": 362
},
{
"epoch": 0.3902176834184359,
"grad_norm": 17.512006759643555,
"learning_rate": 0.00013585271695988718,
"loss": 3.7296,
"step": 363
},
{
"epoch": 0.39129266326256384,
"grad_norm": 15.320964813232422,
"learning_rate": 0.00013553407931795662,
"loss": 3.4195,
"step": 364
},
{
"epoch": 0.39236764310669175,
"grad_norm": 15.187535285949707,
"learning_rate": 0.0001352150282239934,
"loss": 2.9683,
"step": 365
},
{
"epoch": 0.39344262295081966,
"grad_norm": 23.12388038635254,
"learning_rate": 0.000134895567390274,
"loss": 3.8934,
"step": 366
},
{
"epoch": 0.3945176027949476,
"grad_norm": 19.0565128326416,
"learning_rate": 0.00013457570053384226,
"loss": 3.7732,
"step": 367
},
{
"epoch": 0.39559258263907554,
"grad_norm": 13.927467346191406,
"learning_rate": 0.00013425543137646624,
"loss": 3.1398,
"step": 368
},
{
"epoch": 0.39666756248320345,
"grad_norm": 17.896343231201172,
"learning_rate": 0.00013393476364459493,
"loss": 3.4939,
"step": 369
},
{
"epoch": 0.39774254232733136,
"grad_norm": 14.155213356018066,
"learning_rate": 0.00013361370106931486,
"loss": 3.9665,
"step": 370
},
{
"epoch": 0.39881752217145927,
"grad_norm": 10.331148147583008,
"learning_rate": 0.00013329224738630678,
"loss": 2.5528,
"step": 371
},
{
"epoch": 0.3998925020155872,
"grad_norm": 16.241106033325195,
"learning_rate": 0.00013297040633580202,
"loss": 3.4909,
"step": 372
},
{
"epoch": 0.40096748185971515,
"grad_norm": 21.060741424560547,
"learning_rate": 0.00013264818166253917,
"loss": 4.3651,
"step": 373
},
{
"epoch": 0.40204246170384306,
"grad_norm": 19.373760223388672,
"learning_rate": 0.00013232557711572032,
"loss": 3.4165,
"step": 374
},
{
"epoch": 0.40311744154797097,
"grad_norm": 16.87083625793457,
"learning_rate": 0.00013200259644896762,
"loss": 3.7119,
"step": 375
},
{
"epoch": 0.4041924213920989,
"grad_norm": 15.271788597106934,
"learning_rate": 0.00013167924342027945,
"loss": 2.9353,
"step": 376
},
{
"epoch": 0.40526740123622684,
"grad_norm": 18.673574447631836,
"learning_rate": 0.00013135552179198678,
"loss": 3.4936,
"step": 377
},
{
"epoch": 0.40634238108035475,
"grad_norm": 14.075126647949219,
"learning_rate": 0.00013103143533070937,
"loss": 3.1212,
"step": 378
},
{
"epoch": 0.40741736092448266,
"grad_norm": 15.539584159851074,
"learning_rate": 0.00013070698780731193,
"loss": 3.0859,
"step": 379
},
{
"epoch": 0.4084923407686106,
"grad_norm": 14.371941566467285,
"learning_rate": 0.0001303821829968603,
"loss": 3.601,
"step": 380
},
{
"epoch": 0.4095673206127385,
"grad_norm": 18.58356285095215,
"learning_rate": 0.00013005702467857742,
"loss": 4.0211,
"step": 381
},
{
"epoch": 0.41064230045686645,
"grad_norm": 16.63950538635254,
"learning_rate": 0.00012973151663579947,
"loss": 3.3913,
"step": 382
},
{
"epoch": 0.41171728030099436,
"grad_norm": 14.509610176086426,
"learning_rate": 0.0001294056626559318,
"loss": 2.8095,
"step": 383
},
{
"epoch": 0.41279226014512227,
"grad_norm": 12.644404411315918,
"learning_rate": 0.0001290794665304049,
"loss": 2.9944,
"step": 384
},
{
"epoch": 0.4138672399892502,
"grad_norm": 11.41877555847168,
"learning_rate": 0.00012875293205463016,
"loss": 2.5601,
"step": 385
},
{
"epoch": 0.41494221983337815,
"grad_norm": 23.640005111694336,
"learning_rate": 0.00012842606302795585,
"loss": 4.2101,
"step": 386
},
{
"epoch": 0.41601719967750606,
"grad_norm": 14.225778579711914,
"learning_rate": 0.00012809886325362287,
"loss": 3.0655,
"step": 387
},
{
"epoch": 0.41709217952163397,
"grad_norm": 22.691972732543945,
"learning_rate": 0.0001277713365387205,
"loss": 4.6002,
"step": 388
},
{
"epoch": 0.4181671593657619,
"grad_norm": 18.885147094726562,
"learning_rate": 0.00012744348669414203,
"loss": 4.323,
"step": 389
},
{
"epoch": 0.41924213920988984,
"grad_norm": 17.81749153137207,
"learning_rate": 0.00012711531753454056,
"loss": 3.2404,
"step": 390
},
{
"epoch": 0.42031711905401775,
"grad_norm": 15.428961753845215,
"learning_rate": 0.0001267868328782845,
"loss": 2.9677,
"step": 391
},
{
"epoch": 0.42139209889814566,
"grad_norm": 12.817580223083496,
"learning_rate": 0.00012645803654741318,
"loss": 3.8399,
"step": 392
},
{
"epoch": 0.42246707874227357,
"grad_norm": 11.081942558288574,
"learning_rate": 0.00012612893236759238,
"loss": 2.4973,
"step": 393
},
{
"epoch": 0.4235420585864015,
"grad_norm": 17.789241790771484,
"learning_rate": 0.0001257995241680698,
"loss": 3.9219,
"step": 394
},
{
"epoch": 0.42461703843052945,
"grad_norm": 17.394941329956055,
"learning_rate": 0.00012546981578163058,
"loss": 3.189,
"step": 395
},
{
"epoch": 0.42569201827465736,
"grad_norm": 17.376474380493164,
"learning_rate": 0.00012513981104455256,
"loss": 3.4426,
"step": 396
},
{
"epoch": 0.42676699811878527,
"grad_norm": 16.225542068481445,
"learning_rate": 0.00012480951379656175,
"loss": 4.0302,
"step": 397
},
{
"epoch": 0.4278419779629132,
"grad_norm": 17.361696243286133,
"learning_rate": 0.00012447892788078772,
"loss": 3.3795,
"step": 398
},
{
"epoch": 0.42891695780704114,
"grad_norm": 14.174507141113281,
"learning_rate": 0.0001241480571437187,
"loss": 3.9256,
"step": 399
},
{
"epoch": 0.42999193765116905,
"grad_norm": 17.67365074157715,
"learning_rate": 0.00012381690543515693,
"loss": 4.5173,
"step": 400
},
{
"epoch": 0.43106691749529696,
"grad_norm": 14.265713691711426,
"learning_rate": 0.00012348547660817385,
"loss": 2.7096,
"step": 401
},
{
"epoch": 0.4321418973394249,
"grad_norm": 13.033537864685059,
"learning_rate": 0.00012315377451906537,
"loss": 2.5288,
"step": 402
},
{
"epoch": 0.4332168771835528,
"grad_norm": 9.724547386169434,
"learning_rate": 0.00012282180302730682,
"loss": 2.2101,
"step": 403
},
{
"epoch": 0.43429185702768075,
"grad_norm": 11.268912315368652,
"learning_rate": 0.00012248956599550804,
"loss": 1.9814,
"step": 404
},
{
"epoch": 0.43536683687180866,
"grad_norm": 11.205545425415039,
"learning_rate": 0.00012215706728936875,
"loss": 2.8051,
"step": 405
},
{
"epoch": 0.43644181671593657,
"grad_norm": 13.20801830291748,
"learning_rate": 0.00012182431077763317,
"loss": 3.1918,
"step": 406
},
{
"epoch": 0.4375167965600645,
"grad_norm": 19.06052589416504,
"learning_rate": 0.00012149130033204525,
"loss": 3.7518,
"step": 407
},
{
"epoch": 0.43859177640419245,
"grad_norm": 17.768795013427734,
"learning_rate": 0.00012115803982730352,
"loss": 3.7953,
"step": 408
},
{
"epoch": 0.43966675624832036,
"grad_norm": 20.065616607666016,
"learning_rate": 0.00012082453314101607,
"loss": 4.4466,
"step": 409
},
{
"epoch": 0.44074173609244827,
"grad_norm": 10.540583610534668,
"learning_rate": 0.00012049078415365543,
"loss": 2.42,
"step": 410
},
{
"epoch": 0.4418167159365762,
"grad_norm": 18.175241470336914,
"learning_rate": 0.00012015679674851328,
"loss": 4.3468,
"step": 411
},
{
"epoch": 0.4428916957807041,
"grad_norm": 14.195399284362793,
"learning_rate": 0.00011982257481165546,
"loss": 3.681,
"step": 412
},
{
"epoch": 0.44396667562483205,
"grad_norm": 16.479141235351562,
"learning_rate": 0.00011948812223187675,
"loss": 3.6083,
"step": 413
},
{
"epoch": 0.44504165546895996,
"grad_norm": 16.942975997924805,
"learning_rate": 0.0001191534429006554,
"loss": 3.8114,
"step": 414
},
{
"epoch": 0.4461166353130879,
"grad_norm": 12.2439546585083,
"learning_rate": 0.00011881854071210805,
"loss": 2.9306,
"step": 415
},
{
"epoch": 0.4471916151572158,
"grad_norm": 16.4477481842041,
"learning_rate": 0.00011848341956294437,
"loss": 3.2065,
"step": 416
},
{
"epoch": 0.44826659500134375,
"grad_norm": 15.710596084594727,
"learning_rate": 0.00011814808335242173,
"loss": 2.4522,
"step": 417
},
{
"epoch": 0.44934157484547166,
"grad_norm": 12.070144653320312,
"learning_rate": 0.00011781253598229982,
"loss": 2.3715,
"step": 418
},
{
"epoch": 0.45041655468959957,
"grad_norm": 12.033697128295898,
"learning_rate": 0.00011747678135679521,
"loss": 3.0672,
"step": 419
},
{
"epoch": 0.4514915345337275,
"grad_norm": 16.81391716003418,
"learning_rate": 0.00011714082338253603,
"loss": 2.8255,
"step": 420
},
{
"epoch": 0.4525665143778554,
"grad_norm": 15.085043907165527,
"learning_rate": 0.00011680466596851635,
"loss": 3.1703,
"step": 421
},
{
"epoch": 0.45364149422198335,
"grad_norm": 19.121475219726562,
"learning_rate": 0.0001164683130260509,
"loss": 4.1685,
"step": 422
},
{
"epoch": 0.45471647406611126,
"grad_norm": 18.101137161254883,
"learning_rate": 0.00011613176846872937,
"loss": 3.6327,
"step": 423
},
{
"epoch": 0.4557914539102392,
"grad_norm": 12.256120681762695,
"learning_rate": 0.00011579503621237102,
"loss": 3.4243,
"step": 424
},
{
"epoch": 0.4568664337543671,
"grad_norm": 10.51490306854248,
"learning_rate": 0.00011545812017497901,
"loss": 3.4243,
"step": 425
},
{
"epoch": 0.45794141359849505,
"grad_norm": 22.243715286254883,
"learning_rate": 0.00011512102427669488,
"loss": 4.0067,
"step": 426
},
{
"epoch": 0.45901639344262296,
"grad_norm": 10.290740966796875,
"learning_rate": 0.00011478375243975296,
"loss": 2.5808,
"step": 427
},
{
"epoch": 0.46009137328675087,
"grad_norm": 13.278031349182129,
"learning_rate": 0.00011444630858843461,
"loss": 2.8027,
"step": 428
},
{
"epoch": 0.4611663531308788,
"grad_norm": 14.611438751220703,
"learning_rate": 0.0001141086966490227,
"loss": 2.9664,
"step": 429
},
{
"epoch": 0.4622413329750067,
"grad_norm": 16.739944458007812,
"learning_rate": 0.00011377092054975584,
"loss": 2.5552,
"step": 430
},
{
"epoch": 0.46331631281913466,
"grad_norm": 16.316120147705078,
"learning_rate": 0.0001134329842207827,
"loss": 3.7572,
"step": 431
},
{
"epoch": 0.46439129266326257,
"grad_norm": 18.96500587463379,
"learning_rate": 0.0001130948915941163,
"loss": 4.1539,
"step": 432
},
{
"epoch": 0.4654662725073905,
"grad_norm": 11.103170394897461,
"learning_rate": 0.00011275664660358818,
"loss": 2.3669,
"step": 433
},
{
"epoch": 0.4665412523515184,
"grad_norm": 16.899873733520508,
"learning_rate": 0.00011241825318480281,
"loss": 3.4796,
"step": 434
},
{
"epoch": 0.46761623219564635,
"grad_norm": 17.01308822631836,
"learning_rate": 0.00011207971527509158,
"loss": 3.6144,
"step": 435
},
{
"epoch": 0.46869121203977426,
"grad_norm": 13.122427940368652,
"learning_rate": 0.00011174103681346711,
"loss": 3.1118,
"step": 436
},
{
"epoch": 0.4697661918839022,
"grad_norm": 14.372381210327148,
"learning_rate": 0.00011140222174057734,
"loss": 3.7498,
"step": 437
},
{
"epoch": 0.4708411717280301,
"grad_norm": 18.45073699951172,
"learning_rate": 0.00011106327399865988,
"loss": 3.9332,
"step": 438
},
{
"epoch": 0.47191615157215805,
"grad_norm": 15.843331336975098,
"learning_rate": 0.00011072419753149586,
"loss": 4.2044,
"step": 439
},
{
"epoch": 0.47299113141628596,
"grad_norm": 14.005358695983887,
"learning_rate": 0.00011038499628436416,
"loss": 3.1071,
"step": 440
},
{
"epoch": 0.47406611126041387,
"grad_norm": 15.334671974182129,
"learning_rate": 0.00011004567420399563,
"loss": 3.2788,
"step": 441
},
{
"epoch": 0.4751410911045418,
"grad_norm": 12.769185066223145,
"learning_rate": 0.00010970623523852699,
"loss": 2.3523,
"step": 442
},
{
"epoch": 0.4762160709486697,
"grad_norm": 17.81171417236328,
"learning_rate": 0.00010936668333745499,
"loss": 2.9305,
"step": 443
},
{
"epoch": 0.47729105079279766,
"grad_norm": 12.29996395111084,
"learning_rate": 0.0001090270224515904,
"loss": 2.219,
"step": 444
},
{
"epoch": 0.47836603063692557,
"grad_norm": 14.522968292236328,
"learning_rate": 0.00010868725653301206,
"loss": 3.0004,
"step": 445
},
{
"epoch": 0.4794410104810535,
"grad_norm": 13.94498062133789,
"learning_rate": 0.00010834738953502095,
"loss": 3.0531,
"step": 446
},
{
"epoch": 0.4805159903251814,
"grad_norm": 15.410560607910156,
"learning_rate": 0.0001080074254120941,
"loss": 2.3937,
"step": 447
},
{
"epoch": 0.48159097016930935,
"grad_norm": 10.539775848388672,
"learning_rate": 0.00010766736811983865,
"loss": 2.817,
"step": 448
},
{
"epoch": 0.48266595001343726,
"grad_norm": 16.945173263549805,
"learning_rate": 0.00010732722161494579,
"loss": 2.8102,
"step": 449
},
{
"epoch": 0.48374092985756517,
"grad_norm": 15.775056838989258,
"learning_rate": 0.00010698698985514475,
"loss": 3.7962,
"step": 450
},
{
"epoch": 0.4848159097016931,
"grad_norm": 16.637723922729492,
"learning_rate": 0.0001066466767991567,
"loss": 3.2478,
"step": 451
},
{
"epoch": 0.485890889545821,
"grad_norm": 25.035099029541016,
"learning_rate": 0.00010630628640664874,
"loss": 5.1592,
"step": 452
},
{
"epoch": 0.48696586938994896,
"grad_norm": 21.126035690307617,
"learning_rate": 0.00010596582263818781,
"loss": 3.2066,
"step": 453
},
{
"epoch": 0.48804084923407687,
"grad_norm": 16.712631225585938,
"learning_rate": 0.00010562528945519463,
"loss": 3.5551,
"step": 454
},
{
"epoch": 0.4891158290782048,
"grad_norm": 24.561006546020508,
"learning_rate": 0.00010528469081989749,
"loss": 4.7308,
"step": 455
},
{
"epoch": 0.4901908089223327,
"grad_norm": 23.1239013671875,
"learning_rate": 0.00010494403069528634,
"loss": 3.2912,
"step": 456
},
{
"epoch": 0.49126578876646065,
"grad_norm": 14.793785095214844,
"learning_rate": 0.00010460331304506657,
"loss": 2.3428,
"step": 457
},
{
"epoch": 0.49234076861058856,
"grad_norm": 14.798579216003418,
"learning_rate": 0.00010426254183361286,
"loss": 3.0977,
"step": 458
},
{
"epoch": 0.4934157484547165,
"grad_norm": 16.173603057861328,
"learning_rate": 0.00010392172102592313,
"loss": 3.4115,
"step": 459
},
{
"epoch": 0.4944907282988444,
"grad_norm": 24.551992416381836,
"learning_rate": 0.00010358085458757232,
"loss": 3.7147,
"step": 460
},
{
"epoch": 0.4955657081429723,
"grad_norm": 18.784887313842773,
"learning_rate": 0.00010323994648466638,
"loss": 4.1496,
"step": 461
},
{
"epoch": 0.49664068798710026,
"grad_norm": 14.511931419372559,
"learning_rate": 0.00010289900068379595,
"loss": 3.409,
"step": 462
},
{
"epoch": 0.49771566783122817,
"grad_norm": 17.688566207885742,
"learning_rate": 0.00010255802115199033,
"loss": 1.9911,
"step": 463
},
{
"epoch": 0.4987906476753561,
"grad_norm": 18.56838607788086,
"learning_rate": 0.00010221701185667141,
"loss": 3.1941,
"step": 464
},
{
"epoch": 0.499865627519484,
"grad_norm": 13.474034309387207,
"learning_rate": 0.00010187597676560718,
"loss": 3.0192,
"step": 465
},
{
"epoch": 0.5009406073636119,
"grad_norm": 12.874210357666016,
"learning_rate": 0.00010153491984686593,
"loss": 1.5535,
"step": 466
},
{
"epoch": 0.5009406073636119,
"eval_loss": 0.7541670799255371,
"eval_runtime": 5.5547,
"eval_samples_per_second": 70.571,
"eval_steps_per_second": 35.286,
"step": 466
},
{
"epoch": 0.5020155872077399,
"grad_norm": 14.241241455078125,
"learning_rate": 0.0001011938450687699,
"loss": 2.6351,
"step": 467
},
{
"epoch": 0.5030905670518678,
"grad_norm": 15.222787857055664,
"learning_rate": 0.00010085275639984904,
"loss": 3.6475,
"step": 468
},
{
"epoch": 0.5041655468959957,
"grad_norm": 12.280945777893066,
"learning_rate": 0.00010051165780879504,
"loss": 2.5128,
"step": 469
},
{
"epoch": 0.5052405267401237,
"grad_norm": 14.948744773864746,
"learning_rate": 0.00010017055326441494,
"loss": 2.9114,
"step": 470
},
{
"epoch": 0.5063155065842515,
"grad_norm": 17.295438766479492,
"learning_rate": 9.982944673558508e-05,
"loss": 3.4193,
"step": 471
},
{
"epoch": 0.5073904864283795,
"grad_norm": 18.69068145751953,
"learning_rate": 9.9488342191205e-05,
"loss": 4.0637,
"step": 472
},
{
"epoch": 0.5084654662725074,
"grad_norm": 17.060588836669922,
"learning_rate": 9.914724360015099e-05,
"loss": 3.1746,
"step": 473
},
{
"epoch": 0.5095404461166353,
"grad_norm": 13.83548641204834,
"learning_rate": 9.880615493123012e-05,
"loss": 2.757,
"step": 474
},
{
"epoch": 0.5106154259607633,
"grad_norm": 12.663851737976074,
"learning_rate": 9.846508015313408e-05,
"loss": 2.7217,
"step": 475
},
{
"epoch": 0.5116904058048911,
"grad_norm": 14.959012985229492,
"learning_rate": 9.812402323439284e-05,
"loss": 3.3345,
"step": 476
},
{
"epoch": 0.5127653856490191,
"grad_norm": 14.708747863769531,
"learning_rate": 9.778298814332863e-05,
"loss": 2.8671,
"step": 477
},
{
"epoch": 0.513840365493147,
"grad_norm": 10.635260581970215,
"learning_rate": 9.744197884800969e-05,
"loss": 2.6943,
"step": 478
},
{
"epoch": 0.5149153453372749,
"grad_norm": 14.44356632232666,
"learning_rate": 9.710099931620408e-05,
"loss": 2.1394,
"step": 479
},
{
"epoch": 0.5159903251814029,
"grad_norm": 9.918120384216309,
"learning_rate": 9.676005351533366e-05,
"loss": 2.2011,
"step": 480
},
{
"epoch": 0.5170653050255307,
"grad_norm": 17.1123046875,
"learning_rate": 9.64191454124277e-05,
"loss": 3.2862,
"step": 481
},
{
"epoch": 0.5181402848696587,
"grad_norm": 13.601349830627441,
"learning_rate": 9.60782789740769e-05,
"loss": 2.3652,
"step": 482
},
{
"epoch": 0.5192152647137867,
"grad_norm": 16.888429641723633,
"learning_rate": 9.573745816638716e-05,
"loss": 2.7698,
"step": 483
},
{
"epoch": 0.5202902445579145,
"grad_norm": 16.15688705444336,
"learning_rate": 9.539668695493344e-05,
"loss": 2.6367,
"step": 484
},
{
"epoch": 0.5213652244020425,
"grad_norm": 16.414520263671875,
"learning_rate": 9.505596930471367e-05,
"loss": 3.9243,
"step": 485
},
{
"epoch": 0.5224402042461704,
"grad_norm": 13.967657089233398,
"learning_rate": 9.471530918010253e-05,
"loss": 3.1243,
"step": 486
},
{
"epoch": 0.5235151840902983,
"grad_norm": 15.140684127807617,
"learning_rate": 9.43747105448054e-05,
"loss": 2.659,
"step": 487
},
{
"epoch": 0.5245901639344263,
"grad_norm": 13.074856758117676,
"learning_rate": 9.40341773618122e-05,
"loss": 3.3224,
"step": 488
},
{
"epoch": 0.5256651437785541,
"grad_norm": 15.123608589172363,
"learning_rate": 9.369371359335128e-05,
"loss": 3.385,
"step": 489
},
{
"epoch": 0.5267401236226821,
"grad_norm": 15.648529052734375,
"learning_rate": 9.335332320084331e-05,
"loss": 2.8329,
"step": 490
},
{
"epoch": 0.52781510346681,
"grad_norm": 15.20040225982666,
"learning_rate": 9.301301014485528e-05,
"loss": 3.5456,
"step": 491
},
{
"epoch": 0.5288900833109379,
"grad_norm": 23.638113021850586,
"learning_rate": 9.267277838505423e-05,
"loss": 4.8434,
"step": 492
},
{
"epoch": 0.5299650631550659,
"grad_norm": 11.857388496398926,
"learning_rate": 9.233263188016138e-05,
"loss": 2.2761,
"step": 493
},
{
"epoch": 0.5310400429991937,
"grad_norm": 12.123178482055664,
"learning_rate": 9.199257458790591e-05,
"loss": 3.0025,
"step": 494
},
{
"epoch": 0.5321150228433217,
"grad_norm": 11.024534225463867,
"learning_rate": 9.165261046497907e-05,
"loss": 2.265,
"step": 495
},
{
"epoch": 0.5331900026874496,
"grad_norm": 16.32103157043457,
"learning_rate": 9.131274346698796e-05,
"loss": 3.5393,
"step": 496
},
{
"epoch": 0.5342649825315775,
"grad_norm": 25.771560668945312,
"learning_rate": 9.097297754840962e-05,
"loss": 3.8375,
"step": 497
},
{
"epoch": 0.5353399623757055,
"grad_norm": 12.820847511291504,
"learning_rate": 9.063331666254503e-05,
"loss": 2.6361,
"step": 498
},
{
"epoch": 0.5364149422198333,
"grad_norm": 12.816265106201172,
"learning_rate": 9.029376476147302e-05,
"loss": 2.2486,
"step": 499
},
{
"epoch": 0.5374899220639613,
"grad_norm": 9.368247032165527,
"learning_rate": 8.995432579600439e-05,
"loss": 2.5467,
"step": 500
},
{
"epoch": 0.5385649019080893,
"grad_norm": 16.314271926879883,
"learning_rate": 8.961500371563585e-05,
"loss": 3.1917,
"step": 501
},
{
"epoch": 0.5396398817522171,
"grad_norm": 17.877838134765625,
"learning_rate": 8.927580246850418e-05,
"loss": 3.636,
"step": 502
},
{
"epoch": 0.5407148615963451,
"grad_norm": 15.371697425842285,
"learning_rate": 8.893672600134013e-05,
"loss": 4.3843,
"step": 503
},
{
"epoch": 0.541789841440473,
"grad_norm": 14.944757461547852,
"learning_rate": 8.859777825942267e-05,
"loss": 2.2189,
"step": 504
},
{
"epoch": 0.5428648212846009,
"grad_norm": 17.238252639770508,
"learning_rate": 8.825896318653293e-05,
"loss": 2.525,
"step": 505
},
{
"epoch": 0.5439398011287289,
"grad_norm": 14.758814811706543,
"learning_rate": 8.792028472490844e-05,
"loss": 3.1758,
"step": 506
},
{
"epoch": 0.5450147809728567,
"grad_norm": 9.887633323669434,
"learning_rate": 8.758174681519721e-05,
"loss": 2.2908,
"step": 507
},
{
"epoch": 0.5460897608169847,
"grad_norm": 13.68622875213623,
"learning_rate": 8.724335339641184e-05,
"loss": 2.105,
"step": 508
},
{
"epoch": 0.5471647406611126,
"grad_norm": 12.679695129394531,
"learning_rate": 8.690510840588373e-05,
"loss": 2.1756,
"step": 509
},
{
"epoch": 0.5482397205052405,
"grad_norm": 14.024535179138184,
"learning_rate": 8.656701577921732e-05,
"loss": 3.0431,
"step": 510
},
{
"epoch": 0.5493147003493685,
"grad_norm": 12.965935707092285,
"learning_rate": 8.622907945024417e-05,
"loss": 2.1099,
"step": 511
},
{
"epoch": 0.5503896801934963,
"grad_norm": 19.419710159301758,
"learning_rate": 8.589130335097732e-05,
"loss": 3.3639,
"step": 512
},
{
"epoch": 0.5514646600376243,
"grad_norm": 18.27731704711914,
"learning_rate": 8.55536914115654e-05,
"loss": 3.5416,
"step": 513
},
{
"epoch": 0.5525396398817523,
"grad_norm": 16.81820297241211,
"learning_rate": 8.521624756024705e-05,
"loss": 3.8419,
"step": 514
},
{
"epoch": 0.5536146197258801,
"grad_norm": 14.465489387512207,
"learning_rate": 8.487897572330513e-05,
"loss": 2.3487,
"step": 515
},
{
"epoch": 0.5546895995700081,
"grad_norm": 11.499032974243164,
"learning_rate": 8.454187982502101e-05,
"loss": 2.6283,
"step": 516
},
{
"epoch": 0.555764579414136,
"grad_norm": 21.029111862182617,
"learning_rate": 8.4204963787629e-05,
"loss": 4.7078,
"step": 517
},
{
"epoch": 0.5568395592582639,
"grad_norm": 18.131210327148438,
"learning_rate": 8.386823153127064e-05,
"loss": 3.6223,
"step": 518
},
{
"epoch": 0.5579145391023919,
"grad_norm": 15.802128791809082,
"learning_rate": 8.353168697394913e-05,
"loss": 2.6126,
"step": 519
},
{
"epoch": 0.5589895189465197,
"grad_norm": 16.3378849029541,
"learning_rate": 8.319533403148367e-05,
"loss": 2.8075,
"step": 520
},
{
"epoch": 0.5600644987906477,
"grad_norm": 13.895936012268066,
"learning_rate": 8.285917661746401e-05,
"loss": 2.7503,
"step": 521
},
{
"epoch": 0.5611394786347756,
"grad_norm": 13.693531036376953,
"learning_rate": 8.25232186432048e-05,
"loss": 2.9142,
"step": 522
},
{
"epoch": 0.5622144584789035,
"grad_norm": 14.982376098632812,
"learning_rate": 8.218746401770022e-05,
"loss": 3.0101,
"step": 523
},
{
"epoch": 0.5632894383230315,
"grad_norm": 18.6697998046875,
"learning_rate": 8.185191664757828e-05,
"loss": 3.6426,
"step": 524
},
{
"epoch": 0.5643644181671593,
"grad_norm": 12.057175636291504,
"learning_rate": 8.151658043705565e-05,
"loss": 2.9482,
"step": 525
},
{
"epoch": 0.5654393980112873,
"grad_norm": 14.079970359802246,
"learning_rate": 8.118145928789199e-05,
"loss": 3.1769,
"step": 526
},
{
"epoch": 0.5665143778554153,
"grad_norm": 12.814187049865723,
"learning_rate": 8.084655709934462e-05,
"loss": 2.472,
"step": 527
},
{
"epoch": 0.5675893576995431,
"grad_norm": 10.802642822265625,
"learning_rate": 8.051187776812326e-05,
"loss": 2.0466,
"step": 528
},
{
"epoch": 0.5686643375436711,
"grad_norm": 12.31850528717041,
"learning_rate": 8.017742518834454e-05,
"loss": 2.4457,
"step": 529
},
{
"epoch": 0.5697393173877989,
"grad_norm": 13.78878116607666,
"learning_rate": 7.984320325148675e-05,
"loss": 2.6326,
"step": 530
},
{
"epoch": 0.5708142972319269,
"grad_norm": 9.785225868225098,
"learning_rate": 7.950921584634461e-05,
"loss": 2.8243,
"step": 531
},
{
"epoch": 0.5718892770760549,
"grad_norm": 16.127605438232422,
"learning_rate": 7.917546685898391e-05,
"loss": 3.5011,
"step": 532
},
{
"epoch": 0.5729642569201827,
"grad_norm": 20.46214485168457,
"learning_rate": 7.884196017269648e-05,
"loss": 2.5311,
"step": 533
},
{
"epoch": 0.5740392367643107,
"grad_norm": 13.586955070495605,
"learning_rate": 7.850869966795476e-05,
"loss": 2.8393,
"step": 534
},
{
"epoch": 0.5751142166084386,
"grad_norm": 16.564584732055664,
"learning_rate": 7.817568922236682e-05,
"loss": 2.3696,
"step": 535
},
{
"epoch": 0.5761891964525665,
"grad_norm": 21.446279525756836,
"learning_rate": 7.784293271063124e-05,
"loss": 4.4285,
"step": 536
},
{
"epoch": 0.5772641762966945,
"grad_norm": 18.923242568969727,
"learning_rate": 7.751043400449197e-05,
"loss": 3.2939,
"step": 537
},
{
"epoch": 0.5783391561408223,
"grad_norm": 16.000579833984375,
"learning_rate": 7.717819697269321e-05,
"loss": 3.8915,
"step": 538
},
{
"epoch": 0.5794141359849503,
"grad_norm": 11.695368766784668,
"learning_rate": 7.684622548093461e-05,
"loss": 2.5856,
"step": 539
},
{
"epoch": 0.5804891158290783,
"grad_norm": 15.072840690612793,
"learning_rate": 7.651452339182613e-05,
"loss": 2.8462,
"step": 540
},
{
"epoch": 0.5815640956732061,
"grad_norm": 18.407136917114258,
"learning_rate": 7.618309456484308e-05,
"loss": 2.5811,
"step": 541
},
{
"epoch": 0.5826390755173341,
"grad_norm": 17.274293899536133,
"learning_rate": 7.58519428562813e-05,
"loss": 3.2455,
"step": 542
},
{
"epoch": 0.5837140553614619,
"grad_norm": 15.445805549621582,
"learning_rate": 7.552107211921229e-05,
"loss": 3.3812,
"step": 543
},
{
"epoch": 0.5847890352055899,
"grad_norm": 16.473222732543945,
"learning_rate": 7.519048620343825e-05,
"loss": 2.9544,
"step": 544
},
{
"epoch": 0.5858640150497179,
"grad_norm": 16.593351364135742,
"learning_rate": 7.486018895544748e-05,
"loss": 3.8982,
"step": 545
},
{
"epoch": 0.5869389948938457,
"grad_norm": 18.829208374023438,
"learning_rate": 7.453018421836946e-05,
"loss": 2.933,
"step": 546
},
{
"epoch": 0.5880139747379737,
"grad_norm": 21.860137939453125,
"learning_rate": 7.420047583193019e-05,
"loss": 3.5987,
"step": 547
},
{
"epoch": 0.5890889545821015,
"grad_norm": 15.280534744262695,
"learning_rate": 7.387106763240763e-05,
"loss": 2.8143,
"step": 548
},
{
"epoch": 0.5901639344262295,
"grad_norm": 15.49770736694336,
"learning_rate": 7.354196345258683e-05,
"loss": 2.4214,
"step": 549
},
{
"epoch": 0.5912389142703575,
"grad_norm": 15.602472305297852,
"learning_rate": 7.32131671217155e-05,
"loss": 3.4367,
"step": 550
},
{
"epoch": 0.5923138941144853,
"grad_norm": 14.388925552368164,
"learning_rate": 7.288468246545946e-05,
"loss": 3.4037,
"step": 551
},
{
"epoch": 0.5933888739586133,
"grad_norm": 19.788185119628906,
"learning_rate": 7.255651330585797e-05,
"loss": 3.3473,
"step": 552
},
{
"epoch": 0.5944638538027412,
"grad_norm": 13.820460319519043,
"learning_rate": 7.222866346127953e-05,
"loss": 2.5223,
"step": 553
},
{
"epoch": 0.5955388336468691,
"grad_norm": 16.7007999420166,
"learning_rate": 7.190113674637714e-05,
"loss": 2.8172,
"step": 554
},
{
"epoch": 0.5966138134909971,
"grad_norm": 16.35044288635254,
"learning_rate": 7.157393697204416e-05,
"loss": 2.9871,
"step": 555
},
{
"epoch": 0.5976887933351249,
"grad_norm": 13.284764289855957,
"learning_rate": 7.124706794536983e-05,
"loss": 3.6496,
"step": 556
},
{
"epoch": 0.5987637731792529,
"grad_norm": 12.027867317199707,
"learning_rate": 7.09205334695951e-05,
"loss": 2.2565,
"step": 557
},
{
"epoch": 0.5998387530233809,
"grad_norm": 17.299827575683594,
"learning_rate": 7.059433734406818e-05,
"loss": 2.7168,
"step": 558
},
{
"epoch": 0.6009137328675087,
"grad_norm": 17.865158081054688,
"learning_rate": 7.026848336420054e-05,
"loss": 3.6538,
"step": 559
},
{
"epoch": 0.6019887127116367,
"grad_norm": 12.184990882873535,
"learning_rate": 6.99429753214226e-05,
"loss": 1.9531,
"step": 560
},
{
"epoch": 0.6030636925557645,
"grad_norm": 15.416332244873047,
"learning_rate": 6.961781700313972e-05,
"loss": 2.8138,
"step": 561
},
{
"epoch": 0.6041386723998925,
"grad_norm": 16.23126220703125,
"learning_rate": 6.929301219268805e-05,
"loss": 2.6759,
"step": 562
},
{
"epoch": 0.6052136522440205,
"grad_norm": 21.727291107177734,
"learning_rate": 6.896856466929062e-05,
"loss": 3.2578,
"step": 563
},
{
"epoch": 0.6062886320881483,
"grad_norm": 9.553668975830078,
"learning_rate": 6.86444782080132e-05,
"loss": 2.1244,
"step": 564
},
{
"epoch": 0.6073636119322763,
"grad_norm": 21.076509475708008,
"learning_rate": 6.832075657972054e-05,
"loss": 3.2957,
"step": 565
},
{
"epoch": 0.6084385917764042,
"grad_norm": 11.683152198791504,
"learning_rate": 6.799740355103239e-05,
"loss": 2.4247,
"step": 566
},
{
"epoch": 0.6095135716205321,
"grad_norm": 13.204034805297852,
"learning_rate": 6.76744228842797e-05,
"loss": 2.9528,
"step": 567
},
{
"epoch": 0.6105885514646601,
"grad_norm": 14.521734237670898,
"learning_rate": 6.735181833746086e-05,
"loss": 2.5978,
"step": 568
},
{
"epoch": 0.6116635313087879,
"grad_norm": 17.30651092529297,
"learning_rate": 6.702959366419801e-05,
"loss": 3.5166,
"step": 569
},
{
"epoch": 0.6127385111529159,
"grad_norm": 15.7838716506958,
"learning_rate": 6.670775261369325e-05,
"loss": 2.6126,
"step": 570
},
{
"epoch": 0.6138134909970439,
"grad_norm": 13.338095664978027,
"learning_rate": 6.638629893068515e-05,
"loss": 3.1597,
"step": 571
},
{
"epoch": 0.6148884708411717,
"grad_norm": 16.482463836669922,
"learning_rate": 6.60652363554051e-05,
"loss": 2.4673,
"step": 572
},
{
"epoch": 0.6159634506852997,
"grad_norm": 13.602096557617188,
"learning_rate": 6.574456862353377e-05,
"loss": 2.7441,
"step": 573
},
{
"epoch": 0.6170384305294275,
"grad_norm": 15.007840156555176,
"learning_rate": 6.542429946615774e-05,
"loss": 2.7128,
"step": 574
},
{
"epoch": 0.6181134103735555,
"grad_norm": 11.352252006530762,
"learning_rate": 6.510443260972599e-05,
"loss": 3.2629,
"step": 575
},
{
"epoch": 0.6191883902176835,
"grad_norm": 14.53456974029541,
"learning_rate": 6.47849717760066e-05,
"loss": 1.8599,
"step": 576
},
{
"epoch": 0.6202633700618113,
"grad_norm": 12.216171264648438,
"learning_rate": 6.446592068204341e-05,
"loss": 2.2911,
"step": 577
},
{
"epoch": 0.6213383499059393,
"grad_norm": 15.579245567321777,
"learning_rate": 6.41472830401128e-05,
"loss": 2.4855,
"step": 578
},
{
"epoch": 0.6224133297500671,
"grad_norm": 19.280820846557617,
"learning_rate": 6.382906255768051e-05,
"loss": 4.8336,
"step": 579
},
{
"epoch": 0.6234883095941951,
"grad_norm": 19.063934326171875,
"learning_rate": 6.351126293735843e-05,
"loss": 2.7687,
"step": 580
},
{
"epoch": 0.6245632894383231,
"grad_norm": 13.213850975036621,
"learning_rate": 6.319388787686158e-05,
"loss": 3.1479,
"step": 581
},
{
"epoch": 0.6256382692824509,
"grad_norm": 19.5723934173584,
"learning_rate": 6.287694106896509e-05,
"loss": 4.7255,
"step": 582
},
{
"epoch": 0.6267132491265789,
"grad_norm": 12.059754371643066,
"learning_rate": 6.256042620146119e-05,
"loss": 2.5616,
"step": 583
},
{
"epoch": 0.6277882289707069,
"grad_norm": 10.33979320526123,
"learning_rate": 6.224434695711631e-05,
"loss": 1.6791,
"step": 584
},
{
"epoch": 0.6288632088148347,
"grad_norm": 12.472517967224121,
"learning_rate": 6.19287070136283e-05,
"loss": 2.8273,
"step": 585
},
{
"epoch": 0.6299381886589627,
"grad_norm": 13.145999908447266,
"learning_rate": 6.16135100435836e-05,
"loss": 2.4934,
"step": 586
},
{
"epoch": 0.6310131685030905,
"grad_norm": 12.09467887878418,
"learning_rate": 6.129875971441434e-05,
"loss": 2.5874,
"step": 587
},
{
"epoch": 0.6320881483472185,
"grad_norm": 12.754231452941895,
"learning_rate": 6.0984459688356e-05,
"loss": 3.2328,
"step": 588
},
{
"epoch": 0.6331631281913465,
"grad_norm": 20.645830154418945,
"learning_rate": 6.0670613622404496e-05,
"loss": 3.202,
"step": 589
},
{
"epoch": 0.6342381080354743,
"grad_norm": 13.244977951049805,
"learning_rate": 6.035722516827382e-05,
"loss": 2.1665,
"step": 590
},
{
"epoch": 0.6353130878796023,
"grad_norm": 17.669931411743164,
"learning_rate": 6.004429797235349e-05,
"loss": 2.6613,
"step": 591
},
{
"epoch": 0.6363880677237301,
"grad_norm": 10.736635208129883,
"learning_rate": 5.973183567566605e-05,
"loss": 2.4063,
"step": 592
},
{
"epoch": 0.6374630475678581,
"grad_norm": 18.680782318115234,
"learning_rate": 5.9419841913824824e-05,
"loss": 2.6796,
"step": 593
},
{
"epoch": 0.6385380274119861,
"grad_norm": 19.364147186279297,
"learning_rate": 5.9108320316991536e-05,
"loss": 3.8844,
"step": 594
},
{
"epoch": 0.6396130072561139,
"grad_norm": 13.238618850708008,
"learning_rate": 5.879727450983412e-05,
"loss": 3.3821,
"step": 595
},
{
"epoch": 0.6406879871002419,
"grad_norm": 21.127750396728516,
"learning_rate": 5.848670811148451e-05,
"loss": 3.8302,
"step": 596
},
{
"epoch": 0.6417629669443697,
"grad_norm": 19.388427734375,
"learning_rate": 5.817662473549651e-05,
"loss": 2.4551,
"step": 597
},
{
"epoch": 0.6428379467884977,
"grad_norm": 13.296785354614258,
"learning_rate": 5.786702798980388e-05,
"loss": 2.7313,
"step": 598
},
{
"epoch": 0.6439129266326257,
"grad_norm": 13.282078742980957,
"learning_rate": 5.755792147667811e-05,
"loss": 2.6865,
"step": 599
},
{
"epoch": 0.6449879064767535,
"grad_norm": 16.131546020507812,
"learning_rate": 5.7249308792686815e-05,
"loss": 2.9787,
"step": 600
},
{
"epoch": 0.6460628863208815,
"grad_norm": 13.292814254760742,
"learning_rate": 5.6941193528651596e-05,
"loss": 2.7872,
"step": 601
},
{
"epoch": 0.6471378661650095,
"grad_norm": 13.973624229431152,
"learning_rate": 5.663357926960644e-05,
"loss": 2.6566,
"step": 602
},
{
"epoch": 0.6482128460091373,
"grad_norm": 14.040596961975098,
"learning_rate": 5.6326469594756034e-05,
"loss": 3.0928,
"step": 603
},
{
"epoch": 0.6492878258532653,
"grad_norm": 17.149208068847656,
"learning_rate": 5.6019868077433876e-05,
"loss": 3.2098,
"step": 604
},
{
"epoch": 0.6503628056973931,
"grad_norm": 14.205854415893555,
"learning_rate": 5.5713778285061046e-05,
"loss": 2.1934,
"step": 605
},
{
"epoch": 0.6514377855415211,
"grad_norm": 20.856319427490234,
"learning_rate": 5.540820377910435e-05,
"loss": 4.2625,
"step": 606
},
{
"epoch": 0.6525127653856491,
"grad_norm": 19.353553771972656,
"learning_rate": 5.5103148115035195e-05,
"loss": 3.3242,
"step": 607
},
{
"epoch": 0.6535877452297769,
"grad_norm": 13.190366744995117,
"learning_rate": 5.479861484228794e-05,
"loss": 2.3837,
"step": 608
},
{
"epoch": 0.6546627250739049,
"grad_norm": 16.302879333496094,
"learning_rate": 5.449460750421883e-05,
"loss": 3.2505,
"step": 609
},
{
"epoch": 0.6557377049180327,
"grad_norm": 13.280342102050781,
"learning_rate": 5.419112963806468e-05,
"loss": 1.8674,
"step": 610
},
{
"epoch": 0.6568126847621607,
"grad_norm": 21.006906509399414,
"learning_rate": 5.388818477490154e-05,
"loss": 3.6557,
"step": 611
},
{
"epoch": 0.6578876646062887,
"grad_norm": 19.307729721069336,
"learning_rate": 5.358577643960403e-05,
"loss": 2.2382,
"step": 612
},
{
"epoch": 0.6589626444504165,
"grad_norm": 11.849048614501953,
"learning_rate": 5.328390815080381e-05,
"loss": 2.0229,
"step": 613
},
{
"epoch": 0.6600376242945445,
"grad_norm": 11.787457466125488,
"learning_rate": 5.2982583420849116e-05,
"loss": 2.6637,
"step": 614
},
{
"epoch": 0.6611126041386725,
"grad_norm": 15.777995109558105,
"learning_rate": 5.268180575576352e-05,
"loss": 2.893,
"step": 615
},
{
"epoch": 0.6621875839828003,
"grad_norm": 19.96381378173828,
"learning_rate": 5.238157865520539e-05,
"loss": 3.2706,
"step": 616
},
{
"epoch": 0.6632625638269283,
"grad_norm": 20.699359893798828,
"learning_rate": 5.208190561242708e-05,
"loss": 2.6676,
"step": 617
},
{
"epoch": 0.6643375436710561,
"grad_norm": 14.418145179748535,
"learning_rate": 5.178279011423417e-05,
"loss": 2.6929,
"step": 618
},
{
"epoch": 0.6654125235151841,
"grad_norm": 12.969032287597656,
"learning_rate": 5.148423564094517e-05,
"loss": 2.5543,
"step": 619
},
{
"epoch": 0.6664875033593121,
"grad_norm": 25.956907272338867,
"learning_rate": 5.118624566635066e-05,
"loss": 3.7616,
"step": 620
},
{
"epoch": 0.6675624832034399,
"grad_norm": 16.259069442749023,
"learning_rate": 5.0888823657673266e-05,
"loss": 2.6596,
"step": 621
},
{
"epoch": 0.6686374630475679,
"grad_norm": 15.737533569335938,
"learning_rate": 5.059197307552698e-05,
"loss": 3.4037,
"step": 622
},
{
"epoch": 0.6697124428916957,
"grad_norm": 15.57932186126709,
"learning_rate": 5.0295697373877096e-05,
"loss": 3.3037,
"step": 623
},
{
"epoch": 0.6707874227358237,
"grad_norm": 16.854917526245117,
"learning_rate": 5.000000000000002e-05,
"loss": 2.7998,
"step": 624
},
{
"epoch": 0.6718624025799517,
"grad_norm": 20.23788070678711,
"learning_rate": 4.9704884394442964e-05,
"loss": 2.6882,
"step": 625
},
{
"epoch": 0.6729373824240795,
"grad_norm": 13.206673622131348,
"learning_rate": 4.941035399098418e-05,
"loss": 2.8392,
"step": 626
},
{
"epoch": 0.6740123622682075,
"grad_norm": 11.313912391662598,
"learning_rate": 4.911641221659279e-05,
"loss": 2.6687,
"step": 627
},
{
"epoch": 0.6750873421123353,
"grad_norm": 17.33076286315918,
"learning_rate": 4.8823062491389094e-05,
"loss": 2.5485,
"step": 628
},
{
"epoch": 0.6761623219564633,
"grad_norm": 17.54669952392578,
"learning_rate": 4.853030822860455e-05,
"loss": 3.4131,
"step": 629
},
{
"epoch": 0.6772373018005913,
"grad_norm": 14.568754196166992,
"learning_rate": 4.823815283454235e-05,
"loss": 3.2074,
"step": 630
},
{
"epoch": 0.6783122816447191,
"grad_norm": 14.021873474121094,
"learning_rate": 4.794659970853749e-05,
"loss": 2.3931,
"step": 631
},
{
"epoch": 0.6793872614888471,
"grad_norm": 15.752306938171387,
"learning_rate": 4.765565224291743e-05,
"loss": 2.4038,
"step": 632
},
{
"epoch": 0.6804622413329751,
"grad_norm": 17.143024444580078,
"learning_rate": 4.7365313822962576e-05,
"loss": 2.8179,
"step": 633
},
{
"epoch": 0.6815372211771029,
"grad_norm": 18.36078643798828,
"learning_rate": 4.707558782686677e-05,
"loss": 2.4392,
"step": 634
},
{
"epoch": 0.6826122010212309,
"grad_norm": 17.252010345458984,
"learning_rate": 4.67864776256982e-05,
"loss": 2.6834,
"step": 635
},
{
"epoch": 0.6836871808653587,
"grad_norm": 25.259824752807617,
"learning_rate": 4.64979865833599e-05,
"loss": 3.477,
"step": 636
},
{
"epoch": 0.6847621607094867,
"grad_norm": 14.85805606842041,
"learning_rate": 4.621011805655093e-05,
"loss": 2.3259,
"step": 637
},
{
"epoch": 0.6858371405536147,
"grad_norm": 14.022159576416016,
"learning_rate": 4.592287539472701e-05,
"loss": 2.6373,
"step": 638
},
{
"epoch": 0.6869121203977425,
"grad_norm": 13.847736358642578,
"learning_rate": 4.563626194006178e-05,
"loss": 2.5128,
"step": 639
},
{
"epoch": 0.6879871002418705,
"grad_norm": 12.617048263549805,
"learning_rate": 4.535028102740785e-05,
"loss": 2.6718,
"step": 640
},
{
"epoch": 0.6890620800859983,
"grad_norm": 15.006109237670898,
"learning_rate": 4.5064935984257826e-05,
"loss": 3.0097,
"step": 641
},
{
"epoch": 0.6901370599301263,
"grad_norm": 18.60867691040039,
"learning_rate": 4.478023013070595e-05,
"loss": 3.4391,
"step": 642
},
{
"epoch": 0.6912120397742543,
"grad_norm": 18.005126953125,
"learning_rate": 4.449616677940903e-05,
"loss": 2.8679,
"step": 643
},
{
"epoch": 0.6922870196183821,
"grad_norm": 15.206576347351074,
"learning_rate": 4.421274923554835e-05,
"loss": 2.5725,
"step": 644
},
{
"epoch": 0.6933619994625101,
"grad_norm": 10.55386734008789,
"learning_rate": 4.392998079679076e-05,
"loss": 2.5634,
"step": 645
},
{
"epoch": 0.694436979306638,
"grad_norm": 17.997459411621094,
"learning_rate": 4.364786475325072e-05,
"loss": 4.0269,
"step": 646
},
{
"epoch": 0.6955119591507659,
"grad_norm": 17.598691940307617,
"learning_rate": 4.33664043874518e-05,
"loss": 3.0651,
"step": 647
},
{
"epoch": 0.6965869389948939,
"grad_norm": 19.84808349609375,
"learning_rate": 4.30856029742884e-05,
"loss": 3.4776,
"step": 648
},
{
"epoch": 0.6976619188390217,
"grad_norm": 19.29751968383789,
"learning_rate": 4.280546378098792e-05,
"loss": 3.4422,
"step": 649
},
{
"epoch": 0.6987368986831497,
"grad_norm": 19.02079963684082,
"learning_rate": 4.252599006707245e-05,
"loss": 2.9825,
"step": 650
},
{
"epoch": 0.6998118785272777,
"grad_norm": 17.91227912902832,
"learning_rate": 4.224718508432113e-05,
"loss": 2.7378,
"step": 651
},
{
"epoch": 0.7008868583714055,
"grad_norm": 20.459753036499023,
"learning_rate": 4.196905207673201e-05,
"loss": 3.7669,
"step": 652
},
{
"epoch": 0.7019618382155335,
"grad_norm": 12.434891700744629,
"learning_rate": 4.16915942804846e-05,
"loss": 2.6119,
"step": 653
},
{
"epoch": 0.7030368180596613,
"grad_norm": 17.53223419189453,
"learning_rate": 4.141481492390197e-05,
"loss": 2.7561,
"step": 654
},
{
"epoch": 0.7041117979037893,
"grad_norm": 15.005716323852539,
"learning_rate": 4.113871722741337e-05,
"loss": 2.5018,
"step": 655
},
{
"epoch": 0.7051867777479173,
"grad_norm": 19.277578353881836,
"learning_rate": 4.08633044035167e-05,
"loss": 3.2063,
"step": 656
},
{
"epoch": 0.7062617575920451,
"grad_norm": 18.722848892211914,
"learning_rate": 4.058857965674101e-05,
"loss": 2.4138,
"step": 657
},
{
"epoch": 0.7073367374361731,
"grad_norm": 12.36088752746582,
"learning_rate": 4.031454618360945e-05,
"loss": 2.4916,
"step": 658
},
{
"epoch": 0.708411717280301,
"grad_norm": 18.981199264526367,
"learning_rate": 4.0041207172601826e-05,
"loss": 3.578,
"step": 659
},
{
"epoch": 0.7094866971244289,
"grad_norm": 15.9765625,
"learning_rate": 3.976856580411774e-05,
"loss": 2.896,
"step": 660
},
{
"epoch": 0.7105616769685569,
"grad_norm": 10.438156127929688,
"learning_rate": 3.9496625250439344e-05,
"loss": 2.5757,
"step": 661
},
{
"epoch": 0.7116366568126847,
"grad_norm": 15.564522743225098,
"learning_rate": 3.922538867569466e-05,
"loss": 2.7255,
"step": 662
},
{
"epoch": 0.7127116366568127,
"grad_norm": 10.465269088745117,
"learning_rate": 3.8954859235820664e-05,
"loss": 1.888,
"step": 663
},
{
"epoch": 0.7137866165009407,
"grad_norm": 10.99638843536377,
"learning_rate": 3.8685040078526415e-05,
"loss": 2.6292,
"step": 664
},
{
"epoch": 0.7148615963450685,
"grad_norm": 14.457728385925293,
"learning_rate": 3.841593434325675e-05,
"loss": 2.3995,
"step": 665
},
{
"epoch": 0.7159365761891965,
"grad_norm": 17.016172409057617,
"learning_rate": 3.814754516115544e-05,
"loss": 2.8181,
"step": 666
},
{
"epoch": 0.7170115560333243,
"grad_norm": 18.437854766845703,
"learning_rate": 3.787987565502902e-05,
"loss": 2.1409,
"step": 667
},
{
"epoch": 0.7180865358774523,
"grad_norm": 19.192140579223633,
"learning_rate": 3.761292893931019e-05,
"loss": 3.0362,
"step": 668
},
{
"epoch": 0.7191615157215803,
"grad_norm": 18.337635040283203,
"learning_rate": 3.734670812002183e-05,
"loss": 3.1711,
"step": 669
},
{
"epoch": 0.7202364955657081,
"grad_norm": 12.289979934692383,
"learning_rate": 3.708121629474077e-05,
"loss": 2.4776,
"step": 670
},
{
"epoch": 0.7213114754098361,
"grad_norm": 16.519603729248047,
"learning_rate": 3.681645655256159e-05,
"loss": 2.5979,
"step": 671
},
{
"epoch": 0.7223864552539639,
"grad_norm": 16.15471076965332,
"learning_rate": 3.655243197406097e-05,
"loss": 3.1705,
"step": 672
},
{
"epoch": 0.7234614350980919,
"grad_norm": 14.030595779418945,
"learning_rate": 3.628914563126156e-05,
"loss": 2.5392,
"step": 673
},
{
"epoch": 0.7245364149422199,
"grad_norm": 14.324786186218262,
"learning_rate": 3.6026600587596484e-05,
"loss": 3.4405,
"step": 674
},
{
"epoch": 0.7256113947863477,
"grad_norm": 14.356512069702148,
"learning_rate": 3.576479989787345e-05,
"loss": 2.7351,
"step": 675
},
{
"epoch": 0.7266863746304757,
"grad_norm": 22.63331413269043,
"learning_rate": 3.550374660823949e-05,
"loss": 4.4579,
"step": 676
},
{
"epoch": 0.7277613544746036,
"grad_norm": 16.112449645996094,
"learning_rate": 3.52434437561452e-05,
"loss": 3.2883,
"step": 677
},
{
"epoch": 0.7288363343187315,
"grad_norm": 15.933297157287598,
"learning_rate": 3.4983894370309665e-05,
"loss": 3.1271,
"step": 678
},
{
"epoch": 0.7299113141628595,
"grad_norm": 18.30646514892578,
"learning_rate": 3.472510147068515e-05,
"loss": 3.1462,
"step": 679
},
{
"epoch": 0.7309862940069873,
"grad_norm": 14.52500057220459,
"learning_rate": 3.446706806842177e-05,
"loss": 3.5975,
"step": 680
},
{
"epoch": 0.7320612738511153,
"grad_norm": 19.05596160888672,
"learning_rate": 3.420979716583279e-05,
"loss": 3.3156,
"step": 681
},
{
"epoch": 0.7331362536952433,
"grad_norm": 17.002887725830078,
"learning_rate": 3.395329175635935e-05,
"loss": 2.8059,
"step": 682
},
{
"epoch": 0.7342112335393711,
"grad_norm": 13.947765350341797,
"learning_rate": 3.369755482453594e-05,
"loss": 2.3958,
"step": 683
},
{
"epoch": 0.7352862133834991,
"grad_norm": 14.15211009979248,
"learning_rate": 3.344258934595539e-05,
"loss": 2.7893,
"step": 684
},
{
"epoch": 0.7363611932276269,
"grad_norm": 16.130531311035156,
"learning_rate": 3.31883982872345e-05,
"loss": 2.4694,
"step": 685
},
{
"epoch": 0.7374361730717549,
"grad_norm": 19.640533447265625,
"learning_rate": 3.2934984605979424e-05,
"loss": 3.6272,
"step": 686
},
{
"epoch": 0.7385111529158829,
"grad_norm": 16.92746925354004,
"learning_rate": 3.268235125075111e-05,
"loss": 3.1489,
"step": 687
},
{
"epoch": 0.7395861327600107,
"grad_norm": 13.478626251220703,
"learning_rate": 3.243050116103128e-05,
"loss": 4.0042,
"step": 688
},
{
"epoch": 0.7406611126041387,
"grad_norm": 16.014270782470703,
"learning_rate": 3.217943726718795e-05,
"loss": 2.8463,
"step": 689
},
{
"epoch": 0.7417360924482665,
"grad_norm": 10.819585800170898,
"learning_rate": 3.1929162490441565e-05,
"loss": 2.3828,
"step": 690
},
{
"epoch": 0.7428110722923945,
"grad_norm": 13.190820693969727,
"learning_rate": 3.16796797428308e-05,
"loss": 2.5904,
"step": 691
},
{
"epoch": 0.7438860521365225,
"grad_norm": 16.281742095947266,
"learning_rate": 3.1430991927178866e-05,
"loss": 2.2573,
"step": 692
},
{
"epoch": 0.7449610319806503,
"grad_norm": 17.6481990814209,
"learning_rate": 3.1183101937059647e-05,
"loss": 2.8613,
"step": 693
},
{
"epoch": 0.7460360118247783,
"grad_norm": 15.284998893737793,
"learning_rate": 3.093601265676393e-05,
"loss": 3.2382,
"step": 694
},
{
"epoch": 0.7471109916689062,
"grad_norm": 15.303094863891602,
"learning_rate": 3.068972696126611e-05,
"loss": 2.2713,
"step": 695
},
{
"epoch": 0.7481859715130341,
"grad_norm": 17.59407615661621,
"learning_rate": 3.044424771619041e-05,
"loss": 2.976,
"step": 696
},
{
"epoch": 0.7492609513571621,
"grad_norm": 15.894051551818848,
"learning_rate": 3.0199577777777875e-05,
"loss": 3.3442,
"step": 697
},
{
"epoch": 0.7503359312012899,
"grad_norm": 13.673702239990234,
"learning_rate": 2.9955719992852804e-05,
"loss": 2.5122,
"step": 698
},
{
"epoch": 0.7514109110454179,
"grad_norm": 16.84321403503418,
"learning_rate": 2.9712677198789916e-05,
"loss": 2.9834,
"step": 699
},
{
"epoch": 0.7514109110454179,
"eval_loss": 0.6793206930160522,
"eval_runtime": 5.5979,
"eval_samples_per_second": 70.026,
"eval_steps_per_second": 35.013,
"step": 699
},
{
"epoch": 0.7524858908895459,
"grad_norm": 13.654646873474121,
"learning_rate": 2.9470452223481204e-05,
"loss": 2.9831,
"step": 700
},
{
"epoch": 0.7535608707336737,
"grad_norm": 9.951108932495117,
"learning_rate": 2.922904788530293e-05,
"loss": 2.2996,
"step": 701
},
{
"epoch": 0.7546358505778017,
"grad_norm": 9.867563247680664,
"learning_rate": 2.8988466993083097e-05,
"loss": 2.1596,
"step": 702
},
{
"epoch": 0.7557108304219295,
"grad_norm": 14.348727226257324,
"learning_rate": 2.8748712346068464e-05,
"loss": 2.8225,
"step": 703
},
{
"epoch": 0.7567858102660575,
"grad_norm": 14.35478687286377,
"learning_rate": 2.8509786733892264e-05,
"loss": 2.9542,
"step": 704
},
{
"epoch": 0.7578607901101855,
"grad_norm": 23.562644958496094,
"learning_rate": 2.827169293654147e-05,
"loss": 4.984,
"step": 705
},
{
"epoch": 0.7589357699543133,
"grad_norm": 11.014055252075195,
"learning_rate": 2.8034433724324715e-05,
"loss": 2.2337,
"step": 706
},
{
"epoch": 0.7600107497984413,
"grad_norm": 14.4437837600708,
"learning_rate": 2.77980118578398e-05,
"loss": 2.8005,
"step": 707
},
{
"epoch": 0.7610857296425692,
"grad_norm": 14.065092086791992,
"learning_rate": 2.7562430087941814e-05,
"loss": 3.0488,
"step": 708
},
{
"epoch": 0.7621607094866971,
"grad_norm": 18.38102912902832,
"learning_rate": 2.7327691155710976e-05,
"loss": 2.857,
"step": 709
},
{
"epoch": 0.7632356893308251,
"grad_norm": 15.708161354064941,
"learning_rate": 2.7093797792420728e-05,
"loss": 3.3492,
"step": 710
},
{
"epoch": 0.7643106691749529,
"grad_norm": 13.264737129211426,
"learning_rate": 2.68607527195061e-05,
"loss": 2.2699,
"step": 711
},
{
"epoch": 0.7653856490190809,
"grad_norm": 22.40721893310547,
"learning_rate": 2.6628558648531843e-05,
"loss": 3.6175,
"step": 712
},
{
"epoch": 0.7664606288632088,
"grad_norm": 16.45822525024414,
"learning_rate": 2.639721828116112e-05,
"loss": 1.7208,
"step": 713
},
{
"epoch": 0.7675356087073367,
"grad_norm": 15.185565948486328,
"learning_rate": 2.6166734309123787e-05,
"loss": 2.439,
"step": 714
},
{
"epoch": 0.7686105885514647,
"grad_norm": 13.136150360107422,
"learning_rate": 2.5937109414185366e-05,
"loss": 2.9833,
"step": 715
},
{
"epoch": 0.7696855683955925,
"grad_norm": 13.081589698791504,
"learning_rate": 2.5708346268115647e-05,
"loss": 2.4198,
"step": 716
},
{
"epoch": 0.7707605482397205,
"grad_norm": 10.561399459838867,
"learning_rate": 2.5480447532657624e-05,
"loss": 2.3794,
"step": 717
},
{
"epoch": 0.7718355280838485,
"grad_norm": 15.137680053710938,
"learning_rate": 2.525341585949662e-05,
"loss": 2.6444,
"step": 718
},
{
"epoch": 0.7729105079279763,
"grad_norm": 18.32636070251465,
"learning_rate": 2.5027253890229285e-05,
"loss": 2.978,
"step": 719
},
{
"epoch": 0.7739854877721043,
"grad_norm": 14.093949317932129,
"learning_rate": 2.4801964256333053e-05,
"loss": 2.4359,
"step": 720
},
{
"epoch": 0.7750604676162322,
"grad_norm": 15.987081527709961,
"learning_rate": 2.457754957913532e-05,
"loss": 3.6221,
"step": 721
},
{
"epoch": 0.7761354474603601,
"grad_norm": 14.7132568359375,
"learning_rate": 2.4354012469783094e-05,
"loss": 2.7112,
"step": 722
},
{
"epoch": 0.7772104273044881,
"grad_norm": 15.25096321105957,
"learning_rate": 2.4131355529212573e-05,
"loss": 2.8029,
"step": 723
},
{
"epoch": 0.7782854071486159,
"grad_norm": 11.852306365966797,
"learning_rate": 2.3909581348118805e-05,
"loss": 2.6288,
"step": 724
},
{
"epoch": 0.7793603869927439,
"grad_norm": 14.22082805633545,
"learning_rate": 2.368869250692567e-05,
"loss": 2.6691,
"step": 725
},
{
"epoch": 0.7804353668368718,
"grad_norm": 14.895218849182129,
"learning_rate": 2.346869157575574e-05,
"loss": 2.223,
"step": 726
},
{
"epoch": 0.7815103466809997,
"grad_norm": 11.507866859436035,
"learning_rate": 2.324958111440051e-05,
"loss": 2.4171,
"step": 727
},
{
"epoch": 0.7825853265251277,
"grad_norm": 10.818052291870117,
"learning_rate": 2.3031363672290406e-05,
"loss": 2.4578,
"step": 728
},
{
"epoch": 0.7836603063692555,
"grad_norm": 14.49646282196045,
"learning_rate": 2.28140417884654e-05,
"loss": 3.3386,
"step": 729
},
{
"epoch": 0.7847352862133835,
"grad_norm": 16.65618896484375,
"learning_rate": 2.2597617991545162e-05,
"loss": 2.4579,
"step": 730
},
{
"epoch": 0.7858102660575115,
"grad_norm": 21.370105743408203,
"learning_rate": 2.2382094799699917e-05,
"loss": 2.0147,
"step": 731
},
{
"epoch": 0.7868852459016393,
"grad_norm": 18.23256492614746,
"learning_rate": 2.2167474720620974e-05,
"loss": 2.8713,
"step": 732
},
{
"epoch": 0.7879602257457673,
"grad_norm": 13.314383506774902,
"learning_rate": 2.1953760251491563e-05,
"loss": 2.5137,
"step": 733
},
{
"epoch": 0.7890352055898951,
"grad_norm": 13.909300804138184,
"learning_rate": 2.174095387895786e-05,
"loss": 2.7543,
"step": 734
},
{
"epoch": 0.7901101854340231,
"grad_norm": 12.180556297302246,
"learning_rate": 2.152905807909995e-05,
"loss": 2.0877,
"step": 735
},
{
"epoch": 0.7911851652781511,
"grad_norm": 22.62627410888672,
"learning_rate": 2.131807531740315e-05,
"loss": 4.0689,
"step": 736
},
{
"epoch": 0.7922601451222789,
"grad_norm": 15.548689842224121,
"learning_rate": 2.1108008048729145e-05,
"loss": 3.4416,
"step": 737
},
{
"epoch": 0.7933351249664069,
"grad_norm": 17.072410583496094,
"learning_rate": 2.0898858717287594e-05,
"loss": 4.3289,
"step": 738
},
{
"epoch": 0.7944101048105348,
"grad_norm": 12.466560363769531,
"learning_rate": 2.0690629756607648e-05,
"loss": 2.6264,
"step": 739
},
{
"epoch": 0.7954850846546627,
"grad_norm": 10.822991371154785,
"learning_rate": 2.0483323589509483e-05,
"loss": 1.9285,
"step": 740
},
{
"epoch": 0.7965600644987907,
"grad_norm": 16.706153869628906,
"learning_rate": 2.0276942628076378e-05,
"loss": 3.2333,
"step": 741
},
{
"epoch": 0.7976350443429185,
"grad_norm": 13.810402870178223,
"learning_rate": 2.0071489273626376e-05,
"loss": 2.5904,
"step": 742
},
{
"epoch": 0.7987100241870465,
"grad_norm": 15.366273880004883,
"learning_rate": 1.9866965916684587e-05,
"loss": 2.8742,
"step": 743
},
{
"epoch": 0.7997850040311744,
"grad_norm": 11.5558500289917,
"learning_rate": 1.966337493695516e-05,
"loss": 2.3341,
"step": 744
},
{
"epoch": 0.8008599838753023,
"grad_norm": 17.35418701171875,
"learning_rate": 1.9460718703293768e-05,
"loss": 2.6252,
"step": 745
},
{
"epoch": 0.8019349637194303,
"grad_norm": 11.60128402709961,
"learning_rate": 1.925899957367996e-05,
"loss": 2.109,
"step": 746
},
{
"epoch": 0.8030099435635581,
"grad_norm": 12.664255142211914,
"learning_rate": 1.9058219895189666e-05,
"loss": 2.339,
"step": 747
},
{
"epoch": 0.8040849234076861,
"grad_norm": 16.977569580078125,
"learning_rate": 1.8858382003968078e-05,
"loss": 2.3301,
"step": 748
},
{
"epoch": 0.8051599032518141,
"grad_norm": 12.833663940429688,
"learning_rate": 1.8659488225202226e-05,
"loss": 2.461,
"step": 749
},
{
"epoch": 0.8062348830959419,
"grad_norm": 14.8052339553833,
"learning_rate": 1.846154087309414e-05,
"loss": 2.9448,
"step": 750
},
{
"epoch": 0.8073098629400699,
"grad_norm": 16.044788360595703,
"learning_rate": 1.826454225083375e-05,
"loss": 2.9095,
"step": 751
},
{
"epoch": 0.8083848427841978,
"grad_norm": 24.449602127075195,
"learning_rate": 1.8068494650572243e-05,
"loss": 3.3211,
"step": 752
},
{
"epoch": 0.8094598226283257,
"grad_norm": 18.429533004760742,
"learning_rate": 1.787340035339524e-05,
"loss": 3.7572,
"step": 753
},
{
"epoch": 0.8105348024724537,
"grad_norm": 18.038867950439453,
"learning_rate": 1.7679261629296408e-05,
"loss": 3.5666,
"step": 754
},
{
"epoch": 0.8116097823165815,
"grad_norm": 17.882884979248047,
"learning_rate": 1.7486080737150945e-05,
"loss": 3.4553,
"step": 755
},
{
"epoch": 0.8126847621607095,
"grad_norm": 18.42115592956543,
"learning_rate": 1.7293859924689258e-05,
"loss": 2.3743,
"step": 756
},
{
"epoch": 0.8137597420048374,
"grad_norm": 10.389144897460938,
"learning_rate": 1.7102601428470987e-05,
"loss": 1.7921,
"step": 757
},
{
"epoch": 0.8148347218489653,
"grad_norm": 19.917224884033203,
"learning_rate": 1.691230747385878e-05,
"loss": 4.1469,
"step": 758
},
{
"epoch": 0.8159097016930933,
"grad_norm": 22.9906005859375,
"learning_rate": 1.672298027499254e-05,
"loss": 3.5433,
"step": 759
},
{
"epoch": 0.8169846815372211,
"grad_norm": 9.970109939575195,
"learning_rate": 1.653462203476356e-05,
"loss": 1.9898,
"step": 760
},
{
"epoch": 0.8180596613813491,
"grad_norm": 15.049894332885742,
"learning_rate": 1.6347234944789014e-05,
"loss": 3.014,
"step": 761
},
{
"epoch": 0.819134641225477,
"grad_norm": 18.622976303100586,
"learning_rate": 1.6160821185386364e-05,
"loss": 2.6812,
"step": 762
},
{
"epoch": 0.8202096210696049,
"grad_norm": 18.029285430908203,
"learning_rate": 1.5975382925547965e-05,
"loss": 2.8567,
"step": 763
},
{
"epoch": 0.8212846009137329,
"grad_norm": 17.624483108520508,
"learning_rate": 1.5790922322915958e-05,
"loss": 3.039,
"step": 764
},
{
"epoch": 0.8223595807578608,
"grad_norm": 21.21023178100586,
"learning_rate": 1.5607441523756993e-05,
"loss": 3.2504,
"step": 765
},
{
"epoch": 0.8234345606019887,
"grad_norm": 15.159942626953125,
"learning_rate": 1.5424942662937435e-05,
"loss": 2.2915,
"step": 766
},
{
"epoch": 0.8245095404461167,
"grad_norm": 16.94089126586914,
"learning_rate": 1.5243427863898364e-05,
"loss": 2.781,
"step": 767
},
{
"epoch": 0.8255845202902445,
"grad_norm": 14.287386894226074,
"learning_rate": 1.5062899238631e-05,
"loss": 2.2052,
"step": 768
},
{
"epoch": 0.8266595001343725,
"grad_norm": 23.06917953491211,
"learning_rate": 1.4883358887652044e-05,
"loss": 4.0856,
"step": 769
},
{
"epoch": 0.8277344799785004,
"grad_norm": 14.637825012207031,
"learning_rate": 1.4704808899979239e-05,
"loss": 2.954,
"step": 770
},
{
"epoch": 0.8288094598226283,
"grad_norm": 9.044286727905273,
"learning_rate": 1.4527251353107163e-05,
"loss": 1.8635,
"step": 771
},
{
"epoch": 0.8298844396667563,
"grad_norm": 17.238901138305664,
"learning_rate": 1.4350688312982864e-05,
"loss": 2.5819,
"step": 772
},
{
"epoch": 0.8309594195108841,
"grad_norm": 11.831186294555664,
"learning_rate": 1.4175121833982052e-05,
"loss": 2.4721,
"step": 773
},
{
"epoch": 0.8320343993550121,
"grad_norm": 21.131061553955078,
"learning_rate": 1.4000553958885021e-05,
"loss": 4.1583,
"step": 774
},
{
"epoch": 0.83310937919914,
"grad_norm": 15.86319637298584,
"learning_rate": 1.3826986718852952e-05,
"loss": 2.5534,
"step": 775
},
{
"epoch": 0.8341843590432679,
"grad_norm": 16.642955780029297,
"learning_rate": 1.365442213340432e-05,
"loss": 2.3907,
"step": 776
},
{
"epoch": 0.8352593388873959,
"grad_norm": 19.222917556762695,
"learning_rate": 1.3482862210391245e-05,
"loss": 3.667,
"step": 777
},
{
"epoch": 0.8363343187315238,
"grad_norm": 18.172245025634766,
"learning_rate": 1.3312308945976348e-05,
"loss": 3.029,
"step": 778
},
{
"epoch": 0.8374092985756517,
"grad_norm": 15.490896224975586,
"learning_rate": 1.3142764324609303e-05,
"loss": 3.208,
"step": 779
},
{
"epoch": 0.8384842784197797,
"grad_norm": 18.96816062927246,
"learning_rate": 1.2974230319003944e-05,
"loss": 3.4618,
"step": 780
},
{
"epoch": 0.8395592582639075,
"grad_norm": 21.89808464050293,
"learning_rate": 1.2806708890115138e-05,
"loss": 4.5078,
"step": 781
},
{
"epoch": 0.8406342381080355,
"grad_norm": 14.64857006072998,
"learning_rate": 1.2640201987116117e-05,
"loss": 3.2866,
"step": 782
},
{
"epoch": 0.8417092179521634,
"grad_norm": 15.451770782470703,
"learning_rate": 1.2474711547375683e-05,
"loss": 2.3176,
"step": 783
},
{
"epoch": 0.8427841977962913,
"grad_norm": 12.625808715820312,
"learning_rate": 1.2310239496435749e-05,
"loss": 2.557,
"step": 784
},
{
"epoch": 0.8438591776404193,
"grad_norm": 16.126930236816406,
"learning_rate": 1.2146787747988919e-05,
"loss": 2.9173,
"step": 785
},
{
"epoch": 0.8449341574845471,
"grad_norm": 16.86185646057129,
"learning_rate": 1.1984358203856116e-05,
"loss": 2.4147,
"step": 786
},
{
"epoch": 0.8460091373286751,
"grad_norm": 17.082571029663086,
"learning_rate": 1.1822952753964667e-05,
"loss": 2.9913,
"step": 787
},
{
"epoch": 0.847084117172803,
"grad_norm": 15.173123359680176,
"learning_rate": 1.1662573276326061e-05,
"loss": 2.8246,
"step": 788
},
{
"epoch": 0.8481590970169309,
"grad_norm": 17.791603088378906,
"learning_rate": 1.1503221637014327e-05,
"loss": 3.7321,
"step": 789
},
{
"epoch": 0.8492340768610589,
"grad_norm": 16.170446395874023,
"learning_rate": 1.134489969014414e-05,
"loss": 3.1235,
"step": 790
},
{
"epoch": 0.8503090567051867,
"grad_norm": 13.791912078857422,
"learning_rate": 1.1187609277849376e-05,
"loss": 2.7921,
"step": 791
},
{
"epoch": 0.8513840365493147,
"grad_norm": 12.047784805297852,
"learning_rate": 1.1031352230261637e-05,
"loss": 1.8449,
"step": 792
},
{
"epoch": 0.8524590163934426,
"grad_norm": 21.286800384521484,
"learning_rate": 1.0876130365488878e-05,
"loss": 3.6973,
"step": 793
},
{
"epoch": 0.8535339962375705,
"grad_norm": 15.110943794250488,
"learning_rate": 1.072194548959442e-05,
"loss": 3.3045,
"step": 794
},
{
"epoch": 0.8546089760816985,
"grad_norm": 11.98826789855957,
"learning_rate": 1.0568799396575746e-05,
"loss": 2.831,
"step": 795
},
{
"epoch": 0.8556839559258264,
"grad_norm": 18.719282150268555,
"learning_rate": 1.0416693868343797e-05,
"loss": 3.2608,
"step": 796
},
{
"epoch": 0.8567589357699543,
"grad_norm": 14.92679500579834,
"learning_rate": 1.0265630674702076e-05,
"loss": 2.494,
"step": 797
},
{
"epoch": 0.8578339156140823,
"grad_norm": 12.818703651428223,
"learning_rate": 1.0115611573326233e-05,
"loss": 2.6008,
"step": 798
},
{
"epoch": 0.8589088954582101,
"grad_norm": 17.74081039428711,
"learning_rate": 9.966638309743482e-06,
"loss": 3.2788,
"step": 799
},
{
"epoch": 0.8599838753023381,
"grad_norm": 13.859757423400879,
"learning_rate": 9.818712617312287e-06,
"loss": 2.3176,
"step": 800
},
{
"epoch": 0.861058855146466,
"grad_norm": 26.295276641845703,
"learning_rate": 9.671836217202334e-06,
"loss": 3.3959,
"step": 801
},
{
"epoch": 0.8621338349905939,
"grad_norm": 15.147491455078125,
"learning_rate": 9.52601081837431e-06,
"loss": 3.1437,
"step": 802
},
{
"epoch": 0.8632088148347219,
"grad_norm": 12.152910232543945,
"learning_rate": 9.381238117560187e-06,
"loss": 2.4838,
"step": 803
},
{
"epoch": 0.8642837946788497,
"grad_norm": 28.314746856689453,
"learning_rate": 9.237519799243355e-06,
"loss": 3.2608,
"step": 804
},
{
"epoch": 0.8653587745229777,
"grad_norm": 14.662104606628418,
"learning_rate": 9.094857535639156e-06,
"loss": 2.9242,
"step": 805
},
{
"epoch": 0.8664337543671056,
"grad_norm": 13.321383476257324,
"learning_rate": 8.95325298667523e-06,
"loss": 2.8041,
"step": 806
},
{
"epoch": 0.8675087342112335,
"grad_norm": 16.60413360595703,
"learning_rate": 8.812707799972442e-06,
"loss": 2.5522,
"step": 807
},
{
"epoch": 0.8685837140553615,
"grad_norm": 16.389780044555664,
"learning_rate": 8.673223610825531e-06,
"loss": 2.647,
"step": 808
},
{
"epoch": 0.8696586938994894,
"grad_norm": 10.362971305847168,
"learning_rate": 8.53480204218412e-06,
"loss": 2.1054,
"step": 809
},
{
"epoch": 0.8707336737436173,
"grad_norm": 18.633100509643555,
"learning_rate": 8.397444704633906e-06,
"loss": 3.8698,
"step": 810
},
{
"epoch": 0.8718086535877452,
"grad_norm": 14.801193237304688,
"learning_rate": 8.261153196377814e-06,
"loss": 2.1947,
"step": 811
},
{
"epoch": 0.8728836334318731,
"grad_norm": 16.298856735229492,
"learning_rate": 8.1259291032175e-06,
"loss": 3.4508,
"step": 812
},
{
"epoch": 0.8739586132760011,
"grad_norm": 17.139835357666016,
"learning_rate": 7.991773998534802e-06,
"loss": 2.6625,
"step": 813
},
{
"epoch": 0.875033593120129,
"grad_norm": 14.451316833496094,
"learning_rate": 7.858689443273547e-06,
"loss": 2.5462,
"step": 814
},
{
"epoch": 0.8761085729642569,
"grad_norm": 19.52531623840332,
"learning_rate": 7.72667698592131e-06,
"loss": 2.7614,
"step": 815
},
{
"epoch": 0.8771835528083849,
"grad_norm": 23.297826766967773,
"learning_rate": 7.595738162491383e-06,
"loss": 2.0079,
"step": 816
},
{
"epoch": 0.8782585326525127,
"grad_norm": 16.850067138671875,
"learning_rate": 7.465874496504943e-06,
"loss": 3.0355,
"step": 817
},
{
"epoch": 0.8793335124966407,
"grad_norm": 19.124881744384766,
"learning_rate": 7.337087498973327e-06,
"loss": 3.1917,
"step": 818
},
{
"epoch": 0.8804084923407686,
"grad_norm": 16.559707641601562,
"learning_rate": 7.209378668380451e-06,
"loss": 2.6744,
"step": 819
},
{
"epoch": 0.8814834721848965,
"grad_norm": 11.441034317016602,
"learning_rate": 7.0827494906653526e-06,
"loss": 2.2681,
"step": 820
},
{
"epoch": 0.8825584520290245,
"grad_norm": 18.97810935974121,
"learning_rate": 6.957201439204897e-06,
"loss": 3.6872,
"step": 821
},
{
"epoch": 0.8836334318731524,
"grad_norm": 15.341508865356445,
"learning_rate": 6.832735974796689e-06,
"loss": 2.0165,
"step": 822
},
{
"epoch": 0.8847084117172803,
"grad_norm": 16.359432220458984,
"learning_rate": 6.7093545456419886e-06,
"loss": 3.3588,
"step": 823
},
{
"epoch": 0.8857833915614082,
"grad_norm": 16.616304397583008,
"learning_rate": 6.5870585873289425e-06,
"loss": 3.643,
"step": 824
},
{
"epoch": 0.8868583714055361,
"grad_norm": 14.876137733459473,
"learning_rate": 6.4658495228158146e-06,
"loss": 2.8213,
"step": 825
},
{
"epoch": 0.8879333512496641,
"grad_norm": 11.03225326538086,
"learning_rate": 6.345728762414505e-06,
"loss": 2.1051,
"step": 826
},
{
"epoch": 0.889008331093792,
"grad_norm": 17.549360275268555,
"learning_rate": 6.226697703774076e-06,
"loss": 3.0634,
"step": 827
},
{
"epoch": 0.8900833109379199,
"grad_norm": 12.46760368347168,
"learning_rate": 6.108757731864489e-06,
"loss": 2.7482,
"step": 828
},
{
"epoch": 0.8911582907820479,
"grad_norm": 11.985771179199219,
"learning_rate": 5.99191021896055e-06,
"loss": 2.4109,
"step": 829
},
{
"epoch": 0.8922332706261757,
"grad_norm": 15.262474060058594,
"learning_rate": 5.876156524625864e-06,
"loss": 3.0215,
"step": 830
},
{
"epoch": 0.8933082504703037,
"grad_norm": 12.595239639282227,
"learning_rate": 5.7614979956971075e-06,
"loss": 2.726,
"step": 831
},
{
"epoch": 0.8943832303144316,
"grad_norm": 18.540124893188477,
"learning_rate": 5.647935966268225e-06,
"loss": 1.9021,
"step": 832
},
{
"epoch": 0.8954582101585595,
"grad_norm": 12.652937889099121,
"learning_rate": 5.5354717576750816e-06,
"loss": 2.366,
"step": 833
},
{
"epoch": 0.8965331900026875,
"grad_norm": 13.983992576599121,
"learning_rate": 5.424106678479945e-06,
"loss": 2.7013,
"step": 834
},
{
"epoch": 0.8976081698468154,
"grad_norm": 15.04990291595459,
"learning_rate": 5.313842024456306e-06,
"loss": 2.3414,
"step": 835
},
{
"epoch": 0.8986831496909433,
"grad_norm": 18.419647216796875,
"learning_rate": 5.204679078573827e-06,
"loss": 3.8336,
"step": 836
},
{
"epoch": 0.8997581295350712,
"grad_norm": 17.847749710083008,
"learning_rate": 5.096619110983347e-06,
"loss": 2.9503,
"step": 837
},
{
"epoch": 0.9008331093791991,
"grad_norm": 15.034934043884277,
"learning_rate": 4.9896633790022405e-06,
"loss": 2.6579,
"step": 838
},
{
"epoch": 0.9019080892233271,
"grad_norm": 16.45047950744629,
"learning_rate": 4.883813127099579e-06,
"loss": 3.8603,
"step": 839
},
{
"epoch": 0.902983069067455,
"grad_norm": 14.44510269165039,
"learning_rate": 4.779069586881857e-06,
"loss": 2.7965,
"step": 840
},
{
"epoch": 0.9040580489115829,
"grad_norm": 14.151533126831055,
"learning_rate": 4.675433977078547e-06,
"loss": 2.3626,
"step": 841
},
{
"epoch": 0.9051330287557108,
"grad_norm": 20.559478759765625,
"learning_rate": 4.572907503527923e-06,
"loss": 2.3899,
"step": 842
},
{
"epoch": 0.9062080085998387,
"grad_norm": 11.681385040283203,
"learning_rate": 4.471491359163094e-06,
"loss": 2.3098,
"step": 843
},
{
"epoch": 0.9072829884439667,
"grad_norm": 16.471866607666016,
"learning_rate": 4.3711867239980335e-06,
"loss": 2.6948,
"step": 844
},
{
"epoch": 0.9083579682880946,
"grad_norm": 10.209484100341797,
"learning_rate": 4.271994765113952e-06,
"loss": 2.1588,
"step": 845
},
{
"epoch": 0.9094329481322225,
"grad_norm": 15.725363731384277,
"learning_rate": 4.173916636645591e-06,
"loss": 2.8127,
"step": 846
},
{
"epoch": 0.9105079279763505,
"grad_norm": 15.666772842407227,
"learning_rate": 4.0769534797679645e-06,
"loss": 2.667,
"step": 847
},
{
"epoch": 0.9115829078204783,
"grad_norm": 15.909188270568848,
"learning_rate": 3.9811064226828895e-06,
"loss": 3.317,
"step": 848
},
{
"epoch": 0.9126578876646063,
"grad_norm": 11.588101387023926,
"learning_rate": 3.8863765806060105e-06,
"loss": 1.9476,
"step": 849
},
{
"epoch": 0.9137328675087342,
"grad_norm": 10.225213050842285,
"learning_rate": 3.7927650557537555e-06,
"loss": 2.1618,
"step": 850
},
{
"epoch": 0.9148078473528621,
"grad_norm": 19.92328643798828,
"learning_rate": 3.7002729373304957e-06,
"loss": 2.6487,
"step": 851
},
{
"epoch": 0.9158828271969901,
"grad_norm": 21.881338119506836,
"learning_rate": 3.6089013015159433e-06,
"loss": 4.0831,
"step": 852
},
{
"epoch": 0.916957807041118,
"grad_norm": 16.8553524017334,
"learning_rate": 3.5186512114525282e-06,
"loss": 3.4859,
"step": 853
},
{
"epoch": 0.9180327868852459,
"grad_norm": 15.485822677612305,
"learning_rate": 3.4295237172331516e-06,
"loss": 2.7058,
"step": 854
},
{
"epoch": 0.9191077667293738,
"grad_norm": 11.821995735168457,
"learning_rate": 3.3415198558888305e-06,
"loss": 2.4467,
"step": 855
},
{
"epoch": 0.9201827465735017,
"grad_norm": 10.078314781188965,
"learning_rate": 3.2546406513767504e-06,
"loss": 1.897,
"step": 856
},
{
"epoch": 0.9212577264176297,
"grad_norm": 13.78282356262207,
"learning_rate": 3.1688871145683086e-06,
"loss": 2.9327,
"step": 857
},
{
"epoch": 0.9223327062617576,
"grad_norm": 17.97776222229004,
"learning_rate": 3.0842602432373024e-06,
"loss": 2.6016,
"step": 858
},
{
"epoch": 0.9234076861058855,
"grad_norm": 17.566068649291992,
"learning_rate": 3.0007610220483927e-06,
"loss": 2.7243,
"step": 859
},
{
"epoch": 0.9244826659500134,
"grad_norm": 12.550614356994629,
"learning_rate": 2.918390422545614e-06,
"loss": 2.2397,
"step": 860
},
{
"epoch": 0.9255576457941413,
"grad_norm": 26.070829391479492,
"learning_rate": 2.8371494031410704e-06,
"loss": 3.6046,
"step": 861
},
{
"epoch": 0.9266326256382693,
"grad_norm": 20.401790618896484,
"learning_rate": 2.757038909103793e-06,
"loss": 2.7354,
"step": 862
},
{
"epoch": 0.9277076054823972,
"grad_norm": 10.504318237304688,
"learning_rate": 2.6780598725487214e-06,
"loss": 2.4165,
"step": 863
},
{
"epoch": 0.9287825853265251,
"grad_norm": 12.440912246704102,
"learning_rate": 2.6002132124258947e-06,
"loss": 2.0556,
"step": 864
},
{
"epoch": 0.9298575651706531,
"grad_norm": 15.771154403686523,
"learning_rate": 2.5234998345097238e-06,
"loss": 3.0948,
"step": 865
},
{
"epoch": 0.930932545014781,
"grad_norm": 13.663370132446289,
"learning_rate": 2.4479206313884784e-06,
"loss": 2.6857,
"step": 866
},
{
"epoch": 0.9320075248589089,
"grad_norm": 12.99375057220459,
"learning_rate": 2.3734764824538515e-06,
"loss": 2.46,
"step": 867
},
{
"epoch": 0.9330825047030368,
"grad_norm": 10.756906509399414,
"learning_rate": 2.300168253890833e-06,
"loss": 2.2951,
"step": 868
},
{
"epoch": 0.9341574845471647,
"grad_norm": 15.61550521850586,
"learning_rate": 2.2279967986674756e-06,
"loss": 2.947,
"step": 869
},
{
"epoch": 0.9352324643912927,
"grad_norm": 17.821706771850586,
"learning_rate": 2.1569629565251546e-06,
"loss": 2.9358,
"step": 870
},
{
"epoch": 0.9363074442354206,
"grad_norm": 13.073902130126953,
"learning_rate": 2.0870675539686023e-06,
"loss": 2.665,
"step": 871
},
{
"epoch": 0.9373824240795485,
"grad_norm": 16.576457977294922,
"learning_rate": 2.0183114042564567e-06,
"loss": 3.3876,
"step": 872
},
{
"epoch": 0.9384574039236764,
"grad_norm": 12.26412296295166,
"learning_rate": 1.9506953073917365e-06,
"loss": 2.3773,
"step": 873
},
{
"epoch": 0.9395323837678043,
"grad_norm": 15.137334823608398,
"learning_rate": 1.8842200501124618e-06,
"loss": 2.353,
"step": 874
},
{
"epoch": 0.9406073636119323,
"grad_norm": 11.213092803955078,
"learning_rate": 1.818886405882636e-06,
"loss": 1.8275,
"step": 875
},
{
"epoch": 0.9416823434560602,
"grad_norm": 12.951726913452148,
"learning_rate": 1.7546951348831441e-06,
"loss": 2.4801,
"step": 876
},
{
"epoch": 0.9427573233001881,
"grad_norm": 9.286628723144531,
"learning_rate": 1.6916469840029369e-06,
"loss": 1.8095,
"step": 877
},
{
"epoch": 0.9438323031443161,
"grad_norm": 16.789724349975586,
"learning_rate": 1.6297426868303378e-06,
"loss": 2.2362,
"step": 878
},
{
"epoch": 0.944907282988444,
"grad_norm": 9.946615219116211,
"learning_rate": 1.5689829636445496e-06,
"loss": 2.0806,
"step": 879
},
{
"epoch": 0.9459822628325719,
"grad_norm": 10.376627922058105,
"learning_rate": 1.5093685214072174e-06,
"loss": 2.7637,
"step": 880
},
{
"epoch": 0.9470572426766998,
"grad_norm": 10.574774742126465,
"learning_rate": 1.4509000537541895e-06,
"loss": 2.6576,
"step": 881
},
{
"epoch": 0.9481322225208277,
"grad_norm": 19.413841247558594,
"learning_rate": 1.3935782409875476e-06,
"loss": 3.1755,
"step": 882
},
{
"epoch": 0.9492072023649557,
"grad_norm": 24.996675491333008,
"learning_rate": 1.337403750067545e-06,
"loss": 4.3345,
"step": 883
},
{
"epoch": 0.9502821822090836,
"grad_norm": 15.001642227172852,
"learning_rate": 1.2823772346050034e-06,
"loss": 3.8014,
"step": 884
},
{
"epoch": 0.9513571620532115,
"grad_norm": 20.97924041748047,
"learning_rate": 1.2284993348535723e-06,
"loss": 3.1809,
"step": 885
},
{
"epoch": 0.9524321418973394,
"grad_norm": 12.326496124267578,
"learning_rate": 1.1757706777023592e-06,
"loss": 2.0848,
"step": 886
},
{
"epoch": 0.9535071217414673,
"grad_norm": 16.77161979675293,
"learning_rate": 1.1241918766686122e-06,
"loss": 2.5849,
"step": 887
},
{
"epoch": 0.9545821015855953,
"grad_norm": 18.30064582824707,
"learning_rate": 1.0737635318905704e-06,
"loss": 2.8427,
"step": 888
},
{
"epoch": 0.9556570814297232,
"grad_norm": 16.708187103271484,
"learning_rate": 1.0244862301205249e-06,
"loss": 3.3412,
"step": 889
},
{
"epoch": 0.9567320612738511,
"grad_norm": 9.98150634765625,
"learning_rate": 9.763605447179137e-07,
"loss": 1.9673,
"step": 890
},
{
"epoch": 0.957807041117979,
"grad_norm": 16.982563018798828,
"learning_rate": 9.293870356427259e-07,
"loss": 2.7158,
"step": 891
},
{
"epoch": 0.958882020962107,
"grad_norm": 14.91385555267334,
"learning_rate": 8.835662494489638e-07,
"loss": 2.4724,
"step": 892
},
{
"epoch": 0.9599570008062349,
"grad_norm": 11.67471981048584,
"learning_rate": 8.388987192782472e-07,
"loss": 2.0277,
"step": 893
},
{
"epoch": 0.9610319806503628,
"grad_norm": 17.66205596923828,
"learning_rate": 7.953849648536516e-07,
"loss": 3.0247,
"step": 894
},
{
"epoch": 0.9621069604944907,
"grad_norm": 12.348430633544922,
"learning_rate": 7.53025492473669e-07,
"loss": 2.1941,
"step": 895
},
{
"epoch": 0.9631819403386187,
"grad_norm": 11.82780647277832,
"learning_rate": 7.118207950062905e-07,
"loss": 1.9668,
"step": 896
},
{
"epoch": 0.9642569201827466,
"grad_norm": 11.388262748718262,
"learning_rate": 6.717713518832325e-07,
"loss": 1.7028,
"step": 897
},
{
"epoch": 0.9653319000268745,
"grad_norm": 20.69382095336914,
"learning_rate": 6.328776290944749e-07,
"loss": 2.4699,
"step": 898
},
{
"epoch": 0.9664068798710024,
"grad_norm": 16.286813735961914,
"learning_rate": 5.9514007918271e-07,
"loss": 2.5732,
"step": 899
},
{
"epoch": 0.9674818597151303,
"grad_norm": 16.142581939697266,
"learning_rate": 5.585591412381797e-07,
"loss": 3.1468,
"step": 900
},
{
"epoch": 0.9685568395592583,
"grad_norm": 21.596254348754883,
"learning_rate": 5.231352408934686e-07,
"loss": 3.7028,
"step": 901
},
{
"epoch": 0.9696318194033862,
"grad_norm": 10.433320999145508,
"learning_rate": 4.88868790318675e-07,
"loss": 2.0005,
"step": 902
},
{
"epoch": 0.9707067992475141,
"grad_norm": 20.765460968017578,
"learning_rate": 4.557601882164808e-07,
"loss": 3.7903,
"step": 903
},
{
"epoch": 0.971781779091642,
"grad_norm": 18.080036163330078,
"learning_rate": 4.2380981981759994e-07,
"loss": 3.3676,
"step": 904
},
{
"epoch": 0.97285675893577,
"grad_norm": 14.852317810058594,
"learning_rate": 3.930180568762931e-07,
"loss": 2.6543,
"step": 905
},
{
"epoch": 0.9739317387798979,
"grad_norm": 18.983341217041016,
"learning_rate": 3.633852576659935e-07,
"loss": 3.433,
"step": 906
},
{
"epoch": 0.9750067186240258,
"grad_norm": 14.545281410217285,
"learning_rate": 3.3491176697517667e-07,
"loss": 2.89,
"step": 907
},
{
"epoch": 0.9760816984681537,
"grad_norm": 10.29315185546875,
"learning_rate": 3.0759791610335267e-07,
"loss": 2.0145,
"step": 908
},
{
"epoch": 0.9771566783122816,
"grad_norm": 21.322124481201172,
"learning_rate": 2.81444022857158e-07,
"loss": 3.5667,
"step": 909
},
{
"epoch": 0.9782316581564096,
"grad_norm": 13.829008102416992,
"learning_rate": 2.5645039154675864e-07,
"loss": 2.6689,
"step": 910
},
{
"epoch": 0.9793066380005375,
"grad_norm": 15.563042640686035,
"learning_rate": 2.3261731298217514e-07,
"loss": 2.4102,
"step": 911
},
{
"epoch": 0.9803816178446654,
"grad_norm": 17.429529190063477,
"learning_rate": 2.099450644700407e-07,
"loss": 2.4791,
"step": 912
},
{
"epoch": 0.9814565976887933,
"grad_norm": 15.122339248657227,
"learning_rate": 1.8843390981024834e-07,
"loss": 2.1212,
"step": 913
},
{
"epoch": 0.9825315775329213,
"grad_norm": 17.688457489013672,
"learning_rate": 1.6808409929298663e-07,
"loss": 2.279,
"step": 914
},
{
"epoch": 0.9836065573770492,
"grad_norm": 17.063894271850586,
"learning_rate": 1.488958696957421e-07,
"loss": 3.7765,
"step": 915
},
{
"epoch": 0.9846815372211771,
"grad_norm": 10.769682884216309,
"learning_rate": 1.3086944428060132e-07,
"loss": 2.1393,
"step": 916
},
{
"epoch": 0.985756517065305,
"grad_norm": 10.24843692779541,
"learning_rate": 1.1400503279163088e-07,
"loss": 2.2815,
"step": 917
},
{
"epoch": 0.986831496909433,
"grad_norm": 13.591650009155273,
"learning_rate": 9.83028314524348e-08,
"loss": 2.809,
"step": 918
},
{
"epoch": 0.9879064767535609,
"grad_norm": 13.300793647766113,
"learning_rate": 8.376302296387861e-08,
"loss": 2.9568,
"step": 919
},
{
"epoch": 0.9889814565976888,
"grad_norm": 16.980257034301758,
"learning_rate": 7.038577650195777e-08,
"loss": 2.0696,
"step": 920
},
{
"epoch": 0.9900564364418167,
"grad_norm": 17.34079360961914,
"learning_rate": 5.8171247715854696e-08,
"loss": 3.1045,
"step": 921
},
{
"epoch": 0.9911314162859446,
"grad_norm": 18.76333999633789,
"learning_rate": 4.711957872606254e-08,
"loss": 2.967,
"step": 922
},
{
"epoch": 0.9922063961300726,
"grad_norm": 15.434213638305664,
"learning_rate": 3.7230898122808665e-08,
"loss": 3.0553,
"step": 923
},
{
"epoch": 0.9932813759742005,
"grad_norm": 20.012191772460938,
"learning_rate": 2.850532096452252e-08,
"loss": 3.6372,
"step": 924
},
{
"epoch": 0.9943563558183284,
"grad_norm": 14.739432334899902,
"learning_rate": 2.0942948776481175e-08,
"loss": 1.9846,
"step": 925
},
{
"epoch": 0.9954313356624563,
"grad_norm": 14.258848190307617,
"learning_rate": 1.4543869549665801e-08,
"loss": 2.9188,
"step": 926
},
{
"epoch": 0.9965063155065843,
"grad_norm": 14.79078483581543,
"learning_rate": 9.308157739706946e-09,
"loss": 2.4791,
"step": 927
},
{
"epoch": 0.9975812953507122,
"grad_norm": 17.466236114501953,
"learning_rate": 5.23587426601857e-09,
"loss": 2.1995,
"step": 928
},
{
"epoch": 0.9986562751948401,
"grad_norm": 19.64586067199707,
"learning_rate": 2.327066511120801e-09,
"loss": 2.872,
"step": 929
},
{
"epoch": 0.999731255038968,
"grad_norm": 12.937053680419922,
"learning_rate": 5.817683200515233e-10,
"loss": 2.6175,
"step": 930
},
{
"epoch": 1.000806234883096,
"grad_norm": 16.165599822998047,
"learning_rate": 0.0,
"loss": 2.2581,
"step": 931
}
],
"logging_steps": 1,
"max_steps": 931,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 233,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7022441851256832.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}