bella-7b-lora / trainer_state.json
juiceb0xc0de's picture
Upload folder using huggingface_hub
e10ad63 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 2148,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013976240391334731,
"grad_norm": 7.36542272567749,
"learning_rate": 4.5e-05,
"loss": 3.7207,
"step": 10
},
{
"epoch": 0.027952480782669462,
"grad_norm": 2.479189872741699,
"learning_rate": 4.9997813884789515e-05,
"loss": 1.5402,
"step": 20
},
{
"epoch": 0.041928721174004195,
"grad_norm": 1.2499618530273438,
"learning_rate": 4.999025743417125e-05,
"loss": 1.6544,
"step": 30
},
{
"epoch": 0.055904961565338925,
"grad_norm": 1.4645347595214844,
"learning_rate": 4.997730529020747e-05,
"loss": 1.1744,
"step": 40
},
{
"epoch": 0.06988120195667366,
"grad_norm": 1.411839246749878,
"learning_rate": 4.9958960249419854e-05,
"loss": 1.4873,
"step": 50
},
{
"epoch": 0.08385744234800839,
"grad_norm": 1.320661187171936,
"learning_rate": 4.993522627272057e-05,
"loss": 1.5195,
"step": 60
},
{
"epoch": 0.09783368273934312,
"grad_norm": 1.4731823205947876,
"learning_rate": 4.9906108484557024e-05,
"loss": 1.7024,
"step": 70
},
{
"epoch": 0.11180992313067785,
"grad_norm": 1.086996078491211,
"learning_rate": 4.987161317180547e-05,
"loss": 1.4094,
"step": 80
},
{
"epoch": 0.12578616352201258,
"grad_norm": 0.963653564453125,
"learning_rate": 4.983174778241357e-05,
"loss": 1.4754,
"step": 90
},
{
"epoch": 0.13976240391334732,
"grad_norm": 1.3541700839996338,
"learning_rate": 4.978652092379231e-05,
"loss": 1.5034,
"step": 100
},
{
"epoch": 0.15373864430468204,
"grad_norm": 0.8499814867973328,
"learning_rate": 4.9735942360957535e-05,
"loss": 1.1504,
"step": 110
},
{
"epoch": 0.16771488469601678,
"grad_norm": 1.2124848365783691,
"learning_rate": 4.9680023014421605e-05,
"loss": 1.3261,
"step": 120
},
{
"epoch": 0.1816911250873515,
"grad_norm": 1.1937811374664307,
"learning_rate": 4.9618774957835484e-05,
"loss": 1.3726,
"step": 130
},
{
"epoch": 0.19566736547868624,
"grad_norm": 1.1995941400527954,
"learning_rate": 4.9552211415381935e-05,
"loss": 1.2336,
"step": 140
},
{
"epoch": 0.20964360587002095,
"grad_norm": 1.1256550550460815,
"learning_rate": 4.9480346758920217e-05,
"loss": 1.401,
"step": 150
},
{
"epoch": 0.2236198462613557,
"grad_norm": 1.2055258750915527,
"learning_rate": 4.9403196504883086e-05,
"loss": 1.268,
"step": 160
},
{
"epoch": 0.2375960866526904,
"grad_norm": 1.1542116403579712,
"learning_rate": 4.932077731092656e-05,
"loss": 1.2265,
"step": 170
},
{
"epoch": 0.25157232704402516,
"grad_norm": 1.2236621379852295,
"learning_rate": 4.923310697233336e-05,
"loss": 1.3758,
"step": 180
},
{
"epoch": 0.2655485674353599,
"grad_norm": 1.2633250951766968,
"learning_rate": 4.9140204418170705e-05,
"loss": 1.4731,
"step": 190
},
{
"epoch": 0.27952480782669464,
"grad_norm": 1.06014883518219,
"learning_rate": 4.904208970720327e-05,
"loss": 1.3239,
"step": 200
},
{
"epoch": 0.29350104821802936,
"grad_norm": 1.3706936836242676,
"learning_rate": 4.893878402356229e-05,
"loss": 1.1694,
"step": 210
},
{
"epoch": 0.3074772886093641,
"grad_norm": 1.0831369161605835,
"learning_rate": 4.8830309672171646e-05,
"loss": 1.2387,
"step": 220
},
{
"epoch": 0.3214535290006988,
"grad_norm": 1.2866735458374023,
"learning_rate": 4.871669007393197e-05,
"loss": 1.1891,
"step": 230
},
{
"epoch": 0.33542976939203356,
"grad_norm": 1.4383916854858398,
"learning_rate": 4.859794976066377e-05,
"loss": 1.2125,
"step": 240
},
{
"epoch": 0.3494060097833683,
"grad_norm": 1.179437518119812,
"learning_rate": 4.847411436981075e-05,
"loss": 1.3652,
"step": 250
},
{
"epoch": 0.363382250174703,
"grad_norm": 1.160443902015686,
"learning_rate": 4.8345210638904396e-05,
"loss": 1.3129,
"step": 260
},
{
"epoch": 0.37735849056603776,
"grad_norm": 2.0713257789611816,
"learning_rate": 4.821126639979094e-05,
"loss": 1.3108,
"step": 270
},
{
"epoch": 0.3913347309573725,
"grad_norm": 1.2409151792526245,
"learning_rate": 4.807231057262225e-05,
"loss": 1.1604,
"step": 280
},
{
"epoch": 0.4053109713487072,
"grad_norm": 1.328251600265503,
"learning_rate": 4.792837315961154e-05,
"loss": 1.3142,
"step": 290
},
{
"epoch": 0.4192872117400419,
"grad_norm": 1.1564041376113892,
"learning_rate": 4.777948523855557e-05,
"loss": 1.2178,
"step": 300
},
{
"epoch": 0.4332634521313767,
"grad_norm": 1.174281358718872,
"learning_rate": 4.7625678956124584e-05,
"loss": 1.2944,
"step": 310
},
{
"epoch": 0.4472396925227114,
"grad_norm": 1.2078189849853516,
"learning_rate": 4.7466987520921437e-05,
"loss": 1.2028,
"step": 320
},
{
"epoch": 0.4612159329140461,
"grad_norm": 1.1163133382797241,
"learning_rate": 4.730344519631149e-05,
"loss": 1.2439,
"step": 330
},
{
"epoch": 0.4751921733053808,
"grad_norm": 1.1989187002182007,
"learning_rate": 4.713508729302474e-05,
"loss": 1.2537,
"step": 340
},
{
"epoch": 0.4891684136967156,
"grad_norm": 1.1516531705856323,
"learning_rate": 4.6961950161531784e-05,
"loss": 1.2472,
"step": 350
},
{
"epoch": 0.5031446540880503,
"grad_norm": 1.3662034273147583,
"learning_rate": 4.678407118419538e-05,
"loss": 1.3128,
"step": 360
},
{
"epoch": 0.517120894479385,
"grad_norm": 1.1363831758499146,
"learning_rate": 4.6601488767199074e-05,
"loss": 1.2577,
"step": 370
},
{
"epoch": 0.5310971348707197,
"grad_norm": 1.411293864250183,
"learning_rate": 4.641424233225491e-05,
"loss": 1.3468,
"step": 380
},
{
"epoch": 0.5450733752620545,
"grad_norm": 1.1499321460723877,
"learning_rate": 4.6222372308091785e-05,
"loss": 1.2135,
"step": 390
},
{
"epoch": 0.5590496156533893,
"grad_norm": 1.371730089187622,
"learning_rate": 4.602592012172644e-05,
"loss": 1.1668,
"step": 400
},
{
"epoch": 0.573025856044724,
"grad_norm": 1.168100118637085,
"learning_rate": 4.582492818951881e-05,
"loss": 1.2127,
"step": 410
},
{
"epoch": 0.5870020964360587,
"grad_norm": 1.2999486923217773,
"learning_rate": 4.56194399080139e-05,
"loss": 1.2012,
"step": 420
},
{
"epoch": 0.6009783368273934,
"grad_norm": 1.2116057872772217,
"learning_rate": 4.54094996445719e-05,
"loss": 1.2952,
"step": 430
},
{
"epoch": 0.6149545772187281,
"grad_norm": 1.2093985080718994,
"learning_rate": 4.519515272778878e-05,
"loss": 1.3045,
"step": 440
},
{
"epoch": 0.6289308176100629,
"grad_norm": 1.236147403717041,
"learning_rate": 4.4976445437709305e-05,
"loss": 1.1097,
"step": 450
},
{
"epoch": 0.6429070580013976,
"grad_norm": 1.2068607807159424,
"learning_rate": 4.4753424995834596e-05,
"loss": 1.1982,
"step": 460
},
{
"epoch": 0.6568832983927324,
"grad_norm": 1.2430453300476074,
"learning_rate": 4.452613955492649e-05,
"loss": 1.2413,
"step": 470
},
{
"epoch": 0.6708595387840671,
"grad_norm": 1.3011972904205322,
"learning_rate": 4.4294638188610736e-05,
"loss": 1.2005,
"step": 480
},
{
"epoch": 0.6848357791754018,
"grad_norm": 1.0899052619934082,
"learning_rate": 4.405897088078147e-05,
"loss": 1.2271,
"step": 490
},
{
"epoch": 0.6988120195667366,
"grad_norm": 1.1827476024627686,
"learning_rate": 4.3819188514809044e-05,
"loss": 1.242,
"step": 500
},
{
"epoch": 0.7127882599580713,
"grad_norm": 1.1827216148376465,
"learning_rate": 4.3575342862553766e-05,
"loss": 1.0454,
"step": 510
},
{
"epoch": 0.726764500349406,
"grad_norm": 1.2986303567886353,
"learning_rate": 4.332748657318767e-05,
"loss": 1.0946,
"step": 520
},
{
"epoch": 0.7407407407407407,
"grad_norm": 1.2949241399765015,
"learning_rate": 4.307567316182699e-05,
"loss": 1.2538,
"step": 530
},
{
"epoch": 0.7547169811320755,
"grad_norm": 1.256659984588623,
"learning_rate": 4.2819956997977586e-05,
"loss": 1.3298,
"step": 540
},
{
"epoch": 0.7686932215234102,
"grad_norm": 1.4315185546875,
"learning_rate": 4.2560393293795926e-05,
"loss": 1.2817,
"step": 550
},
{
"epoch": 0.782669461914745,
"grad_norm": 1.1938406229019165,
"learning_rate": 4.229703809216812e-05,
"loss": 0.9553,
"step": 560
},
{
"epoch": 0.7966457023060797,
"grad_norm": 1.1752980947494507,
"learning_rate": 4.2029948254609613e-05,
"loss": 1.0532,
"step": 570
},
{
"epoch": 0.8106219426974144,
"grad_norm": 1.4171777963638306,
"learning_rate": 4.17591814489881e-05,
"loss": 1.1731,
"step": 580
},
{
"epoch": 0.8245981830887491,
"grad_norm": 1.2295438051223755,
"learning_rate": 4.1484796137072315e-05,
"loss": 1.2059,
"step": 590
},
{
"epoch": 0.8385744234800838,
"grad_norm": 1.2784441709518433,
"learning_rate": 4.120685156190952e-05,
"loss": 1.3045,
"step": 600
},
{
"epoch": 0.8525506638714185,
"grad_norm": 1.3020128011703491,
"learning_rate": 4.0925407735034136e-05,
"loss": 1.2221,
"step": 610
},
{
"epoch": 0.8665269042627534,
"grad_norm": 1.2527514696121216,
"learning_rate": 4.0640525423510605e-05,
"loss": 1.2331,
"step": 620
},
{
"epoch": 0.8805031446540881,
"grad_norm": 1.2841899394989014,
"learning_rate": 4.035226613681303e-05,
"loss": 1.1167,
"step": 630
},
{
"epoch": 0.8944793850454228,
"grad_norm": 1.254504680633545,
"learning_rate": 4.006069211354457e-05,
"loss": 1.2525,
"step": 640
},
{
"epoch": 0.9084556254367575,
"grad_norm": 1.295681357383728,
"learning_rate": 3.976586630799935e-05,
"loss": 1.1356,
"step": 650
},
{
"epoch": 0.9224318658280922,
"grad_norm": 1.0384072065353394,
"learning_rate": 3.946785237656992e-05,
"loss": 1.1792,
"step": 660
},
{
"epoch": 0.9364081062194269,
"grad_norm": 1.1121629476547241,
"learning_rate": 3.916671466400307e-05,
"loss": 1.209,
"step": 670
},
{
"epoch": 0.9503843466107617,
"grad_norm": 1.432450532913208,
"learning_rate": 3.886251818950702e-05,
"loss": 1.221,
"step": 680
},
{
"epoch": 0.9643605870020965,
"grad_norm": 1.472678542137146,
"learning_rate": 3.855532863271302e-05,
"loss": 1.2537,
"step": 690
},
{
"epoch": 0.9783368273934312,
"grad_norm": 1.285548448562622,
"learning_rate": 3.8245212319494354e-05,
"loss": 1.2792,
"step": 700
},
{
"epoch": 0.9923130677847659,
"grad_norm": 1.2394496202468872,
"learning_rate": 3.793223620764573e-05,
"loss": 1.1743,
"step": 710
},
{
"epoch": 1.0055904961565338,
"grad_norm": 1.683555006980896,
"learning_rate": 3.7616467872426376e-05,
"loss": 1.0054,
"step": 720
},
{
"epoch": 1.0195667365478687,
"grad_norm": 1.3049145936965942,
"learning_rate": 3.7297975491969684e-05,
"loss": 0.9015,
"step": 730
},
{
"epoch": 1.0335429769392033,
"grad_norm": 1.439926028251648,
"learning_rate": 3.697682783256278e-05,
"loss": 0.9361,
"step": 740
},
{
"epoch": 1.047519217330538,
"grad_norm": 1.7054588794708252,
"learning_rate": 3.665309423379904e-05,
"loss": 0.9668,
"step": 750
},
{
"epoch": 1.0614954577218727,
"grad_norm": 1.674256682395935,
"learning_rate": 3.632684459360685e-05,
"loss": 0.9566,
"step": 760
},
{
"epoch": 1.0754716981132075,
"grad_norm": 1.1890208721160889,
"learning_rate": 3.5998149353157815e-05,
"loss": 0.8597,
"step": 770
},
{
"epoch": 1.0894479385045424,
"grad_norm": 1.6892008781433105,
"learning_rate": 3.56670794816577e-05,
"loss": 0.8231,
"step": 780
},
{
"epoch": 1.103424178895877,
"grad_norm": 1.4274325370788574,
"learning_rate": 3.5333706461023275e-05,
"loss": 0.8254,
"step": 790
},
{
"epoch": 1.1174004192872118,
"grad_norm": 1.9769346714019775,
"learning_rate": 3.4998102270448606e-05,
"loss": 0.848,
"step": 800
},
{
"epoch": 1.1313766596785464,
"grad_norm": 1.1409764289855957,
"learning_rate": 3.466033937086381e-05,
"loss": 0.7997,
"step": 810
},
{
"epoch": 1.1453529000698812,
"grad_norm": 1.563937783241272,
"learning_rate": 3.432049068928994e-05,
"loss": 0.824,
"step": 820
},
{
"epoch": 1.159329140461216,
"grad_norm": 1.51837158203125,
"learning_rate": 3.39786296030931e-05,
"loss": 0.9618,
"step": 830
},
{
"epoch": 1.1733053808525507,
"grad_norm": 1.7045314311981201,
"learning_rate": 3.363482992414152e-05,
"loss": 0.7897,
"step": 840
},
{
"epoch": 1.1872816212438855,
"grad_norm": 1.5531268119812012,
"learning_rate": 3.328916588286858e-05,
"loss": 0.8622,
"step": 850
},
{
"epoch": 1.20125786163522,
"grad_norm": 1.0864923000335693,
"learning_rate": 3.2941712112245624e-05,
"loss": 0.8361,
"step": 860
},
{
"epoch": 1.215234102026555,
"grad_norm": 1.7823400497436523,
"learning_rate": 3.259254363166785e-05,
"loss": 0.7441,
"step": 870
},
{
"epoch": 1.2292103424178895,
"grad_norm": 1.6787755489349365,
"learning_rate": 3.2241735830756656e-05,
"loss": 0.8565,
"step": 880
},
{
"epoch": 1.2431865828092243,
"grad_norm": 1.9149354696273804,
"learning_rate": 3.188936445308221e-05,
"loss": 0.9051,
"step": 890
},
{
"epoch": 1.257162823200559,
"grad_norm": 1.828198790550232,
"learning_rate": 3.153550557980943e-05,
"loss": 0.8709,
"step": 900
},
{
"epoch": 1.2711390635918938,
"grad_norm": 1.6430469751358032,
"learning_rate": 3.118023561327123e-05,
"loss": 0.8763,
"step": 910
},
{
"epoch": 1.2851153039832286,
"grad_norm": 1.5574500560760498,
"learning_rate": 3.08236312604723e-05,
"loss": 0.9294,
"step": 920
},
{
"epoch": 1.2990915443745632,
"grad_norm": 1.695155382156372,
"learning_rate": 3.0465769516527163e-05,
"loss": 0.8626,
"step": 930
},
{
"epoch": 1.313067784765898,
"grad_norm": 1.6467136144638062,
"learning_rate": 3.010672764803606e-05,
"loss": 0.8066,
"step": 940
},
{
"epoch": 1.3270440251572326,
"grad_norm": 1.6641095876693726,
"learning_rate": 2.9746583176402083e-05,
"loss": 0.8388,
"step": 950
},
{
"epoch": 1.3410202655485675,
"grad_norm": 1.8467156887054443,
"learning_rate": 2.9385413861093474e-05,
"loss": 0.85,
"step": 960
},
{
"epoch": 1.3549965059399023,
"grad_norm": 1.6968249082565308,
"learning_rate": 2.9023297682854383e-05,
"loss": 0.8725,
"step": 970
},
{
"epoch": 1.368972746331237,
"grad_norm": 1.8281203508377075,
"learning_rate": 2.866031282686791e-05,
"loss": 0.9046,
"step": 980
},
{
"epoch": 1.3829489867225715,
"grad_norm": 1.5624685287475586,
"learning_rate": 2.829653766587499e-05,
"loss": 0.8172,
"step": 990
},
{
"epoch": 1.3969252271139063,
"grad_norm": 1.5298397541046143,
"learning_rate": 2.793205074325282e-05,
"loss": 0.9824,
"step": 1000
},
{
"epoch": 1.4109014675052411,
"grad_norm": 1.857814908027649,
"learning_rate": 2.756693075605634e-05,
"loss": 0.8879,
"step": 1010
},
{
"epoch": 1.4248777078965758,
"grad_norm": 1.6862074136734009,
"learning_rate": 2.7201256538026698e-05,
"loss": 0.8606,
"step": 1020
},
{
"epoch": 1.4388539482879106,
"grad_norm": 1.5606305599212646,
"learning_rate": 2.683510704257003e-05,
"loss": 0.9077,
"step": 1030
},
{
"epoch": 1.4528301886792452,
"grad_norm": 1.7978451251983643,
"learning_rate": 2.6468561325710527e-05,
"loss": 0.9191,
"step": 1040
},
{
"epoch": 1.46680642907058,
"grad_norm": 1.6466025114059448,
"learning_rate": 2.6101698529021267e-05,
"loss": 0.8355,
"step": 1050
},
{
"epoch": 1.4807826694619148,
"grad_norm": 1.6290067434310913,
"learning_rate": 2.5734597862536653e-05,
"loss": 1.0066,
"step": 1060
},
{
"epoch": 1.4947589098532494,
"grad_norm": 1.7666162252426147,
"learning_rate": 2.536733858764998e-05,
"loss": 0.8495,
"step": 1070
},
{
"epoch": 1.508735150244584,
"grad_norm": 1.616740107536316,
"learning_rate": 2.5e-05,
"loss": 0.8858,
"step": 1080
},
{
"epoch": 1.5227113906359189,
"grad_norm": 2.093693733215332,
"learning_rate": 2.4632661412350027e-05,
"loss": 0.8448,
"step": 1090
},
{
"epoch": 1.5366876310272537,
"grad_norm": 2.0865767002105713,
"learning_rate": 2.4265402137463356e-05,
"loss": 0.7736,
"step": 1100
},
{
"epoch": 1.5506638714185885,
"grad_norm": 1.4208672046661377,
"learning_rate": 2.389830147097874e-05,
"loss": 0.8127,
"step": 1110
},
{
"epoch": 1.5646401118099231,
"grad_norm": 1.1758469343185425,
"learning_rate": 2.3531438674289485e-05,
"loss": 0.8384,
"step": 1120
},
{
"epoch": 1.5786163522012577,
"grad_norm": 1.971389651298523,
"learning_rate": 2.316489295742997e-05,
"loss": 0.8234,
"step": 1130
},
{
"epoch": 1.5925925925925926,
"grad_norm": 1.6852281093597412,
"learning_rate": 2.2798743461973308e-05,
"loss": 0.8124,
"step": 1140
},
{
"epoch": 1.6065688329839274,
"grad_norm": 1.8564770221710205,
"learning_rate": 2.2433069243943665e-05,
"loss": 0.9065,
"step": 1150
},
{
"epoch": 1.6205450733752622,
"grad_norm": 1.841373324394226,
"learning_rate": 2.2067949256747186e-05,
"loss": 0.8598,
"step": 1160
},
{
"epoch": 1.6345213137665968,
"grad_norm": 1.6030656099319458,
"learning_rate": 2.1703462334125013e-05,
"loss": 0.7746,
"step": 1170
},
{
"epoch": 1.6484975541579314,
"grad_norm": 1.712646484375,
"learning_rate": 2.1339687173132104e-05,
"loss": 0.8264,
"step": 1180
},
{
"epoch": 1.6624737945492662,
"grad_norm": 2.1567349433898926,
"learning_rate": 2.0976702317145623e-05,
"loss": 0.839,
"step": 1190
},
{
"epoch": 1.676450034940601,
"grad_norm": 2.055509567260742,
"learning_rate": 2.061458613890653e-05,
"loss": 0.7234,
"step": 1200
},
{
"epoch": 1.6904262753319357,
"grad_norm": 1.855790615081787,
"learning_rate": 2.0253416823597926e-05,
"loss": 0.7455,
"step": 1210
},
{
"epoch": 1.7044025157232703,
"grad_norm": 1.8636531829833984,
"learning_rate": 1.9893272351963946e-05,
"loss": 0.7849,
"step": 1220
},
{
"epoch": 1.718378756114605,
"grad_norm": 1.9092494249343872,
"learning_rate": 1.953423048347284e-05,
"loss": 0.7652,
"step": 1230
},
{
"epoch": 1.73235499650594,
"grad_norm": 1.8068718910217285,
"learning_rate": 1.9176368739527706e-05,
"loss": 0.932,
"step": 1240
},
{
"epoch": 1.7463312368972748,
"grad_norm": 2.015453338623047,
"learning_rate": 1.8819764386728773e-05,
"loss": 0.7782,
"step": 1250
},
{
"epoch": 1.7603074772886094,
"grad_norm": 1.833056092262268,
"learning_rate": 1.8464494420190574e-05,
"loss": 0.8057,
"step": 1260
},
{
"epoch": 1.774283717679944,
"grad_norm": 2.147026538848877,
"learning_rate": 1.811063554691779e-05,
"loss": 0.7086,
"step": 1270
},
{
"epoch": 1.7882599580712788,
"grad_norm": 1.8407797813415527,
"learning_rate": 1.775826416924335e-05,
"loss": 0.731,
"step": 1280
},
{
"epoch": 1.8022361984626136,
"grad_norm": 2.3300631046295166,
"learning_rate": 1.740745636833216e-05,
"loss": 0.6734,
"step": 1290
},
{
"epoch": 1.8162124388539485,
"grad_norm": 2.4620823860168457,
"learning_rate": 1.7058287887754375e-05,
"loss": 0.7447,
"step": 1300
},
{
"epoch": 1.830188679245283,
"grad_norm": 1.8684221506118774,
"learning_rate": 1.671083411713143e-05,
"loss": 0.7376,
"step": 1310
},
{
"epoch": 1.8441649196366177,
"grad_norm": 2.022292375564575,
"learning_rate": 1.6365170075858487e-05,
"loss": 0.6937,
"step": 1320
},
{
"epoch": 1.8581411600279525,
"grad_norm": 1.459423542022705,
"learning_rate": 1.60213703969069e-05,
"loss": 0.7536,
"step": 1330
},
{
"epoch": 1.8721174004192873,
"grad_norm": 2.1186535358428955,
"learning_rate": 1.567950931071007e-05,
"loss": 0.7683,
"step": 1340
},
{
"epoch": 1.886093640810622,
"grad_norm": 2.1873745918273926,
"learning_rate": 1.5339660629136194e-05,
"loss": 0.7859,
"step": 1350
},
{
"epoch": 1.9000698812019565,
"grad_norm": 1.9905816316604614,
"learning_rate": 1.5001897729551393e-05,
"loss": 0.7714,
"step": 1360
},
{
"epoch": 1.9140461215932913,
"grad_norm": 1.9993782043457031,
"learning_rate": 1.4666293538976727e-05,
"loss": 0.7884,
"step": 1370
},
{
"epoch": 1.9280223619846262,
"grad_norm": 2.1728382110595703,
"learning_rate": 1.4332920518342316e-05,
"loss": 0.7586,
"step": 1380
},
{
"epoch": 1.941998602375961,
"grad_norm": 1.4806022644042969,
"learning_rate": 1.4001850646842191e-05,
"loss": 0.6538,
"step": 1390
},
{
"epoch": 1.9559748427672956,
"grad_norm": 1.89388906955719,
"learning_rate": 1.367315540639315e-05,
"loss": 0.7698,
"step": 1400
},
{
"epoch": 1.9699510831586302,
"grad_norm": 2.3090474605560303,
"learning_rate": 1.3346905766200962e-05,
"loss": 0.7763,
"step": 1410
},
{
"epoch": 1.983927323549965,
"grad_norm": 1.79945707321167,
"learning_rate": 1.3023172167437213e-05,
"loss": 0.7061,
"step": 1420
},
{
"epoch": 1.9979035639412999,
"grad_norm": 2.0168325901031494,
"learning_rate": 1.270202450803032e-05,
"loss": 0.7628,
"step": 1430
},
{
"epoch": 2.0111809923130677,
"grad_norm": 1.8525409698486328,
"learning_rate": 1.2383532127573638e-05,
"loss": 0.5339,
"step": 1440
},
{
"epoch": 2.0251572327044025,
"grad_norm": 3.1745312213897705,
"learning_rate": 1.2067763792354277e-05,
"loss": 0.5761,
"step": 1450
},
{
"epoch": 2.0391334730957373,
"grad_norm": 2.129549741744995,
"learning_rate": 1.1754787680505657e-05,
"loss": 0.5323,
"step": 1460
},
{
"epoch": 2.053109713487072,
"grad_norm": 1.1728535890579224,
"learning_rate": 1.1444671367286987e-05,
"loss": 0.5582,
"step": 1470
},
{
"epoch": 2.0670859538784065,
"grad_norm": 1.981683611869812,
"learning_rate": 1.1137481810492989e-05,
"loss": 0.5555,
"step": 1480
},
{
"epoch": 2.0810621942697414,
"grad_norm": 2.0317366123199463,
"learning_rate": 1.0833285335996934e-05,
"loss": 0.5624,
"step": 1490
},
{
"epoch": 2.095038434661076,
"grad_norm": 2.0169522762298584,
"learning_rate": 1.0532147623430085e-05,
"loss": 0.4676,
"step": 1500
},
{
"epoch": 2.109014675052411,
"grad_norm": 2.0225350856781006,
"learning_rate": 1.0234133692000652e-05,
"loss": 0.5178,
"step": 1510
},
{
"epoch": 2.1229909154437454,
"grad_norm": 1.7321361303329468,
"learning_rate": 9.939307886455435e-06,
"loss": 0.5373,
"step": 1520
},
{
"epoch": 2.1369671558350802,
"grad_norm": 1.4348721504211426,
"learning_rate": 9.647733863186966e-06,
"loss": 0.4503,
"step": 1530
},
{
"epoch": 2.150943396226415,
"grad_norm": 2.057035446166992,
"learning_rate": 9.359474576489399e-06,
"loss": 0.5713,
"step": 1540
},
{
"epoch": 2.16491963661775,
"grad_norm": 2.8267476558685303,
"learning_rate": 9.074592264965873e-06,
"loss": 0.5322,
"step": 1550
},
{
"epoch": 2.1788958770090847,
"grad_norm": 3.4240221977233887,
"learning_rate": 8.793148438090484e-06,
"loss": 0.535,
"step": 1560
},
{
"epoch": 2.192872117400419,
"grad_norm": 2.6496121883392334,
"learning_rate": 8.515203862927687e-06,
"loss": 0.4921,
"step": 1570
},
{
"epoch": 2.206848357791754,
"grad_norm": 2.2704765796661377,
"learning_rate": 8.240818551011905e-06,
"loss": 0.5066,
"step": 1580
},
{
"epoch": 2.2208245981830887,
"grad_norm": 2.1157729625701904,
"learning_rate": 7.970051745390389e-06,
"loss": 0.4063,
"step": 1590
},
{
"epoch": 2.2348008385744236,
"grad_norm": 0.6612274050712585,
"learning_rate": 7.702961907831882e-06,
"loss": 0.4551,
"step": 1600
},
{
"epoch": 2.2487770789657584,
"grad_norm": 2.5119524002075195,
"learning_rate": 7.439606706204083e-06,
"loss": 0.6081,
"step": 1610
},
{
"epoch": 2.262753319357093,
"grad_norm": 2.2395474910736084,
"learning_rate": 7.180043002022416e-06,
"loss": 0.4833,
"step": 1620
},
{
"epoch": 2.2767295597484276,
"grad_norm": 2.71502947807312,
"learning_rate": 6.9243268381730176e-06,
"loss": 0.5712,
"step": 1630
},
{
"epoch": 2.2907058001397624,
"grad_norm": 2.18860125541687,
"learning_rate": 6.6725134268123404e-06,
"loss": 0.5765,
"step": 1640
},
{
"epoch": 2.3046820405310973,
"grad_norm": 1.808046817779541,
"learning_rate": 6.424657137446241e-06,
"loss": 0.4617,
"step": 1650
},
{
"epoch": 2.318658280922432,
"grad_norm": 2.0803866386413574,
"learning_rate": 6.18081148519096e-06,
"loss": 0.5395,
"step": 1660
},
{
"epoch": 2.3326345213137665,
"grad_norm": 2.06593656539917,
"learning_rate": 5.941029119218536e-06,
"loss": 0.5479,
"step": 1670
},
{
"epoch": 2.3466107617051013,
"grad_norm": 0.7248632311820984,
"learning_rate": 5.705361811389262e-06,
"loss": 0.5115,
"step": 1680
},
{
"epoch": 2.360587002096436,
"grad_norm": 1.9568665027618408,
"learning_rate": 5.473860445073515e-06,
"loss": 0.4513,
"step": 1690
},
{
"epoch": 2.374563242487771,
"grad_norm": 1.4630240201950073,
"learning_rate": 5.246575004165408e-06,
"loss": 0.4364,
"step": 1700
},
{
"epoch": 2.3885394828791053,
"grad_norm": 1.8359235525131226,
"learning_rate": 5.0235545622907e-06,
"loss": 0.4796,
"step": 1710
},
{
"epoch": 2.40251572327044,
"grad_norm": 2.409799575805664,
"learning_rate": 4.804847272211227e-06,
"loss": 0.4301,
"step": 1720
},
{
"epoch": 2.416491963661775,
"grad_norm": 2.9206278324127197,
"learning_rate": 4.5905003554281125e-06,
"loss": 0.4912,
"step": 1730
},
{
"epoch": 2.43046820405311,
"grad_norm": 2.4633424282073975,
"learning_rate": 4.3805600919861075e-06,
"loss": 0.6244,
"step": 1740
},
{
"epoch": 2.4444444444444446,
"grad_norm": 1.9702569246292114,
"learning_rate": 4.175071810481193e-06,
"loss": 0.5056,
"step": 1750
},
{
"epoch": 2.458420684835779,
"grad_norm": 2.079681396484375,
"learning_rate": 3.974079878273562e-06,
"loss": 0.4396,
"step": 1760
},
{
"epoch": 2.472396925227114,
"grad_norm": 2.3693020343780518,
"learning_rate": 3.777627691908209e-06,
"loss": 0.4821,
"step": 1770
},
{
"epoch": 2.4863731656184487,
"grad_norm": 2.4170966148376465,
"learning_rate": 3.5857576677450933e-06,
"loss": 0.5048,
"step": 1780
},
{
"epoch": 2.5003494060097835,
"grad_norm": 2.2658681869506836,
"learning_rate": 3.398511232800927e-06,
"loss": 0.5569,
"step": 1790
},
{
"epoch": 2.514325646401118,
"grad_norm": 1.8655173778533936,
"learning_rate": 3.2159288158046224e-06,
"loss": 0.516,
"step": 1800
},
{
"epoch": 2.5283018867924527,
"grad_norm": 2.5998153686523438,
"learning_rate": 3.0380498384682154e-06,
"loss": 0.4745,
"step": 1810
},
{
"epoch": 2.5422781271837875,
"grad_norm": 2.643659830093384,
"learning_rate": 2.8649127069752644e-06,
"loss": 0.5829,
"step": 1820
},
{
"epoch": 2.5562543675751224,
"grad_norm": 2.506516933441162,
"learning_rate": 2.696554803688517e-06,
"loss": 0.5319,
"step": 1830
},
{
"epoch": 2.570230607966457,
"grad_norm": 2.2960050106048584,
"learning_rate": 2.533012479078575e-06,
"loss": 0.4738,
"step": 1840
},
{
"epoch": 2.584206848357792,
"grad_norm": 3.9678826332092285,
"learning_rate": 2.3743210438754243e-06,
"loss": 0.4935,
"step": 1850
},
{
"epoch": 2.5981830887491264,
"grad_norm": 2.771167278289795,
"learning_rate": 2.2205147614444312e-06,
"loss": 0.5127,
"step": 1860
},
{
"epoch": 2.6121593291404612,
"grad_norm": 3.70699143409729,
"learning_rate": 2.071626840388463e-06,
"loss": 0.4047,
"step": 1870
},
{
"epoch": 2.626135569531796,
"grad_norm": 2.3208189010620117,
"learning_rate": 1.9276894273777518e-06,
"loss": 0.5239,
"step": 1880
},
{
"epoch": 2.6401118099231304,
"grad_norm": 2.6511785984039307,
"learning_rate": 1.7887336002090639e-06,
"loss": 0.4359,
"step": 1890
},
{
"epoch": 2.6540880503144653,
"grad_norm": 1.7029021978378296,
"learning_rate": 1.6547893610956127e-06,
"loss": 0.4334,
"step": 1900
},
{
"epoch": 2.6680642907058,
"grad_norm": 2.3018014430999756,
"learning_rate": 1.5258856301892471e-06,
"loss": 0.5468,
"step": 1910
},
{
"epoch": 2.682040531097135,
"grad_norm": 2.5188214778900146,
"learning_rate": 1.4020502393362362e-06,
"loss": 0.5802,
"step": 1920
},
{
"epoch": 2.6960167714884697,
"grad_norm": 2.3051352500915527,
"learning_rate": 1.2833099260680382e-06,
"loss": 0.5138,
"step": 1930
},
{
"epoch": 2.7099930118798046,
"grad_norm": 2.7857372760772705,
"learning_rate": 1.1696903278283543e-06,
"loss": 0.5259,
"step": 1940
},
{
"epoch": 2.723969252271139,
"grad_norm": 2.314476251602173,
"learning_rate": 1.06121597643771e-06,
"loss": 0.5131,
"step": 1950
},
{
"epoch": 2.737945492662474,
"grad_norm": 2.091313123703003,
"learning_rate": 9.579102927967349e-07,
"loss": 0.4142,
"step": 1960
},
{
"epoch": 2.7519217330538086,
"grad_norm": 2.41235089302063,
"learning_rate": 8.59795581829298e-07,
"loss": 0.4547,
"step": 1970
},
{
"epoch": 2.765897973445143,
"grad_norm": 2.577587842941284,
"learning_rate": 7.668930276666403e-07,
"loss": 0.5207,
"step": 1980
},
{
"epoch": 2.779874213836478,
"grad_norm": 2.426898241043091,
"learning_rate": 6.792226890734444e-07,
"loss": 0.4958,
"step": 1990
},
{
"epoch": 2.7938504542278126,
"grad_norm": 1.4604806900024414,
"learning_rate": 5.968034951169155e-07,
"loss": 0.4254,
"step": 2000
},
{
"epoch": 2.8078266946191475,
"grad_norm": 3.135892152786255,
"learning_rate": 5.196532410797844e-07,
"loss": 0.4954,
"step": 2010
},
{
"epoch": 2.8218029350104823,
"grad_norm": 1.6824243068695068,
"learning_rate": 4.477885846180724e-07,
"loss": 0.585,
"step": 2020
},
{
"epoch": 2.835779175401817,
"grad_norm": 2.4747464656829834,
"learning_rate": 3.8122504216451804e-07,
"loss": 0.5324,
"step": 2030
},
{
"epoch": 2.8497554157931515,
"grad_norm": 2.6158955097198486,
"learning_rate": 3.1997698557839905e-07,
"loss": 0.5783,
"step": 2040
},
{
"epoch": 2.8637316561844863,
"grad_norm": 1.8758794069290161,
"learning_rate": 2.6405763904246706e-07,
"loss": 0.5657,
"step": 2050
},
{
"epoch": 2.877707896575821,
"grad_norm": 2.1772301197052,
"learning_rate": 2.134790762076927e-07,
"loss": 0.4668,
"step": 2060
},
{
"epoch": 2.891684136967156,
"grad_norm": 2.552476644515991,
"learning_rate": 1.682522175864315e-07,
"loss": 0.5831,
"step": 2070
},
{
"epoch": 2.9056603773584904,
"grad_norm": 2.3809163570404053,
"learning_rate": 1.283868281945322e-07,
"loss": 0.4918,
"step": 2080
},
{
"epoch": 2.919636617749825,
"grad_norm": 2.3678464889526367,
"learning_rate": 9.389151544298147e-08,
"loss": 0.4573,
"step": 2090
},
{
"epoch": 2.93361285814116,
"grad_norm": 2.0146450996398926,
"learning_rate": 6.477372727943798e-08,
"loss": 0.5716,
"step": 2100
},
{
"epoch": 2.947589098532495,
"grad_norm": 1.9456125497817993,
"learning_rate": 4.103975058015186e-08,
"loss": 0.5161,
"step": 2110
},
{
"epoch": 2.9615653389238297,
"grad_norm": 1.9012235403060913,
"learning_rate": 2.269470979253674e-08,
"loss": 0.3703,
"step": 2120
},
{
"epoch": 2.9755415793151645,
"grad_norm": 2.5397086143493652,
"learning_rate": 9.742565828751993e-09,
"loss": 0.4771,
"step": 2130
},
{
"epoch": 2.989517819706499,
"grad_norm": 2.45979380607605,
"learning_rate": 2.1861152104868387e-09,
"loss": 0.3966,
"step": 2140
}
],
"logging_steps": 10,
"max_steps": 2148,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.1937068893945856e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}