ObsDrive / SFT /trainer_state.json
russellyq's picture
Upload folder using huggingface_hub
58e5dbb verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 2361,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.012706480304955527,
"grad_norm": 29.66147254458293,
"learning_rate": 5.070422535211268e-07,
"loss": 2.2184,
"step": 10
},
{
"epoch": 0.025412960609911054,
"grad_norm": 13.392595629867637,
"learning_rate": 1.0704225352112677e-06,
"loss": 1.8971,
"step": 20
},
{
"epoch": 0.03811944091486658,
"grad_norm": 11.776335522019831,
"learning_rate": 1.6338028169014086e-06,
"loss": 1.4505,
"step": 30
},
{
"epoch": 0.05082592121982211,
"grad_norm": 9.270920607469225,
"learning_rate": 2.19718309859155e-06,
"loss": 1.2166,
"step": 40
},
{
"epoch": 0.06353240152477764,
"grad_norm": 8.813337405138961,
"learning_rate": 2.7605633802816906e-06,
"loss": 1.1232,
"step": 50
},
{
"epoch": 0.07623888182973317,
"grad_norm": 8.759370998993289,
"learning_rate": 3.3239436619718313e-06,
"loss": 1.1167,
"step": 60
},
{
"epoch": 0.08894536213468869,
"grad_norm": 9.774431896253528,
"learning_rate": 3.887323943661972e-06,
"loss": 1.0426,
"step": 70
},
{
"epoch": 0.10165184243964422,
"grad_norm": 6.934105141250302,
"learning_rate": 4.450704225352113e-06,
"loss": 0.9485,
"step": 80
},
{
"epoch": 0.11435832274459974,
"grad_norm": 9.541786088857878,
"learning_rate": 5.014084507042254e-06,
"loss": 0.971,
"step": 90
},
{
"epoch": 0.12706480304955528,
"grad_norm": 8.771108246775684,
"learning_rate": 5.577464788732395e-06,
"loss": 1.0133,
"step": 100
},
{
"epoch": 0.1397712833545108,
"grad_norm": 8.314820650161352,
"learning_rate": 6.1408450704225356e-06,
"loss": 0.9729,
"step": 110
},
{
"epoch": 0.15247776365946633,
"grad_norm": 8.539055316805458,
"learning_rate": 6.704225352112676e-06,
"loss": 0.9679,
"step": 120
},
{
"epoch": 0.16518424396442186,
"grad_norm": 7.751785436062567,
"learning_rate": 7.267605633802818e-06,
"loss": 0.9634,
"step": 130
},
{
"epoch": 0.17789072426937738,
"grad_norm": 8.338219679133466,
"learning_rate": 7.830985915492958e-06,
"loss": 0.9674,
"step": 140
},
{
"epoch": 0.1905972045743329,
"grad_norm": 6.970312776486159,
"learning_rate": 8.3943661971831e-06,
"loss": 0.9323,
"step": 150
},
{
"epoch": 0.20330368487928843,
"grad_norm": 8.934334887580613,
"learning_rate": 8.957746478873241e-06,
"loss": 0.9296,
"step": 160
},
{
"epoch": 0.21601016518424396,
"grad_norm": 8.855402068732014,
"learning_rate": 9.521126760563381e-06,
"loss": 0.8979,
"step": 170
},
{
"epoch": 0.22871664548919948,
"grad_norm": 7.627448152527667,
"learning_rate": 1.0084507042253523e-05,
"loss": 0.9398,
"step": 180
},
{
"epoch": 0.241423125794155,
"grad_norm": 8.293311252648705,
"learning_rate": 1.0647887323943662e-05,
"loss": 0.9903,
"step": 190
},
{
"epoch": 0.25412960609911056,
"grad_norm": 7.860820039563278,
"learning_rate": 1.1211267605633804e-05,
"loss": 0.953,
"step": 200
},
{
"epoch": 0.2668360864040661,
"grad_norm": 12.101780056709387,
"learning_rate": 1.1774647887323944e-05,
"loss": 0.9014,
"step": 210
},
{
"epoch": 0.2795425667090216,
"grad_norm": 8.180635063012854,
"learning_rate": 1.2338028169014084e-05,
"loss": 0.9612,
"step": 220
},
{
"epoch": 0.29224904701397714,
"grad_norm": 7.043614458189797,
"learning_rate": 1.2901408450704227e-05,
"loss": 0.9492,
"step": 230
},
{
"epoch": 0.30495552731893266,
"grad_norm": 6.22917625029958,
"learning_rate": 1.3464788732394367e-05,
"loss": 0.9073,
"step": 240
},
{
"epoch": 0.3176620076238882,
"grad_norm": 6.158034885075652,
"learning_rate": 1.4028169014084507e-05,
"loss": 0.899,
"step": 250
},
{
"epoch": 0.3303684879288437,
"grad_norm": 6.511044998547078,
"learning_rate": 1.459154929577465e-05,
"loss": 0.9147,
"step": 260
},
{
"epoch": 0.34307496823379924,
"grad_norm": 7.2764171631878405,
"learning_rate": 1.515492957746479e-05,
"loss": 0.9637,
"step": 270
},
{
"epoch": 0.35578144853875476,
"grad_norm": 7.6464115804873405,
"learning_rate": 1.571830985915493e-05,
"loss": 0.9521,
"step": 280
},
{
"epoch": 0.3684879288437103,
"grad_norm": 7.027026272110715,
"learning_rate": 1.6281690140845072e-05,
"loss": 0.9738,
"step": 290
},
{
"epoch": 0.3811944091486658,
"grad_norm": 7.0177338278091765,
"learning_rate": 1.6845070422535213e-05,
"loss": 1.0046,
"step": 300
},
{
"epoch": 0.39390088945362134,
"grad_norm": 6.988788493899054,
"learning_rate": 1.740845070422535e-05,
"loss": 0.9481,
"step": 310
},
{
"epoch": 0.40660736975857686,
"grad_norm": 6.592900845765771,
"learning_rate": 1.7971830985915497e-05,
"loss": 0.9588,
"step": 320
},
{
"epoch": 0.4193138500635324,
"grad_norm": 5.836583770100009,
"learning_rate": 1.8535211267605635e-05,
"loss": 0.9404,
"step": 330
},
{
"epoch": 0.4320203303684879,
"grad_norm": 6.69521882642329,
"learning_rate": 1.9098591549295776e-05,
"loss": 0.9385,
"step": 340
},
{
"epoch": 0.44472681067344344,
"grad_norm": 5.945281536047344,
"learning_rate": 1.9661971830985918e-05,
"loss": 0.9394,
"step": 350
},
{
"epoch": 0.45743329097839897,
"grad_norm": 5.686864112756425,
"learning_rate": 1.9999803787597817e-05,
"loss": 0.9764,
"step": 360
},
{
"epoch": 0.4701397712833545,
"grad_norm": 5.459918154381771,
"learning_rate": 1.9997596486500402e-05,
"loss": 0.9827,
"step": 370
},
{
"epoch": 0.48284625158831,
"grad_norm": 5.164520519383337,
"learning_rate": 1.999293716197302e-05,
"loss": 0.9291,
"step": 380
},
{
"epoch": 0.49555273189326554,
"grad_norm": 6.177104322189275,
"learning_rate": 1.998582695676762e-05,
"loss": 1.0364,
"step": 390
},
{
"epoch": 0.5082592121982211,
"grad_norm": 5.346131678285112,
"learning_rate": 1.997626761474232e-05,
"loss": 0.9728,
"step": 400
},
{
"epoch": 0.5209656925031766,
"grad_norm": 5.075630186929998,
"learning_rate": 1.99642614804337e-05,
"loss": 0.957,
"step": 410
},
{
"epoch": 0.5336721728081322,
"grad_norm": 5.766342613636245,
"learning_rate": 1.9949811498481763e-05,
"loss": 0.9856,
"step": 420
},
{
"epoch": 0.5463786531130876,
"grad_norm": 4.773734435548506,
"learning_rate": 1.9932921212907753e-05,
"loss": 1.0065,
"step": 430
},
{
"epoch": 0.5590851334180432,
"grad_norm": 4.8306675684219105,
"learning_rate": 1.991359476624493e-05,
"loss": 0.9175,
"step": 440
},
{
"epoch": 0.5717916137229987,
"grad_norm": 4.8928892220094236,
"learning_rate": 1.9891836898522566e-05,
"loss": 0.9014,
"step": 450
},
{
"epoch": 0.5844980940279543,
"grad_norm": 4.466687752161082,
"learning_rate": 1.9867652946103413e-05,
"loss": 0.9324,
"step": 460
},
{
"epoch": 0.5972045743329097,
"grad_norm": 4.706203130518702,
"learning_rate": 1.9841048840374885e-05,
"loss": 0.9311,
"step": 470
},
{
"epoch": 0.6099110546378653,
"grad_norm": 4.776570809357277,
"learning_rate": 1.9812031106294314e-05,
"loss": 0.952,
"step": 480
},
{
"epoch": 0.6226175349428208,
"grad_norm": 5.443465949868151,
"learning_rate": 1.978060686078866e-05,
"loss": 0.9067,
"step": 490
},
{
"epoch": 0.6353240152477764,
"grad_norm": 4.985579208130221,
"learning_rate": 1.974678381100896e-05,
"loss": 0.9559,
"step": 500
},
{
"epoch": 0.6480304955527318,
"grad_norm": 4.406803792781764,
"learning_rate": 1.9710570252440106e-05,
"loss": 0.9082,
"step": 510
},
{
"epoch": 0.6607369758576874,
"grad_norm": 4.620905177890474,
"learning_rate": 1.9671975066866254e-05,
"loss": 0.9241,
"step": 520
},
{
"epoch": 0.6734434561626429,
"grad_norm": 3.9984494959651533,
"learning_rate": 1.9631007720192475e-05,
"loss": 0.8811,
"step": 530
},
{
"epoch": 0.6861499364675985,
"grad_norm": 4.760193821922472,
"learning_rate": 1.9587678260123146e-05,
"loss": 0.9314,
"step": 540
},
{
"epoch": 0.6988564167725541,
"grad_norm": 4.4328809578626895,
"learning_rate": 1.9541997313697614e-05,
"loss": 0.9018,
"step": 550
},
{
"epoch": 0.7115628970775095,
"grad_norm": 3.785123304702001,
"learning_rate": 1.9493976084683814e-05,
"loss": 0.9349,
"step": 560
},
{
"epoch": 0.7242693773824651,
"grad_norm": 4.623522007776074,
"learning_rate": 1.9443626350830417e-05,
"loss": 0.9283,
"step": 570
},
{
"epoch": 0.7369758576874206,
"grad_norm": 4.367382745999128,
"learning_rate": 1.9390960460978188e-05,
"loss": 0.8936,
"step": 580
},
{
"epoch": 0.7496823379923762,
"grad_norm": 4.640745350515662,
"learning_rate": 1.933599133203131e-05,
"loss": 0.9529,
"step": 590
},
{
"epoch": 0.7623888182973316,
"grad_norm": 4.0946471225054974,
"learning_rate": 1.9278732445789364e-05,
"loss": 0.8961,
"step": 600
},
{
"epoch": 0.7750952986022872,
"grad_norm": 4.250614479191838,
"learning_rate": 1.9219197845640766e-05,
"loss": 0.9028,
"step": 610
},
{
"epoch": 0.7878017789072427,
"grad_norm": 4.147828875270731,
"learning_rate": 1.9157402133118454e-05,
"loss": 0.9302,
"step": 620
},
{
"epoch": 0.8005082592121983,
"grad_norm": 5.568981020266887,
"learning_rate": 1.909336046431871e-05,
"loss": 0.9233,
"step": 630
},
{
"epoch": 0.8132147395171537,
"grad_norm": 4.311517125711432,
"learning_rate": 1.9027088546183968e-05,
"loss": 0.9694,
"step": 640
},
{
"epoch": 0.8259212198221093,
"grad_norm": 5.556351823725932,
"learning_rate": 1.8958602632650474e-05,
"loss": 0.9003,
"step": 650
},
{
"epoch": 0.8386277001270648,
"grad_norm": 4.269905367926679,
"learning_rate": 1.8887919520661867e-05,
"loss": 0.8805,
"step": 660
},
{
"epoch": 0.8513341804320204,
"grad_norm": 3.863721835826297,
"learning_rate": 1.8815056546049505e-05,
"loss": 0.9158,
"step": 670
},
{
"epoch": 0.8640406607369758,
"grad_norm": 3.963824200874715,
"learning_rate": 1.8740031579280667e-05,
"loss": 0.8835,
"step": 680
},
{
"epoch": 0.8767471410419314,
"grad_norm": 3.680960497113959,
"learning_rate": 1.8662863021075632e-05,
"loss": 0.898,
"step": 690
},
{
"epoch": 0.8894536213468869,
"grad_norm": 3.7414803428899606,
"learning_rate": 1.8583569797894673e-05,
"loss": 0.9253,
"step": 700
},
{
"epoch": 0.9021601016518425,
"grad_norm": 4.680988801232008,
"learning_rate": 1.8502171357296144e-05,
"loss": 0.848,
"step": 710
},
{
"epoch": 0.9148665819567979,
"grad_norm": 3.9671267724005785,
"learning_rate": 1.8418687663166745e-05,
"loss": 0.8965,
"step": 720
},
{
"epoch": 0.9275730622617535,
"grad_norm": 4.137039499686447,
"learning_rate": 1.833313919082515e-05,
"loss": 0.8553,
"step": 730
},
{
"epoch": 0.940279542566709,
"grad_norm": 4.784766455706121,
"learning_rate": 1.8245546922000207e-05,
"loss": 0.8695,
"step": 740
},
{
"epoch": 0.9529860228716646,
"grad_norm": 4.418195979726905,
"learning_rate": 1.815593233968492e-05,
"loss": 0.8497,
"step": 750
},
{
"epoch": 0.96569250317662,
"grad_norm": 4.103893841492413,
"learning_rate": 1.806431742286752e-05,
"loss": 0.8746,
"step": 760
},
{
"epoch": 0.9783989834815756,
"grad_norm": 3.798164417492566,
"learning_rate": 1.7970724641140864e-05,
"loss": 0.8708,
"step": 770
},
{
"epoch": 0.9911054637865311,
"grad_norm": 4.623760315878684,
"learning_rate": 1.7875176949191506e-05,
"loss": 0.94,
"step": 780
},
{
"epoch": 1.0038119440914866,
"grad_norm": 3.381126634985229,
"learning_rate": 1.7777697781169813e-05,
"loss": 0.8297,
"step": 790
},
{
"epoch": 1.0165184243964422,
"grad_norm": 4.072031882597377,
"learning_rate": 1.7678311044942464e-05,
"loss": 0.6761,
"step": 800
},
{
"epoch": 1.0292249047013977,
"grad_norm": 4.666218927514245,
"learning_rate": 1.757704111622878e-05,
"loss": 0.6868,
"step": 810
},
{
"epoch": 1.0419313850063532,
"grad_norm": 3.010389554548932,
"learning_rate": 1.747391283262231e-05,
"loss": 0.6994,
"step": 820
},
{
"epoch": 1.0546378653113089,
"grad_norm": 4.690411895539488,
"learning_rate": 1.736895148749911e-05,
"loss": 0.7141,
"step": 830
},
{
"epoch": 1.0673443456162643,
"grad_norm": 4.135758513727204,
"learning_rate": 1.7262182823814297e-05,
"loss": 0.6941,
"step": 840
},
{
"epoch": 1.0800508259212198,
"grad_norm": 4.378977675253243,
"learning_rate": 1.7153633027788252e-05,
"loss": 0.6662,
"step": 850
},
{
"epoch": 1.0927573062261753,
"grad_norm": 3.8569291056754498,
"learning_rate": 1.704332872248418e-05,
"loss": 0.6575,
"step": 860
},
{
"epoch": 1.105463786531131,
"grad_norm": 3.9650953005920666,
"learning_rate": 1.69312969612785e-05,
"loss": 0.6959,
"step": 870
},
{
"epoch": 1.1181702668360864,
"grad_norm": 3.349386344864765,
"learning_rate": 1.6817565221225698e-05,
"loss": 0.6701,
"step": 880
},
{
"epoch": 1.130876747141042,
"grad_norm": 4.530446985368436,
"learning_rate": 1.6702161396319266e-05,
"loss": 0.7168,
"step": 890
},
{
"epoch": 1.1435832274459974,
"grad_norm": 4.048659358174538,
"learning_rate": 1.658511379065039e-05,
"loss": 0.7087,
"step": 900
},
{
"epoch": 1.156289707750953,
"grad_norm": 3.897340539186477,
"learning_rate": 1.6466451111466044e-05,
"loss": 0.7509,
"step": 910
},
{
"epoch": 1.1689961880559085,
"grad_norm": 3.106349799248209,
"learning_rate": 1.6346202462128228e-05,
"loss": 0.6793,
"step": 920
},
{
"epoch": 1.181702668360864,
"grad_norm": 3.7338218401998753,
"learning_rate": 1.6224397334976023e-05,
"loss": 0.7172,
"step": 930
},
{
"epoch": 1.1944091486658195,
"grad_norm": 5.11718627522725,
"learning_rate": 1.610106560409227e-05,
"loss": 0.6759,
"step": 940
},
{
"epoch": 1.2071156289707752,
"grad_norm": 3.6889308944466177,
"learning_rate": 1.597623751797662e-05,
"loss": 0.6822,
"step": 950
},
{
"epoch": 1.2198221092757306,
"grad_norm": 3.6223318506400135,
"learning_rate": 1.584994369212673e-05,
"loss": 0.7034,
"step": 960
},
{
"epoch": 1.2325285895806861,
"grad_norm": 3.3333910693718662,
"learning_rate": 1.572221510152949e-05,
"loss": 0.767,
"step": 970
},
{
"epoch": 1.2452350698856416,
"grad_norm": 4.265447578007238,
"learning_rate": 1.5593083073064037e-05,
"loss": 0.7358,
"step": 980
},
{
"epoch": 1.2579415501905973,
"grad_norm": 3.874622904654225,
"learning_rate": 1.5462579277818498e-05,
"loss": 0.7336,
"step": 990
},
{
"epoch": 1.2706480304955527,
"grad_norm": 3.925758808832438,
"learning_rate": 1.5330735723322282e-05,
"loss": 0.7102,
"step": 1000
},
{
"epoch": 1.2833545108005082,
"grad_norm": 4.212874894353556,
"learning_rate": 1.5197584745695904e-05,
"loss": 0.7053,
"step": 1010
},
{
"epoch": 1.2960609911054637,
"grad_norm": 3.7288496569236154,
"learning_rate": 1.506315900172014e-05,
"loss": 0.7223,
"step": 1020
},
{
"epoch": 1.3087674714104194,
"grad_norm": 3.79413472563588,
"learning_rate": 1.4927491460826626e-05,
"loss": 0.7185,
"step": 1030
},
{
"epoch": 1.3214739517153749,
"grad_norm": 4.197391869723048,
"learning_rate": 1.4790615397011703e-05,
"loss": 0.6293,
"step": 1040
},
{
"epoch": 1.3341804320203303,
"grad_norm": 3.3274802014296254,
"learning_rate": 1.4652564380675616e-05,
"loss": 0.7111,
"step": 1050
},
{
"epoch": 1.346886912325286,
"grad_norm": 3.984633199779957,
"learning_rate": 1.4513372270388967e-05,
"loss": 0.6926,
"step": 1060
},
{
"epoch": 1.3595933926302415,
"grad_norm": 4.32141196403412,
"learning_rate": 1.4373073204588556e-05,
"loss": 0.7126,
"step": 1070
},
{
"epoch": 1.372299872935197,
"grad_norm": 3.7790442182857302,
"learning_rate": 1.42317015932045e-05,
"loss": 0.6873,
"step": 1080
},
{
"epoch": 1.3850063532401524,
"grad_norm": 4.2661658978513355,
"learning_rate": 1.4089292109220852e-05,
"loss": 0.7642,
"step": 1090
},
{
"epoch": 1.397712833545108,
"grad_norm": 4.2591149854567645,
"learning_rate": 1.394587968017162e-05,
"loss": 0.6799,
"step": 1100
},
{
"epoch": 1.4104193138500636,
"grad_norm": 3.689601844022756,
"learning_rate": 1.3801499479574431e-05,
"loss": 0.6536,
"step": 1110
},
{
"epoch": 1.423125794155019,
"grad_norm": 4.289242494025662,
"learning_rate": 1.3656186918303804e-05,
"loss": 0.7092,
"step": 1120
},
{
"epoch": 1.4358322744599745,
"grad_norm": 3.891766076099888,
"learning_rate": 1.3509977635906241e-05,
"loss": 0.6536,
"step": 1130
},
{
"epoch": 1.4485387547649302,
"grad_norm": 3.4313665664745465,
"learning_rate": 1.3362907491859227e-05,
"loss": 0.6474,
"step": 1140
},
{
"epoch": 1.4612452350698857,
"grad_norm": 4.303628344639665,
"learning_rate": 1.3215012556776287e-05,
"loss": 0.715,
"step": 1150
},
{
"epoch": 1.4739517153748412,
"grad_norm": 4.009317272354951,
"learning_rate": 1.3066329103560267e-05,
"loss": 0.715,
"step": 1160
},
{
"epoch": 1.4866581956797966,
"grad_norm": 3.171330560062687,
"learning_rate": 1.2916893598506981e-05,
"loss": 0.6217,
"step": 1170
},
{
"epoch": 1.499364675984752,
"grad_norm": 3.3926952435565676,
"learning_rate": 1.276674269236145e-05,
"loss": 0.7366,
"step": 1180
},
{
"epoch": 1.5120711562897078,
"grad_norm": 3.8316403134343537,
"learning_rate": 1.2615913211328894e-05,
"loss": 0.6939,
"step": 1190
},
{
"epoch": 1.5247776365946633,
"grad_norm": 4.868361745818093,
"learning_rate": 1.2464442148042679e-05,
"loss": 0.6919,
"step": 1200
},
{
"epoch": 1.537484116899619,
"grad_norm": 3.5185484888328644,
"learning_rate": 1.2312366652491476e-05,
"loss": 0.6791,
"step": 1210
},
{
"epoch": 1.5501905972045744,
"grad_norm": 3.543401291583064,
"learning_rate": 1.2159724022907786e-05,
"loss": 0.6574,
"step": 1220
},
{
"epoch": 1.5628970775095299,
"grad_norm": 3.6437779582291063,
"learning_rate": 1.2006551696620135e-05,
"loss": 0.701,
"step": 1230
},
{
"epoch": 1.5756035578144854,
"grad_norm": 3.2559101294982025,
"learning_rate": 1.1852887240871145e-05,
"loss": 0.6546,
"step": 1240
},
{
"epoch": 1.5883100381194408,
"grad_norm": 3.9272330209126634,
"learning_rate": 1.1698768343603753e-05,
"loss": 0.6643,
"step": 1250
},
{
"epoch": 1.6010165184243963,
"grad_norm": 4.624643945291569,
"learning_rate": 1.1544232804217805e-05,
"loss": 0.6982,
"step": 1260
},
{
"epoch": 1.613722998729352,
"grad_norm": 3.7368581014964803,
"learning_rate": 1.1389318524299332e-05,
"loss": 0.6591,
"step": 1270
},
{
"epoch": 1.6264294790343075,
"grad_norm": 3.4323757873137177,
"learning_rate": 1.1234063498324764e-05,
"loss": 0.6743,
"step": 1280
},
{
"epoch": 1.6391359593392631,
"grad_norm": 4.208550713330492,
"learning_rate": 1.1078505804342327e-05,
"loss": 0.7147,
"step": 1290
},
{
"epoch": 1.6518424396442186,
"grad_norm": 2.978768874310465,
"learning_rate": 1.092268359463302e-05,
"loss": 0.671,
"step": 1300
},
{
"epoch": 1.664548919949174,
"grad_norm": 3.5924777944521606,
"learning_rate": 1.0766635086353298e-05,
"loss": 0.6713,
"step": 1310
},
{
"epoch": 1.6772554002541296,
"grad_norm": 3.495623048824376,
"learning_rate": 1.06103985521619e-05,
"loss": 0.6629,
"step": 1320
},
{
"epoch": 1.689961880559085,
"grad_norm": 4.086638389260075,
"learning_rate": 1.0454012310833034e-05,
"loss": 0.7035,
"step": 1330
},
{
"epoch": 1.7026683608640405,
"grad_norm": 3.475772078501932,
"learning_rate": 1.0297514717858286e-05,
"loss": 0.6631,
"step": 1340
},
{
"epoch": 1.7153748411689962,
"grad_norm": 3.5510342885210164,
"learning_rate": 1.0140944156039481e-05,
"loss": 0.685,
"step": 1350
},
{
"epoch": 1.7280813214739519,
"grad_norm": 3.5594852661382634,
"learning_rate": 9.984339026074881e-06,
"loss": 0.6549,
"step": 1360
},
{
"epoch": 1.7407878017789074,
"grad_norm": 3.3395635194008415,
"learning_rate": 9.827737737140983e-06,
"loss": 0.6467,
"step": 1370
},
{
"epoch": 1.7534942820838628,
"grad_norm": 3.219821540782638,
"learning_rate": 9.671178697472217e-06,
"loss": 0.6543,
"step": 1380
},
{
"epoch": 1.7662007623888183,
"grad_norm": 3.384594388965041,
"learning_rate": 9.514700304940901e-06,
"loss": 0.6922,
"step": 1390
},
{
"epoch": 1.7789072426937738,
"grad_norm": 3.64590250632275,
"learning_rate": 9.358340937639746e-06,
"loss": 0.6557,
"step": 1400
},
{
"epoch": 1.7916137229987292,
"grad_norm": 3.765353121248252,
"learning_rate": 9.202138944469168e-06,
"loss": 0.688,
"step": 1410
},
{
"epoch": 1.804320203303685,
"grad_norm": 3.7449398399867624,
"learning_rate": 9.046132635731816e-06,
"loss": 0.6675,
"step": 1420
},
{
"epoch": 1.8170266836086404,
"grad_norm": 3.942030599345544,
"learning_rate": 8.890360273736504e-06,
"loss": 0.6584,
"step": 1430
},
{
"epoch": 1.829733163913596,
"grad_norm": 4.037931457583538,
"learning_rate": 8.734860063413974e-06,
"loss": 0.6735,
"step": 1440
},
{
"epoch": 1.8424396442185516,
"grad_norm": 3.6205476660211247,
"learning_rate": 8.579670142946701e-06,
"loss": 0.7102,
"step": 1450
},
{
"epoch": 1.855146124523507,
"grad_norm": 3.821487835967331,
"learning_rate": 8.42482857441506e-06,
"loss": 0.6749,
"step": 1460
},
{
"epoch": 1.8678526048284625,
"grad_norm": 3.3623194464637574,
"learning_rate": 8.270373334462193e-06,
"loss": 0.672,
"step": 1470
},
{
"epoch": 1.880559085133418,
"grad_norm": 4.020841970961885,
"learning_rate": 8.116342304979783e-06,
"loss": 0.6863,
"step": 1480
},
{
"epoch": 1.8932655654383734,
"grad_norm": 4.08254040286643,
"learning_rate": 7.962773263817114e-06,
"loss": 0.6815,
"step": 1490
},
{
"epoch": 1.9059720457433291,
"grad_norm": 4.148274894889353,
"learning_rate": 7.809703875515613e-06,
"loss": 0.6417,
"step": 1500
},
{
"epoch": 1.9186785260482846,
"grad_norm": 4.640824882446659,
"learning_rate": 7.657171682071198e-06,
"loss": 0.62,
"step": 1510
},
{
"epoch": 1.9313850063532403,
"grad_norm": 4.7797510297359835,
"learning_rate": 7.505214093726692e-06,
"loss": 0.6439,
"step": 1520
},
{
"epoch": 1.9440914866581958,
"grad_norm": 3.613563186875674,
"learning_rate": 7.353868379796518e-06,
"loss": 0.6705,
"step": 1530
},
{
"epoch": 1.9567979669631512,
"grad_norm": 3.271201239131824,
"learning_rate": 7.203171659526e-06,
"loss": 0.6324,
"step": 1540
},
{
"epoch": 1.9695044472681067,
"grad_norm": 3.89489541610708,
"learning_rate": 7.053160892987434e-06,
"loss": 0.6757,
"step": 1550
},
{
"epoch": 1.9822109275730622,
"grad_norm": 3.701828258351079,
"learning_rate": 6.903872872015209e-06,
"loss": 0.6456,
"step": 1560
},
{
"epoch": 1.9949174078780176,
"grad_norm": 3.5373164710070957,
"learning_rate": 6.755344211182221e-06,
"loss": 0.6166,
"step": 1570
},
{
"epoch": 2.007623888182973,
"grad_norm": 2.425760113176382,
"learning_rate": 6.607611338819697e-06,
"loss": 0.5016,
"step": 1580
},
{
"epoch": 2.020330368487929,
"grad_norm": 3.427501282817139,
"learning_rate": 6.460710488082774e-06,
"loss": 0.374,
"step": 1590
},
{
"epoch": 2.0330368487928845,
"grad_norm": 3.4855149165350636,
"learning_rate": 6.31467768806388e-06,
"loss": 0.3524,
"step": 1600
},
{
"epoch": 2.04574332909784,
"grad_norm": 3.5473678303457996,
"learning_rate": 6.169548754956201e-06,
"loss": 0.3485,
"step": 1610
},
{
"epoch": 2.0584498094027954,
"grad_norm": 3.2554977371598466,
"learning_rate": 6.025359283269363e-06,
"loss": 0.348,
"step": 1620
},
{
"epoch": 2.071156289707751,
"grad_norm": 3.4222657332943376,
"learning_rate": 5.882144637099465e-06,
"loss": 0.3753,
"step": 1630
},
{
"epoch": 2.0838627700127064,
"grad_norm": 2.9777568505895675,
"learning_rate": 5.739939941455644e-06,
"loss": 0.3526,
"step": 1640
},
{
"epoch": 2.096569250317662,
"grad_norm": 3.7955516489911805,
"learning_rate": 5.598780073645267e-06,
"loss": 0.3543,
"step": 1650
},
{
"epoch": 2.1092757306226178,
"grad_norm": 3.8406500166667885,
"learning_rate": 5.458699654719873e-06,
"loss": 0.3642,
"step": 1660
},
{
"epoch": 2.121982210927573,
"grad_norm": 3.813395969645494,
"learning_rate": 5.319733040983972e-06,
"loss": 0.3428,
"step": 1670
},
{
"epoch": 2.1346886912325287,
"grad_norm": 3.7266891839301763,
"learning_rate": 5.181914315568782e-06,
"loss": 0.3403,
"step": 1680
},
{
"epoch": 2.147395171537484,
"grad_norm": 3.688709734552298,
"learning_rate": 5.0452772800729375e-06,
"loss": 0.3469,
"step": 1690
},
{
"epoch": 2.1601016518424396,
"grad_norm": 3.6629109337292403,
"learning_rate": 4.909855446272288e-06,
"loss": 0.3454,
"step": 1700
},
{
"epoch": 2.172808132147395,
"grad_norm": 3.7085182263998555,
"learning_rate": 4.775682027900739e-06,
"loss": 0.341,
"step": 1710
},
{
"epoch": 2.1855146124523506,
"grad_norm": 3.481723946532174,
"learning_rate": 4.6427899325042135e-06,
"loss": 0.3352,
"step": 1720
},
{
"epoch": 2.198221092757306,
"grad_norm": 3.2839395610983027,
"learning_rate": 4.511211753369712e-06,
"loss": 0.3447,
"step": 1730
},
{
"epoch": 2.210927573062262,
"grad_norm": 3.6755308055006464,
"learning_rate": 4.380979761531431e-06,
"loss": 0.3531,
"step": 1740
},
{
"epoch": 2.2236340533672174,
"grad_norm": 3.7905960831955916,
"learning_rate": 4.2521258978559324e-06,
"loss": 0.356,
"step": 1750
},
{
"epoch": 2.236340533672173,
"grad_norm": 3.627875927246556,
"learning_rate": 4.124681765208286e-06,
"loss": 0.3266,
"step": 1760
},
{
"epoch": 2.2490470139771284,
"grad_norm": 3.3246186092589447,
"learning_rate": 3.998678620701102e-06,
"loss": 0.3386,
"step": 1770
},
{
"epoch": 2.261753494282084,
"grad_norm": 3.804007286983282,
"learning_rate": 3.874147368028396e-06,
"loss": 0.3544,
"step": 1780
},
{
"epoch": 2.2744599745870393,
"grad_norm": 3.143040423820396,
"learning_rate": 3.751118549886065e-06,
"loss": 0.3227,
"step": 1790
},
{
"epoch": 2.2871664548919948,
"grad_norm": 3.352132852945674,
"learning_rate": 3.6296223404809903e-06,
"loss": 0.3399,
"step": 1800
},
{
"epoch": 2.2998729351969507,
"grad_norm": 4.043987038976339,
"learning_rate": 3.509688538130448e-06,
"loss": 0.3369,
"step": 1810
},
{
"epoch": 2.312579415501906,
"grad_norm": 3.954856965708331,
"learning_rate": 3.39134655795374e-06,
"loss": 0.341,
"step": 1820
},
{
"epoch": 2.3252858958068616,
"grad_norm": 3.5214147520563626,
"learning_rate": 3.2746254246578167e-06,
"loss": 0.3365,
"step": 1830
},
{
"epoch": 2.337992376111817,
"grad_norm": 3.218428553726758,
"learning_rate": 3.1595537654186114e-06,
"loss": 0.3546,
"step": 1840
},
{
"epoch": 2.3506988564167726,
"grad_norm": 3.163287967416541,
"learning_rate": 3.0461598028599305e-06,
"loss": 0.3431,
"step": 1850
},
{
"epoch": 2.363405336721728,
"grad_norm": 3.0988204272069573,
"learning_rate": 2.9344713481315225e-06,
"loss": 0.3303,
"step": 1860
},
{
"epoch": 2.3761118170266835,
"grad_norm": 3.9034586935786395,
"learning_rate": 2.8245157940880784e-06,
"loss": 0.3337,
"step": 1870
},
{
"epoch": 2.388818297331639,
"grad_norm": 3.5690630552722786,
"learning_rate": 2.7163201085708424e-06,
"loss": 0.3223,
"step": 1880
},
{
"epoch": 2.4015247776365944,
"grad_norm": 3.163806174642701,
"learning_rate": 2.6099108277934105e-06,
"loss": 0.3398,
"step": 1890
},
{
"epoch": 2.4142312579415504,
"grad_norm": 3.7465583268537275,
"learning_rate": 2.505314049833457e-06,
"loss": 0.3483,
"step": 1900
},
{
"epoch": 2.426937738246506,
"grad_norm": 3.516374761436456,
"learning_rate": 2.402555428231872e-06,
"loss": 0.3273,
"step": 1910
},
{
"epoch": 2.4396442185514613,
"grad_norm": 3.5353549798113284,
"learning_rate": 2.3016601657009364e-06,
"loss": 0.3374,
"step": 1920
},
{
"epoch": 2.4523506988564168,
"grad_norm": 3.357432157861631,
"learning_rate": 2.202653007943093e-06,
"loss": 0.3464,
"step": 1930
},
{
"epoch": 2.4650571791613722,
"grad_norm": 3.6506743298663675,
"learning_rate": 2.1055582375817475e-06,
"loss": 0.325,
"step": 1940
},
{
"epoch": 2.4777636594663277,
"grad_norm": 3.907282101797735,
"learning_rate": 2.0103996682057235e-06,
"loss": 0.3255,
"step": 1950
},
{
"epoch": 2.490470139771283,
"grad_norm": 3.711785490906897,
"learning_rate": 1.9172006385286723e-06,
"loss": 0.3391,
"step": 1960
},
{
"epoch": 2.503176620076239,
"grad_norm": 3.2473323176322135,
"learning_rate": 1.8259840066650136e-06,
"loss": 0.3389,
"step": 1970
},
{
"epoch": 2.5158831003811946,
"grad_norm": 3.6433209864443916,
"learning_rate": 1.7367721445237285e-06,
"loss": 0.3258,
"step": 1980
},
{
"epoch": 2.52858958068615,
"grad_norm": 4.12961794749056,
"learning_rate": 1.6495869323213654e-06,
"loss": 0.3185,
"step": 1990
},
{
"epoch": 2.5412960609911055,
"grad_norm": 4.1376649833602865,
"learning_rate": 1.564449753215711e-06,
"loss": 0.3247,
"step": 2000
},
{
"epoch": 2.554002541296061,
"grad_norm": 4.441583691608097,
"learning_rate": 1.4813814880612942e-06,
"loss": 0.3198,
"step": 2010
},
{
"epoch": 2.5667090216010164,
"grad_norm": 2.7847992176083047,
"learning_rate": 1.4004025102881402e-06,
"loss": 0.3143,
"step": 2020
},
{
"epoch": 2.579415501905972,
"grad_norm": 3.1337243741428473,
"learning_rate": 1.321532680904959e-06,
"loss": 0.3312,
"step": 2030
},
{
"epoch": 2.5921219822109274,
"grad_norm": 3.3653692372757225,
"learning_rate": 1.2447913436279879e-06,
"loss": 0.3129,
"step": 2040
},
{
"epoch": 2.604828462515883,
"grad_norm": 3.7337029805672635,
"learning_rate": 1.1701973201367544e-06,
"loss": 0.3253,
"step": 2050
},
{
"epoch": 2.6175349428208388,
"grad_norm": 4.214904637605022,
"learning_rate": 1.09776890545782e-06,
"loss": 0.3531,
"step": 2060
},
{
"epoch": 2.6302414231257942,
"grad_norm": 3.3613800857078764,
"learning_rate": 1.0275238634777441e-06,
"loss": 0.3105,
"step": 2070
},
{
"epoch": 2.6429479034307497,
"grad_norm": 3.7555272174929595,
"learning_rate": 9.594794225862692e-07,
"loss": 0.3331,
"step": 2080
},
{
"epoch": 2.655654383735705,
"grad_norm": 3.6364434124366,
"learning_rate": 8.936522714508678e-07,
"loss": 0.3336,
"step": 2090
},
{
"epoch": 2.6683608640406606,
"grad_norm": 3.684690863431174,
"learning_rate": 8.300585549236773e-07,
"loss": 0.3232,
"step": 2100
},
{
"epoch": 2.681067344345616,
"grad_norm": 4.307296056522447,
"learning_rate": 7.687138700817598e-07,
"loss": 0.3165,
"step": 2110
},
{
"epoch": 2.693773824650572,
"grad_norm": 3.418062959790304,
"learning_rate": 7.096332624017755e-07,
"loss": 0.3126,
"step": 2120
},
{
"epoch": 2.7064803049555275,
"grad_norm": 3.5508346766299397,
"learning_rate": 6.528312220698885e-07,
"loss": 0.3303,
"step": 2130
},
{
"epoch": 2.719186785260483,
"grad_norm": 4.518044935525217,
"learning_rate": 5.983216804278869e-07,
"loss": 0.3191,
"step": 2140
},
{
"epoch": 2.7318932655654384,
"grad_norm": 3.2181606746612044,
"learning_rate": 5.461180065563787e-07,
"loss": 0.3059,
"step": 2150
},
{
"epoch": 2.744599745870394,
"grad_norm": 3.8710207257234144,
"learning_rate": 4.962330039958585e-07,
"loss": 0.3194,
"step": 2160
},
{
"epoch": 2.7573062261753494,
"grad_norm": 3.6523179671049215,
"learning_rate": 4.486789076064968e-07,
"loss": 0.3148,
"step": 2170
},
{
"epoch": 2.770012706480305,
"grad_norm": 3.5451882545264053,
"learning_rate": 4.034673805674116e-07,
"loss": 0.3285,
"step": 2180
},
{
"epoch": 2.7827191867852603,
"grad_norm": 3.0920265120158827,
"learning_rate": 3.606095115161279e-07,
"loss": 0.3172,
"step": 2190
},
{
"epoch": 2.795425667090216,
"grad_norm": 3.496745467546405,
"learning_rate": 3.201158118289793e-07,
"loss": 0.3183,
"step": 2200
},
{
"epoch": 2.8081321473951717,
"grad_norm": 3.361865186046809,
"learning_rate": 2.8199621304306425e-07,
"loss": 0.3209,
"step": 2210
},
{
"epoch": 2.820838627700127,
"grad_norm": 3.951234253131063,
"learning_rate": 2.46260064420426e-07,
"loss": 0.3165,
"step": 2220
},
{
"epoch": 2.8335451080050826,
"grad_norm": 4.033338515766138,
"learning_rate": 2.1291613065504313e-07,
"loss": 0.3233,
"step": 2230
},
{
"epoch": 2.846251588310038,
"grad_norm": 3.8424683095800303,
"learning_rate": 1.819725897231872e-07,
"loss": 0.318,
"step": 2240
},
{
"epoch": 2.8589580686149936,
"grad_norm": 3.6002718533091,
"learning_rate": 1.5343703087768225e-07,
"loss": 0.3323,
"step": 2250
},
{
"epoch": 2.871664548919949,
"grad_norm": 3.392955644057836,
"learning_rate": 1.2731645278655448e-07,
"loss": 0.3088,
"step": 2260
},
{
"epoch": 2.884371029224905,
"grad_norm": 3.9060628771881842,
"learning_rate": 1.0361726181653209e-07,
"loss": 0.3213,
"step": 2270
},
{
"epoch": 2.8970775095298604,
"grad_norm": 3.6937829161145515,
"learning_rate": 8.234527046180885e-08,
"loss": 0.3193,
"step": 2280
},
{
"epoch": 2.909783989834816,
"grad_norm": 3.7501161450474036,
"learning_rate": 6.350569591846434e-08,
"loss": 0.3334,
"step": 2290
},
{
"epoch": 2.9224904701397714,
"grad_norm": 3.4784424672902605,
"learning_rate": 4.710315880489091e-08,
"loss": 0.3273,
"step": 2300
},
{
"epoch": 2.935196950444727,
"grad_norm": 3.4126846620318707,
"learning_rate": 3.31416820285313e-08,
"loss": 0.3177,
"step": 2310
},
{
"epoch": 2.9479034307496823,
"grad_norm": 3.9848361195981785,
"learning_rate": 2.1624689799214503e-08,
"loss": 0.322,
"step": 2320
},
{
"epoch": 2.9606099110546378,
"grad_norm": 3.6028548305792203,
"learning_rate": 1.2555006789334301e-08,
"loss": 0.3038,
"step": 2330
},
{
"epoch": 2.9733163913595932,
"grad_norm": 4.107748523282802,
"learning_rate": 5.934857441062258e-09,
"loss": 0.313,
"step": 2340
},
{
"epoch": 2.9860228716645487,
"grad_norm": 4.057112734457165,
"learning_rate": 1.765865420779722e-09,
"loss": 0.315,
"step": 2350
},
{
"epoch": 2.998729351969504,
"grad_norm": 3.491454630886239,
"learning_rate": 4.9053220856354335e-11,
"loss": 0.328,
"step": 2360
},
{
"epoch": 3.0,
"step": 2361,
"total_flos": 121880697913344.0,
"train_loss": 0.6645450910134418,
"train_runtime": 16043.0595,
"train_samples_per_second": 1.177,
"train_steps_per_second": 0.147
}
],
"logging_steps": 10,
"max_steps": 2361,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 121880697913344.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}