effientReason-4b-sft-final / trainer_state.json
greyi's picture
Upload folder using huggingface_hub
1f32bea verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.0,
"eval_steps": 50,
"global_step": 246,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02476780185758514,
"grad_norm": 0.5827791094779968,
"learning_rate": 0.0,
"loss": 0.5993257761001587,
"step": 1
},
{
"epoch": 0.04953560371517028,
"grad_norm": 0.5781313180923462,
"learning_rate": 2.5e-07,
"loss": 0.5510573387145996,
"step": 2
},
{
"epoch": 0.07430340557275542,
"grad_norm": 0.5830345153808594,
"learning_rate": 5e-07,
"loss": 0.500480055809021,
"step": 3
},
{
"epoch": 0.09907120743034056,
"grad_norm": 0.5189770460128784,
"learning_rate": 7.5e-07,
"loss": 0.5299410820007324,
"step": 4
},
{
"epoch": 0.1238390092879257,
"grad_norm": 0.520061194896698,
"learning_rate": 1e-06,
"loss": 0.5539457201957703,
"step": 5
},
{
"epoch": 0.14860681114551083,
"grad_norm": 0.5419376492500305,
"learning_rate": 1.2499999999999999e-06,
"loss": 0.5408970713615417,
"step": 6
},
{
"epoch": 0.17337461300309598,
"grad_norm": 0.5576385855674744,
"learning_rate": 1.5e-06,
"loss": 0.5969724655151367,
"step": 7
},
{
"epoch": 0.19814241486068113,
"grad_norm": 0.5351932048797607,
"learning_rate": 1.75e-06,
"loss": 0.5394197106361389,
"step": 8
},
{
"epoch": 0.22291021671826625,
"grad_norm": 0.4773852527141571,
"learning_rate": 2e-06,
"loss": 0.5735222101211548,
"step": 9
},
{
"epoch": 0.2476780185758514,
"grad_norm": 0.5032294392585754,
"learning_rate": 1.9999128816724105e-06,
"loss": 0.5828520059585571,
"step": 10
},
{
"epoch": 0.2724458204334365,
"grad_norm": 0.49014607071876526,
"learning_rate": 1.9996515418688487e-06,
"loss": 0.5568044781684875,
"step": 11
},
{
"epoch": 0.29721362229102166,
"grad_norm": 0.5634818077087402,
"learning_rate": 1.9992160261242874e-06,
"loss": 0.5982780456542969,
"step": 12
},
{
"epoch": 0.3219814241486068,
"grad_norm": 0.4928373396396637,
"learning_rate": 1.9986064103215337e-06,
"loss": 0.563035249710083,
"step": 13
},
{
"epoch": 0.34674922600619196,
"grad_norm": 0.5265209674835205,
"learning_rate": 1.9978228006780053e-06,
"loss": 0.588450014591217,
"step": 14
},
{
"epoch": 0.3715170278637771,
"grad_norm": 0.4966702461242676,
"learning_rate": 1.996865333727226e-06,
"loss": 0.5518300533294678,
"step": 15
},
{
"epoch": 0.39628482972136225,
"grad_norm": 0.5559803247451782,
"learning_rate": 1.9957341762950344e-06,
"loss": 0.5778566002845764,
"step": 16
},
{
"epoch": 0.42105263157894735,
"grad_norm": 0.5569736957550049,
"learning_rate": 1.9944295254705185e-06,
"loss": 0.556509256362915,
"step": 17
},
{
"epoch": 0.4458204334365325,
"grad_norm": 0.5971181988716125,
"learning_rate": 1.992951608571673e-06,
"loss": 0.5314251780509949,
"step": 18
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.529690146446228,
"learning_rate": 1.9913006831057965e-06,
"loss": 0.5227062702178955,
"step": 19
},
{
"epoch": 0.4953560371517028,
"grad_norm": 0.6401184797286987,
"learning_rate": 1.989477036724619e-06,
"loss": 0.5782433152198792,
"step": 20
},
{
"epoch": 0.5201238390092879,
"grad_norm": 0.539941132068634,
"learning_rate": 1.9874809871741874e-06,
"loss": 0.5736757516860962,
"step": 21
},
{
"epoch": 0.544891640866873,
"grad_norm": 0.5726771950721741,
"learning_rate": 1.9853128822394975e-06,
"loss": 0.5858570337295532,
"step": 22
},
{
"epoch": 0.5696594427244582,
"grad_norm": 0.55902498960495,
"learning_rate": 1.982973099683902e-06,
"loss": 0.5574871301651001,
"step": 23
},
{
"epoch": 0.5944272445820433,
"grad_norm": 0.527619481086731,
"learning_rate": 1.9804620471832865e-06,
"loss": 0.5171317458152771,
"step": 24
},
{
"epoch": 0.6191950464396285,
"grad_norm": 0.5026052594184875,
"learning_rate": 1.9777801622550405e-06,
"loss": 0.5416678190231323,
"step": 25
},
{
"epoch": 0.6439628482972136,
"grad_norm": 0.47064998745918274,
"learning_rate": 1.9749279121818236e-06,
"loss": 0.5682564973831177,
"step": 26
},
{
"epoch": 0.6687306501547987,
"grad_norm": 0.5842341184616089,
"learning_rate": 1.9719057939301475e-06,
"loss": 0.5644649267196655,
"step": 27
},
{
"epoch": 0.6934984520123839,
"grad_norm": 0.49904075264930725,
"learning_rate": 1.9687143340637884e-06,
"loss": 0.5811545252799988,
"step": 28
},
{
"epoch": 0.718266253869969,
"grad_norm": 1.2309396266937256,
"learning_rate": 1.9653540886520385e-06,
"loss": 0.605437695980072,
"step": 29
},
{
"epoch": 0.7430340557275542,
"grad_norm": 0.5156847834587097,
"learning_rate": 1.9618256431728192e-06,
"loss": 0.5422309637069702,
"step": 30
},
{
"epoch": 0.7678018575851393,
"grad_norm": 0.6013903617858887,
"learning_rate": 1.958129612410668e-06,
"loss": 0.54377281665802,
"step": 31
},
{
"epoch": 0.7925696594427245,
"grad_norm": 0.5307015180587769,
"learning_rate": 1.954266640349623e-06,
"loss": 0.5074729919433594,
"step": 32
},
{
"epoch": 0.8173374613003096,
"grad_norm": 0.5950272679328918,
"learning_rate": 1.950237400061015e-06,
"loss": 0.5290631055831909,
"step": 33
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.5664405226707458,
"learning_rate": 1.9460425935861946e-06,
"loss": 0.600000262260437,
"step": 34
},
{
"epoch": 0.8668730650154799,
"grad_norm": 0.5338588953018188,
"learning_rate": 1.9416829518142113e-06,
"loss": 0.5680241584777832,
"step": 35
},
{
"epoch": 0.891640866873065,
"grad_norm": 0.5495931506156921,
"learning_rate": 1.9371592343544655e-06,
"loss": 0.5304821729660034,
"step": 36
},
{
"epoch": 0.9164086687306502,
"grad_norm": 0.47950977087020874,
"learning_rate": 1.932472229404356e-06,
"loss": 0.5156245827674866,
"step": 37
},
{
"epoch": 0.9411764705882353,
"grad_norm": 0.5299308896064758,
"learning_rate": 1.9276227536119477e-06,
"loss": 0.5732549428939819,
"step": 38
},
{
"epoch": 0.9659442724458205,
"grad_norm": 0.5737171173095703,
"learning_rate": 1.9226116519336828e-06,
"loss": 0.5309604406356812,
"step": 39
},
{
"epoch": 0.9907120743034056,
"grad_norm": 0.693321168422699,
"learning_rate": 1.917439797487156e-06,
"loss": 0.5797507762908936,
"step": 40
},
{
"epoch": 1.0,
"grad_norm": 0.9561907649040222,
"learning_rate": 1.9121080913989878e-06,
"loss": 0.5909802913665771,
"step": 41
},
{
"epoch": 1.0247678018575852,
"grad_norm": 0.6066501140594482,
"learning_rate": 1.9066174626478126e-06,
"loss": 0.6078804135322571,
"step": 42
},
{
"epoch": 1.0495356037151702,
"grad_norm": 0.5243707299232483,
"learning_rate": 1.9009688679024189e-06,
"loss": 0.5241413116455078,
"step": 43
},
{
"epoch": 1.0743034055727554,
"grad_norm": 0.5240072011947632,
"learning_rate": 1.8951632913550625e-06,
"loss": 0.5645661950111389,
"step": 44
},
{
"epoch": 1.0990712074303406,
"grad_norm": 0.6983147263526917,
"learning_rate": 1.889201744549981e-06,
"loss": 0.5029958486557007,
"step": 45
},
{
"epoch": 1.1238390092879258,
"grad_norm": 0.6109921932220459,
"learning_rate": 1.8830852662071505e-06,
"loss": 0.5748687386512756,
"step": 46
},
{
"epoch": 1.1486068111455108,
"grad_norm": 0.5242897868156433,
"learning_rate": 1.8768149220412987e-06,
"loss": 0.5576164722442627,
"step": 47
},
{
"epoch": 1.173374613003096,
"grad_norm": 0.5376689434051514,
"learning_rate": 1.8703918045762194e-06,
"loss": 0.5489684343338013,
"step": 48
},
{
"epoch": 1.1981424148606812,
"grad_norm": 0.5369903445243835,
"learning_rate": 1.863817032954416e-06,
"loss": 0.5305777192115784,
"step": 49
},
{
"epoch": 1.2229102167182662,
"grad_norm": 0.482452392578125,
"learning_rate": 1.8570917527421045e-06,
"loss": 0.4907306134700775,
"step": 50
},
{
"epoch": 1.2229102167182662,
"eval_accuracy": 0.8213776795920542,
"eval_loss": 0.5645560622215271,
"eval_runtime": 16.7311,
"eval_samples_per_second": 4.064,
"eval_steps_per_second": 2.032,
"step": 50
},
{
"epoch": 1.2476780185758514,
"grad_norm": 0.5009844899177551,
"learning_rate": 1.8502171357296142e-06,
"loss": 0.5544570088386536,
"step": 51
},
{
"epoch": 1.2724458204334366,
"grad_norm": 0.5807215571403503,
"learning_rate": 1.8431943797272185e-06,
"loss": 0.5804014205932617,
"step": 52
},
{
"epoch": 1.2972136222910216,
"grad_norm": 0.5564484596252441,
"learning_rate": 1.836024708356434e-06,
"loss": 0.5661737322807312,
"step": 53
},
{
"epoch": 1.3219814241486068,
"grad_norm": 0.5095818042755127,
"learning_rate": 1.8287093708368186e-06,
"loss": 0.5299423336982727,
"step": 54
},
{
"epoch": 1.346749226006192,
"grad_norm": 0.5763193368911743,
"learning_rate": 1.8212496417683135e-06,
"loss": 0.5352605581283569,
"step": 55
},
{
"epoch": 1.3715170278637772,
"grad_norm": 0.5195797681808472,
"learning_rate": 1.81364682090916e-06,
"loss": 0.530654788017273,
"step": 56
},
{
"epoch": 1.3962848297213624,
"grad_norm": 0.5399932861328125,
"learning_rate": 1.805902232949435e-06,
"loss": 0.5673707723617554,
"step": 57
},
{
"epoch": 1.4210526315789473,
"grad_norm": 0.5126509666442871,
"learning_rate": 1.7980172272802397e-06,
"loss": 0.5673764944076538,
"step": 58
},
{
"epoch": 1.4458204334365325,
"grad_norm": 0.5293602347373962,
"learning_rate": 1.789993177758588e-06,
"loss": 0.5548557043075562,
"step": 59
},
{
"epoch": 1.4705882352941178,
"grad_norm": 0.47508999705314636,
"learning_rate": 1.7818314824680298e-06,
"loss": 0.5592916011810303,
"step": 60
},
{
"epoch": 1.4953560371517027,
"grad_norm": 0.506854236125946,
"learning_rate": 1.773533563475053e-06,
"loss": 0.5494035482406616,
"step": 61
},
{
"epoch": 1.520123839009288,
"grad_norm": 0.6375800371170044,
"learning_rate": 1.7651008665813081e-06,
"loss": 0.5607191324234009,
"step": 62
},
{
"epoch": 1.5448916408668731,
"grad_norm": 0.4859982132911682,
"learning_rate": 1.7565348610716958e-06,
"loss": 0.5413356423377991,
"step": 63
},
{
"epoch": 1.5696594427244581,
"grad_norm": 0.5644744634628296,
"learning_rate": 1.7478370394583643e-06,
"loss": 0.5568721294403076,
"step": 64
},
{
"epoch": 1.5944272445820433,
"grad_norm": 0.5623730421066284,
"learning_rate": 1.739008917220659e-06,
"loss": 0.5305633544921875,
"step": 65
},
{
"epoch": 1.6191950464396285,
"grad_norm": 0.46600863337516785,
"learning_rate": 1.7300520325410698e-06,
"loss": 0.519407331943512,
"step": 66
},
{
"epoch": 1.6439628482972135,
"grad_norm": 0.5476927161216736,
"learning_rate": 1.7209679460372249e-06,
"loss": 0.5438145399093628,
"step": 67
},
{
"epoch": 1.6687306501547987,
"grad_norm": 0.5339446663856506,
"learning_rate": 1.711758240489971e-06,
"loss": 0.5288221836090088,
"step": 68
},
{
"epoch": 1.693498452012384,
"grad_norm": 0.4474664628505707,
"learning_rate": 1.7024245205675985e-06,
"loss": 0.5665724277496338,
"step": 69
},
{
"epoch": 1.718266253869969,
"grad_norm": 0.5247179865837097,
"learning_rate": 1.6929684125462468e-06,
"loss": 0.5420582294464111,
"step": 70
},
{
"epoch": 1.7430340557275543,
"grad_norm": 0.6573188304901123,
"learning_rate": 1.6833915640265483e-06,
"loss": 0.538118839263916,
"step": 71
},
{
"epoch": 1.7678018575851393,
"grad_norm": 0.5430057644844055,
"learning_rate": 1.6736956436465573e-06,
"loss": 0.5287379026412964,
"step": 72
},
{
"epoch": 1.7925696594427245,
"grad_norm": 1.451054334640503,
"learning_rate": 1.6638823407910082e-06,
"loss": 0.5065432190895081,
"step": 73
},
{
"epoch": 1.8173374613003097,
"grad_norm": 1.7800654172897339,
"learning_rate": 1.6539533652969682e-06,
"loss": 0.5422472357749939,
"step": 74
},
{
"epoch": 1.8421052631578947,
"grad_norm": 0.5204485654830933,
"learning_rate": 1.6439104471559156e-06,
"loss": 0.4941398501396179,
"step": 75
},
{
"epoch": 1.86687306501548,
"grad_norm": 0.4798074960708618,
"learning_rate": 1.6337553362123161e-06,
"loss": 0.5543307065963745,
"step": 76
},
{
"epoch": 1.891640866873065,
"grad_norm": 0.4639158248901367,
"learning_rate": 1.6234898018587336e-06,
"loss": 0.5305337905883789,
"step": 77
},
{
"epoch": 1.91640866873065,
"grad_norm": 0.4957791566848755,
"learning_rate": 1.613115632727537e-06,
"loss": 0.4810314178466797,
"step": 78
},
{
"epoch": 1.9411764705882353,
"grad_norm": 0.542951762676239,
"learning_rate": 1.6026346363792564e-06,
"loss": 0.5742234587669373,
"step": 79
},
{
"epoch": 1.9659442724458205,
"grad_norm": 0.518661618232727,
"learning_rate": 1.592048638987638e-06,
"loss": 0.5540245771408081,
"step": 80
},
{
"epoch": 1.9907120743034055,
"grad_norm": 0.48943665623664856,
"learning_rate": 1.5813594850214597e-06,
"loss": 0.509993851184845,
"step": 81
},
{
"epoch": 2.0,
"grad_norm": 0.8778729438781738,
"learning_rate": 1.570569036923155e-06,
"loss": 0.539715051651001,
"step": 82
},
{
"epoch": 2.024767801857585,
"grad_norm": 0.4994299113750458,
"learning_rate": 1.5596791747843082e-06,
"loss": 0.5089604258537292,
"step": 83
},
{
"epoch": 2.0495356037151704,
"grad_norm": 0.5828955173492432,
"learning_rate": 1.548691796018074e-06,
"loss": 0.5253075361251831,
"step": 84
},
{
"epoch": 2.0743034055727554,
"grad_norm": 0.5461580753326416,
"learning_rate": 1.5376088150285774e-06,
"loss": 0.5154924392700195,
"step": 85
},
{
"epoch": 2.0990712074303404,
"grad_norm": 0.965928852558136,
"learning_rate": 1.5264321628773558e-06,
"loss": 0.5028945803642273,
"step": 86
},
{
"epoch": 2.123839009287926,
"grad_norm": 0.45946890115737915,
"learning_rate": 1.5151637869468958e-06,
"loss": 0.5220765471458435,
"step": 87
},
{
"epoch": 2.1486068111455108,
"grad_norm": 0.4885327219963074,
"learning_rate": 1.5038056506013295e-06,
"loss": 0.5020776391029358,
"step": 88
},
{
"epoch": 2.173374613003096,
"grad_norm": 0.5246437191963196,
"learning_rate": 1.492359732844342e-06,
"loss": 0.46335524320602417,
"step": 89
},
{
"epoch": 2.198142414860681,
"grad_norm": 0.5331137180328369,
"learning_rate": 1.4808280279743591e-06,
"loss": 0.5037820339202881,
"step": 90
},
{
"epoch": 2.222910216718266,
"grad_norm": 0.5505975484848022,
"learning_rate": 1.4692125452370662e-06,
"loss": 0.5359715223312378,
"step": 91
},
{
"epoch": 2.2476780185758516,
"grad_norm": 0.5390040278434753,
"learning_rate": 1.4575153084753232e-06,
"loss": 0.5337521433830261,
"step": 92
},
{
"epoch": 2.2724458204334366,
"grad_norm": 0.44791266322135925,
"learning_rate": 1.4457383557765383e-06,
"loss": 0.5155265927314758,
"step": 93
},
{
"epoch": 2.2972136222910216,
"grad_norm": 0.4978775382041931,
"learning_rate": 1.433883739117558e-06,
"loss": 0.4920554757118225,
"step": 94
},
{
"epoch": 2.321981424148607,
"grad_norm": 0.5269660353660583,
"learning_rate": 1.4219535240071376e-06,
"loss": 0.5533995628356934,
"step": 95
},
{
"epoch": 2.346749226006192,
"grad_norm": 0.4875043034553528,
"learning_rate": 1.4099497891260537e-06,
"loss": 0.523270845413208,
"step": 96
},
{
"epoch": 2.371517027863777,
"grad_norm": 0.5254143476486206,
"learning_rate": 1.3978746259649208e-06,
"loss": 0.5255824327468872,
"step": 97
},
{
"epoch": 2.3962848297213624,
"grad_norm": 0.5345160365104675,
"learning_rate": 1.3857301384597794e-06,
"loss": 0.5329371094703674,
"step": 98
},
{
"epoch": 2.4210526315789473,
"grad_norm": 0.46321746706962585,
"learning_rate": 1.3735184426255114e-06,
"loss": 0.5548917055130005,
"step": 99
},
{
"epoch": 2.4458204334365323,
"grad_norm": 0.5209585428237915,
"learning_rate": 1.3612416661871531e-06,
"loss": 0.5931960940361023,
"step": 100
},
{
"epoch": 2.4458204334365323,
"eval_accuracy": 0.8215138901886158,
"eval_loss": 0.562470018863678,
"eval_runtime": 16.4711,
"eval_samples_per_second": 4.128,
"eval_steps_per_second": 2.064,
"step": 100
},
{
"epoch": 2.4705882352941178,
"grad_norm": 0.483987033367157,
"learning_rate": 1.3489019482091667e-06,
"loss": 0.5425853133201599,
"step": 101
},
{
"epoch": 2.4953560371517027,
"grad_norm": 0.44485101103782654,
"learning_rate": 1.336501438722739e-06,
"loss": 0.5403157472610474,
"step": 102
},
{
"epoch": 2.5201238390092877,
"grad_norm": 0.5460787415504456,
"learning_rate": 1.324042298351166e-06,
"loss": 0.5747348666191101,
"step": 103
},
{
"epoch": 2.544891640866873,
"grad_norm": 0.45323142409324646,
"learning_rate": 1.3115266979333914e-06,
"loss": 0.5297880172729492,
"step": 104
},
{
"epoch": 2.569659442724458,
"grad_norm": 0.6902194619178772,
"learning_rate": 1.2989568181457702e-06,
"loss": 0.5073508024215698,
"step": 105
},
{
"epoch": 2.594427244582043,
"grad_norm": 0.5212258100509644,
"learning_rate": 1.2863348491221127e-06,
"loss": 0.5311723351478577,
"step": 106
},
{
"epoch": 2.6191950464396285,
"grad_norm": 0.5578774809837341,
"learning_rate": 1.273662990072083e-06,
"loss": 0.5304839015007019,
"step": 107
},
{
"epoch": 2.6439628482972135,
"grad_norm": 0.504798173904419,
"learning_rate": 1.2609434488980166e-06,
"loss": 0.4865831136703491,
"step": 108
},
{
"epoch": 2.6687306501547985,
"grad_norm": 0.4682161211967468,
"learning_rate": 1.2481784418102239e-06,
"loss": 0.5439316630363464,
"step": 109
},
{
"epoch": 2.693498452012384,
"grad_norm": 0.5871185064315796,
"learning_rate": 1.2353701929408424e-06,
"loss": 0.477615088224411,
"step": 110
},
{
"epoch": 2.718266253869969,
"grad_norm": 0.4735322594642639,
"learning_rate": 1.2225209339563143e-06,
"loss": 0.5605683326721191,
"step": 111
},
{
"epoch": 2.7430340557275543,
"grad_norm": 0.5656632781028748,
"learning_rate": 1.2096329036685466e-06,
"loss": 0.5053581595420837,
"step": 112
},
{
"epoch": 2.7678018575851393,
"grad_norm": 0.501797616481781,
"learning_rate": 1.196708347644828e-06,
"loss": 0.5080878734588623,
"step": 113
},
{
"epoch": 2.7925696594427247,
"grad_norm": 1.2063102722167969,
"learning_rate": 1.1837495178165704e-06,
"loss": 0.552485466003418,
"step": 114
},
{
"epoch": 2.8173374613003097,
"grad_norm": 0.5052933096885681,
"learning_rate": 1.1707586720869374e-06,
"loss": 0.5424617528915405,
"step": 115
},
{
"epoch": 2.8421052631578947,
"grad_norm": 0.5184856057167053,
"learning_rate": 1.1577380739374373e-06,
"loss": 0.5432671904563904,
"step": 116
},
{
"epoch": 2.86687306501548,
"grad_norm": 0.5071874260902405,
"learning_rate": 1.1446899920335405e-06,
"loss": 0.5507460832595825,
"step": 117
},
{
"epoch": 2.891640866873065,
"grad_norm": 0.519482433795929,
"learning_rate": 1.1316166998293935e-06,
"loss": 0.5559477210044861,
"step": 118
},
{
"epoch": 2.91640866873065,
"grad_norm": 0.5042552947998047,
"learning_rate": 1.1185204751717027e-06,
"loss": 0.5015457272529602,
"step": 119
},
{
"epoch": 2.9411764705882355,
"grad_norm": 1.3727635145187378,
"learning_rate": 1.1054035999028476e-06,
"loss": 0.5176253318786621,
"step": 120
},
{
"epoch": 2.9659442724458205,
"grad_norm": 0.5206997990608215,
"learning_rate": 1.092268359463302e-06,
"loss": 0.5474892258644104,
"step": 121
},
{
"epoch": 2.9907120743034055,
"grad_norm": 0.472130686044693,
"learning_rate": 1.0791170424934246e-06,
"loss": 0.4985366463661194,
"step": 122
},
{
"epoch": 3.0,
"grad_norm": 1.058793544769287,
"learning_rate": 1.0659519404346952e-06,
"loss": 0.48316121101379395,
"step": 123
},
{
"epoch": 3.024767801857585,
"grad_norm": 0.5421841740608215,
"learning_rate": 1.0527753471304623e-06,
"loss": 0.5144573450088501,
"step": 124
},
{
"epoch": 3.0495356037151704,
"grad_norm": 0.5197970271110535,
"learning_rate": 1.0395895584262695e-06,
"loss": 0.5817261934280396,
"step": 125
},
{
"epoch": 3.0743034055727554,
"grad_norm": 0.49334728717803955,
"learning_rate": 1.0263968717698363e-06,
"loss": 0.5018012523651123,
"step": 126
},
{
"epoch": 3.0990712074303404,
"grad_norm": 0.6232290267944336,
"learning_rate": 1.013199585810759e-06,
"loss": 0.5584498643875122,
"step": 127
},
{
"epoch": 3.123839009287926,
"grad_norm": 0.455437034368515,
"learning_rate": 1e-06,
"loss": 0.5036893486976624,
"step": 128
},
{
"epoch": 3.1486068111455108,
"grad_norm": 0.48946836590766907,
"learning_rate": 9.868004141892412e-07,
"loss": 0.5123312473297119,
"step": 129
},
{
"epoch": 3.173374613003096,
"grad_norm": 0.5698655843734741,
"learning_rate": 9.736031282301638e-07,
"loss": 0.5401725172996521,
"step": 130
},
{
"epoch": 3.198142414860681,
"grad_norm": 0.9283490180969238,
"learning_rate": 9.604104415737308e-07,
"loss": 0.48566514253616333,
"step": 131
},
{
"epoch": 3.222910216718266,
"grad_norm": 2.0157785415649414,
"learning_rate": 9.472246528695375e-07,
"loss": 0.4537651538848877,
"step": 132
},
{
"epoch": 3.2476780185758516,
"grad_norm": 0.5449803471565247,
"learning_rate": 9.340480595653045e-07,
"loss": 0.5530433654785156,
"step": 133
},
{
"epoch": 3.2724458204334366,
"grad_norm": 0.4725954532623291,
"learning_rate": 9.208829575065753e-07,
"loss": 0.5256283283233643,
"step": 134
},
{
"epoch": 3.2972136222910216,
"grad_norm": 0.4579267203807831,
"learning_rate": 9.077316405366981e-07,
"loss": 0.5190701484680176,
"step": 135
},
{
"epoch": 3.321981424148607,
"grad_norm": 0.544757604598999,
"learning_rate": 8.945964000971523e-07,
"loss": 0.5290215015411377,
"step": 136
},
{
"epoch": 3.346749226006192,
"grad_norm": 0.4990670084953308,
"learning_rate": 8.814795248282973e-07,
"loss": 0.5203908085823059,
"step": 137
},
{
"epoch": 3.371517027863777,
"grad_norm": 0.5583924651145935,
"learning_rate": 8.683833001706067e-07,
"loss": 0.499897837638855,
"step": 138
},
{
"epoch": 3.3962848297213624,
"grad_norm": 0.47875887155532837,
"learning_rate": 8.553100079664598e-07,
"loss": 0.4940932095050812,
"step": 139
},
{
"epoch": 3.4210526315789473,
"grad_norm": 0.4689862132072449,
"learning_rate": 8.422619260625624e-07,
"loss": 0.488369345664978,
"step": 140
},
{
"epoch": 3.4458204334365323,
"grad_norm": 0.5019742846488953,
"learning_rate": 8.292413279130624e-07,
"loss": 0.49827271699905396,
"step": 141
},
{
"epoch": 3.4705882352941178,
"grad_norm": 0.47474774718284607,
"learning_rate": 8.162504821834295e-07,
"loss": 0.5006945133209229,
"step": 142
},
{
"epoch": 3.4953560371517027,
"grad_norm": 0.5412342548370361,
"learning_rate": 8.032916523551719e-07,
"loss": 0.5021499395370483,
"step": 143
},
{
"epoch": 3.5201238390092877,
"grad_norm": 0.46898508071899414,
"learning_rate": 7.903670963314535e-07,
"loss": 0.5173486471176147,
"step": 144
},
{
"epoch": 3.544891640866873,
"grad_norm": 0.5036367177963257,
"learning_rate": 7.774790660436857e-07,
"loss": 0.5127341151237488,
"step": 145
},
{
"epoch": 3.569659442724458,
"grad_norm": 0.4592057466506958,
"learning_rate": 7.646298070591577e-07,
"loss": 0.5291725397109985,
"step": 146
},
{
"epoch": 3.594427244582043,
"grad_norm": 0.579252302646637,
"learning_rate": 7.518215581897763e-07,
"loss": 0.5540162324905396,
"step": 147
},
{
"epoch": 3.6191950464396285,
"grad_norm": 0.5662134885787964,
"learning_rate": 7.390565511019833e-07,
"loss": 0.5307095646858215,
"step": 148
},
{
"epoch": 3.6439628482972135,
"grad_norm": 0.5780702233314514,
"learning_rate": 7.263370099279171e-07,
"loss": 0.48574694991111755,
"step": 149
},
{
"epoch": 3.6687306501547985,
"grad_norm": 0.5063837766647339,
"learning_rate": 7.136651508778874e-07,
"loss": 0.5621860027313232,
"step": 150
},
{
"epoch": 3.6687306501547985,
"eval_accuracy": 0.8215492383391412,
"eval_loss": 0.5617780685424805,
"eval_runtime": 16.4087,
"eval_samples_per_second": 4.144,
"eval_steps_per_second": 2.072,
"step": 150
},
{
"epoch": 3.693498452012384,
"grad_norm": 0.5430096387863159,
"learning_rate": 7.010431818542297e-07,
"loss": 0.4991950988769531,
"step": 151
},
{
"epoch": 3.718266253869969,
"grad_norm": 0.4858173727989197,
"learning_rate": 6.884733020666084e-07,
"loss": 0.47163355350494385,
"step": 152
},
{
"epoch": 3.7430340557275543,
"grad_norm": 0.4979320168495178,
"learning_rate": 6.759577016488343e-07,
"loss": 0.5382797718048096,
"step": 153
},
{
"epoch": 3.7678018575851393,
"grad_norm": 0.47822287678718567,
"learning_rate": 6.63498561277261e-07,
"loss": 0.5248020887374878,
"step": 154
},
{
"epoch": 3.7925696594427247,
"grad_norm": 0.5561540722846985,
"learning_rate": 6.510980517908333e-07,
"loss": 0.47944825887680054,
"step": 155
},
{
"epoch": 3.8173374613003097,
"grad_norm": 0.510204553604126,
"learning_rate": 6.387583338128471e-07,
"loss": 0.5094054937362671,
"step": 156
},
{
"epoch": 3.8421052631578947,
"grad_norm": 0.4817684590816498,
"learning_rate": 6.264815573744884e-07,
"loss": 0.4909018874168396,
"step": 157
},
{
"epoch": 3.86687306501548,
"grad_norm": 0.4790090024471283,
"learning_rate": 6.142698615402204e-07,
"loss": 0.47690001130104065,
"step": 158
},
{
"epoch": 3.891640866873065,
"grad_norm": 0.4971541464328766,
"learning_rate": 6.021253740350792e-07,
"loss": 0.5042445659637451,
"step": 159
},
{
"epoch": 3.91640866873065,
"grad_norm": 0.5663966536521912,
"learning_rate": 5.900502108739465e-07,
"loss": 0.5802559852600098,
"step": 160
},
{
"epoch": 3.9411764705882355,
"grad_norm": 0.6140542030334473,
"learning_rate": 5.780464759928623e-07,
"loss": 0.5226213932037354,
"step": 161
},
{
"epoch": 3.9659442724458205,
"grad_norm": 0.510217010974884,
"learning_rate": 5.661162608824419e-07,
"loss": 0.487061470746994,
"step": 162
},
{
"epoch": 3.9907120743034055,
"grad_norm": 0.47863468527793884,
"learning_rate": 5.542616442234618e-07,
"loss": 0.49519461393356323,
"step": 163
},
{
"epoch": 4.0,
"grad_norm": 0.8134075999259949,
"learning_rate": 5.424846915246769e-07,
"loss": 0.5006481409072876,
"step": 164
},
{
"epoch": 4.024767801857585,
"grad_norm": 0.5010446906089783,
"learning_rate": 5.307874547629339e-07,
"loss": 0.5043383240699768,
"step": 165
},
{
"epoch": 4.04953560371517,
"grad_norm": 0.5629169344902039,
"learning_rate": 5.191719720256407e-07,
"loss": 0.5104990005493164,
"step": 166
},
{
"epoch": 4.074303405572755,
"grad_norm": 0.5630432367324829,
"learning_rate": 5.076402671556577e-07,
"loss": 0.4841610789299011,
"step": 167
},
{
"epoch": 4.099071207430341,
"grad_norm": 0.46193253993988037,
"learning_rate": 4.961943493986708e-07,
"loss": 0.5317561030387878,
"step": 168
},
{
"epoch": 4.123839009287925,
"grad_norm": 0.5281070470809937,
"learning_rate": 4.848362130531039e-07,
"loss": 0.5141686201095581,
"step": 169
},
{
"epoch": 4.148606811145511,
"grad_norm": 0.927697479724884,
"learning_rate": 4.7356783712264403e-07,
"loss": 0.46369314193725586,
"step": 170
},
{
"epoch": 4.173374613003096,
"grad_norm": 0.5692654252052307,
"learning_rate": 4.623911849714225e-07,
"loss": 0.48228251934051514,
"step": 171
},
{
"epoch": 4.198142414860681,
"grad_norm": 0.48862549662590027,
"learning_rate": 4.5130820398192636e-07,
"loss": 0.5285767316818237,
"step": 172
},
{
"epoch": 4.222910216718266,
"grad_norm": 0.5772708058357239,
"learning_rate": 4.40320825215692e-07,
"loss": 0.5200311541557312,
"step": 173
},
{
"epoch": 4.247678018575852,
"grad_norm": 0.5576812028884888,
"learning_rate": 4.294309630768451e-07,
"loss": 0.5052947402000427,
"step": 174
},
{
"epoch": 4.272445820433436,
"grad_norm": 0.48456260561943054,
"learning_rate": 4.1864051497854027e-07,
"loss": 0.5091853141784668,
"step": 175
},
{
"epoch": 4.2972136222910216,
"grad_norm": 0.4992901086807251,
"learning_rate": 4.079513610123618e-07,
"loss": 0.5285595655441284,
"step": 176
},
{
"epoch": 4.321981424148607,
"grad_norm": 0.560563862323761,
"learning_rate": 3.973653636207437e-07,
"loss": 0.5327163338661194,
"step": 177
},
{
"epoch": 4.346749226006192,
"grad_norm": 0.48380428552627563,
"learning_rate": 3.8688436727246296e-07,
"loss": 0.4750836491584778,
"step": 178
},
{
"epoch": 4.371517027863777,
"grad_norm": 0.4964829385280609,
"learning_rate": 3.765101981412665e-07,
"loss": 0.46454548835754395,
"step": 179
},
{
"epoch": 4.396284829721362,
"grad_norm": 0.4538560211658478,
"learning_rate": 3.6624466378768384e-07,
"loss": 0.51465904712677,
"step": 180
},
{
"epoch": 4.421052631578947,
"grad_norm": 0.6692084074020386,
"learning_rate": 3.560895528440844e-07,
"loss": 0.4617176055908203,
"step": 181
},
{
"epoch": 4.445820433436532,
"grad_norm": 0.47236230969429016,
"learning_rate": 3.4604663470303186e-07,
"loss": 0.5083804130554199,
"step": 182
},
{
"epoch": 4.470588235294118,
"grad_norm": 0.4774688184261322,
"learning_rate": 3.3611765920899183e-07,
"loss": 0.5058382749557495,
"step": 183
},
{
"epoch": 4.495356037151703,
"grad_norm": 0.47210627794265747,
"learning_rate": 3.263043563534428e-07,
"loss": 0.5376588106155396,
"step": 184
},
{
"epoch": 4.520123839009288,
"grad_norm": 0.4772137403488159,
"learning_rate": 3.166084359734513e-07,
"loss": 0.5304179191589355,
"step": 185
},
{
"epoch": 4.544891640866873,
"grad_norm": 0.4682233929634094,
"learning_rate": 3.070315874537531e-07,
"loss": 0.4820975661277771,
"step": 186
},
{
"epoch": 4.569659442724459,
"grad_norm": 0.48219650983810425,
"learning_rate": 2.975754794324015e-07,
"loss": 0.5084782838821411,
"step": 187
},
{
"epoch": 4.594427244582043,
"grad_norm": 0.43362459540367126,
"learning_rate": 2.8824175951002916e-07,
"loss": 0.47581952810287476,
"step": 188
},
{
"epoch": 4.6191950464396285,
"grad_norm": 0.567948579788208,
"learning_rate": 2.790320539627754e-07,
"loss": 0.5314459800720215,
"step": 189
},
{
"epoch": 4.643962848297214,
"grad_norm": 0.5087016224861145,
"learning_rate": 2.6994796745893e-07,
"loss": 0.4740360379219055,
"step": 190
},
{
"epoch": 4.6687306501547985,
"grad_norm": 0.5123845338821411,
"learning_rate": 2.60991082779341e-07,
"loss": 0.5245854258537292,
"step": 191
},
{
"epoch": 4.693498452012384,
"grad_norm": 0.4884699285030365,
"learning_rate": 2.521629605416354e-07,
"loss": 0.5254173278808594,
"step": 192
},
{
"epoch": 4.718266253869969,
"grad_norm": 0.5492839217185974,
"learning_rate": 2.434651389283042e-07,
"loss": 0.5060293674468994,
"step": 193
},
{
"epoch": 4.743034055727554,
"grad_norm": 0.4537581503391266,
"learning_rate": 2.3489913341869193e-07,
"loss": 0.5028636455535889,
"step": 194
},
{
"epoch": 4.767801857585139,
"grad_norm": 0.5206896662712097,
"learning_rate": 2.264664365249469e-07,
"loss": 0.509818971157074,
"step": 195
},
{
"epoch": 4.792569659442725,
"grad_norm": 0.5348969101905823,
"learning_rate": 2.181685175319702e-07,
"loss": 0.4900963306427002,
"step": 196
},
{
"epoch": 4.817337461300309,
"grad_norm": 0.478466659784317,
"learning_rate": 2.100068222414121e-07,
"loss": 0.5366532802581787,
"step": 197
},
{
"epoch": 4.842105263157895,
"grad_norm": 0.4873082637786865,
"learning_rate": 2.0198277271976049e-07,
"loss": 0.5138839483261108,
"step": 198
},
{
"epoch": 4.86687306501548,
"grad_norm": 0.5307355523109436,
"learning_rate": 1.9409776705056514e-07,
"loss": 0.48487958312034607,
"step": 199
},
{
"epoch": 4.891640866873065,
"grad_norm": 0.6182578206062317,
"learning_rate": 1.863531790908398e-07,
"loss": 0.49715912342071533,
"step": 200
},
{
"epoch": 4.891640866873065,
"eval_accuracy": 0.8215848485329422,
"eval_loss": 0.5621271133422852,
"eval_runtime": 16.3624,
"eval_samples_per_second": 4.156,
"eval_steps_per_second": 2.078,
"step": 200
},
{
"epoch": 4.91640866873065,
"grad_norm": 0.5110271573066711,
"learning_rate": 1.787503582316864e-07,
"loss": 0.5255718231201172,
"step": 201
},
{
"epoch": 4.9411764705882355,
"grad_norm": 0.4957195222377777,
"learning_rate": 1.7129062916318137e-07,
"loss": 0.5106043219566345,
"step": 202
},
{
"epoch": 4.965944272445821,
"grad_norm": 1.4632741212844849,
"learning_rate": 1.6397529164356606e-07,
"loss": 0.5344016551971436,
"step": 203
},
{
"epoch": 4.9907120743034055,
"grad_norm": 0.533440113067627,
"learning_rate": 1.5680562027278154e-07,
"loss": 0.5215489268302917,
"step": 204
},
{
"epoch": 5.0,
"grad_norm": 0.8572560548782349,
"learning_rate": 1.49782864270386e-07,
"loss": 0.5227999687194824,
"step": 205
},
{
"epoch": 5.024767801857585,
"grad_norm": 0.43222400546073914,
"learning_rate": 1.429082472578954e-07,
"loss": 0.5099145174026489,
"step": 206
},
{
"epoch": 5.04953560371517,
"grad_norm": 0.47421810030937195,
"learning_rate": 1.3618296704558364e-07,
"loss": 0.5271211862564087,
"step": 207
},
{
"epoch": 5.074303405572755,
"grad_norm": 0.5383461117744446,
"learning_rate": 1.2960819542378053e-07,
"loss": 0.548247218132019,
"step": 208
},
{
"epoch": 5.099071207430341,
"grad_norm": 0.513953685760498,
"learning_rate": 1.2318507795870137e-07,
"loss": 0.47977253794670105,
"step": 209
},
{
"epoch": 5.123839009287925,
"grad_norm": 0.5112437605857849,
"learning_rate": 1.1691473379284944e-07,
"loss": 0.4924686551094055,
"step": 210
},
{
"epoch": 5.148606811145511,
"grad_norm": 0.5439184308052063,
"learning_rate": 1.1079825545001886e-07,
"loss": 0.4926351308822632,
"step": 211
},
{
"epoch": 5.173374613003096,
"grad_norm": 0.47784221172332764,
"learning_rate": 1.0483670864493777e-07,
"loss": 0.5255255699157715,
"step": 212
},
{
"epoch": 5.198142414860681,
"grad_norm": 0.48372480273246765,
"learning_rate": 9.903113209758096e-08,
"loss": 0.5388856530189514,
"step": 213
},
{
"epoch": 5.222910216718266,
"grad_norm": 0.4922617971897125,
"learning_rate": 9.338253735218748e-08,
"loss": 0.4714866280555725,
"step": 214
},
{
"epoch": 5.247678018575852,
"grad_norm": 0.5694555044174194,
"learning_rate": 8.789190860101226e-08,
"loss": 0.49757862091064453,
"step": 215
},
{
"epoch": 5.272445820433436,
"grad_norm": 0.5285799503326416,
"learning_rate": 8.256020251284379e-08,
"loss": 0.5523006916046143,
"step": 216
},
{
"epoch": 5.2972136222910216,
"grad_norm": 0.542019784450531,
"learning_rate": 7.73883480663171e-08,
"loss": 0.4939878582954407,
"step": 217
},
{
"epoch": 5.321981424148607,
"grad_norm": 0.4783063232898712,
"learning_rate": 7.23772463880522e-08,
"loss": 0.5162045359611511,
"step": 218
},
{
"epoch": 5.346749226006192,
"grad_norm": 0.4960096776485443,
"learning_rate": 6.75277705956443e-08,
"loss": 0.5186662673950195,
"step": 219
},
{
"epoch": 5.371517027863777,
"grad_norm": 0.4951794147491455,
"learning_rate": 6.284076564553464e-08,
"loss": 0.48755860328674316,
"step": 220
},
{
"epoch": 5.396284829721362,
"grad_norm": 0.4898841381072998,
"learning_rate": 5.831704818578842e-08,
"loss": 0.5034775733947754,
"step": 221
},
{
"epoch": 5.421052631578947,
"grad_norm": 0.540875256061554,
"learning_rate": 5.395740641380531e-08,
"loss": 0.4632171094417572,
"step": 222
},
{
"epoch": 5.445820433436532,
"grad_norm": 0.45750898122787476,
"learning_rate": 4.976259993898502e-08,
"loss": 0.49796921014785767,
"step": 223
},
{
"epoch": 5.470588235294118,
"grad_norm": 0.5052651166915894,
"learning_rate": 4.573335965037706e-08,
"loss": 0.47650158405303955,
"step": 224
},
{
"epoch": 5.495356037151703,
"grad_norm": 0.4999431371688843,
"learning_rate": 4.187038758933203e-08,
"loss": 0.49834519624710083,
"step": 225
},
{
"epoch": 5.520123839009288,
"grad_norm": 0.5175738334655762,
"learning_rate": 3.817435682718095e-08,
"loss": 0.46955606341362,
"step": 226
},
{
"epoch": 5.544891640866873,
"grad_norm": 0.4690812826156616,
"learning_rate": 3.464591134796135e-08,
"loss": 0.5154824256896973,
"step": 227
},
{
"epoch": 5.569659442724459,
"grad_norm": 0.4758513867855072,
"learning_rate": 3.1285665936211516e-08,
"loss": 0.5336707830429077,
"step": 228
},
{
"epoch": 5.594427244582043,
"grad_norm": 0.442473441362381,
"learning_rate": 2.8094206069852355e-08,
"loss": 0.4967498779296875,
"step": 229
},
{
"epoch": 5.6191950464396285,
"grad_norm": 0.4868296682834625,
"learning_rate": 2.507208781817638e-08,
"loss": 0.5311983823776245,
"step": 230
},
{
"epoch": 5.643962848297214,
"grad_norm": 0.5476986169815063,
"learning_rate": 2.221983774495928e-08,
"loss": 0.5054424405097961,
"step": 231
},
{
"epoch": 5.6687306501547985,
"grad_norm": 0.4974565804004669,
"learning_rate": 1.953795281671333e-08,
"loss": 0.5006812214851379,
"step": 232
},
{
"epoch": 5.693498452012384,
"grad_norm": 0.5025091767311096,
"learning_rate": 1.7026900316098212e-08,
"loss": 0.527012825012207,
"step": 233
},
{
"epoch": 5.718266253869969,
"grad_norm": 0.46924424171447754,
"learning_rate": 1.4687117760502576e-08,
"loss": 0.4735889434814453,
"step": 234
},
{
"epoch": 5.743034055727554,
"grad_norm": 0.454560786485672,
"learning_rate": 1.2519012825812803e-08,
"loss": 0.49276185035705566,
"step": 235
},
{
"epoch": 5.767801857585139,
"grad_norm": 0.4710627496242523,
"learning_rate": 1.0522963275380492e-08,
"loss": 0.5048189759254456,
"step": 236
},
{
"epoch": 5.792569659442725,
"grad_norm": 0.4550038278102875,
"learning_rate": 8.699316894203223e-09,
"loss": 0.513171911239624,
"step": 237
},
{
"epoch": 5.817337461300309,
"grad_norm": 0.5602344870567322,
"learning_rate": 7.048391428326584e-09,
"loss": 0.5195218324661255,
"step": 238
},
{
"epoch": 5.842105263157895,
"grad_norm": 0.4764668643474579,
"learning_rate": 5.570474529481561e-09,
"loss": 0.49439120292663574,
"step": 239
},
{
"epoch": 5.86687306501548,
"grad_norm": 0.7008131146430969,
"learning_rate": 4.265823704965532e-09,
"loss": 0.5026534795761108,
"step": 240
},
{
"epoch": 5.891640866873065,
"grad_norm": 0.5155523419380188,
"learning_rate": 3.1346662727740338e-09,
"loss": 0.505569338798523,
"step": 241
},
{
"epoch": 5.91640866873065,
"grad_norm": 0.48813626170158386,
"learning_rate": 2.1771993219946718e-09,
"loss": 0.4332225024700165,
"step": 242
},
{
"epoch": 5.9411764705882355,
"grad_norm": 0.5733649134635925,
"learning_rate": 1.393589678466367e-09,
"loss": 0.5184577703475952,
"step": 243
},
{
"epoch": 5.965944272445821,
"grad_norm": 0.47005656361579895,
"learning_rate": 7.839738757123848e-10,
"loss": 0.48927992582321167,
"step": 244
},
{
"epoch": 5.9907120743034055,
"grad_norm": 0.519534170627594,
"learning_rate": 3.484581311511414e-10,
"loss": 0.5252695679664612,
"step": 245
},
{
"epoch": 6.0,
"grad_norm": 0.8245697617530823,
"learning_rate": 8.711832758934168e-11,
"loss": 0.485756516456604,
"step": 246
}
],
"logging_steps": 1,
"max_steps": 246,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 63272699183104.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}