llada_sft_rethinker_partial_2048 / trainer_state.json
hbXNov's picture
Add files using upload-large-folder tool
7e880d5 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.964980544747082,
"eval_steps": 500,
"global_step": 192,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01556420233463035,
"grad_norm": 32.7662513325928,
"learning_rate": 3.3333333333333333e-06,
"loss": 2.0427,
"step": 1
},
{
"epoch": 0.0311284046692607,
"grad_norm": 14.17953242397471,
"learning_rate": 6.666666666666667e-06,
"loss": 1.8704,
"step": 2
},
{
"epoch": 0.04669260700389105,
"grad_norm": 9.581323731857186,
"learning_rate": 1e-05,
"loss": 2.0032,
"step": 3
},
{
"epoch": 0.0622568093385214,
"grad_norm": 12.804841760907372,
"learning_rate": 1.3333333333333333e-05,
"loss": 2.2126,
"step": 4
},
{
"epoch": 0.07782101167315175,
"grad_norm": 8.478925783183458,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.4769,
"step": 5
},
{
"epoch": 0.0933852140077821,
"grad_norm": 8.969266052384613,
"learning_rate": 2e-05,
"loss": 1.5752,
"step": 6
},
{
"epoch": 0.10894941634241245,
"grad_norm": 11.83099546071694,
"learning_rate": 1.999871626303739e-05,
"loss": 1.5588,
"step": 7
},
{
"epoch": 0.1245136186770428,
"grad_norm": 5.954526723326734,
"learning_rate": 1.999486541836746e-05,
"loss": 1.3619,
"step": 8
},
{
"epoch": 0.14007782101167315,
"grad_norm": 36.01845182362445,
"learning_rate": 1.9988448564539475e-05,
"loss": 1.3373,
"step": 9
},
{
"epoch": 0.1556420233463035,
"grad_norm": 9.862509446330417,
"learning_rate": 1.9979467532120636e-05,
"loss": 1.8391,
"step": 10
},
{
"epoch": 0.17120622568093385,
"grad_norm": 6.924375728787968,
"learning_rate": 1.99679248831739e-05,
"loss": 1.7063,
"step": 11
},
{
"epoch": 0.1867704280155642,
"grad_norm": 5.947859490849787,
"learning_rate": 1.9953823910527057e-05,
"loss": 1.3137,
"step": 12
},
{
"epoch": 0.20233463035019456,
"grad_norm": 9.376302417818316,
"learning_rate": 1.9937168636833405e-05,
"loss": 1.4841,
"step": 13
},
{
"epoch": 0.2178988326848249,
"grad_norm": 8.09315137157763,
"learning_rate": 1.9917963813424154e-05,
"loss": 1.6061,
"step": 14
},
{
"epoch": 0.23346303501945526,
"grad_norm": 12.912871635467674,
"learning_rate": 1.9896214918953003e-05,
"loss": 1.5974,
"step": 15
},
{
"epoch": 0.2490272373540856,
"grad_norm": 4.890429692775311,
"learning_rate": 1.9871928157833235e-05,
"loss": 1.3604,
"step": 16
},
{
"epoch": 0.26459143968871596,
"grad_norm": 5.96078805337046,
"learning_rate": 1.9845110458467724e-05,
"loss": 1.4205,
"step": 17
},
{
"epoch": 0.2801556420233463,
"grad_norm": 5.406312981626749,
"learning_rate": 1.981576947127245e-05,
"loss": 1.3691,
"step": 18
},
{
"epoch": 0.29571984435797666,
"grad_norm": 6.229338294974077,
"learning_rate": 1.978391356649404e-05,
"loss": 1.4662,
"step": 19
},
{
"epoch": 0.311284046692607,
"grad_norm": 7.078018638767833,
"learning_rate": 1.9749551831821917e-05,
"loss": 1.4734,
"step": 20
},
{
"epoch": 0.32684824902723736,
"grad_norm": 11.02898750969552,
"learning_rate": 1.971269406979584e-05,
"loss": 1.3636,
"step": 21
},
{
"epoch": 0.3424124513618677,
"grad_norm": 6.82235626712361,
"learning_rate": 1.9673350795009468e-05,
"loss": 1.5607,
"step": 22
},
{
"epoch": 0.35797665369649806,
"grad_norm": 8.481695271045934,
"learning_rate": 1.963153323111082e-05,
"loss": 1.2882,
"step": 23
},
{
"epoch": 0.3735408560311284,
"grad_norm": 17.63369186879554,
"learning_rate": 1.958725330760044e-05,
"loss": 1.5022,
"step": 24
},
{
"epoch": 0.38910505836575876,
"grad_norm": 17.68569390194203,
"learning_rate": 1.9540523656428223e-05,
"loss": 1.2848,
"step": 25
},
{
"epoch": 0.4046692607003891,
"grad_norm": 6.641749083734281,
"learning_rate": 1.9491357608389824e-05,
"loss": 1.2358,
"step": 26
},
{
"epoch": 0.42023346303501946,
"grad_norm": 23.867198010532427,
"learning_rate": 1.9439769189323727e-05,
"loss": 1.2404,
"step": 27
},
{
"epoch": 0.4357976653696498,
"grad_norm": 6.690231465484989,
"learning_rate": 1.9385773116110015e-05,
"loss": 1.3622,
"step": 28
},
{
"epoch": 0.45136186770428016,
"grad_norm": 7.267926238407661,
"learning_rate": 1.9329384792472036e-05,
"loss": 1.3815,
"step": 29
},
{
"epoch": 0.4669260700389105,
"grad_norm": 7.371143249720988,
"learning_rate": 1.9270620304582077e-05,
"loss": 1.4497,
"step": 30
},
{
"epoch": 0.48249027237354086,
"grad_norm": 5.739368570493641,
"learning_rate": 1.92094964164724e-05,
"loss": 1.439,
"step": 31
},
{
"epoch": 0.4980544747081712,
"grad_norm": 10.712092924918982,
"learning_rate": 1.9146030565252894e-05,
"loss": 1.2185,
"step": 32
},
{
"epoch": 0.5136186770428015,
"grad_norm": 5.745736001033042,
"learning_rate": 1.9080240856136675e-05,
"loss": 1.4049,
"step": 33
},
{
"epoch": 0.5291828793774319,
"grad_norm": 7.632465200177505,
"learning_rate": 1.9012146057275168e-05,
"loss": 1.5059,
"step": 34
},
{
"epoch": 0.5447470817120622,
"grad_norm": 8.286309897189899,
"learning_rate": 1.8941765594403975e-05,
"loss": 1.4689,
"step": 35
},
{
"epoch": 0.5603112840466926,
"grad_norm": 8.480450264883203,
"learning_rate": 1.886911954530124e-05,
"loss": 1.5331,
"step": 36
},
{
"epoch": 0.5758754863813229,
"grad_norm": 9.499572491510447,
"learning_rate": 1.879422863405995e-05,
"loss": 1.635,
"step": 37
},
{
"epoch": 0.5914396887159533,
"grad_norm": 9.489789008744502,
"learning_rate": 1.8717114225175858e-05,
"loss": 1.4844,
"step": 38
},
{
"epoch": 0.6070038910505836,
"grad_norm": 7.2947137328088765,
"learning_rate": 1.863779831745276e-05,
"loss": 1.4507,
"step": 39
},
{
"epoch": 0.622568093385214,
"grad_norm": 5.913323265251484,
"learning_rate": 1.8556303537726753e-05,
"loss": 1.6038,
"step": 40
},
{
"epoch": 0.6381322957198443,
"grad_norm": 14.132278657345845,
"learning_rate": 1.8472653134411388e-05,
"loss": 1.3738,
"step": 41
},
{
"epoch": 0.6536964980544747,
"grad_norm": 7.698733116161007,
"learning_rate": 1.8386870970865488e-05,
"loss": 1.1948,
"step": 42
},
{
"epoch": 0.669260700389105,
"grad_norm": 6.593898707009616,
"learning_rate": 1.8298981518585514e-05,
"loss": 1.2161,
"step": 43
},
{
"epoch": 0.6848249027237354,
"grad_norm": 8.732218414278748,
"learning_rate": 1.8209009850224465e-05,
"loss": 1.3516,
"step": 44
},
{
"epoch": 0.7003891050583657,
"grad_norm": 10.91781491470466,
"learning_rate": 1.811698163243929e-05,
"loss": 1.3615,
"step": 45
},
{
"epoch": 0.7159533073929961,
"grad_norm": 10.205838168734168,
"learning_rate": 1.8022923118568827e-05,
"loss": 1.4948,
"step": 46
},
{
"epoch": 0.7315175097276264,
"grad_norm": 17.343787523950684,
"learning_rate": 1.7926861141144393e-05,
"loss": 1.4923,
"step": 47
},
{
"epoch": 0.7470817120622568,
"grad_norm": 6.218713745426841,
"learning_rate": 1.782882310423512e-05,
"loss": 1.3801,
"step": 48
},
{
"epoch": 0.7626459143968871,
"grad_norm": 9.930901357054527,
"learning_rate": 1.7728836975630283e-05,
"loss": 1.4591,
"step": 49
},
{
"epoch": 0.7782101167315175,
"grad_norm": 10.565666651397537,
"learning_rate": 1.7626931278860773e-05,
"loss": 1.2283,
"step": 50
},
{
"epoch": 0.7937743190661478,
"grad_norm": 7.173094908653802,
"learning_rate": 1.752313508506208e-05,
"loss": 1.1787,
"step": 51
},
{
"epoch": 0.8093385214007782,
"grad_norm": 7.454713889992053,
"learning_rate": 1.7417478004680982e-05,
"loss": 1.387,
"step": 52
},
{
"epoch": 0.8249027237354085,
"grad_norm": 8.888727310362047,
"learning_rate": 1.730999017902848e-05,
"loss": 1.363,
"step": 53
},
{
"epoch": 0.8404669260700389,
"grad_norm": 8.910897747960528,
"learning_rate": 1.720070227168118e-05,
"loss": 1.4924,
"step": 54
},
{
"epoch": 0.8560311284046692,
"grad_norm": 8.560591680367171,
"learning_rate": 1.708964545973382e-05,
"loss": 1.5208,
"step": 55
},
{
"epoch": 0.8715953307392996,
"grad_norm": 6.31006260674449,
"learning_rate": 1.6976851424905153e-05,
"loss": 1.1552,
"step": 56
},
{
"epoch": 0.8871595330739299,
"grad_norm": 15.810845104599778,
"learning_rate": 1.6862352344500004e-05,
"loss": 1.2454,
"step": 57
},
{
"epoch": 0.9027237354085603,
"grad_norm": 6.767459348182446,
"learning_rate": 1.674618088222985e-05,
"loss": 1.2886,
"step": 58
},
{
"epoch": 0.9182879377431906,
"grad_norm": 10.51614814940254,
"learning_rate": 1.6628370178894734e-05,
"loss": 1.2644,
"step": 59
},
{
"epoch": 0.933852140077821,
"grad_norm": 7.047043052174269,
"learning_rate": 1.6508953842928966e-05,
"loss": 1.443,
"step": 60
},
{
"epoch": 0.9494163424124513,
"grad_norm": 8.579327238483026,
"learning_rate": 1.638796594081354e-05,
"loss": 1.3322,
"step": 61
},
{
"epoch": 0.9649805447470817,
"grad_norm": 7.0287097887612235,
"learning_rate": 1.626544098735777e-05,
"loss": 1.4198,
"step": 62
},
{
"epoch": 0.980544747081712,
"grad_norm": 16.693418616763456,
"learning_rate": 1.614141393585313e-05,
"loss": 1.4243,
"step": 63
},
{
"epoch": 0.9961089494163424,
"grad_norm": 4.4493625007162185,
"learning_rate": 1.601592016810193e-05,
"loss": 1.0317,
"step": 64
},
{
"epoch": 1.0,
"grad_norm": 4.4493625007162185,
"learning_rate": 1.588899548432377e-05,
"loss": 0.3818,
"step": 65
},
{
"epoch": 1.0155642023346303,
"grad_norm": 14.571145059681054,
"learning_rate": 1.5760676092942663e-05,
"loss": 1.3258,
"step": 66
},
{
"epoch": 1.0311284046692606,
"grad_norm": 7.149351806597666,
"learning_rate": 1.563099860025766e-05,
"loss": 1.2366,
"step": 67
},
{
"epoch": 1.046692607003891,
"grad_norm": 40.50757660945441,
"learning_rate": 1.55e-05,
"loss": 1.6179,
"step": 68
},
{
"epoch": 1.0622568093385214,
"grad_norm": 15.44110442369705,
"learning_rate": 1.5367717662779732e-05,
"loss": 1.3405,
"step": 69
},
{
"epoch": 1.0778210116731517,
"grad_norm": 6.675357357971338,
"learning_rate": 1.5234189325424802e-05,
"loss": 1.1276,
"step": 70
},
{
"epoch": 1.0933852140077822,
"grad_norm": 7.1518156856898605,
"learning_rate": 1.5099453080215705e-05,
"loss": 1.2737,
"step": 71
},
{
"epoch": 1.1089494163424125,
"grad_norm": 6.2755998715712815,
"learning_rate": 1.4963547364018711e-05,
"loss": 1.2964,
"step": 72
},
{
"epoch": 1.1245136186770428,
"grad_norm": 7.749171240376019,
"learning_rate": 1.4826510947320767e-05,
"loss": 1.2542,
"step": 73
},
{
"epoch": 1.140077821011673,
"grad_norm": 8.188433813273727,
"learning_rate": 1.4688382923169289e-05,
"loss": 1.2587,
"step": 74
},
{
"epoch": 1.1556420233463034,
"grad_norm": 7.386225279732122,
"learning_rate": 1.4549202696019868e-05,
"loss": 1.3309,
"step": 75
},
{
"epoch": 1.171206225680934,
"grad_norm": 6.253316144967461,
"learning_rate": 1.4409009970495184e-05,
"loss": 1.3574,
"step": 76
},
{
"epoch": 1.1867704280155642,
"grad_norm": 10.042142885418704,
"learning_rate": 1.4267844740058273e-05,
"loss": 1.1808,
"step": 77
},
{
"epoch": 1.2023346303501945,
"grad_norm": 8.752169534398908,
"learning_rate": 1.4125747275603384e-05,
"loss": 1.2535,
"step": 78
},
{
"epoch": 1.217898832684825,
"grad_norm": 5.922268212950014,
"learning_rate": 1.3982758113967723e-05,
"loss": 1.4928,
"step": 79
},
{
"epoch": 1.2334630350194553,
"grad_norm": 13.340943215095326,
"learning_rate": 1.3838918046367302e-05,
"loss": 1.5576,
"step": 80
},
{
"epoch": 1.2490272373540856,
"grad_norm": 11.447188182283101,
"learning_rate": 1.3694268106760225e-05,
"loss": 1.3702,
"step": 81
},
{
"epoch": 1.264591439688716,
"grad_norm": 8.785930153191286,
"learning_rate": 1.3548849560140735e-05,
"loss": 1.5769,
"step": 82
},
{
"epoch": 1.2801556420233462,
"grad_norm": 11.289308481687042,
"learning_rate": 1.3402703890767365e-05,
"loss": 1.4041,
"step": 83
},
{
"epoch": 1.2957198443579767,
"grad_norm": 6.0701513028110865,
"learning_rate": 1.3255872790328485e-05,
"loss": 1.2474,
"step": 84
},
{
"epoch": 1.311284046692607,
"grad_norm": 15.420406437695464,
"learning_rate": 1.310839814604874e-05,
"loss": 1.3971,
"step": 85
},
{
"epoch": 1.3268482490272373,
"grad_norm": 11.112901019437691,
"learning_rate": 1.2960322028739664e-05,
"loss": 1.292,
"step": 86
},
{
"epoch": 1.3424124513618678,
"grad_norm": 6.574313635072488,
"learning_rate": 1.2811686680797942e-05,
"loss": 1.5592,
"step": 87
},
{
"epoch": 1.3579766536964981,
"grad_norm": 19.9661703497788,
"learning_rate": 1.2662534504154707e-05,
"loss": 1.5115,
"step": 88
},
{
"epoch": 1.3735408560311284,
"grad_norm": 18.38165853721984,
"learning_rate": 1.2512908048179336e-05,
"loss": 1.5681,
"step": 89
},
{
"epoch": 1.3891050583657587,
"grad_norm": 7.567686089019195,
"learning_rate": 1.236284999754119e-05,
"loss": 1.2417,
"step": 90
},
{
"epoch": 1.404669260700389,
"grad_norm": 12.905705238689295,
"learning_rate": 1.221240316003275e-05,
"loss": 1.2854,
"step": 91
},
{
"epoch": 1.4202334630350195,
"grad_norm": 20.818267922715442,
"learning_rate": 1.2061610454357618e-05,
"loss": 1.5286,
"step": 92
},
{
"epoch": 1.4357976653696498,
"grad_norm": 6.109213052045277,
"learning_rate": 1.1910514897886892e-05,
"loss": 1.3168,
"step": 93
},
{
"epoch": 1.45136186770428,
"grad_norm": 15.101872320411488,
"learning_rate": 1.1759159594387404e-05,
"loss": 1.5504,
"step": 94
},
{
"epoch": 1.4669260700389106,
"grad_norm": 6.233434571455187,
"learning_rate": 1.1607587721725288e-05,
"loss": 1.5917,
"step": 95
},
{
"epoch": 1.482490272373541,
"grad_norm": 9.122907207075865,
"learning_rate": 1.1455842519548417e-05,
"loss": 1.53,
"step": 96
},
{
"epoch": 1.4980544747081712,
"grad_norm": 7.271922289011854,
"learning_rate": 1.1303967276951215e-05,
"loss": 1.3232,
"step": 97
},
{
"epoch": 1.5136186770428015,
"grad_norm": 7.525926983479133,
"learning_rate": 1.115200532012538e-05,
"loss": 1.434,
"step": 98
},
{
"epoch": 1.5291828793774318,
"grad_norm": 13.459453151432884,
"learning_rate": 1.1000000000000001e-05,
"loss": 1.2062,
"step": 99
},
{
"epoch": 1.544747081712062,
"grad_norm": 6.34554520110176,
"learning_rate": 1.0847994679874623e-05,
"loss": 1.2515,
"step": 100
},
{
"epoch": 1.5603112840466926,
"grad_norm": 8.913386857418164,
"learning_rate": 1.0696032723048787e-05,
"loss": 1.2267,
"step": 101
},
{
"epoch": 1.575875486381323,
"grad_norm": 9.547469800185302,
"learning_rate": 1.0544157480451586e-05,
"loss": 1.1735,
"step": 102
},
{
"epoch": 1.5914396887159534,
"grad_norm": 7.932054956466567,
"learning_rate": 1.0392412278274714e-05,
"loss": 1.205,
"step": 103
},
{
"epoch": 1.6070038910505837,
"grad_norm": 9.208669039900636,
"learning_rate": 1.02408404056126e-05,
"loss": 1.1383,
"step": 104
},
{
"epoch": 1.622568093385214,
"grad_norm": 7.709935193825099,
"learning_rate": 1.0089485102113113e-05,
"loss": 1.4121,
"step": 105
},
{
"epoch": 1.6381322957198443,
"grad_norm": 6.819923905554452,
"learning_rate": 9.938389545642388e-06,
"loss": 1.3696,
"step": 106
},
{
"epoch": 1.6536964980544746,
"grad_norm": 8.206400329676246,
"learning_rate": 9.787596839967254e-06,
"loss": 1.3651,
"step": 107
},
{
"epoch": 1.669260700389105,
"grad_norm": 10.260363436595911,
"learning_rate": 9.637150002458813e-06,
"loss": 1.2666,
"step": 108
},
{
"epoch": 1.6848249027237354,
"grad_norm": 26.767196552606528,
"learning_rate": 9.487091951820669e-06,
"loss": 1.479,
"step": 109
},
{
"epoch": 1.7003891050583657,
"grad_norm": 7.51563263665608,
"learning_rate": 9.337465495845299e-06,
"loss": 1.2219,
"step": 110
},
{
"epoch": 1.7159533073929962,
"grad_norm": 12.755450160808891,
"learning_rate": 9.188313319202057e-06,
"loss": 1.4279,
"step": 111
},
{
"epoch": 1.7315175097276265,
"grad_norm": 6.349610495085197,
"learning_rate": 9.039677971260337e-06,
"loss": 1.4551,
"step": 112
},
{
"epoch": 1.7470817120622568,
"grad_norm": 8.917885244170789,
"learning_rate": 8.891601853951262e-06,
"loss": 1.2766,
"step": 113
},
{
"epoch": 1.7626459143968871,
"grad_norm": 13.766988130785693,
"learning_rate": 8.744127209671516e-06,
"loss": 1.2214,
"step": 114
},
{
"epoch": 1.7782101167315174,
"grad_norm": 15.79931988964264,
"learning_rate": 8.597296109232636e-06,
"loss": 1.2607,
"step": 115
},
{
"epoch": 1.7937743190661477,
"grad_norm": 27.982585836161547,
"learning_rate": 8.451150439859264e-06,
"loss": 1.213,
"step": 116
},
{
"epoch": 1.8093385214007782,
"grad_norm": 6.366753369802077,
"learning_rate": 8.30573189323978e-06,
"loss": 1.4226,
"step": 117
},
{
"epoch": 1.8249027237354085,
"grad_norm": 8.762108357477535,
"learning_rate": 8.161081953632701e-06,
"loss": 1.2593,
"step": 118
},
{
"epoch": 1.840466926070039,
"grad_norm": 9.6117893036037,
"learning_rate": 8.01724188603228e-06,
"loss": 1.625,
"step": 119
},
{
"epoch": 1.8560311284046693,
"grad_norm": 25.20227315878213,
"learning_rate": 7.87425272439662e-06,
"loss": 1.7573,
"step": 120
},
{
"epoch": 1.8715953307392996,
"grad_norm": 18.50697294390281,
"learning_rate": 7.732155259941729e-06,
"loss": 1.3655,
"step": 121
},
{
"epoch": 1.88715953307393,
"grad_norm": 11.419496186766294,
"learning_rate": 7.590990029504816e-06,
"loss": 1.2208,
"step": 122
},
{
"epoch": 1.9027237354085602,
"grad_norm": 12.960895376933111,
"learning_rate": 7.450797303980135e-06,
"loss": 1.1531,
"step": 123
},
{
"epoch": 1.9182879377431905,
"grad_norm": 10.895377744831976,
"learning_rate": 7.311617076830715e-06,
"loss": 1.2867,
"step": 124
},
{
"epoch": 1.933852140077821,
"grad_norm": 9.198976531684588,
"learning_rate": 7.173489052679236e-06,
"loss": 1.3783,
"step": 125
},
{
"epoch": 1.9494163424124513,
"grad_norm": 7.6941427655967365,
"learning_rate": 7.0364526359812924e-06,
"loss": 1.5269,
"step": 126
},
{
"epoch": 1.9649805447470818,
"grad_norm": 7.578325575993518,
"learning_rate": 6.900546919784295e-06,
"loss": 1.479,
"step": 127
},
{
"epoch": 1.9805447470817121,
"grad_norm": 9.809861880346132,
"learning_rate": 6.7658106745752015e-06,
"loss": 1.3796,
"step": 128
},
{
"epoch": 1.9961089494163424,
"grad_norm": 11.809424540063796,
"learning_rate": 6.632282337220272e-06,
"loss": 1.8018,
"step": 129
},
{
"epoch": 2.0,
"grad_norm": 11.809424540063796,
"learning_rate": 6.500000000000003e-06,
"loss": 0.3816,
"step": 130
},
{
"epoch": 2.0155642023346303,
"grad_norm": 10.863741493343777,
"learning_rate": 6.369001399742344e-06,
"loss": 1.2037,
"step": 131
},
{
"epoch": 2.0311284046692606,
"grad_norm": 5.962018084798566,
"learning_rate": 6.239323907057342e-06,
"loss": 0.9657,
"step": 132
},
{
"epoch": 2.046692607003891,
"grad_norm": 4.788797901834975,
"learning_rate": 6.1110045156762355e-06,
"loss": 1.1664,
"step": 133
},
{
"epoch": 2.062256809338521,
"grad_norm": 6.2976648307960446,
"learning_rate": 5.984079831898073e-06,
"loss": 1.4275,
"step": 134
},
{
"epoch": 2.077821011673152,
"grad_norm": 11.390981419662713,
"learning_rate": 5.8585860641468674e-06,
"loss": 1.1395,
"step": 135
},
{
"epoch": 2.093385214007782,
"grad_norm": 6.57931364685089,
"learning_rate": 5.7345590126422315e-06,
"loss": 1.2979,
"step": 136
},
{
"epoch": 2.1089494163424125,
"grad_norm": 9.124550827147443,
"learning_rate": 5.612034059186464e-06,
"loss": 1.5149,
"step": 137
},
{
"epoch": 2.124513618677043,
"grad_norm": 7.9154220919147145,
"learning_rate": 5.491046157071034e-06,
"loss": 1.2253,
"step": 138
},
{
"epoch": 2.140077821011673,
"grad_norm": 9.043495071655409,
"learning_rate": 5.37162982110527e-06,
"loss": 1.2771,
"step": 139
},
{
"epoch": 2.1556420233463034,
"grad_norm": 7.246287181191101,
"learning_rate": 5.253819117770149e-06,
"loss": 1.28,
"step": 140
},
{
"epoch": 2.1712062256809337,
"grad_norm": 14.135867016035936,
"learning_rate": 5.137647655500002e-06,
"loss": 1.2389,
"step": 141
},
{
"epoch": 2.1867704280155644,
"grad_norm": 6.191000880702552,
"learning_rate": 5.023148575094847e-06,
"loss": 1.3685,
"step": 142
},
{
"epoch": 2.2023346303501947,
"grad_norm": 6.912497051747966,
"learning_rate": 4.910354540266184e-06,
"loss": 1.1248,
"step": 143
},
{
"epoch": 2.217898832684825,
"grad_norm": 7.634399054473615,
"learning_rate": 4.799297728318821e-06,
"loss": 1.2091,
"step": 144
},
{
"epoch": 2.2334630350194553,
"grad_norm": 8.654404179498275,
"learning_rate": 4.690009820971527e-06,
"loss": 1.2775,
"step": 145
},
{
"epoch": 2.2490272373540856,
"grad_norm": 5.486520320319506,
"learning_rate": 4.582521995319019e-06,
"loss": 1.3234,
"step": 146
},
{
"epoch": 2.264591439688716,
"grad_norm": 14.08610341346389,
"learning_rate": 4.476864914937923e-06,
"loss": 1.1865,
"step": 147
},
{
"epoch": 2.280155642023346,
"grad_norm": 11.693098373701448,
"learning_rate": 4.373068721139227e-06,
"loss": 1.4238,
"step": 148
},
{
"epoch": 2.2957198443579765,
"grad_norm": 9.49971316853895,
"learning_rate": 4.271163024369722e-06,
"loss": 1.1235,
"step": 149
},
{
"epoch": 2.311284046692607,
"grad_norm": 7.887870435405849,
"learning_rate": 4.171176895764882e-06,
"loss": 1.1697,
"step": 150
},
{
"epoch": 2.3268482490272375,
"grad_norm": 7.29104535243051,
"learning_rate": 4.07313885885561e-06,
"loss": 1.4309,
"step": 151
},
{
"epoch": 2.342412451361868,
"grad_norm": 10.320755505362524,
"learning_rate": 3.977076881431175e-06,
"loss": 1.3613,
"step": 152
},
{
"epoch": 2.357976653696498,
"grad_norm": 5.857317648653641,
"learning_rate": 3.883018367560715e-06,
"loss": 1.3462,
"step": 153
},
{
"epoch": 2.3735408560311284,
"grad_norm": 9.13512553833875,
"learning_rate": 3.7909901497755408e-06,
"loss": 1.3862,
"step": 154
},
{
"epoch": 2.3891050583657587,
"grad_norm": 6.878082659822421,
"learning_rate": 3.7010184814144916e-06,
"loss": 1.3616,
"step": 155
},
{
"epoch": 2.404669260700389,
"grad_norm": 5.910003013217337,
"learning_rate": 3.6131290291345155e-06,
"loss": 1.3136,
"step": 156
},
{
"epoch": 2.4202334630350193,
"grad_norm": 9.041793350178478,
"learning_rate": 3.527346865588614e-06,
"loss": 1.2654,
"step": 157
},
{
"epoch": 2.43579766536965,
"grad_norm": 9.955462288729418,
"learning_rate": 3.4436964622732493e-06,
"loss": 1.3949,
"step": 158
},
{
"epoch": 2.4513618677042803,
"grad_norm": 12.124015441070618,
"learning_rate": 3.3622016825472414e-06,
"loss": 1.3149,
"step": 159
},
{
"epoch": 2.4669260700389106,
"grad_norm": 5.944228819887684,
"learning_rate": 3.2828857748241404e-06,
"loss": 1.3735,
"step": 160
},
{
"epoch": 2.482490272373541,
"grad_norm": 7.465704659945909,
"learning_rate": 3.205771365940052e-06,
"loss": 1.1572,
"step": 161
},
{
"epoch": 2.498054474708171,
"grad_norm": 8.012935003044838,
"learning_rate": 3.1308804546987615e-06,
"loss": 1.2964,
"step": 162
},
{
"epoch": 2.5136186770428015,
"grad_norm": 6.396153961978255,
"learning_rate": 3.058234405596029e-06,
"loss": 1.2518,
"step": 163
},
{
"epoch": 2.529182879377432,
"grad_norm": 32.66960667166894,
"learning_rate": 2.9878539427248364e-06,
"loss": 1.3154,
"step": 164
},
{
"epoch": 2.544747081712062,
"grad_norm": 7.182443079061496,
"learning_rate": 2.919759143863326e-06,
"loss": 1.2754,
"step": 165
},
{
"epoch": 2.5603112840466924,
"grad_norm": 9.065894651134005,
"learning_rate": 2.8539694347471093e-06,
"loss": 1.5717,
"step": 166
},
{
"epoch": 2.5758754863813227,
"grad_norm": 8.155108121011244,
"learning_rate": 2.7905035835276e-06,
"loss": 1.1931,
"step": 167
},
{
"epoch": 2.5914396887159534,
"grad_norm": 10.525703817651328,
"learning_rate": 2.7293796954179254e-06,
"loss": 1.2438,
"step": 168
},
{
"epoch": 2.6070038910505837,
"grad_norm": 10.790702057048689,
"learning_rate": 2.670615207527965e-06,
"loss": 1.2728,
"step": 169
},
{
"epoch": 2.622568093385214,
"grad_norm": 6.486978399127539,
"learning_rate": 2.6142268838899844e-06,
"loss": 1.3483,
"step": 170
},
{
"epoch": 2.6381322957198443,
"grad_norm": 11.399298263764798,
"learning_rate": 2.5602308106762773e-06,
"loss": 1.4894,
"step": 171
},
{
"epoch": 2.6536964980544746,
"grad_norm": 8.803157004201939,
"learning_rate": 2.5086423916101794e-06,
"loss": 1.5442,
"step": 172
},
{
"epoch": 2.669260700389105,
"grad_norm": 7.898925628777204,
"learning_rate": 2.4594763435717788e-06,
"loss": 1.3132,
"step": 173
},
{
"epoch": 2.6848249027237356,
"grad_norm": 6.01024097821099,
"learning_rate": 2.412746692399561e-06,
"loss": 1.3329,
"step": 174
},
{
"epoch": 2.700389105058366,
"grad_norm": 9.287448544998288,
"learning_rate": 2.3684667688891813e-06,
"loss": 1.2279,
"step": 175
},
{
"epoch": 2.7159533073929962,
"grad_norm": 6.509435104920216,
"learning_rate": 2.3266492049905327e-06,
"loss": 1.1356,
"step": 176
},
{
"epoch": 2.7315175097276265,
"grad_norm": 7.274526660218056,
"learning_rate": 2.2873059302041627e-06,
"loss": 1.2053,
"step": 177
},
{
"epoch": 2.747081712062257,
"grad_norm": 7.528171905726772,
"learning_rate": 2.250448168178085e-06,
"loss": 1.2631,
"step": 178
},
{
"epoch": 2.762645914396887,
"grad_norm": 7.685404987229149,
"learning_rate": 2.216086433505963e-06,
"loss": 1.1471,
"step": 179
},
{
"epoch": 2.7782101167315174,
"grad_norm": 7.30724911654224,
"learning_rate": 2.18423052872755e-06,
"loss": 1.1335,
"step": 180
},
{
"epoch": 2.7937743190661477,
"grad_norm": 8.275377130204019,
"learning_rate": 2.154889541532279e-06,
"loss": 1.4331,
"step": 181
},
{
"epoch": 2.809338521400778,
"grad_norm": 8.26916689050228,
"learning_rate": 2.128071842166766e-06,
"loss": 1.1323,
"step": 182
},
{
"epoch": 2.8249027237354083,
"grad_norm": 8.648891670292945,
"learning_rate": 2.1037850810469977e-06,
"loss": 1.0748,
"step": 183
},
{
"epoch": 2.840466926070039,
"grad_norm": 8.864628121919203,
"learning_rate": 2.0820361865758506e-06,
"loss": 1.2159,
"step": 184
},
{
"epoch": 2.8560311284046693,
"grad_norm": 7.550747013009374,
"learning_rate": 2.0628313631665977e-06,
"loss": 1.1746,
"step": 185
},
{
"epoch": 2.8715953307392996,
"grad_norm": 5.4396609218382075,
"learning_rate": 2.0461760894729438e-06,
"loss": 1.1403,
"step": 186
},
{
"epoch": 2.88715953307393,
"grad_norm": 9.061879291284523,
"learning_rate": 2.032075116826103e-06,
"loss": 1.5448,
"step": 187
},
{
"epoch": 2.90272373540856,
"grad_norm": 6.124913133747852,
"learning_rate": 2.0205324678793635e-06,
"loss": 1.1864,
"step": 188
},
{
"epoch": 2.9182879377431905,
"grad_norm": 7.03064717545691,
"learning_rate": 2.0115514354605255e-06,
"loss": 1.3855,
"step": 189
},
{
"epoch": 2.9338521400778212,
"grad_norm": 8.91521754504974,
"learning_rate": 2.005134581632538e-06,
"loss": 1.3689,
"step": 190
},
{
"epoch": 2.9494163424124515,
"grad_norm": 8.561228532947991,
"learning_rate": 2.001283736962612e-06,
"loss": 1.5862,
"step": 191
},
{
"epoch": 2.964980544747082,
"grad_norm": 10.281179103024385,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.1926,
"step": 192
},
{
"epoch": 2.964980544747082,
"step": 192,
"total_flos": 104900150247424.0,
"train_loss": 1.356536865234375,
"train_runtime": 15762.2393,
"train_samples_per_second": 1.571,
"train_steps_per_second": 0.012
}
],
"logging_steps": 1.0,
"max_steps": 192,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 104900150247424.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}