smollamdk-2 / trainer_state.json
ukung's picture
Upload folder using huggingface_hub
8801c88 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0846560846560847,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.021164021164021163,
"grad_norm": 6.048354625701904,
"learning_rate": 1e-05,
"loss": 3.1366,
"mean_token_accuracy": 0.8490224033594131,
"step": 1
},
{
"epoch": 0.042328042328042326,
"grad_norm": 6.282668590545654,
"learning_rate": 2e-05,
"loss": 3.3445,
"mean_token_accuracy": 0.8413912802934647,
"step": 2
},
{
"epoch": 0.06349206349206349,
"grad_norm": 5.931667327880859,
"learning_rate": 3e-05,
"loss": 3.6371,
"mean_token_accuracy": 0.819273829460144,
"step": 3
},
{
"epoch": 0.08465608465608465,
"grad_norm": 5.271315097808838,
"learning_rate": 4e-05,
"loss": 2.8728,
"mean_token_accuracy": 0.864514172077179,
"step": 4
},
{
"epoch": 0.10582010582010581,
"grad_norm": 6.306461811065674,
"learning_rate": 5e-05,
"loss": 3.2277,
"mean_token_accuracy": 0.8422431498765945,
"step": 5
},
{
"epoch": 0.12698412698412698,
"grad_norm": 7.170639514923096,
"learning_rate": 4.9473684210526315e-05,
"loss": 3.1188,
"mean_token_accuracy": 0.8523378223180771,
"step": 6
},
{
"epoch": 0.14814814814814814,
"grad_norm": 6.320353984832764,
"learning_rate": 4.8947368421052635e-05,
"loss": 3.2011,
"mean_token_accuracy": 0.8276277035474777,
"step": 7
},
{
"epoch": 0.1693121693121693,
"grad_norm": 7.527361869812012,
"learning_rate": 4.842105263157895e-05,
"loss": 3.5806,
"mean_token_accuracy": 0.8307382017374039,
"step": 8
},
{
"epoch": 0.19047619047619047,
"grad_norm": 9.139302253723145,
"learning_rate": 4.789473684210526e-05,
"loss": 3.3045,
"mean_token_accuracy": 0.8234163820743561,
"step": 9
},
{
"epoch": 0.21164021164021163,
"grad_norm": 8.344583511352539,
"learning_rate": 4.736842105263158e-05,
"loss": 3.4053,
"mean_token_accuracy": 0.8328999727964401,
"step": 10
},
{
"epoch": 0.2328042328042328,
"grad_norm": 7.198660850524902,
"learning_rate": 4.68421052631579e-05,
"loss": 3.6999,
"mean_token_accuracy": 0.8166868537664413,
"step": 11
},
{
"epoch": 0.25396825396825395,
"grad_norm": 7.656338691711426,
"learning_rate": 4.6315789473684214e-05,
"loss": 3.5236,
"mean_token_accuracy": 0.8194563537836075,
"step": 12
},
{
"epoch": 0.2751322751322751,
"grad_norm": 6.596268653869629,
"learning_rate": 4.5789473684210527e-05,
"loss": 3.1002,
"mean_token_accuracy": 0.8456875830888748,
"step": 13
},
{
"epoch": 0.2962962962962963,
"grad_norm": 7.026238918304443,
"learning_rate": 4.5263157894736846e-05,
"loss": 3.5912,
"mean_token_accuracy": 0.8176615834236145,
"step": 14
},
{
"epoch": 0.31746031746031744,
"grad_norm": 6.6125640869140625,
"learning_rate": 4.473684210526316e-05,
"loss": 3.2967,
"mean_token_accuracy": 0.8385807871818542,
"step": 15
},
{
"epoch": 0.3386243386243386,
"grad_norm": 6.488234043121338,
"learning_rate": 4.421052631578947e-05,
"loss": 2.9518,
"mean_token_accuracy": 0.8566168546676636,
"step": 16
},
{
"epoch": 0.35978835978835977,
"grad_norm": 6.3693389892578125,
"learning_rate": 4.368421052631579e-05,
"loss": 3.4379,
"mean_token_accuracy": 0.8285104632377625,
"step": 17
},
{
"epoch": 0.38095238095238093,
"grad_norm": 6.490217208862305,
"learning_rate": 4.3157894736842105e-05,
"loss": 3.2799,
"mean_token_accuracy": 0.8258774131536484,
"step": 18
},
{
"epoch": 0.4021164021164021,
"grad_norm": 6.357998847961426,
"learning_rate": 4.2631578947368425e-05,
"loss": 3.556,
"mean_token_accuracy": 0.8310040086507797,
"step": 19
},
{
"epoch": 0.42328042328042326,
"grad_norm": 6.8284807205200195,
"learning_rate": 4.210526315789474e-05,
"loss": 3.225,
"mean_token_accuracy": 0.8262167721986771,
"step": 20
},
{
"epoch": 0.4444444444444444,
"grad_norm": 6.557299613952637,
"learning_rate": 4.157894736842106e-05,
"loss": 2.8915,
"mean_token_accuracy": 0.8450914323329926,
"step": 21
},
{
"epoch": 0.4656084656084656,
"grad_norm": 6.227797508239746,
"learning_rate": 4.105263157894737e-05,
"loss": 3.5964,
"mean_token_accuracy": 0.8293115049600601,
"step": 22
},
{
"epoch": 0.48677248677248675,
"grad_norm": 6.4811482429504395,
"learning_rate": 4.0526315789473684e-05,
"loss": 2.9146,
"mean_token_accuracy": 0.8511508405208588,
"step": 23
},
{
"epoch": 0.5079365079365079,
"grad_norm": 7.390919208526611,
"learning_rate": 4e-05,
"loss": 3.0839,
"mean_token_accuracy": 0.8521782755851746,
"step": 24
},
{
"epoch": 0.5291005291005291,
"grad_norm": 7.000582695007324,
"learning_rate": 3.9473684210526316e-05,
"loss": 3.419,
"mean_token_accuracy": 0.8302050828933716,
"step": 25
},
{
"epoch": 0.5502645502645502,
"grad_norm": 7.646208763122559,
"learning_rate": 3.894736842105263e-05,
"loss": 3.5312,
"mean_token_accuracy": 0.8222539722919464,
"step": 26
},
{
"epoch": 0.5714285714285714,
"grad_norm": 8.202258110046387,
"learning_rate": 3.842105263157895e-05,
"loss": 3.2736,
"mean_token_accuracy": 0.8385538011789322,
"step": 27
},
{
"epoch": 0.5925925925925926,
"grad_norm": 7.112030982971191,
"learning_rate": 3.789473684210527e-05,
"loss": 3.5607,
"mean_token_accuracy": 0.8270172476768494,
"step": 28
},
{
"epoch": 0.6137566137566137,
"grad_norm": 7.77510404586792,
"learning_rate": 3.736842105263158e-05,
"loss": 3.4509,
"mean_token_accuracy": 0.8338000029325485,
"step": 29
},
{
"epoch": 0.6349206349206349,
"grad_norm": 8.627528190612793,
"learning_rate": 3.6842105263157895e-05,
"loss": 3.4089,
"mean_token_accuracy": 0.8251242786645889,
"step": 30
},
{
"epoch": 0.656084656084656,
"grad_norm": 7.149316787719727,
"learning_rate": 3.6315789473684214e-05,
"loss": 3.0029,
"mean_token_accuracy": 0.8458760529756546,
"step": 31
},
{
"epoch": 0.6772486772486772,
"grad_norm": 8.06534194946289,
"learning_rate": 3.578947368421053e-05,
"loss": 3.2129,
"mean_token_accuracy": 0.8390202522277832,
"step": 32
},
{
"epoch": 0.6984126984126984,
"grad_norm": 6.458263397216797,
"learning_rate": 3.526315789473684e-05,
"loss": 3.7385,
"mean_token_accuracy": 0.8196459114551544,
"step": 33
},
{
"epoch": 0.7195767195767195,
"grad_norm": 6.388949394226074,
"learning_rate": 3.473684210526316e-05,
"loss": 3.0692,
"mean_token_accuracy": 0.8393152952194214,
"step": 34
},
{
"epoch": 0.7407407407407407,
"grad_norm": 7.467120170593262,
"learning_rate": 3.421052631578947e-05,
"loss": 3.4118,
"mean_token_accuracy": 0.8257147669792175,
"step": 35
},
{
"epoch": 0.7619047619047619,
"grad_norm": 7.274282932281494,
"learning_rate": 3.368421052631579e-05,
"loss": 3.2201,
"mean_token_accuracy": 0.8417303115129471,
"step": 36
},
{
"epoch": 0.783068783068783,
"grad_norm": 7.1840500831604,
"learning_rate": 3.3157894736842106e-05,
"loss": 3.2211,
"mean_token_accuracy": 0.8373906314373016,
"step": 37
},
{
"epoch": 0.8042328042328042,
"grad_norm": 6.900996685028076,
"learning_rate": 3.2631578947368426e-05,
"loss": 3.4252,
"mean_token_accuracy": 0.8245490044355392,
"step": 38
},
{
"epoch": 0.8253968253968254,
"grad_norm": 6.207952976226807,
"learning_rate": 3.210526315789474e-05,
"loss": 3.1038,
"mean_token_accuracy": 0.8480600118637085,
"step": 39
},
{
"epoch": 0.8465608465608465,
"grad_norm": 6.920236587524414,
"learning_rate": 3.157894736842105e-05,
"loss": 3.3569,
"mean_token_accuracy": 0.8315872251987457,
"step": 40
},
{
"epoch": 0.8677248677248677,
"grad_norm": 7.361562728881836,
"learning_rate": 3.105263157894737e-05,
"loss": 3.6389,
"mean_token_accuracy": 0.8231257498264313,
"step": 41
},
{
"epoch": 0.8888888888888888,
"grad_norm": 6.357892036437988,
"learning_rate": 3.0526315789473684e-05,
"loss": 3.3438,
"mean_token_accuracy": 0.8264831602573395,
"step": 42
},
{
"epoch": 0.91005291005291,
"grad_norm": 6.043882369995117,
"learning_rate": 3e-05,
"loss": 3.3171,
"mean_token_accuracy": 0.8378995358943939,
"step": 43
},
{
"epoch": 0.9312169312169312,
"grad_norm": 6.81577205657959,
"learning_rate": 2.9473684210526314e-05,
"loss": 3.2854,
"mean_token_accuracy": 0.8394952565431595,
"step": 44
},
{
"epoch": 0.9523809523809523,
"grad_norm": 6.861083030700684,
"learning_rate": 2.8947368421052634e-05,
"loss": 3.1802,
"mean_token_accuracy": 0.8438083976507187,
"step": 45
},
{
"epoch": 0.9735449735449735,
"grad_norm": 7.255928039550781,
"learning_rate": 2.842105263157895e-05,
"loss": 3.7687,
"mean_token_accuracy": 0.8107610791921616,
"step": 46
},
{
"epoch": 0.9947089947089947,
"grad_norm": 6.662189483642578,
"learning_rate": 2.7894736842105263e-05,
"loss": 2.8252,
"mean_token_accuracy": 0.8516946136951447,
"step": 47
},
{
"epoch": 1.0,
"grad_norm": 2.8546807765960693,
"learning_rate": 2.7368421052631583e-05,
"loss": 0.7518,
"mean_token_accuracy": 0.8395061492919922,
"step": 48
},
{
"epoch": 1.0211640211640212,
"grad_norm": 5.928465843200684,
"learning_rate": 2.6842105263157896e-05,
"loss": 2.9837,
"mean_token_accuracy": 0.8488393574953079,
"step": 49
},
{
"epoch": 1.0423280423280423,
"grad_norm": 7.0468363761901855,
"learning_rate": 2.6315789473684212e-05,
"loss": 2.9965,
"mean_token_accuracy": 0.8516333252191544,
"step": 50
},
{
"epoch": 1.0634920634920635,
"grad_norm": 6.113859176635742,
"learning_rate": 2.578947368421053e-05,
"loss": 2.9239,
"mean_token_accuracy": 0.846625804901123,
"step": 51
},
{
"epoch": 1.0846560846560847,
"grad_norm": 6.449796199798584,
"learning_rate": 2.5263157894736845e-05,
"loss": 2.9745,
"mean_token_accuracy": 0.8498904705047607,
"step": 52
},
{
"epoch": 1.1058201058201058,
"grad_norm": 6.227660655975342,
"learning_rate": 2.4736842105263158e-05,
"loss": 2.7134,
"mean_token_accuracy": 0.8572945147752762,
"step": 53
},
{
"epoch": 1.126984126984127,
"grad_norm": 6.191722869873047,
"learning_rate": 2.4210526315789474e-05,
"loss": 2.9167,
"mean_token_accuracy": 0.8419267982244492,
"step": 54
},
{
"epoch": 1.1481481481481481,
"grad_norm": 6.0019307136535645,
"learning_rate": 2.368421052631579e-05,
"loss": 2.8168,
"mean_token_accuracy": 0.8558094352483749,
"step": 55
},
{
"epoch": 1.1693121693121693,
"grad_norm": 6.954141139984131,
"learning_rate": 2.3157894736842107e-05,
"loss": 3.0574,
"mean_token_accuracy": 0.8407768905162811,
"step": 56
},
{
"epoch": 1.1904761904761905,
"grad_norm": 6.376841068267822,
"learning_rate": 2.2631578947368423e-05,
"loss": 3.1123,
"mean_token_accuracy": 0.8525369018316269,
"step": 57
},
{
"epoch": 1.2116402116402116,
"grad_norm": 6.693148612976074,
"learning_rate": 2.2105263157894736e-05,
"loss": 2.351,
"mean_token_accuracy": 0.8728293180465698,
"step": 58
},
{
"epoch": 1.2328042328042328,
"grad_norm": 7.382028102874756,
"learning_rate": 2.1578947368421053e-05,
"loss": 3.065,
"mean_token_accuracy": 0.8475844711065292,
"step": 59
},
{
"epoch": 1.253968253968254,
"grad_norm": 6.0569682121276855,
"learning_rate": 2.105263157894737e-05,
"loss": 2.7,
"mean_token_accuracy": 0.8730548322200775,
"step": 60
},
{
"epoch": 1.2751322751322751,
"grad_norm": 6.204995632171631,
"learning_rate": 2.0526315789473685e-05,
"loss": 2.4867,
"mean_token_accuracy": 0.8747055679559708,
"step": 61
},
{
"epoch": 1.2962962962962963,
"grad_norm": 7.9132466316223145,
"learning_rate": 2e-05,
"loss": 2.634,
"mean_token_accuracy": 0.8690300732851028,
"step": 62
},
{
"epoch": 1.3174603174603174,
"grad_norm": 6.757488250732422,
"learning_rate": 1.9473684210526315e-05,
"loss": 2.4285,
"mean_token_accuracy": 0.8734888136386871,
"step": 63
},
{
"epoch": 1.3386243386243386,
"grad_norm": 6.5959930419921875,
"learning_rate": 1.8947368421052634e-05,
"loss": 2.7514,
"mean_token_accuracy": 0.861017182469368,
"step": 64
},
{
"epoch": 1.3597883597883598,
"grad_norm": 6.472837448120117,
"learning_rate": 1.8421052631578947e-05,
"loss": 2.3507,
"mean_token_accuracy": 0.8777505159378052,
"step": 65
},
{
"epoch": 1.380952380952381,
"grad_norm": 7.2989959716796875,
"learning_rate": 1.7894736842105264e-05,
"loss": 2.5765,
"mean_token_accuracy": 0.8751945346593857,
"step": 66
},
{
"epoch": 1.402116402116402,
"grad_norm": 7.222543716430664,
"learning_rate": 1.736842105263158e-05,
"loss": 2.8945,
"mean_token_accuracy": 0.8649410307407379,
"step": 67
},
{
"epoch": 1.4232804232804233,
"grad_norm": 8.021060943603516,
"learning_rate": 1.6842105263157896e-05,
"loss": 3.104,
"mean_token_accuracy": 0.841856375336647,
"step": 68
},
{
"epoch": 1.4444444444444444,
"grad_norm": 6.828577995300293,
"learning_rate": 1.6315789473684213e-05,
"loss": 2.562,
"mean_token_accuracy": 0.8617057651281357,
"step": 69
},
{
"epoch": 1.4656084656084656,
"grad_norm": 7.925837993621826,
"learning_rate": 1.5789473684210526e-05,
"loss": 3.0748,
"mean_token_accuracy": 0.8369355648756027,
"step": 70
},
{
"epoch": 1.4867724867724867,
"grad_norm": 7.627265930175781,
"learning_rate": 1.5263157894736842e-05,
"loss": 2.7155,
"mean_token_accuracy": 0.8586581945419312,
"step": 71
},
{
"epoch": 1.507936507936508,
"grad_norm": 6.797243595123291,
"learning_rate": 1.4736842105263157e-05,
"loss": 2.9972,
"mean_token_accuracy": 0.8588763475418091,
"step": 72
},
{
"epoch": 1.529100529100529,
"grad_norm": 6.557271957397461,
"learning_rate": 1.4210526315789475e-05,
"loss": 2.7672,
"mean_token_accuracy": 0.8697129487991333,
"step": 73
},
{
"epoch": 1.5502645502645502,
"grad_norm": 7.8767828941345215,
"learning_rate": 1.3684210526315791e-05,
"loss": 2.6796,
"mean_token_accuracy": 0.8595937788486481,
"step": 74
},
{
"epoch": 1.5714285714285714,
"grad_norm": 6.961939334869385,
"learning_rate": 1.3157894736842106e-05,
"loss": 2.9875,
"mean_token_accuracy": 0.8416125029325485,
"step": 75
},
{
"epoch": 1.5925925925925926,
"grad_norm": 7.682628154754639,
"learning_rate": 1.2631578947368422e-05,
"loss": 2.7506,
"mean_token_accuracy": 0.8638267368078232,
"step": 76
},
{
"epoch": 1.6137566137566137,
"grad_norm": 7.030145168304443,
"learning_rate": 1.2105263157894737e-05,
"loss": 3.016,
"mean_token_accuracy": 0.8373551815748215,
"step": 77
},
{
"epoch": 1.6349206349206349,
"grad_norm": 6.8483357429504395,
"learning_rate": 1.1578947368421053e-05,
"loss": 2.9225,
"mean_token_accuracy": 0.8537007570266724,
"step": 78
},
{
"epoch": 1.656084656084656,
"grad_norm": 6.3635358810424805,
"learning_rate": 1.1052631578947368e-05,
"loss": 2.605,
"mean_token_accuracy": 0.8712249547243118,
"step": 79
},
{
"epoch": 1.6772486772486772,
"grad_norm": 6.841208457946777,
"learning_rate": 1.0526315789473684e-05,
"loss": 2.649,
"mean_token_accuracy": 0.8548033833503723,
"step": 80
},
{
"epoch": 1.6984126984126984,
"grad_norm": 6.934281826019287,
"learning_rate": 1e-05,
"loss": 2.7121,
"mean_token_accuracy": 0.8624187707901001,
"step": 81
},
{
"epoch": 1.7195767195767195,
"grad_norm": 7.023396968841553,
"learning_rate": 9.473684210526317e-06,
"loss": 2.7811,
"mean_token_accuracy": 0.8551411479711533,
"step": 82
},
{
"epoch": 1.7407407407407407,
"grad_norm": 6.556440830230713,
"learning_rate": 8.947368421052632e-06,
"loss": 2.4777,
"mean_token_accuracy": 0.8696417659521103,
"step": 83
},
{
"epoch": 1.7619047619047619,
"grad_norm": 6.040866374969482,
"learning_rate": 8.421052631578948e-06,
"loss": 2.578,
"mean_token_accuracy": 0.8646088093519211,
"step": 84
},
{
"epoch": 1.783068783068783,
"grad_norm": 6.976686000823975,
"learning_rate": 7.894736842105263e-06,
"loss": 3.0012,
"mean_token_accuracy": 0.8455324172973633,
"step": 85
},
{
"epoch": 1.8042328042328042,
"grad_norm": 6.477259635925293,
"learning_rate": 7.3684210526315784e-06,
"loss": 2.6865,
"mean_token_accuracy": 0.8627304285764694,
"step": 86
},
{
"epoch": 1.8253968253968254,
"grad_norm": 6.601570129394531,
"learning_rate": 6.842105263157896e-06,
"loss": 2.8802,
"mean_token_accuracy": 0.8515039086341858,
"step": 87
},
{
"epoch": 1.8465608465608465,
"grad_norm": 6.802147388458252,
"learning_rate": 6.315789473684211e-06,
"loss": 2.6333,
"mean_token_accuracy": 0.8652176707983017,
"step": 88
},
{
"epoch": 1.8677248677248677,
"grad_norm": 6.9757843017578125,
"learning_rate": 5.789473684210527e-06,
"loss": 2.5824,
"mean_token_accuracy": 0.8683836162090302,
"step": 89
},
{
"epoch": 1.8888888888888888,
"grad_norm": 8.725011825561523,
"learning_rate": 5.263157894736842e-06,
"loss": 2.0392,
"mean_token_accuracy": 0.8908423334360123,
"step": 90
},
{
"epoch": 1.91005291005291,
"grad_norm": 6.747620105743408,
"learning_rate": 4.736842105263159e-06,
"loss": 2.5522,
"mean_token_accuracy": 0.8612807095050812,
"step": 91
},
{
"epoch": 1.9312169312169312,
"grad_norm": 7.072465419769287,
"learning_rate": 4.210526315789474e-06,
"loss": 3.1815,
"mean_token_accuracy": 0.836863324046135,
"step": 92
},
{
"epoch": 1.9523809523809523,
"grad_norm": 5.726354122161865,
"learning_rate": 3.6842105263157892e-06,
"loss": 2.8141,
"mean_token_accuracy": 0.8544116318225861,
"step": 93
},
{
"epoch": 1.9735449735449735,
"grad_norm": 6.540099620819092,
"learning_rate": 3.1578947368421056e-06,
"loss": 2.2286,
"mean_token_accuracy": 0.8898429274559021,
"step": 94
},
{
"epoch": 1.9947089947089947,
"grad_norm": 6.0961222648620605,
"learning_rate": 2.631578947368421e-06,
"loss": 2.634,
"mean_token_accuracy": 0.8558461666107178,
"step": 95
},
{
"epoch": 2.0,
"grad_norm": 2.9811086654663086,
"learning_rate": 2.105263157894737e-06,
"loss": 0.3701,
"mean_token_accuracy": 0.8978873491287231,
"step": 96
},
{
"epoch": 2.0211640211640214,
"grad_norm": 5.955318450927734,
"learning_rate": 1.5789473684210528e-06,
"loss": 2.499,
"mean_token_accuracy": 0.8697298467159271,
"step": 97
},
{
"epoch": 2.0423280423280423,
"grad_norm": 5.380612373352051,
"learning_rate": 1.0526315789473685e-06,
"loss": 2.2655,
"mean_token_accuracy": 0.8864545077085495,
"step": 98
},
{
"epoch": 2.0634920634920633,
"grad_norm": 5.8451457023620605,
"learning_rate": 5.263157894736843e-07,
"loss": 1.9556,
"mean_token_accuracy": 0.8944731205701828,
"step": 99
},
{
"epoch": 2.0846560846560847,
"grad_norm": 6.115093231201172,
"learning_rate": 0.0,
"loss": 2.4345,
"mean_token_accuracy": 0.8695619106292725,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 514181058527232.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}