SurenaSa's picture
Upload folder using huggingface_hub
a463022 verified
{
"best_global_step": 5950,
"best_metric": 0.7729265244559362,
"best_model_checkpoint": "pseudo_model/checkpoint-5950",
"epoch": 1.0,
"eval_steps": 500,
"global_step": 5950,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008403361344537815,
"grad_norm": 70.3099365234375,
"learning_rate": 1.991764705882353e-05,
"loss": 0.0639,
"step": 50
},
{
"epoch": 0.01680672268907563,
"grad_norm": 0.251952201128006,
"learning_rate": 1.983361344537815e-05,
"loss": 0.1373,
"step": 100
},
{
"epoch": 0.025210084033613446,
"grad_norm": 54.500118255615234,
"learning_rate": 1.9749579831932776e-05,
"loss": 0.187,
"step": 150
},
{
"epoch": 0.03361344537815126,
"grad_norm": 17.24431037902832,
"learning_rate": 1.9665546218487398e-05,
"loss": 0.1268,
"step": 200
},
{
"epoch": 0.04201680672268908,
"grad_norm": 0.7904719114303589,
"learning_rate": 1.958151260504202e-05,
"loss": 0.1147,
"step": 250
},
{
"epoch": 0.05042016806722689,
"grad_norm": 15.803120613098145,
"learning_rate": 1.949747899159664e-05,
"loss": 0.0939,
"step": 300
},
{
"epoch": 0.058823529411764705,
"grad_norm": 21.106233596801758,
"learning_rate": 1.9413445378151262e-05,
"loss": 0.1392,
"step": 350
},
{
"epoch": 0.06722689075630252,
"grad_norm": 4.552457809448242,
"learning_rate": 1.9329411764705883e-05,
"loss": 0.0754,
"step": 400
},
{
"epoch": 0.07563025210084033,
"grad_norm": 59.192100524902344,
"learning_rate": 1.9245378151260505e-05,
"loss": 0.0864,
"step": 450
},
{
"epoch": 0.08403361344537816,
"grad_norm": 24.423683166503906,
"learning_rate": 1.9161344537815126e-05,
"loss": 0.1462,
"step": 500
},
{
"epoch": 0.09243697478991597,
"grad_norm": 20.817564010620117,
"learning_rate": 1.907731092436975e-05,
"loss": 0.1025,
"step": 550
},
{
"epoch": 0.10084033613445378,
"grad_norm": 16.254493713378906,
"learning_rate": 1.8993277310924372e-05,
"loss": 0.1588,
"step": 600
},
{
"epoch": 0.1092436974789916,
"grad_norm": 43.414573669433594,
"learning_rate": 1.8909243697478993e-05,
"loss": 0.0737,
"step": 650
},
{
"epoch": 0.11764705882352941,
"grad_norm": 25.70012664794922,
"learning_rate": 1.8825210084033615e-05,
"loss": 0.0638,
"step": 700
},
{
"epoch": 0.12605042016806722,
"grad_norm": 4.720924377441406,
"learning_rate": 1.8741176470588236e-05,
"loss": 0.1043,
"step": 750
},
{
"epoch": 0.13445378151260504,
"grad_norm": 57.87378692626953,
"learning_rate": 1.8657142857142858e-05,
"loss": 0.1188,
"step": 800
},
{
"epoch": 0.14285714285714285,
"grad_norm": 23.307783126831055,
"learning_rate": 1.857310924369748e-05,
"loss": 0.1161,
"step": 850
},
{
"epoch": 0.15126050420168066,
"grad_norm": 44.87445831298828,
"learning_rate": 1.8489075630252104e-05,
"loss": 0.0917,
"step": 900
},
{
"epoch": 0.15966386554621848,
"grad_norm": 14.766839981079102,
"learning_rate": 1.8405042016806725e-05,
"loss": 0.1296,
"step": 950
},
{
"epoch": 0.16806722689075632,
"grad_norm": 8.999948501586914,
"learning_rate": 1.8321008403361346e-05,
"loss": 0.0966,
"step": 1000
},
{
"epoch": 0.17647058823529413,
"grad_norm": 27.70500946044922,
"learning_rate": 1.8236974789915968e-05,
"loss": 0.0847,
"step": 1050
},
{
"epoch": 0.18487394957983194,
"grad_norm": 45.68607711791992,
"learning_rate": 1.815294117647059e-05,
"loss": 0.0774,
"step": 1100
},
{
"epoch": 0.19327731092436976,
"grad_norm": 9.21766185760498,
"learning_rate": 1.806890756302521e-05,
"loss": 0.0832,
"step": 1150
},
{
"epoch": 0.20168067226890757,
"grad_norm": 84.41997528076172,
"learning_rate": 1.7984873949579832e-05,
"loss": 0.1061,
"step": 1200
},
{
"epoch": 0.21008403361344538,
"grad_norm": 11.963436126708984,
"learning_rate": 1.7900840336134457e-05,
"loss": 0.1119,
"step": 1250
},
{
"epoch": 0.2184873949579832,
"grad_norm": 30.294477462768555,
"learning_rate": 1.7816806722689078e-05,
"loss": 0.089,
"step": 1300
},
{
"epoch": 0.226890756302521,
"grad_norm": 0.17758768796920776,
"learning_rate": 1.77327731092437e-05,
"loss": 0.0671,
"step": 1350
},
{
"epoch": 0.23529411764705882,
"grad_norm": 0.24741335213184357,
"learning_rate": 1.764873949579832e-05,
"loss": 0.0875,
"step": 1400
},
{
"epoch": 0.24369747899159663,
"grad_norm": 79.4989013671875,
"learning_rate": 1.7564705882352942e-05,
"loss": 0.139,
"step": 1450
},
{
"epoch": 0.25210084033613445,
"grad_norm": 40.87698745727539,
"learning_rate": 1.7480672268907564e-05,
"loss": 0.0666,
"step": 1500
},
{
"epoch": 0.2605042016806723,
"grad_norm": 4.265988826751709,
"learning_rate": 1.7396638655462185e-05,
"loss": 0.0957,
"step": 1550
},
{
"epoch": 0.2689075630252101,
"grad_norm": 31.657917022705078,
"learning_rate": 1.7312605042016806e-05,
"loss": 0.1013,
"step": 1600
},
{
"epoch": 0.2773109243697479,
"grad_norm": 48.78037643432617,
"learning_rate": 1.722857142857143e-05,
"loss": 0.0721,
"step": 1650
},
{
"epoch": 0.2857142857142857,
"grad_norm": 0.1120712086558342,
"learning_rate": 1.7144537815126052e-05,
"loss": 0.0865,
"step": 1700
},
{
"epoch": 0.29411764705882354,
"grad_norm": 0.8699374198913574,
"learning_rate": 1.7060504201680674e-05,
"loss": 0.0779,
"step": 1750
},
{
"epoch": 0.3025210084033613,
"grad_norm": 64.08531951904297,
"learning_rate": 1.6976470588235295e-05,
"loss": 0.0938,
"step": 1800
},
{
"epoch": 0.31092436974789917,
"grad_norm": 36.72616195678711,
"learning_rate": 1.6892436974789917e-05,
"loss": 0.068,
"step": 1850
},
{
"epoch": 0.31932773109243695,
"grad_norm": 24.38673973083496,
"learning_rate": 1.6808403361344538e-05,
"loss": 0.0667,
"step": 1900
},
{
"epoch": 0.3277310924369748,
"grad_norm": 4.634633541107178,
"learning_rate": 1.672436974789916e-05,
"loss": 0.1098,
"step": 1950
},
{
"epoch": 0.33613445378151263,
"grad_norm": 0.005323654506355524,
"learning_rate": 1.6640336134453784e-05,
"loss": 0.0578,
"step": 2000
},
{
"epoch": 0.3445378151260504,
"grad_norm": 30.870010375976562,
"learning_rate": 1.6556302521008405e-05,
"loss": 0.0702,
"step": 2050
},
{
"epoch": 0.35294117647058826,
"grad_norm": 0.3597383201122284,
"learning_rate": 1.6472268907563027e-05,
"loss": 0.087,
"step": 2100
},
{
"epoch": 0.36134453781512604,
"grad_norm": 43.77628707885742,
"learning_rate": 1.6388235294117648e-05,
"loss": 0.0865,
"step": 2150
},
{
"epoch": 0.3697478991596639,
"grad_norm": 5.36348819732666,
"learning_rate": 1.630420168067227e-05,
"loss": 0.1211,
"step": 2200
},
{
"epoch": 0.37815126050420167,
"grad_norm": 5.006982326507568,
"learning_rate": 1.622016806722689e-05,
"loss": 0.0772,
"step": 2250
},
{
"epoch": 0.3865546218487395,
"grad_norm": 0.21585534512996674,
"learning_rate": 1.6136134453781512e-05,
"loss": 0.0873,
"step": 2300
},
{
"epoch": 0.3949579831932773,
"grad_norm": 5.9591193199157715,
"learning_rate": 1.6052100840336137e-05,
"loss": 0.0678,
"step": 2350
},
{
"epoch": 0.40336134453781514,
"grad_norm": 5.950344085693359,
"learning_rate": 1.596806722689076e-05,
"loss": 0.0583,
"step": 2400
},
{
"epoch": 0.4117647058823529,
"grad_norm": 62.205570220947266,
"learning_rate": 1.588403361344538e-05,
"loss": 0.0799,
"step": 2450
},
{
"epoch": 0.42016806722689076,
"grad_norm": 11.344050407409668,
"learning_rate": 1.58e-05,
"loss": 0.0925,
"step": 2500
},
{
"epoch": 0.42857142857142855,
"grad_norm": 20.887693405151367,
"learning_rate": 1.5715966386554623e-05,
"loss": 0.0515,
"step": 2550
},
{
"epoch": 0.4369747899159664,
"grad_norm": 4.148672580718994,
"learning_rate": 1.5631932773109244e-05,
"loss": 0.0836,
"step": 2600
},
{
"epoch": 0.44537815126050423,
"grad_norm": 0.019104987382888794,
"learning_rate": 1.5547899159663865e-05,
"loss": 0.0663,
"step": 2650
},
{
"epoch": 0.453781512605042,
"grad_norm": 55.08736038208008,
"learning_rate": 1.5463865546218487e-05,
"loss": 0.0573,
"step": 2700
},
{
"epoch": 0.46218487394957986,
"grad_norm": 24.77686882019043,
"learning_rate": 1.537983193277311e-05,
"loss": 0.0995,
"step": 2750
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.2827483117580414,
"learning_rate": 1.5295798319327733e-05,
"loss": 0.0931,
"step": 2800
},
{
"epoch": 0.4789915966386555,
"grad_norm": 51.127052307128906,
"learning_rate": 1.5211764705882354e-05,
"loss": 0.0674,
"step": 2850
},
{
"epoch": 0.48739495798319327,
"grad_norm": 28.012380599975586,
"learning_rate": 1.5127731092436977e-05,
"loss": 0.071,
"step": 2900
},
{
"epoch": 0.4957983193277311,
"grad_norm": 62.96744155883789,
"learning_rate": 1.5043697478991597e-05,
"loss": 0.0802,
"step": 2950
},
{
"epoch": 0.5042016806722689,
"grad_norm": 24.443973541259766,
"learning_rate": 1.4959663865546218e-05,
"loss": 0.0822,
"step": 3000
},
{
"epoch": 0.5126050420168067,
"grad_norm": 0.09382598847150803,
"learning_rate": 1.4875630252100841e-05,
"loss": 0.0893,
"step": 3050
},
{
"epoch": 0.5210084033613446,
"grad_norm": 8.969447135925293,
"learning_rate": 1.4791596638655463e-05,
"loss": 0.0587,
"step": 3100
},
{
"epoch": 0.5294117647058824,
"grad_norm": 0.33123093843460083,
"learning_rate": 1.4707563025210086e-05,
"loss": 0.0662,
"step": 3150
},
{
"epoch": 0.5378151260504201,
"grad_norm": 0.34799447655677795,
"learning_rate": 1.4623529411764707e-05,
"loss": 0.0471,
"step": 3200
},
{
"epoch": 0.5462184873949579,
"grad_norm": 0.10502827167510986,
"learning_rate": 1.453949579831933e-05,
"loss": 0.0608,
"step": 3250
},
{
"epoch": 0.5546218487394958,
"grad_norm": 46.54685974121094,
"learning_rate": 1.4455462184873952e-05,
"loss": 0.0756,
"step": 3300
},
{
"epoch": 0.5630252100840336,
"grad_norm": 25.307750701904297,
"learning_rate": 1.4371428571428571e-05,
"loss": 0.0703,
"step": 3350
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.03593587502837181,
"learning_rate": 1.4287394957983194e-05,
"loss": 0.0509,
"step": 3400
},
{
"epoch": 0.5798319327731093,
"grad_norm": 6.873576641082764,
"learning_rate": 1.4203361344537816e-05,
"loss": 0.0751,
"step": 3450
},
{
"epoch": 0.5882352941176471,
"grad_norm": 39.44701385498047,
"learning_rate": 1.4119327731092439e-05,
"loss": 0.0698,
"step": 3500
},
{
"epoch": 0.5966386554621849,
"grad_norm": 0.7871583104133606,
"learning_rate": 1.403529411764706e-05,
"loss": 0.0701,
"step": 3550
},
{
"epoch": 0.6050420168067226,
"grad_norm": 0.06720981746912003,
"learning_rate": 1.3951260504201683e-05,
"loss": 0.0613,
"step": 3600
},
{
"epoch": 0.6134453781512605,
"grad_norm": 87.64744567871094,
"learning_rate": 1.3867226890756305e-05,
"loss": 0.0657,
"step": 3650
},
{
"epoch": 0.6218487394957983,
"grad_norm": 92.26810455322266,
"learning_rate": 1.3783193277310924e-05,
"loss": 0.0915,
"step": 3700
},
{
"epoch": 0.6302521008403361,
"grad_norm": 0.13392935693264008,
"learning_rate": 1.3699159663865547e-05,
"loss": 0.0552,
"step": 3750
},
{
"epoch": 0.6386554621848739,
"grad_norm": 0.2624678313732147,
"learning_rate": 1.3615126050420169e-05,
"loss": 0.061,
"step": 3800
},
{
"epoch": 0.6470588235294118,
"grad_norm": 58.510833740234375,
"learning_rate": 1.353109243697479e-05,
"loss": 0.0475,
"step": 3850
},
{
"epoch": 0.6554621848739496,
"grad_norm": 0.19354496896266937,
"learning_rate": 1.3447058823529413e-05,
"loss": 0.0423,
"step": 3900
},
{
"epoch": 0.6638655462184874,
"grad_norm": 20.584426879882812,
"learning_rate": 1.3363025210084035e-05,
"loss": 0.1135,
"step": 3950
},
{
"epoch": 0.6722689075630253,
"grad_norm": 1.4565203189849854,
"learning_rate": 1.3278991596638658e-05,
"loss": 0.0264,
"step": 4000
},
{
"epoch": 0.680672268907563,
"grad_norm": 0.0815153419971466,
"learning_rate": 1.3194957983193279e-05,
"loss": 0.0956,
"step": 4050
},
{
"epoch": 0.6890756302521008,
"grad_norm": 45.101993560791016,
"learning_rate": 1.3110924369747899e-05,
"loss": 0.0853,
"step": 4100
},
{
"epoch": 0.6974789915966386,
"grad_norm": 8.502076148986816,
"learning_rate": 1.3026890756302522e-05,
"loss": 0.0407,
"step": 4150
},
{
"epoch": 0.7058823529411765,
"grad_norm": 0.01488969475030899,
"learning_rate": 1.2942857142857143e-05,
"loss": 0.0506,
"step": 4200
},
{
"epoch": 0.7142857142857143,
"grad_norm": 128.31639099121094,
"learning_rate": 1.2858823529411766e-05,
"loss": 0.0527,
"step": 4250
},
{
"epoch": 0.7226890756302521,
"grad_norm": 69.5967025756836,
"learning_rate": 1.2774789915966388e-05,
"loss": 0.0437,
"step": 4300
},
{
"epoch": 0.7310924369747899,
"grad_norm": 51.27083206176758,
"learning_rate": 1.269075630252101e-05,
"loss": 0.0511,
"step": 4350
},
{
"epoch": 0.7394957983193278,
"grad_norm": 1.9427380561828613,
"learning_rate": 1.2606722689075632e-05,
"loss": 0.0938,
"step": 4400
},
{
"epoch": 0.7478991596638656,
"grad_norm": 17.49405288696289,
"learning_rate": 1.2522689075630252e-05,
"loss": 0.0799,
"step": 4450
},
{
"epoch": 0.7563025210084033,
"grad_norm": 2.69028377532959,
"learning_rate": 1.2438655462184875e-05,
"loss": 0.0556,
"step": 4500
},
{
"epoch": 0.7647058823529411,
"grad_norm": 91.98591613769531,
"learning_rate": 1.2354621848739496e-05,
"loss": 0.0341,
"step": 4550
},
{
"epoch": 0.773109243697479,
"grad_norm": 37.63540267944336,
"learning_rate": 1.2270588235294119e-05,
"loss": 0.0748,
"step": 4600
},
{
"epoch": 0.7815126050420168,
"grad_norm": 24.502737045288086,
"learning_rate": 1.218655462184874e-05,
"loss": 0.0693,
"step": 4650
},
{
"epoch": 0.7899159663865546,
"grad_norm": 0.010893125087022781,
"learning_rate": 1.2102521008403364e-05,
"loss": 0.0371,
"step": 4700
},
{
"epoch": 0.7983193277310925,
"grad_norm": 39.78943634033203,
"learning_rate": 1.2018487394957985e-05,
"loss": 0.0547,
"step": 4750
},
{
"epoch": 0.8067226890756303,
"grad_norm": 21.680444717407227,
"learning_rate": 1.1934453781512605e-05,
"loss": 0.0405,
"step": 4800
},
{
"epoch": 0.8151260504201681,
"grad_norm": 91.88570404052734,
"learning_rate": 1.1850420168067228e-05,
"loss": 0.048,
"step": 4850
},
{
"epoch": 0.8235294117647058,
"grad_norm": 47.21152114868164,
"learning_rate": 1.1766386554621849e-05,
"loss": 0.0957,
"step": 4900
},
{
"epoch": 0.8319327731092437,
"grad_norm": 0.03324070945382118,
"learning_rate": 1.1682352941176472e-05,
"loss": 0.0525,
"step": 4950
},
{
"epoch": 0.8403361344537815,
"grad_norm": 0.4629770517349243,
"learning_rate": 1.1598319327731094e-05,
"loss": 0.0682,
"step": 5000
},
{
"epoch": 0.8487394957983193,
"grad_norm": 45.92827224731445,
"learning_rate": 1.1514285714285715e-05,
"loss": 0.0424,
"step": 5050
},
{
"epoch": 0.8571428571428571,
"grad_norm": 32.47571563720703,
"learning_rate": 1.1430252100840338e-05,
"loss": 0.0794,
"step": 5100
},
{
"epoch": 0.865546218487395,
"grad_norm": 17.017562866210938,
"learning_rate": 1.134621848739496e-05,
"loss": 0.0436,
"step": 5150
},
{
"epoch": 0.8739495798319328,
"grad_norm": 2.4690027236938477,
"learning_rate": 1.1262184873949579e-05,
"loss": 0.0618,
"step": 5200
},
{
"epoch": 0.8823529411764706,
"grad_norm": 91.57916259765625,
"learning_rate": 1.1178151260504202e-05,
"loss": 0.0628,
"step": 5250
},
{
"epoch": 0.8907563025210085,
"grad_norm": 0.11016395688056946,
"learning_rate": 1.1094117647058823e-05,
"loss": 0.076,
"step": 5300
},
{
"epoch": 0.8991596638655462,
"grad_norm": 42.38503646850586,
"learning_rate": 1.1010084033613447e-05,
"loss": 0.0759,
"step": 5350
},
{
"epoch": 0.907563025210084,
"grad_norm": 4.813717365264893,
"learning_rate": 1.0926050420168068e-05,
"loss": 0.0349,
"step": 5400
},
{
"epoch": 0.9159663865546218,
"grad_norm": 0.016100173816084862,
"learning_rate": 1.0842016806722691e-05,
"loss": 0.0506,
"step": 5450
},
{
"epoch": 0.9243697478991597,
"grad_norm": 0.07725575566291809,
"learning_rate": 1.0757983193277312e-05,
"loss": 0.0581,
"step": 5500
},
{
"epoch": 0.9327731092436975,
"grad_norm": 69.73644256591797,
"learning_rate": 1.0673949579831932e-05,
"loss": 0.0552,
"step": 5550
},
{
"epoch": 0.9411764705882353,
"grad_norm": 0.0879245325922966,
"learning_rate": 1.0589915966386555e-05,
"loss": 0.059,
"step": 5600
},
{
"epoch": 0.9495798319327731,
"grad_norm": 0.06376896053552628,
"learning_rate": 1.0505882352941176e-05,
"loss": 0.046,
"step": 5650
},
{
"epoch": 0.957983193277311,
"grad_norm": 98.75856018066406,
"learning_rate": 1.04218487394958e-05,
"loss": 0.0319,
"step": 5700
},
{
"epoch": 0.9663865546218487,
"grad_norm": 0.07460715621709824,
"learning_rate": 1.0337815126050421e-05,
"loss": 0.0349,
"step": 5750
},
{
"epoch": 0.9747899159663865,
"grad_norm": 88.64122772216797,
"learning_rate": 1.0253781512605044e-05,
"loss": 0.0577,
"step": 5800
},
{
"epoch": 0.9831932773109243,
"grad_norm": 0.0315723791718483,
"learning_rate": 1.0169747899159665e-05,
"loss": 0.0655,
"step": 5850
},
{
"epoch": 0.9915966386554622,
"grad_norm": 74.6689682006836,
"learning_rate": 1.0085714285714288e-05,
"loss": 0.0457,
"step": 5900
},
{
"epoch": 1.0,
"grad_norm": 0.06222504749894142,
"learning_rate": 1.0001680672268908e-05,
"loss": 0.0391,
"step": 5950
},
{
"epoch": 1.0,
"eval_f1": 0.7729265244559362,
"eval_loss": 1.5595977306365967,
"eval_runtime": 0.0922,
"eval_samples_per_second": 2364.65,
"eval_steps_per_second": 43.388,
"step": 5950
}
],
"logging_steps": 50,
"max_steps": 11900,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8495403899188410.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}