DeepSeek-R1-Distill-Qwen-7B-InRa / trainer_state.json
wh-zhu's picture
Upload folder using huggingface_hub
8d2b086 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.995962314939435,
"eval_steps": 500,
"global_step": 1113,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.026917900403768506,
"grad_norm": 13.069798469543457,
"learning_rate": 1.7857142857142859e-06,
"loss": 0.3801,
"step": 10
},
{
"epoch": 0.05383580080753701,
"grad_norm": 2.184347629547119,
"learning_rate": 3.5714285714285718e-06,
"loss": 0.31,
"step": 20
},
{
"epoch": 0.08075370121130551,
"grad_norm": 0.8302198648452759,
"learning_rate": 5.357142857142857e-06,
"loss": 0.2645,
"step": 30
},
{
"epoch": 0.10767160161507403,
"grad_norm": 1.0656105279922485,
"learning_rate": 7.1428571428571436e-06,
"loss": 0.2539,
"step": 40
},
{
"epoch": 0.13458950201884254,
"grad_norm": 0.5782439708709717,
"learning_rate": 8.92857142857143e-06,
"loss": 0.2512,
"step": 50
},
{
"epoch": 0.16150740242261102,
"grad_norm": 0.5837422609329224,
"learning_rate": 1.0714285714285714e-05,
"loss": 0.2508,
"step": 60
},
{
"epoch": 0.18842530282637954,
"grad_norm": 0.6315082907676697,
"learning_rate": 1.25e-05,
"loss": 0.2481,
"step": 70
},
{
"epoch": 0.21534320323014805,
"grad_norm": 0.649541974067688,
"learning_rate": 1.4285714285714287e-05,
"loss": 0.2418,
"step": 80
},
{
"epoch": 0.24226110363391656,
"grad_norm": 0.5448735356330872,
"learning_rate": 1.6071428571428572e-05,
"loss": 0.2479,
"step": 90
},
{
"epoch": 0.2691790040376851,
"grad_norm": 0.5558776259422302,
"learning_rate": 1.785714285714286e-05,
"loss": 0.2421,
"step": 100
},
{
"epoch": 0.2960969044414536,
"grad_norm": 0.3394428789615631,
"learning_rate": 1.9642857142857145e-05,
"loss": 0.2388,
"step": 110
},
{
"epoch": 0.32301480484522205,
"grad_norm": 0.3705368936061859,
"learning_rate": 1.9996848199254315e-05,
"loss": 0.2407,
"step": 120
},
{
"epoch": 0.34993270524899056,
"grad_norm": 0.3354800343513489,
"learning_rate": 1.9984047413708153e-05,
"loss": 0.2325,
"step": 130
},
{
"epoch": 0.3768506056527591,
"grad_norm": 0.2792787253856659,
"learning_rate": 1.9961413253717214e-05,
"loss": 0.2385,
"step": 140
},
{
"epoch": 0.4037685060565276,
"grad_norm": 0.6989262104034424,
"learning_rate": 1.9928968011860973e-05,
"loss": 0.2372,
"step": 150
},
{
"epoch": 0.4306864064602961,
"grad_norm": 0.4628732204437256,
"learning_rate": 1.988674364373809e-05,
"loss": 0.2332,
"step": 160
},
{
"epoch": 0.4576043068640646,
"grad_norm": 1.1485790014266968,
"learning_rate": 1.9834781736493057e-05,
"loss": 0.2362,
"step": 170
},
{
"epoch": 0.4845222072678331,
"grad_norm": 0.3115156292915344,
"learning_rate": 1.9773133467856672e-05,
"loss": 0.2347,
"step": 180
},
{
"epoch": 0.5114401076716016,
"grad_norm": 0.2576087415218353,
"learning_rate": 1.9701859555740647e-05,
"loss": 0.2404,
"step": 190
},
{
"epoch": 0.5383580080753702,
"grad_norm": 0.3003959059715271,
"learning_rate": 1.9621030198436007e-05,
"loss": 0.234,
"step": 200
},
{
"epoch": 0.5652759084791387,
"grad_norm": 0.22878509759902954,
"learning_rate": 1.9530725005474195e-05,
"loss": 0.2347,
"step": 210
},
{
"epoch": 0.5921938088829072,
"grad_norm": 0.26122385263442993,
"learning_rate": 1.9431032919218957e-05,
"loss": 0.2446,
"step": 220
},
{
"epoch": 0.6191117092866757,
"grad_norm": 0.22441260516643524,
"learning_rate": 1.9322052127266234e-05,
"loss": 0.2398,
"step": 230
},
{
"epoch": 0.6460296096904441,
"grad_norm": 0.2252231240272522,
"learning_rate": 1.9203889965738354e-05,
"loss": 0.2377,
"step": 240
},
{
"epoch": 0.6729475100942126,
"grad_norm": 0.30187228322029114,
"learning_rate": 1.9076662813567772e-05,
"loss": 0.2355,
"step": 250
},
{
"epoch": 0.6998654104979811,
"grad_norm": 0.2517610192298889,
"learning_rate": 1.894049597787443e-05,
"loss": 0.2402,
"step": 260
},
{
"epoch": 0.7267833109017496,
"grad_norm": 0.30307725071907043,
"learning_rate": 1.879552357054971e-05,
"loss": 0.2378,
"step": 270
},
{
"epoch": 0.7537012113055181,
"grad_norm": 0.26731035113334656,
"learning_rate": 1.8641888376168483e-05,
"loss": 0.2378,
"step": 280
},
{
"epoch": 0.7806191117092867,
"grad_norm": 0.22943764925003052,
"learning_rate": 1.847974171135933e-05,
"loss": 0.235,
"step": 290
},
{
"epoch": 0.8075370121130552,
"grad_norm": 0.19347825646400452,
"learning_rate": 1.830924327577149e-05,
"loss": 0.2329,
"step": 300
},
{
"epoch": 0.8344549125168237,
"grad_norm": 0.22859790921211243,
"learning_rate": 1.8130560994785325e-05,
"loss": 0.2289,
"step": 310
},
{
"epoch": 0.8613728129205922,
"grad_norm": 0.2617790699005127,
"learning_rate": 1.7943870854121126e-05,
"loss": 0.2294,
"step": 320
},
{
"epoch": 0.8882907133243607,
"grad_norm": 0.23600426316261292,
"learning_rate": 1.7749356726509286e-05,
"loss": 0.2304,
"step": 330
},
{
"epoch": 0.9152086137281292,
"grad_norm": 0.2116561233997345,
"learning_rate": 1.7547210190592446e-05,
"loss": 0.2379,
"step": 340
},
{
"epoch": 0.9421265141318977,
"grad_norm": 0.19537119567394257,
"learning_rate": 1.733763034223804e-05,
"loss": 0.2309,
"step": 350
},
{
"epoch": 0.9690444145356663,
"grad_norm": 0.22050656378269196,
"learning_rate": 1.7120823598447077e-05,
"loss": 0.2281,
"step": 360
},
{
"epoch": 0.9959623149394348,
"grad_norm": 0.1890714466571808,
"learning_rate": 1.6897003494052217e-05,
"loss": 0.2327,
"step": 370
},
{
"epoch": 1.0228802153432033,
"grad_norm": 0.1974857598543167,
"learning_rate": 1.6666390471405504e-05,
"loss": 0.2265,
"step": 380
},
{
"epoch": 1.0497981157469718,
"grad_norm": 0.2218897044658661,
"learning_rate": 1.642921166326278e-05,
"loss": 0.2385,
"step": 390
},
{
"epoch": 1.0767160161507403,
"grad_norm": 0.35485249757766724,
"learning_rate": 1.6185700669078674e-05,
"loss": 0.2274,
"step": 400
},
{
"epoch": 1.1036339165545088,
"grad_norm": 0.40264761447906494,
"learning_rate": 1.5936097324932487e-05,
"loss": 0.2287,
"step": 410
},
{
"epoch": 1.1305518169582773,
"grad_norm": 0.2551412284374237,
"learning_rate": 1.568064746731156e-05,
"loss": 0.2395,
"step": 420
},
{
"epoch": 1.1574697173620458,
"grad_norm": 0.19965523481369019,
"learning_rate": 1.5419602690984805e-05,
"loss": 0.2331,
"step": 430
},
{
"epoch": 1.1843876177658144,
"grad_norm": 0.18600021302700043,
"learning_rate": 1.5153220101204839e-05,
"loss": 0.2354,
"step": 440
},
{
"epoch": 1.2113055181695827,
"grad_norm": 0.2717427909374237,
"learning_rate": 1.4881762060482814e-05,
"loss": 0.231,
"step": 450
},
{
"epoch": 1.2382234185733512,
"grad_norm": 0.3491940498352051,
"learning_rate": 1.4605495930185303e-05,
"loss": 0.2302,
"step": 460
},
{
"epoch": 1.2651413189771197,
"grad_norm": 0.18677066266536713,
"learning_rate": 1.4324693807207785e-05,
"loss": 0.2311,
"step": 470
},
{
"epoch": 1.2920592193808882,
"grad_norm": 0.24856720864772797,
"learning_rate": 1.4039632255984078e-05,
"loss": 0.2258,
"step": 480
},
{
"epoch": 1.3189771197846567,
"grad_norm": 0.1940755695104599,
"learning_rate": 1.375059203609562e-05,
"loss": 0.2304,
"step": 490
},
{
"epoch": 1.3458950201884252,
"grad_norm": 0.2115495502948761,
"learning_rate": 1.3457857825748959e-05,
"loss": 0.2255,
"step": 500
},
{
"epoch": 1.3458950201884252,
"eval_loss": 0.24458986520767212,
"eval_runtime": 62.1158,
"eval_samples_per_second": 85.051,
"eval_steps_per_second": 21.267,
"step": 500
},
{
"epoch": 1.3728129205921937,
"grad_norm": 0.30106064677238464,
"learning_rate": 1.3161717941393703e-05,
"loss": 0.2293,
"step": 510
},
{
"epoch": 1.3997308209959622,
"grad_norm": 0.21698522567749023,
"learning_rate": 1.2862464053757196e-05,
"loss": 0.2301,
"step": 520
},
{
"epoch": 1.4266487213997308,
"grad_norm": 0.21992221474647522,
"learning_rate": 1.2560390900575472e-05,
"loss": 0.2264,
"step": 530
},
{
"epoch": 1.4535666218034993,
"grad_norm": 0.25674089789390564,
"learning_rate": 1.2255795996303526e-05,
"loss": 0.2261,
"step": 540
},
{
"epoch": 1.4804845222072678,
"grad_norm": 0.2653080224990845,
"learning_rate": 1.1948979339090758e-05,
"loss": 0.2243,
"step": 550
},
{
"epoch": 1.5074024226110363,
"grad_norm": 0.3156011998653412,
"learning_rate": 1.1640243115310219e-05,
"loss": 0.2353,
"step": 560
},
{
"epoch": 1.5343203230148048,
"grad_norm": 0.21554109454154968,
"learning_rate": 1.1329891401932631e-05,
"loss": 0.2294,
"step": 570
},
{
"epoch": 1.5612382234185733,
"grad_norm": 0.18904979526996613,
"learning_rate": 1.1018229867038358e-05,
"loss": 0.2272,
"step": 580
},
{
"epoch": 1.5881561238223418,
"grad_norm": 0.23018983006477356,
"learning_rate": 1.0705565468762274e-05,
"loss": 0.2294,
"step": 590
},
{
"epoch": 1.6150740242261103,
"grad_norm": 0.2061055600643158,
"learning_rate": 1.0392206152968058e-05,
"loss": 0.2266,
"step": 600
},
{
"epoch": 1.6419919246298789,
"grad_norm": 0.20794202387332916,
"learning_rate": 1.0078460549949647e-05,
"loss": 0.2357,
"step": 610
},
{
"epoch": 1.6689098250336474,
"grad_norm": 0.19699296355247498,
"learning_rate": 9.764637670458595e-06,
"loss": 0.224,
"step": 620
},
{
"epoch": 1.695827725437416,
"grad_norm": 0.22355449199676514,
"learning_rate": 9.451046601356725e-06,
"loss": 0.2365,
"step": 630
},
{
"epoch": 1.7227456258411844,
"grad_norm": 0.20971466600894928,
"learning_rate": 9.137996201193807e-06,
"loss": 0.2328,
"step": 640
},
{
"epoch": 1.749663526244953,
"grad_norm": 0.24429140985012054,
"learning_rate": 8.825794796010101e-06,
"loss": 0.2213,
"step": 650
},
{
"epoch": 1.7765814266487214,
"grad_norm": 0.2615514397621155,
"learning_rate": 8.514749875663397e-06,
"loss": 0.2291,
"step": 660
},
{
"epoch": 1.80349932705249,
"grad_norm": 0.29951363801956177,
"learning_rate": 8.20516779097958e-06,
"loss": 0.2294,
"step": 670
},
{
"epoch": 1.8304172274562585,
"grad_norm": 0.19812524318695068,
"learning_rate": 7.897353452025077e-06,
"loss": 0.2288,
"step": 680
},
{
"epoch": 1.857335127860027,
"grad_norm": 0.21179044246673584,
"learning_rate": 7.591610027798287e-06,
"loss": 0.2294,
"step": 690
},
{
"epoch": 1.8842530282637955,
"grad_norm": 0.193583145737648,
"learning_rate": 7.2882386476358304e-06,
"loss": 0.227,
"step": 700
},
{
"epoch": 1.911170928667564,
"grad_norm": 0.20502911508083344,
"learning_rate": 6.9875381046276605e-06,
"loss": 0.2258,
"step": 710
},
{
"epoch": 1.9380888290713325,
"grad_norm": 0.19676484167575836,
"learning_rate": 6.689804561333164e-06,
"loss": 0.2272,
"step": 720
},
{
"epoch": 1.965006729475101,
"grad_norm": 0.20092357695102692,
"learning_rate": 6.39533125808812e-06,
"loss": 0.2292,
"step": 730
},
{
"epoch": 1.9919246298788695,
"grad_norm": 0.22104892134666443,
"learning_rate": 6.104408224189746e-06,
"loss": 0.2269,
"step": 740
},
{
"epoch": 2.018842530282638,
"grad_norm": 0.1946035623550415,
"learning_rate": 5.8173219922443516e-06,
"loss": 0.2193,
"step": 750
},
{
"epoch": 2.0457604306864066,
"grad_norm": 0.22905437648296356,
"learning_rate": 5.5343553159588884e-06,
"loss": 0.2353,
"step": 760
},
{
"epoch": 2.072678331090175,
"grad_norm": 0.23081299662590027,
"learning_rate": 5.2557868916543996e-06,
"loss": 0.2229,
"step": 770
},
{
"epoch": 2.0995962314939436,
"grad_norm": 0.21353456377983093,
"learning_rate": 4.981891083775597e-06,
"loss": 0.2215,
"step": 780
},
{
"epoch": 2.126514131897712,
"grad_norm": 0.20833438634872437,
"learning_rate": 4.712937654666971e-06,
"loss": 0.2231,
"step": 790
},
{
"epoch": 2.1534320323014806,
"grad_norm": 0.20027689635753632,
"learning_rate": 4.4491914988815055e-06,
"loss": 0.2281,
"step": 800
},
{
"epoch": 2.180349932705249,
"grad_norm": 0.22123222053050995,
"learning_rate": 4.190912382283749e-06,
"loss": 0.2278,
"step": 810
},
{
"epoch": 2.2072678331090176,
"grad_norm": 0.28094470500946045,
"learning_rate": 3.9383546862041955e-06,
"loss": 0.2228,
"step": 820
},
{
"epoch": 2.234185733512786,
"grad_norm": 0.3237360417842865,
"learning_rate": 3.6917671568969006e-06,
"loss": 0.2291,
"step": 830
},
{
"epoch": 2.2611036339165547,
"grad_norm": 0.21679522097110748,
"learning_rate": 3.4513926605471504e-06,
"loss": 0.2285,
"step": 840
},
{
"epoch": 2.288021534320323,
"grad_norm": 0.21422189474105835,
"learning_rate": 3.2174679440704616e-06,
"loss": 0.2279,
"step": 850
},
{
"epoch": 2.3149394347240917,
"grad_norm": 0.2352222353219986,
"learning_rate": 2.9902234019385056e-06,
"loss": 0.2264,
"step": 860
},
{
"epoch": 2.34185733512786,
"grad_norm": 0.23439514636993408,
"learning_rate": 2.7698828492615992e-06,
"loss": 0.2269,
"step": 870
},
{
"epoch": 2.3687752355316287,
"grad_norm": 0.22924348711967468,
"learning_rate": 2.5566633013512753e-06,
"loss": 0.2267,
"step": 880
},
{
"epoch": 2.3956931359353972,
"grad_norm": 0.23167449235916138,
"learning_rate": 2.350774759980027e-06,
"loss": 0.2254,
"step": 890
},
{
"epoch": 2.4226110363391653,
"grad_norm": 0.2599547803401947,
"learning_rate": 2.1524200065487565e-06,
"loss": 0.2291,
"step": 900
},
{
"epoch": 2.449528936742934,
"grad_norm": 0.22817839682102203,
"learning_rate": 1.961794402365611e-06,
"loss": 0.2284,
"step": 910
},
{
"epoch": 2.4764468371467023,
"grad_norm": 0.2169758379459381,
"learning_rate": 1.7790856962329584e-06,
"loss": 0.2286,
"step": 920
},
{
"epoch": 2.503364737550471,
"grad_norm": 0.21095937490463257,
"learning_rate": 1.6044738395319648e-06,
"loss": 0.2253,
"step": 930
},
{
"epoch": 2.5302826379542394,
"grad_norm": 0.21286533772945404,
"learning_rate": 1.4381308089869283e-06,
"loss": 0.2193,
"step": 940
},
{
"epoch": 2.557200538358008,
"grad_norm": 0.2127334177494049,
"learning_rate": 1.2802204372839178e-06,
"loss": 0.2198,
"step": 950
},
{
"epoch": 2.5841184387617764,
"grad_norm": 0.19859924912452698,
"learning_rate": 1.130898251710547e-06,
"loss": 0.2212,
"step": 960
},
{
"epoch": 2.611036339165545,
"grad_norm": 0.23916248977184296,
"learning_rate": 9.903113209758098e-07,
"loss": 0.2245,
"step": 970
},
{
"epoch": 2.6379542395693134,
"grad_norm": 0.24261216819286346,
"learning_rate": 8.585981103608343e-07,
"loss": 0.2241,
"step": 980
},
{
"epoch": 2.664872139973082,
"grad_norm": 0.22423197329044342,
"learning_rate": 7.358883453432398e-07,
"loss": 0.2241,
"step": 990
},
{
"epoch": 2.6917900403768504,
"grad_norm": 0.30151936411857605,
"learning_rate": 6.223028838293898e-07,
"loss": 0.2265,
"step": 1000
},
{
"epoch": 2.6917900403768504,
"eval_loss": 0.2421317845582962,
"eval_runtime": 62.8408,
"eval_samples_per_second": 84.07,
"eval_steps_per_second": 21.021,
"step": 1000
},
{
"epoch": 2.718707940780619,
"grad_norm": 0.23664213716983795,
"learning_rate": 5.179535971203953e-07,
"loss": 0.2199,
"step": 1010
},
{
"epoch": 2.7456258411843875,
"grad_norm": 0.21507257223129272,
"learning_rate": 4.2294325972911274e-07,
"loss": 0.2265,
"step": 1020
},
{
"epoch": 2.772543741588156,
"grad_norm": 0.1968134194612503,
"learning_rate": 3.3736544815663017e-07,
"loss": 0.2204,
"step": 1030
},
{
"epoch": 2.7994616419919245,
"grad_norm": 0.2121606171131134,
"learning_rate": 2.6130444872797143e-07,
"loss": 0.2187,
"step": 1040
},
{
"epoch": 2.826379542395693,
"grad_norm": 0.21338069438934326,
"learning_rate": 1.9483517457776436e-07,
"loss": 0.2156,
"step": 1050
},
{
"epoch": 2.8532974427994615,
"grad_norm": 0.22012507915496826,
"learning_rate": 1.3802309186764619e-07,
"loss": 0.2176,
"step": 1060
},
{
"epoch": 2.88021534320323,
"grad_norm": 0.2376081794500351,
"learning_rate": 9.092415530807975e-08,
"loss": 0.2206,
"step": 1070
},
{
"epoch": 2.9071332436069985,
"grad_norm": 0.21504898369312286,
"learning_rate": 5.3584753048073756e-08,
"loss": 0.2233,
"step": 1080
},
{
"epoch": 2.934051144010767,
"grad_norm": 0.2161342054605484,
"learning_rate": 2.604166098709504e-08,
"loss": 0.2263,
"step": 1090
},
{
"epoch": 2.9609690444145356,
"grad_norm": 0.26196786761283875,
"learning_rate": 8.322006554171147e-09,
"loss": 0.23,
"step": 1100
},
{
"epoch": 2.987886944818304,
"grad_norm": 0.26092347502708435,
"learning_rate": 4.432419898459106e-10,
"loss": 0.2229,
"step": 1110
},
{
"epoch": 2.995962314939435,
"step": 1113,
"total_flos": 5.031637962748592e+18,
"train_loss": 0.232770404511492,
"train_runtime": 3990.8809,
"train_samples_per_second": 35.74,
"train_steps_per_second": 0.279
}
],
"logging_steps": 10,
"max_steps": 1113,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.031637962748592e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}