qwen32b-thai-lora / checkpoint-600 /trainer_state.json
devrf's picture
Upload folder using huggingface_hub
5684a7e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.8314567815693746,
"eval_steps": 500,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013857613026156245,
"grad_norm": 0.1854863315820694,
"learning_rate": 2.0930232558139536e-05,
"loss": 0.8494,
"memory/device_reserved (GiB)": 89.89,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 10,
"tokens_per_second_per_gpu": 1111.65
},
{
"epoch": 0.02771522605231249,
"grad_norm": 0.09567277133464813,
"learning_rate": 4.418604651162791e-05,
"loss": 0.7842,
"memory/device_reserved (GiB)": 90.37,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 20,
"tokens_per_second_per_gpu": 1041.18
},
{
"epoch": 0.04157283907846873,
"grad_norm": 0.1211227998137474,
"learning_rate": 6.744186046511628e-05,
"loss": 0.7353,
"memory/device_reserved (GiB)": 90.37,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 30,
"tokens_per_second_per_gpu": 993.82
},
{
"epoch": 0.05543045210462498,
"grad_norm": 0.09696491807699203,
"learning_rate": 9.069767441860465e-05,
"loss": 0.6943,
"memory/device_reserved (GiB)": 90.37,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 40,
"tokens_per_second_per_gpu": 849.91
},
{
"epoch": 0.06928806513078123,
"grad_norm": 0.12007619440555573,
"learning_rate": 9.999547457436221e-05,
"loss": 0.6814,
"memory/device_reserved (GiB)": 90.38,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 50,
"tokens_per_second_per_gpu": 971.92
},
{
"epoch": 0.08314567815693746,
"grad_norm": 0.12358752638101578,
"learning_rate": 9.996782216198338e-05,
"loss": 0.69,
"memory/device_reserved (GiB)": 90.38,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 60,
"tokens_per_second_per_gpu": 862.06
},
{
"epoch": 0.09700329118309371,
"grad_norm": 0.11916535347700119,
"learning_rate": 9.991504534967746e-05,
"loss": 0.7048,
"memory/device_reserved (GiB)": 90.38,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 70,
"tokens_per_second_per_gpu": 1024.18
},
{
"epoch": 0.11086090420924996,
"grad_norm": 0.12464027106761932,
"learning_rate": 9.983717067423721e-05,
"loss": 0.6705,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 80,
"tokens_per_second_per_gpu": 1009.08
},
{
"epoch": 0.1247185172354062,
"grad_norm": 0.1264505237340927,
"learning_rate": 9.973423729195168e-05,
"loss": 0.6387,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 90,
"tokens_per_second_per_gpu": 1027.91
},
{
"epoch": 0.13857613026156246,
"grad_norm": 0.1262999027967453,
"learning_rate": 9.960629695891814e-05,
"loss": 0.6447,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 100,
"tokens_per_second_per_gpu": 935.25
},
{
"epoch": 0.1524337432877187,
"grad_norm": 0.12777547538280487,
"learning_rate": 9.945341400501838e-05,
"loss": 0.6846,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 110,
"tokens_per_second_per_gpu": 997.71
},
{
"epoch": 0.16629135631387493,
"grad_norm": 0.12563012540340424,
"learning_rate": 9.927566530157298e-05,
"loss": 0.6765,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 120,
"tokens_per_second_per_gpu": 1120.19
},
{
"epoch": 0.1801489693400312,
"grad_norm": 0.13785897195339203,
"learning_rate": 9.907314022268946e-05,
"loss": 0.6315,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 130,
"tokens_per_second_per_gpu": 801.56
},
{
"epoch": 0.19400658236618742,
"grad_norm": 0.14731284976005554,
"learning_rate": 9.884594060032406e-05,
"loss": 0.6642,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 140,
"tokens_per_second_per_gpu": 944.29
},
{
"epoch": 0.20786419539234366,
"grad_norm": 0.1298578828573227,
"learning_rate": 9.859418067307928e-05,
"loss": 0.6696,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 150,
"tokens_per_second_per_gpu": 913.99
},
{
"epoch": 0.22172180841849992,
"grad_norm": 0.13122966885566711,
"learning_rate": 9.831798702876352e-05,
"loss": 0.6768,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 160,
"tokens_per_second_per_gpu": 983.71
},
{
"epoch": 0.23557942144465616,
"grad_norm": 0.12356515228748322,
"learning_rate": 9.801749854074122e-05,
"loss": 0.6526,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 170,
"tokens_per_second_per_gpu": 903.88
},
{
"epoch": 0.2494370344708124,
"grad_norm": 0.11831440776586533,
"learning_rate": 9.769286629810572e-05,
"loss": 0.6415,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 180,
"tokens_per_second_per_gpu": 819.58
},
{
"epoch": 0.2632946474969686,
"grad_norm": 0.12409751862287521,
"learning_rate": 9.73442535297099e-05,
"loss": 0.6685,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 190,
"tokens_per_second_per_gpu": 973.62
},
{
"epoch": 0.2771522605231249,
"grad_norm": 0.1352369785308838,
"learning_rate": 9.697183552209288e-05,
"loss": 0.6329,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 200,
"tokens_per_second_per_gpu": 903.58
},
{
"epoch": 0.29100987354928115,
"grad_norm": 0.12526443600654602,
"learning_rate": 9.657579953134383e-05,
"loss": 0.6452,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 210,
"tokens_per_second_per_gpu": 859.06
},
{
"epoch": 0.3048674865754374,
"grad_norm": 0.11614521592855453,
"learning_rate": 9.615634468894752e-05,
"loss": 0.6407,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 220,
"tokens_per_second_per_gpu": 850.93
},
{
"epoch": 0.3187250996015936,
"grad_norm": 0.1328686773777008,
"learning_rate": 9.571368190165863e-05,
"loss": 0.6741,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 230,
"tokens_per_second_per_gpu": 1038.17
},
{
"epoch": 0.33258271262774985,
"grad_norm": 0.13082517683506012,
"learning_rate": 9.524803374545548e-05,
"loss": 0.6906,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 240,
"tokens_per_second_per_gpu": 1066.43
},
{
"epoch": 0.3464403256539061,
"grad_norm": 0.1282692551612854,
"learning_rate": 9.475963435362614e-05,
"loss": 0.6609,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 250,
"tokens_per_second_per_gpu": 986.84
},
{
"epoch": 0.3602979386800624,
"grad_norm": 0.13708311319351196,
"learning_rate": 9.424872929904358e-05,
"loss": 0.6169,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 260,
"tokens_per_second_per_gpu": 1126.25
},
{
"epoch": 0.3741555517062186,
"grad_norm": 0.1323172152042389,
"learning_rate": 9.371557547068878e-05,
"loss": 0.6574,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 270,
"tokens_per_second_per_gpu": 1062.03
},
{
"epoch": 0.38801316473237485,
"grad_norm": 0.12729060649871826,
"learning_rate": 9.316044094448392e-05,
"loss": 0.6583,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 280,
"tokens_per_second_per_gpu": 941.33
},
{
"epoch": 0.4018707777585311,
"grad_norm": 0.14719286561012268,
"learning_rate": 9.25836048485008e-05,
"loss": 0.6392,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 290,
"tokens_per_second_per_gpu": 883.56
},
{
"epoch": 0.4157283907846873,
"grad_norm": 0.12530402839183807,
"learning_rate": 9.198535722261181e-05,
"loss": 0.6623,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 300,
"tokens_per_second_per_gpu": 902.6
},
{
"epoch": 0.4295860038108436,
"grad_norm": 0.1330760419368744,
"learning_rate": 9.136599887265483e-05,
"loss": 0.645,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 310,
"tokens_per_second_per_gpu": 997.96
},
{
"epoch": 0.44344361683699984,
"grad_norm": 0.13317464292049408,
"learning_rate": 9.072584121918425e-05,
"loss": 0.6139,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 320,
"tokens_per_second_per_gpu": 980.08
},
{
"epoch": 0.4573012298631561,
"grad_norm": 0.12773385643959045,
"learning_rate": 9.006520614088535e-05,
"loss": 0.6658,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 330,
"tokens_per_second_per_gpu": 937.57
},
{
"epoch": 0.4711588428893123,
"grad_norm": 0.13415341079235077,
"learning_rate": 8.938442581272983e-05,
"loss": 0.6737,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 340,
"tokens_per_second_per_gpu": 1046.92
},
{
"epoch": 0.48501645591546855,
"grad_norm": 0.13382680714130402,
"learning_rate": 8.868384253895445e-05,
"loss": 0.6575,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 350,
"tokens_per_second_per_gpu": 1049.72
},
{
"epoch": 0.4988740689416248,
"grad_norm": 0.12621234357357025,
"learning_rate": 8.796380858094643e-05,
"loss": 0.6423,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 360,
"tokens_per_second_per_gpu": 926.35
},
{
"epoch": 0.5127316819677811,
"grad_norm": 0.14663882553577423,
"learning_rate": 8.722468598012245e-05,
"loss": 0.6524,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 370,
"tokens_per_second_per_gpu": 993.98
},
{
"epoch": 0.5265892949939373,
"grad_norm": 0.12107036262750626,
"learning_rate": 8.646684637588991e-05,
"loss": 0.6158,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 380,
"tokens_per_second_per_gpu": 882.74
},
{
"epoch": 0.5404469080200935,
"grad_norm": 0.12905746698379517,
"learning_rate": 8.56906708187824e-05,
"loss": 0.6359,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 390,
"tokens_per_second_per_gpu": 992.61
},
{
"epoch": 0.5543045210462498,
"grad_norm": 0.14433123171329498,
"learning_rate": 8.489654957886306e-05,
"loss": 0.6124,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 400,
"tokens_per_second_per_gpu": 871.5
},
{
"epoch": 0.568162134072406,
"grad_norm": 0.13294072449207306,
"learning_rate": 8.40848819494923e-05,
"loss": 0.6803,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 410,
"tokens_per_second_per_gpu": 950.11
},
{
"epoch": 0.5820197470985623,
"grad_norm": 0.1526036411523819,
"learning_rate": 8.325607604655839e-05,
"loss": 0.6088,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 420,
"tokens_per_second_per_gpu": 1025.1
},
{
"epoch": 0.5958773601247185,
"grad_norm": 0.1453717052936554,
"learning_rate": 8.241054860327216e-05,
"loss": 0.6669,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 430,
"tokens_per_second_per_gpu": 1074.36
},
{
"epoch": 0.6097349731508748,
"grad_norm": 0.1466919481754303,
"learning_rate": 8.154872476062868e-05,
"loss": 0.6147,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 440,
"tokens_per_second_per_gpu": 1010.83
},
{
"epoch": 0.6235925861770311,
"grad_norm": 0.12707076966762543,
"learning_rate": 8.067103785364139e-05,
"loss": 0.6096,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 450,
"tokens_per_second_per_gpu": 919.83
},
{
"epoch": 0.6374501992031872,
"grad_norm": 0.13485883176326752,
"learning_rate": 7.977792919345633e-05,
"loss": 0.6342,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 460,
"tokens_per_second_per_gpu": 1033.87
},
{
"epoch": 0.6513078122293435,
"grad_norm": 0.12489234656095505,
"learning_rate": 7.886984784545566e-05,
"loss": 0.6256,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 470,
"tokens_per_second_per_gpu": 1018.58
},
{
"epoch": 0.6651654252554997,
"grad_norm": 0.16094225645065308,
"learning_rate": 7.794725040346251e-05,
"loss": 0.6455,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 480,
"tokens_per_second_per_gpu": 933.16
},
{
"epoch": 0.679023038281656,
"grad_norm": 0.1351306140422821,
"learning_rate": 7.701060076016024e-05,
"loss": 0.6613,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 490,
"tokens_per_second_per_gpu": 970.62
},
{
"epoch": 0.6928806513078122,
"grad_norm": 0.1223958283662796,
"learning_rate": 7.606036987384184e-05,
"loss": 0.6186,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 500,
"tokens_per_second_per_gpu": 925.45
},
{
"epoch": 0.7067382643339685,
"grad_norm": 0.11873335391283035,
"learning_rate": 7.509703553160666e-05,
"loss": 0.646,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 510,
"tokens_per_second_per_gpu": 990.93
},
{
"epoch": 0.7205958773601248,
"grad_norm": 0.12990343570709229,
"learning_rate": 7.412108210912345e-05,
"loss": 0.6155,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 520,
"tokens_per_second_per_gpu": 955.97
},
{
"epoch": 0.7344534903862809,
"grad_norm": 0.1376057118177414,
"learning_rate": 7.31330003270808e-05,
"loss": 0.6443,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 530,
"tokens_per_second_per_gpu": 895.07
},
{
"epoch": 0.7483111034124372,
"grad_norm": 0.13277359306812286,
"learning_rate": 7.213328700444696e-05,
"loss": 0.6188,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 540,
"tokens_per_second_per_gpu": 907.27
},
{
"epoch": 0.7621687164385934,
"grad_norm": 0.13623632490634918,
"learning_rate": 7.112244480866356e-05,
"loss": 0.6471,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 550,
"tokens_per_second_per_gpu": 1002.06
},
{
"epoch": 0.7760263294647497,
"grad_norm": 0.13037075102329254,
"learning_rate": 7.010098200289859e-05,
"loss": 0.647,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 560,
"tokens_per_second_per_gpu": 1047.58
},
{
"epoch": 0.789883942490906,
"grad_norm": 0.12997141480445862,
"learning_rate": 6.906941219048584e-05,
"loss": 0.6071,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 570,
"tokens_per_second_per_gpu": 1073.59
},
{
"epoch": 0.8037415555170622,
"grad_norm": 0.14118416607379913,
"learning_rate": 6.802825405667905e-05,
"loss": 0.6101,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 580,
"tokens_per_second_per_gpu": 1097.31
},
{
"epoch": 0.8175991685432185,
"grad_norm": 0.126139834523201,
"learning_rate": 6.697803110785115e-05,
"loss": 0.6084,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 590,
"tokens_per_second_per_gpu": 933.94
},
{
"epoch": 0.8314567815693746,
"grad_norm": 0.1207822933793068,
"learning_rate": 6.591927140826902e-05,
"loss": 0.6416,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 600,
"tokens_per_second_per_gpu": 1007.52
}
],
"logging_steps": 10,
"max_steps": 1444,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.327934896681779e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}