qwen32b-thai-lora / checkpoint-900 /trainer_state.json
devrf's picture
Upload folder using huggingface_hub
5684a7e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.2466655118655812,
"eval_steps": 500,
"global_step": 900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013857613026156245,
"grad_norm": 0.1854863315820694,
"learning_rate": 2.0930232558139536e-05,
"loss": 0.8494,
"memory/device_reserved (GiB)": 89.89,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 10,
"tokens_per_second_per_gpu": 1111.65
},
{
"epoch": 0.02771522605231249,
"grad_norm": 0.09567277133464813,
"learning_rate": 4.418604651162791e-05,
"loss": 0.7842,
"memory/device_reserved (GiB)": 90.37,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 20,
"tokens_per_second_per_gpu": 1041.18
},
{
"epoch": 0.04157283907846873,
"grad_norm": 0.1211227998137474,
"learning_rate": 6.744186046511628e-05,
"loss": 0.7353,
"memory/device_reserved (GiB)": 90.37,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 30,
"tokens_per_second_per_gpu": 993.82
},
{
"epoch": 0.05543045210462498,
"grad_norm": 0.09696491807699203,
"learning_rate": 9.069767441860465e-05,
"loss": 0.6943,
"memory/device_reserved (GiB)": 90.37,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 40,
"tokens_per_second_per_gpu": 849.91
},
{
"epoch": 0.06928806513078123,
"grad_norm": 0.12007619440555573,
"learning_rate": 9.999547457436221e-05,
"loss": 0.6814,
"memory/device_reserved (GiB)": 90.38,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 50,
"tokens_per_second_per_gpu": 971.92
},
{
"epoch": 0.08314567815693746,
"grad_norm": 0.12358752638101578,
"learning_rate": 9.996782216198338e-05,
"loss": 0.69,
"memory/device_reserved (GiB)": 90.38,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 60,
"tokens_per_second_per_gpu": 862.06
},
{
"epoch": 0.09700329118309371,
"grad_norm": 0.11916535347700119,
"learning_rate": 9.991504534967746e-05,
"loss": 0.7048,
"memory/device_reserved (GiB)": 90.38,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 70,
"tokens_per_second_per_gpu": 1024.18
},
{
"epoch": 0.11086090420924996,
"grad_norm": 0.12464027106761932,
"learning_rate": 9.983717067423721e-05,
"loss": 0.6705,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 80,
"tokens_per_second_per_gpu": 1009.08
},
{
"epoch": 0.1247185172354062,
"grad_norm": 0.1264505237340927,
"learning_rate": 9.973423729195168e-05,
"loss": 0.6387,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 90,
"tokens_per_second_per_gpu": 1027.91
},
{
"epoch": 0.13857613026156246,
"grad_norm": 0.1262999027967453,
"learning_rate": 9.960629695891814e-05,
"loss": 0.6447,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 100,
"tokens_per_second_per_gpu": 935.25
},
{
"epoch": 0.1524337432877187,
"grad_norm": 0.12777547538280487,
"learning_rate": 9.945341400501838e-05,
"loss": 0.6846,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 110,
"tokens_per_second_per_gpu": 997.71
},
{
"epoch": 0.16629135631387493,
"grad_norm": 0.12563012540340424,
"learning_rate": 9.927566530157298e-05,
"loss": 0.6765,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 120,
"tokens_per_second_per_gpu": 1120.19
},
{
"epoch": 0.1801489693400312,
"grad_norm": 0.13785897195339203,
"learning_rate": 9.907314022268946e-05,
"loss": 0.6315,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 130,
"tokens_per_second_per_gpu": 801.56
},
{
"epoch": 0.19400658236618742,
"grad_norm": 0.14731284976005554,
"learning_rate": 9.884594060032406e-05,
"loss": 0.6642,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 140,
"tokens_per_second_per_gpu": 944.29
},
{
"epoch": 0.20786419539234366,
"grad_norm": 0.1298578828573227,
"learning_rate": 9.859418067307928e-05,
"loss": 0.6696,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 150,
"tokens_per_second_per_gpu": 913.99
},
{
"epoch": 0.22172180841849992,
"grad_norm": 0.13122966885566711,
"learning_rate": 9.831798702876352e-05,
"loss": 0.6768,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 160,
"tokens_per_second_per_gpu": 983.71
},
{
"epoch": 0.23557942144465616,
"grad_norm": 0.12356515228748322,
"learning_rate": 9.801749854074122e-05,
"loss": 0.6526,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 170,
"tokens_per_second_per_gpu": 903.88
},
{
"epoch": 0.2494370344708124,
"grad_norm": 0.11831440776586533,
"learning_rate": 9.769286629810572e-05,
"loss": 0.6415,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 180,
"tokens_per_second_per_gpu": 819.58
},
{
"epoch": 0.2632946474969686,
"grad_norm": 0.12409751862287521,
"learning_rate": 9.73442535297099e-05,
"loss": 0.6685,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 190,
"tokens_per_second_per_gpu": 973.62
},
{
"epoch": 0.2771522605231249,
"grad_norm": 0.1352369785308838,
"learning_rate": 9.697183552209288e-05,
"loss": 0.6329,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 200,
"tokens_per_second_per_gpu": 903.58
},
{
"epoch": 0.29100987354928115,
"grad_norm": 0.12526443600654602,
"learning_rate": 9.657579953134383e-05,
"loss": 0.6452,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 210,
"tokens_per_second_per_gpu": 859.06
},
{
"epoch": 0.3048674865754374,
"grad_norm": 0.11614521592855453,
"learning_rate": 9.615634468894752e-05,
"loss": 0.6407,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 220,
"tokens_per_second_per_gpu": 850.93
},
{
"epoch": 0.3187250996015936,
"grad_norm": 0.1328686773777008,
"learning_rate": 9.571368190165863e-05,
"loss": 0.6741,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 230,
"tokens_per_second_per_gpu": 1038.17
},
{
"epoch": 0.33258271262774985,
"grad_norm": 0.13082517683506012,
"learning_rate": 9.524803374545548e-05,
"loss": 0.6906,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 240,
"tokens_per_second_per_gpu": 1066.43
},
{
"epoch": 0.3464403256539061,
"grad_norm": 0.1282692551612854,
"learning_rate": 9.475963435362614e-05,
"loss": 0.6609,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 250,
"tokens_per_second_per_gpu": 986.84
},
{
"epoch": 0.3602979386800624,
"grad_norm": 0.13708311319351196,
"learning_rate": 9.424872929904358e-05,
"loss": 0.6169,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 260,
"tokens_per_second_per_gpu": 1126.25
},
{
"epoch": 0.3741555517062186,
"grad_norm": 0.1323172152042389,
"learning_rate": 9.371557547068878e-05,
"loss": 0.6574,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 270,
"tokens_per_second_per_gpu": 1062.03
},
{
"epoch": 0.38801316473237485,
"grad_norm": 0.12729060649871826,
"learning_rate": 9.316044094448392e-05,
"loss": 0.6583,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 280,
"tokens_per_second_per_gpu": 941.33
},
{
"epoch": 0.4018707777585311,
"grad_norm": 0.14719286561012268,
"learning_rate": 9.25836048485008e-05,
"loss": 0.6392,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 290,
"tokens_per_second_per_gpu": 883.56
},
{
"epoch": 0.4157283907846873,
"grad_norm": 0.12530402839183807,
"learning_rate": 9.198535722261181e-05,
"loss": 0.6623,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 300,
"tokens_per_second_per_gpu": 902.6
},
{
"epoch": 0.4295860038108436,
"grad_norm": 0.1330760419368744,
"learning_rate": 9.136599887265483e-05,
"loss": 0.645,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 310,
"tokens_per_second_per_gpu": 997.96
},
{
"epoch": 0.44344361683699984,
"grad_norm": 0.13317464292049408,
"learning_rate": 9.072584121918425e-05,
"loss": 0.6139,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 320,
"tokens_per_second_per_gpu": 980.08
},
{
"epoch": 0.4573012298631561,
"grad_norm": 0.12773385643959045,
"learning_rate": 9.006520614088535e-05,
"loss": 0.6658,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 330,
"tokens_per_second_per_gpu": 937.57
},
{
"epoch": 0.4711588428893123,
"grad_norm": 0.13415341079235077,
"learning_rate": 8.938442581272983e-05,
"loss": 0.6737,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 340,
"tokens_per_second_per_gpu": 1046.92
},
{
"epoch": 0.48501645591546855,
"grad_norm": 0.13382680714130402,
"learning_rate": 8.868384253895445e-05,
"loss": 0.6575,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 350,
"tokens_per_second_per_gpu": 1049.72
},
{
"epoch": 0.4988740689416248,
"grad_norm": 0.12621234357357025,
"learning_rate": 8.796380858094643e-05,
"loss": 0.6423,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 360,
"tokens_per_second_per_gpu": 926.35
},
{
"epoch": 0.5127316819677811,
"grad_norm": 0.14663882553577423,
"learning_rate": 8.722468598012245e-05,
"loss": 0.6524,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 370,
"tokens_per_second_per_gpu": 993.98
},
{
"epoch": 0.5265892949939373,
"grad_norm": 0.12107036262750626,
"learning_rate": 8.646684637588991e-05,
"loss": 0.6158,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 380,
"tokens_per_second_per_gpu": 882.74
},
{
"epoch": 0.5404469080200935,
"grad_norm": 0.12905746698379517,
"learning_rate": 8.56906708187824e-05,
"loss": 0.6359,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 390,
"tokens_per_second_per_gpu": 992.61
},
{
"epoch": 0.5543045210462498,
"grad_norm": 0.14433123171329498,
"learning_rate": 8.489654957886306e-05,
"loss": 0.6124,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 400,
"tokens_per_second_per_gpu": 871.5
},
{
"epoch": 0.568162134072406,
"grad_norm": 0.13294072449207306,
"learning_rate": 8.40848819494923e-05,
"loss": 0.6803,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 410,
"tokens_per_second_per_gpu": 950.11
},
{
"epoch": 0.5820197470985623,
"grad_norm": 0.1526036411523819,
"learning_rate": 8.325607604655839e-05,
"loss": 0.6088,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 420,
"tokens_per_second_per_gpu": 1025.1
},
{
"epoch": 0.5958773601247185,
"grad_norm": 0.1453717052936554,
"learning_rate": 8.241054860327216e-05,
"loss": 0.6669,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 430,
"tokens_per_second_per_gpu": 1074.36
},
{
"epoch": 0.6097349731508748,
"grad_norm": 0.1466919481754303,
"learning_rate": 8.154872476062868e-05,
"loss": 0.6147,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 440,
"tokens_per_second_per_gpu": 1010.83
},
{
"epoch": 0.6235925861770311,
"grad_norm": 0.12707076966762543,
"learning_rate": 8.067103785364139e-05,
"loss": 0.6096,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 450,
"tokens_per_second_per_gpu": 919.83
},
{
"epoch": 0.6374501992031872,
"grad_norm": 0.13485883176326752,
"learning_rate": 7.977792919345633e-05,
"loss": 0.6342,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 460,
"tokens_per_second_per_gpu": 1033.87
},
{
"epoch": 0.6513078122293435,
"grad_norm": 0.12489234656095505,
"learning_rate": 7.886984784545566e-05,
"loss": 0.6256,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 470,
"tokens_per_second_per_gpu": 1018.58
},
{
"epoch": 0.6651654252554997,
"grad_norm": 0.16094225645065308,
"learning_rate": 7.794725040346251e-05,
"loss": 0.6455,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 480,
"tokens_per_second_per_gpu": 933.16
},
{
"epoch": 0.679023038281656,
"grad_norm": 0.1351306140422821,
"learning_rate": 7.701060076016024e-05,
"loss": 0.6613,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 490,
"tokens_per_second_per_gpu": 970.62
},
{
"epoch": 0.6928806513078122,
"grad_norm": 0.1223958283662796,
"learning_rate": 7.606036987384184e-05,
"loss": 0.6186,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 500,
"tokens_per_second_per_gpu": 925.45
},
{
"epoch": 0.7067382643339685,
"grad_norm": 0.11873335391283035,
"learning_rate": 7.509703553160666e-05,
"loss": 0.646,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 510,
"tokens_per_second_per_gpu": 990.93
},
{
"epoch": 0.7205958773601248,
"grad_norm": 0.12990343570709229,
"learning_rate": 7.412108210912345e-05,
"loss": 0.6155,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 520,
"tokens_per_second_per_gpu": 955.97
},
{
"epoch": 0.7344534903862809,
"grad_norm": 0.1376057118177414,
"learning_rate": 7.31330003270808e-05,
"loss": 0.6443,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 530,
"tokens_per_second_per_gpu": 895.07
},
{
"epoch": 0.7483111034124372,
"grad_norm": 0.13277359306812286,
"learning_rate": 7.213328700444696e-05,
"loss": 0.6188,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 540,
"tokens_per_second_per_gpu": 907.27
},
{
"epoch": 0.7621687164385934,
"grad_norm": 0.13623632490634918,
"learning_rate": 7.112244480866356e-05,
"loss": 0.6471,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 550,
"tokens_per_second_per_gpu": 1002.06
},
{
"epoch": 0.7760263294647497,
"grad_norm": 0.13037075102329254,
"learning_rate": 7.010098200289859e-05,
"loss": 0.647,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 560,
"tokens_per_second_per_gpu": 1047.58
},
{
"epoch": 0.789883942490906,
"grad_norm": 0.12997141480445862,
"learning_rate": 6.906941219048584e-05,
"loss": 0.6071,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 570,
"tokens_per_second_per_gpu": 1073.59
},
{
"epoch": 0.8037415555170622,
"grad_norm": 0.14118416607379913,
"learning_rate": 6.802825405667905e-05,
"loss": 0.6101,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 580,
"tokens_per_second_per_gpu": 1097.31
},
{
"epoch": 0.8175991685432185,
"grad_norm": 0.126139834523201,
"learning_rate": 6.697803110785115e-05,
"loss": 0.6084,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 590,
"tokens_per_second_per_gpu": 933.94
},
{
"epoch": 0.8314567815693746,
"grad_norm": 0.1207822933793068,
"learning_rate": 6.591927140826902e-05,
"loss": 0.6416,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 600,
"tokens_per_second_per_gpu": 1007.52
},
{
"epoch": 0.8453143945955309,
"grad_norm": 0.1316983848810196,
"learning_rate": 6.485250731457678e-05,
"loss": 0.6102,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 610,
"tokens_per_second_per_gpu": 866.4
},
{
"epoch": 0.8591720076216872,
"grad_norm": 0.13538379967212677,
"learning_rate": 6.377827520812061e-05,
"loss": 0.6426,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 620,
"tokens_per_second_per_gpu": 1044.49
},
{
"epoch": 0.8730296206478434,
"grad_norm": 0.1406071037054062,
"learning_rate": 6.269711522525006e-05,
"loss": 0.6029,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 630,
"tokens_per_second_per_gpu": 966.75
},
{
"epoch": 0.8868872336739997,
"grad_norm": 0.1416017860174179,
"learning_rate": 6.160957098573119e-05,
"loss": 0.6103,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 640,
"tokens_per_second_per_gpu": 996.12
},
{
"epoch": 0.9007448467001559,
"grad_norm": 0.14168281853199005,
"learning_rate": 6.05161893194083e-05,
"loss": 0.6015,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 650,
"tokens_per_second_per_gpu": 1057.0
},
{
"epoch": 0.9146024597263122,
"grad_norm": 0.14854960143566132,
"learning_rate": 5.941751999125149e-05,
"loss": 0.5851,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 660,
"tokens_per_second_per_gpu": 992.44
},
{
"epoch": 0.9284600727524683,
"grad_norm": 0.13109387457370758,
"learning_rate": 5.831411542492854e-05,
"loss": 0.6221,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 670,
"tokens_per_second_per_gpu": 1056.52
},
{
"epoch": 0.9423176857786246,
"grad_norm": 0.13833071291446686,
"learning_rate": 5.720653042503978e-05,
"loss": 0.5828,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 680,
"tokens_per_second_per_gpu": 1061.91
},
{
"epoch": 0.9561752988047809,
"grad_norm": 0.1346118003129959,
"learning_rate": 5.6095321898156016e-05,
"loss": 0.5827,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 690,
"tokens_per_second_per_gpu": 1017.14
},
{
"epoch": 0.9700329118309371,
"grad_norm": 0.12667891383171082,
"learning_rate": 5.498104857279941e-05,
"loss": 0.6744,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 700,
"tokens_per_second_per_gpu": 956.99
},
{
"epoch": 0.9838905248570934,
"grad_norm": 0.13424526154994965,
"learning_rate": 5.3864270718508305e-05,
"loss": 0.6298,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 710,
"tokens_per_second_per_gpu": 953.92
},
{
"epoch": 0.9977481378832496,
"grad_norm": 0.12869331240653992,
"learning_rate": 5.274554986412716e-05,
"loss": 0.6199,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 720,
"tokens_per_second_per_gpu": 1040.0
},
{
"epoch": 1.011086090420925,
"grad_norm": 0.14841921627521515,
"learning_rate": 5.162544851546349e-05,
"loss": 0.6032,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 730,
"tokens_per_second_per_gpu": 885.8
},
{
"epoch": 1.0249437034470812,
"grad_norm": 0.15776373445987701,
"learning_rate": 5.0504529872453256e-05,
"loss": 0.5982,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 740,
"tokens_per_second_per_gpu": 1010.64
},
{
"epoch": 1.0388013164732375,
"grad_norm": 0.17491325736045837,
"learning_rate": 4.9383357545977497e-05,
"loss": 0.5993,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 750,
"tokens_per_second_per_gpu": 1085.43
},
{
"epoch": 1.0526589294993938,
"grad_norm": 0.15103822946548462,
"learning_rate": 4.8262495274472225e-05,
"loss": 0.5512,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 760,
"tokens_per_second_per_gpu": 1024.19
},
{
"epoch": 1.06651654252555,
"grad_norm": 0.15340133011341095,
"learning_rate": 4.7142506640474274e-05,
"loss": 0.5822,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 770,
"tokens_per_second_per_gpu": 1189.62
},
{
"epoch": 1.0803741555517061,
"grad_norm": 0.17283514142036438,
"learning_rate": 4.602395478724539e-05,
"loss": 0.5395,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 780,
"tokens_per_second_per_gpu": 1009.01
},
{
"epoch": 1.0942317685778624,
"grad_norm": 0.15003962814807892,
"learning_rate": 4.490740213561727e-05,
"loss": 0.5358,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 790,
"tokens_per_second_per_gpu": 1047.2
},
{
"epoch": 1.1080893816040187,
"grad_norm": 0.1629399210214615,
"learning_rate": 4.379341010119992e-05,
"loss": 0.601,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 800,
"tokens_per_second_per_gpu": 942.61
},
{
"epoch": 1.121946994630175,
"grad_norm": 0.17462220788002014,
"learning_rate": 4.268253881209532e-05,
"loss": 0.5845,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 810,
"tokens_per_second_per_gpu": 1010.69
},
{
"epoch": 1.1358046076563313,
"grad_norm": 0.17755432426929474,
"learning_rate": 4.157534682725856e-05,
"loss": 0.5637,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 820,
"tokens_per_second_per_gpu": 911.61
},
{
"epoch": 1.1496622206824874,
"grad_norm": 0.1729191541671753,
"learning_rate": 4.047239085564794e-05,
"loss": 0.5921,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 830,
"tokens_per_second_per_gpu": 914.41
},
{
"epoch": 1.1635198337086436,
"grad_norm": 0.15745393931865692,
"learning_rate": 3.937422547630519e-05,
"loss": 0.6086,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 840,
"tokens_per_second_per_gpu": 1000.17
},
{
"epoch": 1.1773774467348,
"grad_norm": 0.16624517738819122,
"learning_rate": 3.828140285950676e-05,
"loss": 0.5603,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 850,
"tokens_per_second_per_gpu": 991.45
},
{
"epoch": 1.1912350597609562,
"grad_norm": 0.17486163973808289,
"learning_rate": 3.7194472489126176e-05,
"loss": 0.5715,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 860,
"tokens_per_second_per_gpu": 1005.74
},
{
"epoch": 1.2050926727871123,
"grad_norm": 0.18270528316497803,
"learning_rate": 3.611398088634721e-05,
"loss": 0.5577,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 870,
"tokens_per_second_per_gpu": 965.77
},
{
"epoch": 1.2189502858132686,
"grad_norm": 0.1544029712677002,
"learning_rate": 3.5040471334866695e-05,
"loss": 0.5706,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 880,
"tokens_per_second_per_gpu": 885.82
},
{
"epoch": 1.2328078988394249,
"grad_norm": 0.1610870063304901,
"learning_rate": 3.397448360772516e-05,
"loss": 0.5791,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 890,
"tokens_per_second_per_gpu": 903.16
},
{
"epoch": 1.2466655118655812,
"grad_norm": 0.15956935286521912,
"learning_rate": 3.291655369590269e-05,
"loss": 0.5978,
"memory/device_reserved (GiB)": 90.59,
"memory/max_active (GiB)": 85.72,
"memory/max_allocated (GiB)": 85.72,
"step": 900,
"tokens_per_second_per_gpu": 954.24
}
],
"logging_steps": 10,
"max_steps": 1444,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.990168442752205e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}