PEFT
Safetensors
English
lora
router-agent
vertex-ai
router-gemma3-peft / trainer_state.json
Alovestocode's picture
Upload PEFT adapter for router (Gemma 3 27B)
ea7d6f8 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 61,
"global_step": 183,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0,
"eval_loss": 1.987602710723877,
"eval_runtime": 21.27,
"eval_samples_per_second": 38.458,
"eval_steps_per_second": 2.445,
"memory/device_reserved (GiB)": 65.8,
"memory/max_active (GiB)": 48.55,
"memory/max_allocated (GiB)": 48.55,
"step": 0
},
{
"epoch": 0.01639344262295082,
"grad_norm": 2.1048970222473145,
"learning_rate": 0.0,
"loss": 1.9829,
"memory/device_reserved (GiB)": 33.32,
"memory/max_active (GiB)": 32.48,
"memory/max_allocated (GiB)": 32.48,
"step": 1
},
{
"epoch": 0.03278688524590164,
"grad_norm": 2.132734537124634,
"learning_rate": 1.1111111111111112e-05,
"loss": 1.9692,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 2
},
{
"epoch": 0.04918032786885246,
"grad_norm": 2.0775797367095947,
"learning_rate": 2.2222222222222223e-05,
"loss": 1.9615,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 3
},
{
"epoch": 0.06557377049180328,
"grad_norm": 1.9622066020965576,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.9353,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 4
},
{
"epoch": 0.08196721311475409,
"grad_norm": 1.344926118850708,
"learning_rate": 4.4444444444444447e-05,
"loss": 1.83,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 5
},
{
"epoch": 0.09836065573770492,
"grad_norm": 0.977587878704071,
"learning_rate": 5.555555555555556e-05,
"loss": 1.7399,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 6
},
{
"epoch": 0.11475409836065574,
"grad_norm": 0.8041461706161499,
"learning_rate": 6.666666666666667e-05,
"loss": 1.6341,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 7
},
{
"epoch": 0.13114754098360656,
"grad_norm": 0.6574768424034119,
"learning_rate": 7.777777777777778e-05,
"loss": 1.5328,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 8
},
{
"epoch": 0.14754098360655737,
"grad_norm": 0.5925188660621643,
"learning_rate": 8.888888888888889e-05,
"loss": 1.4402,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 9
},
{
"epoch": 0.16393442622950818,
"grad_norm": 0.6970644593238831,
"learning_rate": 0.0001,
"loss": 1.4063,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 10
},
{
"epoch": 0.18032786885245902,
"grad_norm": 0.8010526895523071,
"learning_rate": 0.00011111111111111112,
"loss": 1.3407,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 11
},
{
"epoch": 0.19672131147540983,
"grad_norm": 0.759565532207489,
"learning_rate": 0.00012222222222222224,
"loss": 1.2844,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 12
},
{
"epoch": 0.21311475409836064,
"grad_norm": 0.6551400423049927,
"learning_rate": 0.00013333333333333334,
"loss": 1.223,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 13
},
{
"epoch": 0.22950819672131148,
"grad_norm": 0.557424008846283,
"learning_rate": 0.00014444444444444444,
"loss": 1.146,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 14
},
{
"epoch": 0.2459016393442623,
"grad_norm": 0.4815196990966797,
"learning_rate": 0.00015555555555555556,
"loss": 1.087,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 15
},
{
"epoch": 0.26229508196721313,
"grad_norm": 0.4362974762916565,
"learning_rate": 0.0001666666666666667,
"loss": 1.0415,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 16
},
{
"epoch": 0.2786885245901639,
"grad_norm": 0.44927817583084106,
"learning_rate": 0.00017777777777777779,
"loss": 1.0049,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 17
},
{
"epoch": 0.29508196721311475,
"grad_norm": 0.49147677421569824,
"learning_rate": 0.00018888888888888888,
"loss": 0.9776,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 18
},
{
"epoch": 0.3114754098360656,
"grad_norm": 0.39037176966667175,
"learning_rate": 0.0002,
"loss": 0.9379,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 19
},
{
"epoch": 0.32786885245901637,
"grad_norm": 0.28441116213798523,
"learning_rate": 0.0001999818745523526,
"loss": 0.9064,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 20
},
{
"epoch": 0.3442622950819672,
"grad_norm": 0.3153410851955414,
"learning_rate": 0.00019992750478004738,
"loss": 0.8899,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 21
},
{
"epoch": 0.36065573770491804,
"grad_norm": 0.28423598408699036,
"learning_rate": 0.00019983691039261357,
"loss": 0.8671,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 22
},
{
"epoch": 0.3770491803278688,
"grad_norm": 0.2221442312002182,
"learning_rate": 0.00019971012423132775,
"loss": 0.8383,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 23
},
{
"epoch": 0.39344262295081966,
"grad_norm": 0.20366857945919037,
"learning_rate": 0.00019954719225730847,
"loss": 0.8386,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 24
},
{
"epoch": 0.4098360655737705,
"grad_norm": 0.22895343601703644,
"learning_rate": 0.00019934817353485501,
"loss": 0.8219,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 25
},
{
"epoch": 0.4262295081967213,
"grad_norm": 0.2062416672706604,
"learning_rate": 0.00019911314021003613,
"loss": 0.8057,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 26
},
{
"epoch": 0.4426229508196721,
"grad_norm": 0.18311655521392822,
"learning_rate": 0.00019884217748453623,
"loss": 0.804,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 27
},
{
"epoch": 0.45901639344262296,
"grad_norm": 0.19581766426563263,
"learning_rate": 0.00019853538358476932,
"loss": 0.7831,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 28
},
{
"epoch": 0.47540983606557374,
"grad_norm": 0.19657522439956665,
"learning_rate": 0.00019819286972627066,
"loss": 0.7815,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 29
},
{
"epoch": 0.4918032786885246,
"grad_norm": 0.1872270107269287,
"learning_rate": 0.00019781476007338058,
"loss": 0.7798,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 30
},
{
"epoch": 0.5081967213114754,
"grad_norm": 0.17483267188072205,
"learning_rate": 0.00019740119169423337,
"loss": 0.7738,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 31
},
{
"epoch": 0.5245901639344263,
"grad_norm": 0.18843160569667816,
"learning_rate": 0.00019695231451106912,
"loss": 0.7684,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 32
},
{
"epoch": 0.5409836065573771,
"grad_norm": 0.1772245317697525,
"learning_rate": 0.0001964682912458856,
"loss": 0.7661,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 33
},
{
"epoch": 0.5573770491803278,
"grad_norm": 0.1664779931306839,
"learning_rate": 0.00019594929736144976,
"loss": 0.7521,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 34
},
{
"epoch": 0.5737704918032787,
"grad_norm": 0.17712299525737762,
"learning_rate": 0.00019539552099769126,
"loss": 0.7545,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 35
},
{
"epoch": 0.5901639344262295,
"grad_norm": 0.1669715940952301,
"learning_rate": 0.00019480716290349995,
"loss": 0.7549,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 36
},
{
"epoch": 0.6065573770491803,
"grad_norm": 0.17512178421020508,
"learning_rate": 0.00019418443636395248,
"loss": 0.7451,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 37
},
{
"epoch": 0.6229508196721312,
"grad_norm": 0.1587410718202591,
"learning_rate": 0.00019352756712299468,
"loss": 0.7317,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 38
},
{
"epoch": 0.639344262295082,
"grad_norm": 0.1550321877002716,
"learning_rate": 0.00019283679330160726,
"loss": 0.7294,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 39
},
{
"epoch": 0.6557377049180327,
"grad_norm": 0.19032655656337738,
"learning_rate": 0.000192112365311485,
"loss": 0.7241,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 40
},
{
"epoch": 0.6721311475409836,
"grad_norm": 0.15234725177288055,
"learning_rate": 0.0001913545457642601,
"loss": 0.7165,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 41
},
{
"epoch": 0.6885245901639344,
"grad_norm": 0.1558808833360672,
"learning_rate": 0.0001905636093763031,
"loss": 0.7212,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 42
},
{
"epoch": 0.7049180327868853,
"grad_norm": 0.15694938600063324,
"learning_rate": 0.00018973984286913584,
"loss": 0.7055,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 43
},
{
"epoch": 0.7213114754098361,
"grad_norm": 0.14805985987186432,
"learning_rate": 0.00018888354486549237,
"loss": 0.7137,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 44
},
{
"epoch": 0.7377049180327869,
"grad_norm": 0.14736340939998627,
"learning_rate": 0.00018799502578106534,
"loss": 0.6931,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 45
},
{
"epoch": 0.7540983606557377,
"grad_norm": 0.22682452201843262,
"learning_rate": 0.00018707460771197774,
"loss": 0.6979,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 46
},
{
"epoch": 0.7704918032786885,
"grad_norm": 0.14981690049171448,
"learning_rate": 0.00018612262431802007,
"loss": 0.7045,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 47
},
{
"epoch": 0.7868852459016393,
"grad_norm": 0.14373798668384552,
"learning_rate": 0.0001851394207016957,
"loss": 0.7089,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 48
},
{
"epoch": 0.8032786885245902,
"grad_norm": 0.1548561304807663,
"learning_rate": 0.00018412535328311814,
"loss": 0.6927,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 49
},
{
"epoch": 0.819672131147541,
"grad_norm": 0.1519075483083725,
"learning_rate": 0.00018308078967080546,
"loss": 0.6908,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 50
},
{
"epoch": 0.8360655737704918,
"grad_norm": 0.15933729708194733,
"learning_rate": 0.00018200610852841913,
"loss": 0.6732,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 51
},
{
"epoch": 0.8524590163934426,
"grad_norm": 0.14739681780338287,
"learning_rate": 0.00018090169943749476,
"loss": 0.6868,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 52
},
{
"epoch": 0.8688524590163934,
"grad_norm": 0.20240117609500885,
"learning_rate": 0.00017976796275621555,
"loss": 0.6789,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 53
},
{
"epoch": 0.8852459016393442,
"grad_norm": 0.15849758684635162,
"learning_rate": 0.00017860530947427875,
"loss": 0.6948,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 54
},
{
"epoch": 0.9016393442622951,
"grad_norm": 0.19192887842655182,
"learning_rate": 0.00017741416106390826,
"loss": 0.6705,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 55
},
{
"epoch": 0.9180327868852459,
"grad_norm": 0.14879536628723145,
"learning_rate": 0.0001761949493270671,
"loss": 0.6765,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 56
},
{
"epoch": 0.9344262295081968,
"grad_norm": 0.15834036469459534,
"learning_rate": 0.0001749481162389254,
"loss": 0.6639,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 57
},
{
"epoch": 0.9508196721311475,
"grad_norm": 0.1480400264263153,
"learning_rate": 0.0001736741137876405,
"loss": 0.6801,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 58
},
{
"epoch": 0.9672131147540983,
"grad_norm": 0.15552930533885956,
"learning_rate": 0.00017237340381050703,
"loss": 0.6721,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 59
},
{
"epoch": 0.9836065573770492,
"grad_norm": 0.15891212224960327,
"learning_rate": 0.0001710464578265369,
"loss": 0.669,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 60
},
{
"epoch": 1.0,
"grad_norm": 0.154049351811409,
"learning_rate": 0.00016969375686552937,
"loss": 0.6585,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 61
},
{
"epoch": 1.0,
"eval_loss": 0.6697277426719666,
"eval_runtime": 15.4593,
"eval_samples_per_second": 52.913,
"eval_steps_per_second": 3.364,
"memory/device_reserved (GiB)": 73.37,
"memory/max_active (GiB)": 48.8,
"memory/max_allocated (GiB)": 48.8,
"step": 61
},
{
"epoch": 1.0163934426229508,
"grad_norm": 0.15614724159240723,
"learning_rate": 0.00016831579129369346,
"loss": 0.6731,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 62
},
{
"epoch": 1.0327868852459017,
"grad_norm": 0.16120535135269165,
"learning_rate": 0.00016691306063588583,
"loss": 0.654,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 63
},
{
"epoch": 1.0491803278688525,
"grad_norm": 0.1574525684118271,
"learning_rate": 0.00016548607339452853,
"loss": 0.6625,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 64
},
{
"epoch": 1.0655737704918034,
"grad_norm": 0.1553254872560501,
"learning_rate": 0.00016403534686527225,
"loss": 0.6608,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 65
},
{
"epoch": 1.0819672131147542,
"grad_norm": 0.1639672964811325,
"learning_rate": 0.00016256140694947217,
"loss": 0.6601,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 66
},
{
"epoch": 1.098360655737705,
"grad_norm": 0.1632416546344757,
"learning_rate": 0.00016106478796354382,
"loss": 0.6588,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 67
},
{
"epoch": 1.1147540983606556,
"grad_norm": 0.15430399775505066,
"learning_rate": 0.0001595460324452688,
"loss": 0.6559,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 68
},
{
"epoch": 1.1311475409836065,
"grad_norm": 0.14951854944229126,
"learning_rate": 0.00015800569095711982,
"loss": 0.6511,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 69
},
{
"epoch": 1.1475409836065573,
"grad_norm": 0.15766219794750214,
"learning_rate": 0.00015644432188667695,
"loss": 0.6476,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 70
},
{
"epoch": 1.1639344262295082,
"grad_norm": 0.15970833599567413,
"learning_rate": 0.000154862491244207,
"loss": 0.6491,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 71
},
{
"epoch": 1.180327868852459,
"grad_norm": 0.1682189404964447,
"learning_rate": 0.00015326077245747999,
"loss": 0.6448,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 72
},
{
"epoch": 1.1967213114754098,
"grad_norm": 0.1654193103313446,
"learning_rate": 0.0001516397461638962,
"loss": 0.647,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 73
},
{
"epoch": 1.2131147540983607,
"grad_norm": 0.1643504798412323,
"learning_rate": 0.00015000000000000001,
"loss": 0.6411,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 74
},
{
"epoch": 1.2295081967213115,
"grad_norm": 0.15997686982154846,
"learning_rate": 0.00014834212838845637,
"loss": 0.634,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 75
},
{
"epoch": 1.2459016393442623,
"grad_norm": 0.16089341044425964,
"learning_rate": 0.00014666673232256738,
"loss": 0.6323,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 76
},
{
"epoch": 1.2622950819672132,
"grad_norm": 0.1618468165397644,
"learning_rate": 0.0001449744191484066,
"loss": 0.6363,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 77
},
{
"epoch": 1.278688524590164,
"grad_norm": 0.16787968575954437,
"learning_rate": 0.00014326580234465085,
"loss": 0.6393,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 78
},
{
"epoch": 1.2950819672131146,
"grad_norm": 0.16223575174808502,
"learning_rate": 0.00014154150130018866,
"loss": 0.6343,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 79
},
{
"epoch": 1.3114754098360657,
"grad_norm": 0.1583593338727951,
"learning_rate": 0.00013980214108958624,
"loss": 0.6279,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 80
},
{
"epoch": 1.3278688524590163,
"grad_norm": 0.17309962213039398,
"learning_rate": 0.0001380483522464923,
"loss": 0.6309,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 81
},
{
"epoch": 1.3442622950819672,
"grad_norm": 0.1649285852909088,
"learning_rate": 0.0001362807705350641,
"loss": 0.6278,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 82
},
{
"epoch": 1.360655737704918,
"grad_norm": 0.1694308966398239,
"learning_rate": 0.00013450003671949706,
"loss": 0.6268,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 83
},
{
"epoch": 1.3770491803278688,
"grad_norm": 0.15790660679340363,
"learning_rate": 0.00013270679633174218,
"loss": 0.6124,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 84
},
{
"epoch": 1.3934426229508197,
"grad_norm": 0.17729204893112183,
"learning_rate": 0.00013090169943749476,
"loss": 0.625,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 85
},
{
"epoch": 1.4098360655737705,
"grad_norm": 0.1664123386144638,
"learning_rate": 0.0001290854004005399,
"loss": 0.6103,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 86
},
{
"epoch": 1.4262295081967213,
"grad_norm": 0.17104221880435944,
"learning_rate": 0.0001272585576455398,
"loss": 0.6105,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 87
},
{
"epoch": 1.4426229508196722,
"grad_norm": 0.18972143530845642,
"learning_rate": 0.00012542183341934872,
"loss": 0.6135,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 88
},
{
"epoch": 1.459016393442623,
"grad_norm": 0.17770206928253174,
"learning_rate": 0.00012357589355094275,
"loss": 0.6057,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 89
},
{
"epoch": 1.4754098360655736,
"grad_norm": 0.1827758103609085,
"learning_rate": 0.00012172140721005079,
"loss": 0.6115,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 90
},
{
"epoch": 1.4918032786885247,
"grad_norm": 0.17028623819351196,
"learning_rate": 0.00011985904666457455,
"loss": 0.6143,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 91
},
{
"epoch": 1.5081967213114753,
"grad_norm": 0.17973949015140533,
"learning_rate": 0.00011798948703688539,
"loss": 0.6155,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 92
},
{
"epoch": 1.5245901639344264,
"grad_norm": 0.17090703547000885,
"learning_rate": 0.00011611340605908642,
"loss": 0.6121,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 93
},
{
"epoch": 1.540983606557377,
"grad_norm": 0.16992104053497314,
"learning_rate": 0.00011423148382732853,
"loss": 0.6176,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 94
},
{
"epoch": 1.5573770491803278,
"grad_norm": 0.17985564470291138,
"learning_rate": 0.00011234440255526948,
"loss": 0.6104,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 95
},
{
"epoch": 1.5737704918032787,
"grad_norm": 0.1746932864189148,
"learning_rate": 0.00011045284632676536,
"loss": 0.6123,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 96
},
{
"epoch": 1.5901639344262295,
"grad_norm": 0.17701321840286255,
"learning_rate": 0.00010855750084788398,
"loss": 0.6174,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 97
},
{
"epoch": 1.6065573770491803,
"grad_norm": 0.17487658560276031,
"learning_rate": 0.00010665905319833041,
"loss": 0.6065,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 98
},
{
"epoch": 1.6229508196721312,
"grad_norm": 0.1731928288936615,
"learning_rate": 0.00010475819158237425,
"loss": 0.6054,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 99
},
{
"epoch": 1.639344262295082,
"grad_norm": 0.17205819487571716,
"learning_rate": 0.00010285560507936961,
"loss": 0.6033,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 100
},
{
"epoch": 1.6557377049180326,
"grad_norm": 0.17849475145339966,
"learning_rate": 0.00010095198339395769,
"loss": 0.6027,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 101
},
{
"epoch": 1.6721311475409837,
"grad_norm": 0.17146815359592438,
"learning_rate": 9.904801660604234e-05,
"loss": 0.5987,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 102
},
{
"epoch": 1.6885245901639343,
"grad_norm": 0.47297370433807373,
"learning_rate": 9.71443949206304e-05,
"loss": 0.6055,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 103
},
{
"epoch": 1.7049180327868854,
"grad_norm": 0.1885872781276703,
"learning_rate": 9.524180841762577e-05,
"loss": 0.5936,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 104
},
{
"epoch": 1.721311475409836,
"grad_norm": 0.1687517911195755,
"learning_rate": 9.334094680166962e-05,
"loss": 0.6003,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 105
},
{
"epoch": 1.737704918032787,
"grad_norm": 0.16893097758293152,
"learning_rate": 9.144249915211605e-05,
"loss": 0.5874,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 106
},
{
"epoch": 1.7540983606557377,
"grad_norm": 0.18470335006713867,
"learning_rate": 8.954715367323468e-05,
"loss": 0.5953,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 107
},
{
"epoch": 1.7704918032786885,
"grad_norm": 0.18364070355892181,
"learning_rate": 8.765559744473053e-05,
"loss": 0.6019,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 108
},
{
"epoch": 1.7868852459016393,
"grad_norm": 0.17885719239711761,
"learning_rate": 8.57685161726715e-05,
"loss": 0.6081,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 109
},
{
"epoch": 1.8032786885245902,
"grad_norm": 0.17162950336933136,
"learning_rate": 8.38865939409136e-05,
"loss": 0.5941,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 110
},
{
"epoch": 1.819672131147541,
"grad_norm": 0.1798235923051834,
"learning_rate": 8.201051296311462e-05,
"loss": 0.5948,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 111
},
{
"epoch": 1.8360655737704918,
"grad_norm": 0.18231722712516785,
"learning_rate": 8.014095333542548e-05,
"loss": 0.5784,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 112
},
{
"epoch": 1.8524590163934427,
"grad_norm": 0.17540279030799866,
"learning_rate": 7.827859278994925e-05,
"loss": 0.593,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 113
},
{
"epoch": 1.8688524590163933,
"grad_norm": 0.17537765204906464,
"learning_rate": 7.642410644905726e-05,
"loss": 0.5902,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 114
},
{
"epoch": 1.8852459016393444,
"grad_norm": 0.17418642342090607,
"learning_rate": 7.457816658065134e-05,
"loss": 0.6039,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 115
},
{
"epoch": 1.901639344262295,
"grad_norm": 0.17102915048599243,
"learning_rate": 7.274144235446023e-05,
"loss": 0.5853,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 116
},
{
"epoch": 1.918032786885246,
"grad_norm": 0.1854507327079773,
"learning_rate": 7.09145995994601e-05,
"loss": 0.592,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 117
},
{
"epoch": 1.9344262295081966,
"grad_norm": 0.17946381866931915,
"learning_rate": 6.909830056250527e-05,
"loss": 0.5818,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 118
},
{
"epoch": 1.9508196721311475,
"grad_norm": 0.17315614223480225,
"learning_rate": 6.729320366825784e-05,
"loss": 0.5939,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 119
},
{
"epoch": 1.9672131147540983,
"grad_norm": 0.17754870653152466,
"learning_rate": 6.549996328050296e-05,
"loss": 0.5875,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 120
},
{
"epoch": 1.9836065573770492,
"grad_norm": 0.17100785672664642,
"learning_rate": 6.371922946493591e-05,
"loss": 0.5864,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 121
},
{
"epoch": 2.0,
"grad_norm": 0.1712140440940857,
"learning_rate": 6.19516477535077e-05,
"loss": 0.5808,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 122
},
{
"epoch": 2.0,
"eval_loss": 0.6171819567680359,
"eval_runtime": 15.4867,
"eval_samples_per_second": 52.82,
"eval_steps_per_second": 3.358,
"memory/device_reserved (GiB)": 73.37,
"memory/max_active (GiB)": 48.8,
"memory/max_allocated (GiB)": 48.8,
"step": 122
},
{
"epoch": 2.0163934426229506,
"grad_norm": 0.17188939452171326,
"learning_rate": 6.019785891041381e-05,
"loss": 0.5943,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 123
},
{
"epoch": 2.0327868852459017,
"grad_norm": 0.17433030903339386,
"learning_rate": 5.845849869981137e-05,
"loss": 0.5771,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 124
},
{
"epoch": 2.0491803278688523,
"grad_norm": 0.17646503448486328,
"learning_rate": 5.6734197655349156e-05,
"loss": 0.5889,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 125
},
{
"epoch": 2.0655737704918034,
"grad_norm": 0.18202351033687592,
"learning_rate": 5.5025580851593436e-05,
"loss": 0.5864,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 126
},
{
"epoch": 2.081967213114754,
"grad_norm": 0.17786148190498352,
"learning_rate": 5.333326767743263e-05,
"loss": 0.5869,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 127
},
{
"epoch": 2.098360655737705,
"grad_norm": 0.18139372766017914,
"learning_rate": 5.1657871611543605e-05,
"loss": 0.5852,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 128
},
{
"epoch": 2.1147540983606556,
"grad_norm": 0.17242832481861115,
"learning_rate": 5.000000000000002e-05,
"loss": 0.5842,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 129
},
{
"epoch": 2.1311475409836067,
"grad_norm": 0.1741384118795395,
"learning_rate": 4.836025383610382e-05,
"loss": 0.5798,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 130
},
{
"epoch": 2.1475409836065573,
"grad_norm": 0.18157395720481873,
"learning_rate": 4.673922754252002e-05,
"loss": 0.5794,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 131
},
{
"epoch": 2.1639344262295084,
"grad_norm": 0.18070365488529205,
"learning_rate": 4.513750875579303e-05,
"loss": 0.5807,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 132
},
{
"epoch": 2.180327868852459,
"grad_norm": 0.17728549242019653,
"learning_rate": 4.355567811332311e-05,
"loss": 0.5794,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 133
},
{
"epoch": 2.19672131147541,
"grad_norm": 0.17409607768058777,
"learning_rate": 4.19943090428802e-05,
"loss": 0.5805,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 134
},
{
"epoch": 2.2131147540983607,
"grad_norm": 0.17317527532577515,
"learning_rate": 4.045396755473121e-05,
"loss": 0.5761,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 135
},
{
"epoch": 2.2295081967213113,
"grad_norm": 0.18648019433021545,
"learning_rate": 3.893521203645618e-05,
"loss": 0.5706,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 136
},
{
"epoch": 2.2459016393442623,
"grad_norm": 0.1756281554698944,
"learning_rate": 3.7438593050527845e-05,
"loss": 0.5698,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 137
},
{
"epoch": 2.262295081967213,
"grad_norm": 0.17808155715465546,
"learning_rate": 3.5964653134727776e-05,
"loss": 0.5727,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 138
},
{
"epoch": 2.278688524590164,
"grad_norm": 0.2911638021469116,
"learning_rate": 3.45139266054715e-05,
"loss": 0.5782,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 139
},
{
"epoch": 2.2950819672131146,
"grad_norm": 0.18621869385242462,
"learning_rate": 3.308693936411421e-05,
"loss": 0.5746,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 140
},
{
"epoch": 2.3114754098360657,
"grad_norm": 0.18035250902175903,
"learning_rate": 3.1684208706306574e-05,
"loss": 0.5685,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 141
},
{
"epoch": 2.3278688524590163,
"grad_norm": 0.17170463502407074,
"learning_rate": 3.030624313447067e-05,
"loss": 0.5708,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 142
},
{
"epoch": 2.3442622950819674,
"grad_norm": 0.1746087372303009,
"learning_rate": 2.8953542173463133e-05,
"loss": 0.5706,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 143
},
{
"epoch": 2.360655737704918,
"grad_norm": 0.18402113020420074,
"learning_rate": 2.7626596189492983e-05,
"loss": 0.5712,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 144
},
{
"epoch": 2.3770491803278686,
"grad_norm": 0.1822223961353302,
"learning_rate": 2.6325886212359498e-05,
"loss": 0.5579,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 145
},
{
"epoch": 2.3934426229508197,
"grad_norm": 0.18002824485301971,
"learning_rate": 2.5051883761074614e-05,
"loss": 0.5693,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 146
},
{
"epoch": 2.4098360655737707,
"grad_norm": 0.17535454034805298,
"learning_rate": 2.3805050672932928e-05,
"loss": 0.5554,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 147
},
{
"epoch": 2.4262295081967213,
"grad_norm": 0.17632795870304108,
"learning_rate": 2.2585838936091754e-05,
"loss": 0.5584,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 148
},
{
"epoch": 2.442622950819672,
"grad_norm": 0.1735614687204361,
"learning_rate": 2.139469052572127e-05,
"loss": 0.5603,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 149
},
{
"epoch": 2.459016393442623,
"grad_norm": 0.17682839930057526,
"learning_rate": 2.0232037243784475e-05,
"loss": 0.5545,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 150
},
{
"epoch": 2.4754098360655736,
"grad_norm": 0.18232908844947815,
"learning_rate": 1.9098300562505266e-05,
"loss": 0.5619,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 151
},
{
"epoch": 2.4918032786885247,
"grad_norm": 0.1740136593580246,
"learning_rate": 1.7993891471580893e-05,
"loss": 0.5652,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 152
},
{
"epoch": 2.5081967213114753,
"grad_norm": 0.17907975614070892,
"learning_rate": 1.6919210329194533e-05,
"loss": 0.5675,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 153
},
{
"epoch": 2.5245901639344264,
"grad_norm": 0.17249107360839844,
"learning_rate": 1.587464671688187e-05,
"loss": 0.5646,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 154
},
{
"epoch": 2.540983606557377,
"grad_norm": 0.17240655422210693,
"learning_rate": 1.4860579298304312e-05,
"loss": 0.5706,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 155
},
{
"epoch": 2.557377049180328,
"grad_norm": 0.17784570157527924,
"learning_rate": 1.3877375681979943e-05,
"loss": 0.5641,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 156
},
{
"epoch": 2.5737704918032787,
"grad_norm": 0.17441287636756897,
"learning_rate": 1.2925392288022298e-05,
"loss": 0.5652,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 157
},
{
"epoch": 2.5901639344262293,
"grad_norm": 0.17398421466350555,
"learning_rate": 1.2004974218934695e-05,
"loss": 0.5731,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 158
},
{
"epoch": 2.6065573770491803,
"grad_norm": 0.1747160106897354,
"learning_rate": 1.1116455134507664e-05,
"loss": 0.5609,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 159
},
{
"epoch": 2.6229508196721314,
"grad_norm": 0.17699241638183594,
"learning_rate": 1.026015713086418e-05,
"loss": 0.5622,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 160
},
{
"epoch": 2.639344262295082,
"grad_norm": 0.17194418609142303,
"learning_rate": 9.436390623696911e-06,
"loss": 0.5603,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 161
},
{
"epoch": 2.6557377049180326,
"grad_norm": 0.1708948016166687,
"learning_rate": 8.645454235739903e-06,
"loss": 0.5606,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 162
},
{
"epoch": 2.6721311475409837,
"grad_norm": 0.17234554886817932,
"learning_rate": 7.887634688515e-06,
"loss": 0.5577,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 163
},
{
"epoch": 2.6885245901639343,
"grad_norm": 0.17192111909389496,
"learning_rate": 7.163206698392744e-06,
"loss": 0.5649,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 164
},
{
"epoch": 2.7049180327868854,
"grad_norm": 0.17286840081214905,
"learning_rate": 6.472432877005341e-06,
"loss": 0.5534,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 165
},
{
"epoch": 2.721311475409836,
"grad_norm": 0.1688247174024582,
"learning_rate": 5.8155636360475385e-06,
"loss": 0.5613,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 166
},
{
"epoch": 2.737704918032787,
"grad_norm": 0.168919175863266,
"learning_rate": 5.192837096500058e-06,
"loss": 0.5501,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 167
},
{
"epoch": 2.7540983606557377,
"grad_norm": 0.17062221467494965,
"learning_rate": 4.604479002308737e-06,
"loss": 0.5569,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 168
},
{
"epoch": 2.7704918032786887,
"grad_norm": 0.1695357859134674,
"learning_rate": 4.050702638550275e-06,
"loss": 0.563,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 169
},
{
"epoch": 2.7868852459016393,
"grad_norm": 0.1687244176864624,
"learning_rate": 3.5317087541144377e-06,
"loss": 0.5703,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 170
},
{
"epoch": 2.80327868852459,
"grad_norm": 0.1708020716905594,
"learning_rate": 3.047685488930874e-06,
"loss": 0.558,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 171
},
{
"epoch": 2.819672131147541,
"grad_norm": 0.17512136697769165,
"learning_rate": 2.5988083057666533e-06,
"loss": 0.5588,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 172
},
{
"epoch": 2.836065573770492,
"grad_norm": 0.16872957348823547,
"learning_rate": 2.1852399266194314e-06,
"loss": 0.544,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 173
},
{
"epoch": 2.8524590163934427,
"grad_norm": 0.16911788284778595,
"learning_rate": 1.8071302737293295e-06,
"loss": 0.5596,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 174
},
{
"epoch": 2.8688524590163933,
"grad_norm": 0.17218004167079926,
"learning_rate": 1.4646164152307018e-06,
"loss": 0.5569,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 175
},
{
"epoch": 2.8852459016393444,
"grad_norm": 0.17117170989513397,
"learning_rate": 1.157822515463758e-06,
"loss": 0.5707,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 176
},
{
"epoch": 2.901639344262295,
"grad_norm": 0.1660652756690979,
"learning_rate": 8.868597899638898e-07,
"loss": 0.5546,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 177
},
{
"epoch": 2.918032786885246,
"grad_norm": 0.5194974541664124,
"learning_rate": 6.518264651449779e-07,
"loss": 0.5608,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 178
},
{
"epoch": 2.9344262295081966,
"grad_norm": 0.1674128770828247,
"learning_rate": 4.5280774269154115e-07,
"loss": 0.5513,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 179
},
{
"epoch": 2.9508196721311473,
"grad_norm": 0.1717360019683838,
"learning_rate": 2.898757686722542e-07,
"loss": 0.5644,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 180
},
{
"epoch": 2.9672131147540983,
"grad_norm": 0.1789117455482483,
"learning_rate": 1.630896073864352e-07,
"loss": 0.5587,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 181
},
{
"epoch": 2.9836065573770494,
"grad_norm": 0.16960012912750244,
"learning_rate": 7.249521995263964e-08,
"loss": 0.5582,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 182
},
{
"epoch": 3.0,
"grad_norm": 0.16996316611766815,
"learning_rate": 1.81254476474213e-08,
"loss": 0.5541,
"memory/device_reserved (GiB)": 33.36,
"memory/max_active (GiB)": 32.7,
"memory/max_allocated (GiB)": 32.7,
"step": 183
},
{
"epoch": 3.0,
"eval_loss": 0.6080317497253418,
"eval_runtime": 15.4278,
"eval_samples_per_second": 53.021,
"eval_steps_per_second": 3.371,
"memory/device_reserved (GiB)": 73.37,
"memory/max_active (GiB)": 48.8,
"memory/max_allocated (GiB)": 48.8,
"step": 183
}
],
"logging_steps": 1,
"max_steps": 183,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 61,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.760779115062362e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}