Deepseek-MoE-T / trainer_state.json
EricZhang1412's picture
Upload folder using huggingface_hub
6e3639b verified
{
"best_metric": 1.6074212789535522,
"best_model_checkpoint": "./deepseek-MoE-lora/checkpoint-1000",
"epoch": 1.3736263736263736,
"eval_steps": 100,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013736263736263736,
"grad_norm": 29.239500045776367,
"learning_rate": 1.1441647597254004e-06,
"loss": 9.2221,
"mean_token_accuracy": 0.12948232870548965,
"num_tokens": 121829.0,
"step": 10
},
{
"epoch": 0.027472527472527472,
"grad_norm": 30.485370635986328,
"learning_rate": 2.288329519450801e-06,
"loss": 8.9209,
"mean_token_accuracy": 0.13155823573470116,
"num_tokens": 232251.0,
"step": 20
},
{
"epoch": 0.04120879120879121,
"grad_norm": 26.796140670776367,
"learning_rate": 3.4324942791762018e-06,
"loss": 8.6267,
"mean_token_accuracy": 0.1354693438857794,
"num_tokens": 349170.0,
"step": 30
},
{
"epoch": 0.054945054945054944,
"grad_norm": 26.021697998046875,
"learning_rate": 4.576659038901602e-06,
"loss": 8.4408,
"mean_token_accuracy": 0.13846412636339664,
"num_tokens": 459909.0,
"step": 40
},
{
"epoch": 0.06868131868131869,
"grad_norm": 29.667829513549805,
"learning_rate": 5.720823798627003e-06,
"loss": 8.2092,
"mean_token_accuracy": 0.13972207996994257,
"num_tokens": 570165.0,
"step": 50
},
{
"epoch": 0.08241758241758242,
"grad_norm": 21.387392044067383,
"learning_rate": 6.8649885583524035e-06,
"loss": 7.3725,
"mean_token_accuracy": 0.15276159830391406,
"num_tokens": 684371.0,
"step": 60
},
{
"epoch": 0.09615384615384616,
"grad_norm": 22.36768913269043,
"learning_rate": 8.009153318077805e-06,
"loss": 6.6599,
"mean_token_accuracy": 0.16514867953956128,
"num_tokens": 799113.0,
"step": 70
},
{
"epoch": 0.10989010989010989,
"grad_norm": 12.144390106201172,
"learning_rate": 9.153318077803204e-06,
"loss": 5.8577,
"mean_token_accuracy": 0.18936714343726635,
"num_tokens": 919909.0,
"step": 80
},
{
"epoch": 0.12362637362637363,
"grad_norm": 5.9125447273254395,
"learning_rate": 1.0297482837528604e-05,
"loss": 5.3595,
"mean_token_accuracy": 0.209349187836051,
"num_tokens": 1032496.0,
"step": 90
},
{
"epoch": 0.13736263736263737,
"grad_norm": 4.612669944763184,
"learning_rate": 1.1441647597254006e-05,
"loss": 5.1678,
"step": 100
},
{
"epoch": 0.13736263736263737,
"eval_loss": 4.984684944152832,
"eval_mean_token_accuracy": 0.2370274927881029,
"eval_num_tokens": 1141793.0,
"eval_runtime": 311.0531,
"eval_samples_per_second": 16.64,
"eval_steps_per_second": 1.042,
"step": 100
},
{
"epoch": 0.1510989010989011,
"grad_norm": 4.400078296661377,
"learning_rate": 1.2585812356979407e-05,
"loss": 4.8803,
"mean_token_accuracy": 0.23492281436920165,
"num_tokens": 1254329.0,
"step": 110
},
{
"epoch": 0.16483516483516483,
"grad_norm": 4.174052715301514,
"learning_rate": 1.3729977116704807e-05,
"loss": 4.5865,
"mean_token_accuracy": 0.2669687133282423,
"num_tokens": 1373912.0,
"step": 120
},
{
"epoch": 0.17857142857142858,
"grad_norm": 4.239316463470459,
"learning_rate": 1.4874141876430206e-05,
"loss": 4.3418,
"mean_token_accuracy": 0.29036078676581384,
"num_tokens": 1491018.0,
"step": 130
},
{
"epoch": 0.19230769230769232,
"grad_norm": 3.802030563354492,
"learning_rate": 1.601830663615561e-05,
"loss": 4.0019,
"mean_token_accuracy": 0.3230995647609234,
"num_tokens": 1604500.0,
"step": 140
},
{
"epoch": 0.20604395604395603,
"grad_norm": 3.057298183441162,
"learning_rate": 1.716247139588101e-05,
"loss": 3.7445,
"mean_token_accuracy": 0.3446956820785999,
"num_tokens": 1723869.0,
"step": 150
},
{
"epoch": 0.21978021978021978,
"grad_norm": 2.6854259967803955,
"learning_rate": 1.8306636155606407e-05,
"loss": 3.4938,
"mean_token_accuracy": 0.3709439463913441,
"num_tokens": 1840593.0,
"step": 160
},
{
"epoch": 0.23351648351648352,
"grad_norm": 2.322840452194214,
"learning_rate": 1.945080091533181e-05,
"loss": 3.2394,
"mean_token_accuracy": 0.3960001617670059,
"num_tokens": 1948634.0,
"step": 170
},
{
"epoch": 0.24725274725274726,
"grad_norm": 1.996971607208252,
"learning_rate": 2.0594965675057208e-05,
"loss": 3.0746,
"mean_token_accuracy": 0.41375042870640755,
"num_tokens": 2058974.0,
"step": 180
},
{
"epoch": 0.260989010989011,
"grad_norm": 1.6177858114242554,
"learning_rate": 2.173913043478261e-05,
"loss": 2.8488,
"mean_token_accuracy": 0.4406682662665844,
"num_tokens": 2180905.0,
"step": 190
},
{
"epoch": 0.27472527472527475,
"grad_norm": 1.3668256998062134,
"learning_rate": 2.2883295194508012e-05,
"loss": 2.7354,
"step": 200
},
{
"epoch": 0.27472527472527475,
"eval_loss": 2.6662864685058594,
"eval_mean_token_accuracy": 0.4655627465726417,
"eval_num_tokens": 2297024.0,
"eval_runtime": 310.9656,
"eval_samples_per_second": 16.645,
"eval_steps_per_second": 1.042,
"step": 200
},
{
"epoch": 0.28846153846153844,
"grad_norm": 1.2411565780639648,
"learning_rate": 2.402745995423341e-05,
"loss": 2.5765,
"mean_token_accuracy": 0.4659102514386177,
"num_tokens": 2415020.0,
"step": 210
},
{
"epoch": 0.3021978021978022,
"grad_norm": 1.1622848510742188,
"learning_rate": 2.5171624713958813e-05,
"loss": 2.522,
"mean_token_accuracy": 0.4841405004262924,
"num_tokens": 2530662.0,
"step": 220
},
{
"epoch": 0.3159340659340659,
"grad_norm": 1.0268805027008057,
"learning_rate": 2.6315789473684212e-05,
"loss": 2.4053,
"mean_token_accuracy": 0.49754476696252825,
"num_tokens": 2644661.0,
"step": 230
},
{
"epoch": 0.32967032967032966,
"grad_norm": 1.050316572189331,
"learning_rate": 2.7459954233409614e-05,
"loss": 2.3136,
"mean_token_accuracy": 0.5062366120517254,
"num_tokens": 2764206.0,
"step": 240
},
{
"epoch": 0.3434065934065934,
"grad_norm": 0.974026083946228,
"learning_rate": 2.8604118993135016e-05,
"loss": 2.2151,
"mean_token_accuracy": 0.5198402456939221,
"num_tokens": 2879504.0,
"step": 250
},
{
"epoch": 0.35714285714285715,
"grad_norm": 0.95684415102005,
"learning_rate": 2.974828375286041e-05,
"loss": 2.192,
"mean_token_accuracy": 0.523027028143406,
"num_tokens": 2994634.0,
"step": 260
},
{
"epoch": 0.3708791208791209,
"grad_norm": 1.0790362358093262,
"learning_rate": 3.089244851258582e-05,
"loss": 2.0856,
"mean_token_accuracy": 0.537685776501894,
"num_tokens": 3110559.0,
"step": 270
},
{
"epoch": 0.38461538461538464,
"grad_norm": 0.8086666464805603,
"learning_rate": 3.203661327231122e-05,
"loss": 2.0586,
"mean_token_accuracy": 0.54344697073102,
"num_tokens": 3224576.0,
"step": 280
},
{
"epoch": 0.3983516483516483,
"grad_norm": 0.8853408694267273,
"learning_rate": 3.3180778032036615e-05,
"loss": 1.9996,
"mean_token_accuracy": 0.5515147194266319,
"num_tokens": 3341645.0,
"step": 290
},
{
"epoch": 0.41208791208791207,
"grad_norm": 0.9164928793907166,
"learning_rate": 3.432494279176202e-05,
"loss": 1.98,
"step": 300
},
{
"epoch": 0.41208791208791207,
"eval_loss": 1.9627922773361206,
"eval_mean_token_accuracy": 0.5530768693597229,
"eval_num_tokens": 3452305.0,
"eval_runtime": 326.8485,
"eval_samples_per_second": 15.836,
"eval_steps_per_second": 0.991,
"step": 300
},
{
"epoch": 0.4258241758241758,
"grad_norm": 0.91264408826828,
"learning_rate": 3.546910755148742e-05,
"loss": 1.9447,
"mean_token_accuracy": 0.5537945911288261,
"num_tokens": 3569623.0,
"step": 310
},
{
"epoch": 0.43956043956043955,
"grad_norm": 0.77753084897995,
"learning_rate": 3.6613272311212814e-05,
"loss": 1.8766,
"mean_token_accuracy": 0.5657263264060021,
"num_tokens": 3686940.0,
"step": 320
},
{
"epoch": 0.4532967032967033,
"grad_norm": 1.0137959718704224,
"learning_rate": 3.7757437070938216e-05,
"loss": 1.8847,
"mean_token_accuracy": 0.5627776235342026,
"num_tokens": 3802302.0,
"step": 330
},
{
"epoch": 0.46703296703296704,
"grad_norm": 0.8629505038261414,
"learning_rate": 3.890160183066362e-05,
"loss": 1.8672,
"mean_token_accuracy": 0.5701652288436889,
"num_tokens": 3915449.0,
"step": 340
},
{
"epoch": 0.4807692307692308,
"grad_norm": 0.916632354259491,
"learning_rate": 4.0045766590389014e-05,
"loss": 1.8513,
"mean_token_accuracy": 0.5717276155948638,
"num_tokens": 4032982.0,
"step": 350
},
{
"epoch": 0.4945054945054945,
"grad_norm": 0.9153818488121033,
"learning_rate": 4.1189931350114416e-05,
"loss": 1.8632,
"mean_token_accuracy": 0.5680810511112213,
"num_tokens": 4144430.0,
"step": 360
},
{
"epoch": 0.5082417582417582,
"grad_norm": 0.9300126433372498,
"learning_rate": 4.233409610983982e-05,
"loss": 1.7932,
"mean_token_accuracy": 0.5819852232933045,
"num_tokens": 4255862.0,
"step": 370
},
{
"epoch": 0.521978021978022,
"grad_norm": 1.1194523572921753,
"learning_rate": 4.347826086956522e-05,
"loss": 1.8185,
"mean_token_accuracy": 0.5755217462778092,
"num_tokens": 4364858.0,
"step": 380
},
{
"epoch": 0.5357142857142857,
"grad_norm": 0.8476864099502563,
"learning_rate": 4.462242562929062e-05,
"loss": 1.7662,
"mean_token_accuracy": 0.5817034646868706,
"num_tokens": 4478176.0,
"step": 390
},
{
"epoch": 0.5494505494505495,
"grad_norm": 0.9283179640769958,
"learning_rate": 4.5766590389016025e-05,
"loss": 1.801,
"step": 400
},
{
"epoch": 0.5494505494505495,
"eval_loss": 1.7755845785140991,
"eval_mean_token_accuracy": 0.5817039539048701,
"eval_num_tokens": 4596634.0,
"eval_runtime": 316.8533,
"eval_samples_per_second": 16.336,
"eval_steps_per_second": 1.023,
"step": 400
},
{
"epoch": 0.5631868131868132,
"grad_norm": 1.0799726247787476,
"learning_rate": 4.691075514874142e-05,
"loss": 1.7898,
"mean_token_accuracy": 0.5777772672474384,
"num_tokens": 4708369.0,
"step": 410
},
{
"epoch": 0.5769230769230769,
"grad_norm": 0.7919142246246338,
"learning_rate": 4.805491990846682e-05,
"loss": 1.7842,
"mean_token_accuracy": 0.5787179872393609,
"num_tokens": 4827961.0,
"step": 420
},
{
"epoch": 0.5906593406593407,
"grad_norm": 0.9103861451148987,
"learning_rate": 4.9199084668192224e-05,
"loss": 1.7741,
"mean_token_accuracy": 0.5839256420731544,
"num_tokens": 4938283.0,
"step": 430
},
{
"epoch": 0.6043956043956044,
"grad_norm": 0.864320695400238,
"learning_rate": 4.998937902711889e-05,
"loss": 1.7477,
"mean_token_accuracy": 0.5855789959430695,
"num_tokens": 5049612.0,
"step": 440
},
{
"epoch": 0.6181318681318682,
"grad_norm": 1.1230181455612183,
"learning_rate": 4.995397578418183e-05,
"loss": 1.7505,
"mean_token_accuracy": 0.5864449262619018,
"num_tokens": 5164775.0,
"step": 450
},
{
"epoch": 0.6318681318681318,
"grad_norm": 0.9053744673728943,
"learning_rate": 4.991857254124478e-05,
"loss": 1.7381,
"mean_token_accuracy": 0.5900515034794808,
"num_tokens": 5278039.0,
"step": 460
},
{
"epoch": 0.6456043956043956,
"grad_norm": 1.0245819091796875,
"learning_rate": 4.9883169298307723e-05,
"loss": 1.7427,
"mean_token_accuracy": 0.5880876764655113,
"num_tokens": 5390124.0,
"step": 470
},
{
"epoch": 0.6593406593406593,
"grad_norm": 0.8946035504341125,
"learning_rate": 4.9847766055370674e-05,
"loss": 1.71,
"mean_token_accuracy": 0.5933957740664482,
"num_tokens": 5497492.0,
"step": 480
},
{
"epoch": 0.6730769230769231,
"grad_norm": 0.8355729579925537,
"learning_rate": 4.9812362812433624e-05,
"loss": 1.6985,
"mean_token_accuracy": 0.5948651686310769,
"num_tokens": 5613483.0,
"step": 490
},
{
"epoch": 0.6868131868131868,
"grad_norm": 0.9272288680076599,
"learning_rate": 4.977695956949657e-05,
"loss": 1.7286,
"step": 500
},
{
"epoch": 0.6868131868131868,
"eval_loss": 1.7027286291122437,
"eval_mean_token_accuracy": 0.593273350302084,
"eval_num_tokens": 5729153.0,
"eval_runtime": 311.0607,
"eval_samples_per_second": 16.64,
"eval_steps_per_second": 1.042,
"step": 500
},
{
"epoch": 0.7005494505494505,
"grad_norm": 0.9919455051422119,
"learning_rate": 4.974155632655951e-05,
"loss": 1.7034,
"mean_token_accuracy": 0.5900716975331306,
"num_tokens": 5840054.0,
"step": 510
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.9207066893577576,
"learning_rate": 4.970615308362247e-05,
"loss": 1.6816,
"mean_token_accuracy": 0.5953675732016563,
"num_tokens": 5951050.0,
"step": 520
},
{
"epoch": 0.728021978021978,
"grad_norm": 1.035968542098999,
"learning_rate": 4.967074984068541e-05,
"loss": 1.6997,
"mean_token_accuracy": 0.5936009138822556,
"num_tokens": 6059659.0,
"step": 530
},
{
"epoch": 0.7417582417582418,
"grad_norm": 0.9535728096961975,
"learning_rate": 4.9635346597748354e-05,
"loss": 1.6733,
"mean_token_accuracy": 0.597626781463623,
"num_tokens": 6166152.0,
"step": 540
},
{
"epoch": 0.7554945054945055,
"grad_norm": 0.9060805439949036,
"learning_rate": 4.9599943354811304e-05,
"loss": 1.6729,
"mean_token_accuracy": 0.59917561262846,
"num_tokens": 6281975.0,
"step": 550
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.9403394460678101,
"learning_rate": 4.9564540111874254e-05,
"loss": 1.6765,
"mean_token_accuracy": 0.5959734156727791,
"num_tokens": 6393517.0,
"step": 560
},
{
"epoch": 0.782967032967033,
"grad_norm": 1.018798589706421,
"learning_rate": 4.95291368689372e-05,
"loss": 1.6512,
"mean_token_accuracy": 0.6024029836058616,
"num_tokens": 6512243.0,
"step": 570
},
{
"epoch": 0.7967032967032966,
"grad_norm": 0.9490474462509155,
"learning_rate": 4.949373362600014e-05,
"loss": 1.7011,
"mean_token_accuracy": 0.592784532904625,
"num_tokens": 6627920.0,
"step": 580
},
{
"epoch": 0.8104395604395604,
"grad_norm": 0.9688321948051453,
"learning_rate": 4.945833038306309e-05,
"loss": 1.6464,
"mean_token_accuracy": 0.6033695384860038,
"num_tokens": 6739972.0,
"step": 590
},
{
"epoch": 0.8241758241758241,
"grad_norm": 1.112197756767273,
"learning_rate": 4.9422927140126034e-05,
"loss": 1.6757,
"step": 600
},
{
"epoch": 0.8241758241758241,
"eval_loss": 1.6667677164077759,
"eval_mean_token_accuracy": 0.5988459940309878,
"eval_num_tokens": 6859022.0,
"eval_runtime": 311.0117,
"eval_samples_per_second": 16.642,
"eval_steps_per_second": 1.042,
"step": 600
},
{
"epoch": 0.8379120879120879,
"grad_norm": 0.8909268379211426,
"learning_rate": 4.9387523897188984e-05,
"loss": 1.6889,
"mean_token_accuracy": 0.5944475166499614,
"num_tokens": 6970961.0,
"step": 610
},
{
"epoch": 0.8516483516483516,
"grad_norm": 0.9639058709144592,
"learning_rate": 4.9352120654251934e-05,
"loss": 1.6563,
"mean_token_accuracy": 0.6006275966763497,
"num_tokens": 7081524.0,
"step": 620
},
{
"epoch": 0.8653846153846154,
"grad_norm": 1.0538073778152466,
"learning_rate": 4.931671741131488e-05,
"loss": 1.645,
"mean_token_accuracy": 0.6039176091551781,
"num_tokens": 7191486.0,
"step": 630
},
{
"epoch": 0.8791208791208791,
"grad_norm": 1.0572949647903442,
"learning_rate": 4.928131416837782e-05,
"loss": 1.6553,
"mean_token_accuracy": 0.6014241844415664,
"num_tokens": 7306262.0,
"step": 640
},
{
"epoch": 0.8928571428571429,
"grad_norm": 1.0915870666503906,
"learning_rate": 4.924591092544078e-05,
"loss": 1.6737,
"mean_token_accuracy": 0.5958378210663795,
"num_tokens": 7418990.0,
"step": 650
},
{
"epoch": 0.9065934065934066,
"grad_norm": 0.9485398530960083,
"learning_rate": 4.921050768250372e-05,
"loss": 1.6078,
"mean_token_accuracy": 0.6094794437289238,
"num_tokens": 7537671.0,
"step": 660
},
{
"epoch": 0.9203296703296703,
"grad_norm": 1.1498547792434692,
"learning_rate": 4.9175104439566664e-05,
"loss": 1.6356,
"mean_token_accuracy": 0.6057997912168502,
"num_tokens": 7651191.0,
"step": 670
},
{
"epoch": 0.9340659340659341,
"grad_norm": 1.0218619108200073,
"learning_rate": 4.9139701196629614e-05,
"loss": 1.6491,
"mean_token_accuracy": 0.6027492165565491,
"num_tokens": 7769287.0,
"step": 680
},
{
"epoch": 0.9478021978021978,
"grad_norm": 1.183112621307373,
"learning_rate": 4.9104297953692564e-05,
"loss": 1.617,
"mean_token_accuracy": 0.6064566567540168,
"num_tokens": 7886626.0,
"step": 690
},
{
"epoch": 0.9615384615384616,
"grad_norm": 1.1547908782958984,
"learning_rate": 4.906889471075551e-05,
"loss": 1.6585,
"step": 700
},
{
"epoch": 0.9615384615384616,
"eval_loss": 1.6455810070037842,
"eval_mean_token_accuracy": 0.6022681238842599,
"eval_num_tokens": 7995973.0,
"eval_runtime": 310.453,
"eval_samples_per_second": 16.672,
"eval_steps_per_second": 1.044,
"step": 700
},
{
"epoch": 0.9752747252747253,
"grad_norm": 0.961618959903717,
"learning_rate": 4.903349146781845e-05,
"loss": 1.6406,
"mean_token_accuracy": 0.6022139228880405,
"num_tokens": 8109073.0,
"step": 710
},
{
"epoch": 0.989010989010989,
"grad_norm": 1.27222740650177,
"learning_rate": 4.89980882248814e-05,
"loss": 1.6678,
"mean_token_accuracy": 0.5999792292714119,
"num_tokens": 8222282.0,
"step": 720
},
{
"epoch": 1.0027472527472527,
"grad_norm": 1.0560694932937622,
"learning_rate": 4.896268498194435e-05,
"loss": 1.638,
"mean_token_accuracy": 0.6034953713417053,
"num_tokens": 8335446.0,
"step": 730
},
{
"epoch": 1.0164835164835164,
"grad_norm": 1.034684419631958,
"learning_rate": 4.8927281739007295e-05,
"loss": 1.621,
"mean_token_accuracy": 0.6036525964736938,
"num_tokens": 8450623.0,
"step": 740
},
{
"epoch": 1.0302197802197801,
"grad_norm": 1.0682315826416016,
"learning_rate": 4.8891878496070245e-05,
"loss": 1.6376,
"mean_token_accuracy": 0.6037934944033623,
"num_tokens": 8561023.0,
"step": 750
},
{
"epoch": 1.043956043956044,
"grad_norm": 1.2298721075057983,
"learning_rate": 4.885647525313319e-05,
"loss": 1.6339,
"mean_token_accuracy": 0.6060884281992912,
"num_tokens": 8675222.0,
"step": 760
},
{
"epoch": 1.0576923076923077,
"grad_norm": 1.0562454462051392,
"learning_rate": 4.882107201019613e-05,
"loss": 1.6333,
"mean_token_accuracy": 0.6048480987548828,
"num_tokens": 8784755.0,
"step": 770
},
{
"epoch": 1.0714285714285714,
"grad_norm": 0.9854257702827454,
"learning_rate": 4.878566876725909e-05,
"loss": 1.5999,
"mean_token_accuracy": 0.6091530665755271,
"num_tokens": 8900580.0,
"step": 780
},
{
"epoch": 1.085164835164835,
"grad_norm": 1.1997156143188477,
"learning_rate": 4.875026552432203e-05,
"loss": 1.6677,
"mean_token_accuracy": 0.5998320803046227,
"num_tokens": 9016189.0,
"step": 790
},
{
"epoch": 1.098901098901099,
"grad_norm": 1.0334490537643433,
"learning_rate": 4.8714862281384975e-05,
"loss": 1.6272,
"step": 800
},
{
"epoch": 1.098901098901099,
"eval_loss": 1.6325246095657349,
"eval_mean_token_accuracy": 0.6042487479654359,
"eval_num_tokens": 9132964.0,
"eval_runtime": 310.2316,
"eval_samples_per_second": 16.684,
"eval_steps_per_second": 1.044,
"step": 800
},
{
"epoch": 1.1126373626373627,
"grad_norm": 1.03342604637146,
"learning_rate": 4.8679459038447925e-05,
"loss": 1.6429,
"mean_token_accuracy": 0.6036744929850102,
"num_tokens": 9249801.0,
"step": 810
},
{
"epoch": 1.1263736263736264,
"grad_norm": 1.1043306589126587,
"learning_rate": 4.8644055795510875e-05,
"loss": 1.6237,
"mean_token_accuracy": 0.6055251255631446,
"num_tokens": 9365589.0,
"step": 820
},
{
"epoch": 1.14010989010989,
"grad_norm": 1.131428837776184,
"learning_rate": 4.860865255257382e-05,
"loss": 1.6363,
"mean_token_accuracy": 0.6017774865031242,
"num_tokens": 9475982.0,
"step": 830
},
{
"epoch": 1.1538461538461537,
"grad_norm": 0.9910945892333984,
"learning_rate": 4.857324930963676e-05,
"loss": 1.6104,
"mean_token_accuracy": 0.6087021440267563,
"num_tokens": 9592179.0,
"step": 840
},
{
"epoch": 1.1675824175824177,
"grad_norm": 1.1637762784957886,
"learning_rate": 4.853784606669971e-05,
"loss": 1.642,
"mean_token_accuracy": 0.6020917922258378,
"num_tokens": 9708684.0,
"step": 850
},
{
"epoch": 1.1813186813186813,
"grad_norm": 1.1667078733444214,
"learning_rate": 4.850244282376266e-05,
"loss": 1.6368,
"mean_token_accuracy": 0.6061318039894104,
"num_tokens": 9822155.0,
"step": 860
},
{
"epoch": 1.195054945054945,
"grad_norm": 1.1195344924926758,
"learning_rate": 4.8467039580825605e-05,
"loss": 1.5997,
"mean_token_accuracy": 0.6092845112085342,
"num_tokens": 9935041.0,
"step": 870
},
{
"epoch": 1.2087912087912087,
"grad_norm": 1.0910887718200684,
"learning_rate": 4.8431636337888555e-05,
"loss": 1.6381,
"mean_token_accuracy": 0.6027830302715301,
"num_tokens": 10051143.0,
"step": 880
},
{
"epoch": 1.2225274725274726,
"grad_norm": 1.171608805656433,
"learning_rate": 4.83962330949515e-05,
"loss": 1.604,
"mean_token_accuracy": 0.6083497762680053,
"num_tokens": 10164538.0,
"step": 890
},
{
"epoch": 1.2362637362637363,
"grad_norm": 1.1128859519958496,
"learning_rate": 4.836082985201444e-05,
"loss": 1.6191,
"step": 900
},
{
"epoch": 1.2362637362637363,
"eval_loss": 1.6174854040145874,
"eval_mean_token_accuracy": 0.605991005713557,
"eval_num_tokens": 10279282.0,
"eval_runtime": 310.7449,
"eval_samples_per_second": 16.657,
"eval_steps_per_second": 1.043,
"step": 900
},
{
"epoch": 1.25,
"grad_norm": 1.0883232355117798,
"learning_rate": 4.83254266090774e-05,
"loss": 1.5946,
"mean_token_accuracy": 0.6100020207464695,
"num_tokens": 10387510.0,
"step": 910
},
{
"epoch": 1.2637362637362637,
"grad_norm": 1.3262733221054077,
"learning_rate": 4.829002336614034e-05,
"loss": 1.6276,
"mean_token_accuracy": 0.606501379609108,
"num_tokens": 10499682.0,
"step": 920
},
{
"epoch": 1.2774725274725274,
"grad_norm": 1.113751769065857,
"learning_rate": 4.8254620123203285e-05,
"loss": 1.5819,
"mean_token_accuracy": 0.6106113165616989,
"num_tokens": 10618406.0,
"step": 930
},
{
"epoch": 1.2912087912087913,
"grad_norm": 1.1370590925216675,
"learning_rate": 4.8219216880266235e-05,
"loss": 1.6192,
"mean_token_accuracy": 0.6069919407367707,
"num_tokens": 10737112.0,
"step": 940
},
{
"epoch": 1.304945054945055,
"grad_norm": 1.2001625299453735,
"learning_rate": 4.8183813637329185e-05,
"loss": 1.5989,
"mean_token_accuracy": 0.6121641129255295,
"num_tokens": 10855000.0,
"step": 950
},
{
"epoch": 1.3186813186813187,
"grad_norm": 1.2227309942245483,
"learning_rate": 4.814841039439213e-05,
"loss": 1.6093,
"mean_token_accuracy": 0.6081742540001869,
"num_tokens": 10969157.0,
"step": 960
},
{
"epoch": 1.3324175824175823,
"grad_norm": 1.0985106229782104,
"learning_rate": 4.811300715145507e-05,
"loss": 1.6226,
"mean_token_accuracy": 0.6075394690036774,
"num_tokens": 11084095.0,
"step": 970
},
{
"epoch": 1.3461538461538463,
"grad_norm": 1.1740081310272217,
"learning_rate": 4.807760390851802e-05,
"loss": 1.5927,
"mean_token_accuracy": 0.6087717518210412,
"num_tokens": 11200093.0,
"step": 980
},
{
"epoch": 1.35989010989011,
"grad_norm": 1.2875778675079346,
"learning_rate": 4.804220066558097e-05,
"loss": 1.6199,
"mean_token_accuracy": 0.6059487432241439,
"num_tokens": 11310360.0,
"step": 990
},
{
"epoch": 1.3736263736263736,
"grad_norm": 1.12953519821167,
"learning_rate": 4.8006797422643916e-05,
"loss": 1.5867,
"step": 1000
},
{
"epoch": 1.3736263736263736,
"eval_loss": 1.6074212789535522,
"eval_mean_token_accuracy": 0.6077834514924038,
"eval_num_tokens": 11427781.0,
"eval_runtime": 309.2704,
"eval_samples_per_second": 16.736,
"eval_steps_per_second": 1.048,
"step": 1000
}
],
"logging_steps": 10,
"max_steps": 14560,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.451849537677951e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}