arpandeepk's picture
Upload folder using huggingface_hub
ddea42e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.643880926130099,
"eval_steps": 100,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.208277304470539,
"epoch": 0.044101433296582136,
"grad_norm": 0.7034044861793518,
"learning_rate": 3.6e-05,
"loss": 1.8075,
"mean_token_accuracy": 0.7044319450855255,
"num_tokens": 162027.0,
"step": 10
},
{
"entropy": 1.2619145691394806,
"epoch": 0.08820286659316427,
"grad_norm": 0.56658935546875,
"learning_rate": 7.6e-05,
"loss": 1.7395,
"mean_token_accuracy": 0.7111444145441055,
"num_tokens": 322033.0,
"step": 20
},
{
"entropy": 1.3051191717386246,
"epoch": 0.13230429988974643,
"grad_norm": 0.9701246023178101,
"learning_rate": 0.000116,
"loss": 1.4759,
"mean_token_accuracy": 0.7372552484273911,
"num_tokens": 493134.0,
"step": 30
},
{
"entropy": 1.2549610823392867,
"epoch": 0.17640573318632854,
"grad_norm": 0.7130260467529297,
"learning_rate": 0.00015600000000000002,
"loss": 1.1858,
"mean_token_accuracy": 0.7737368658185005,
"num_tokens": 657636.0,
"step": 40
},
{
"entropy": 1.0649129822850227,
"epoch": 0.2205071664829107,
"grad_norm": 0.31629839539527893,
"learning_rate": 0.000196,
"loss": 1.0158,
"mean_token_accuracy": 0.7959408611059189,
"num_tokens": 809381.0,
"step": 50
},
{
"entropy": 0.9025423139333725,
"epoch": 0.26460859977949286,
"grad_norm": 0.2634122669696808,
"learning_rate": 0.0001971473851030111,
"loss": 0.8932,
"mean_token_accuracy": 0.8144056230783463,
"num_tokens": 970344.0,
"step": 60
},
{
"entropy": 0.8994027122855186,
"epoch": 0.308710033076075,
"grad_norm": 0.29325830936431885,
"learning_rate": 0.00019397781299524563,
"loss": 0.8828,
"mean_token_accuracy": 0.8157614529132843,
"num_tokens": 1129434.0,
"step": 70
},
{
"entropy": 0.8638058304786682,
"epoch": 0.3528114663726571,
"grad_norm": 0.22787100076675415,
"learning_rate": 0.0001908082408874802,
"loss": 0.8518,
"mean_token_accuracy": 0.8206798136234283,
"num_tokens": 1290818.0,
"step": 80
},
{
"entropy": 0.8639321938157082,
"epoch": 0.39691289966923926,
"grad_norm": 0.27325868606567383,
"learning_rate": 0.00018763866877971475,
"loss": 0.8633,
"mean_token_accuracy": 0.81620042771101,
"num_tokens": 1452401.0,
"step": 90
},
{
"entropy": 0.8432708650827407,
"epoch": 0.4410143329658214,
"grad_norm": 0.29134032130241394,
"learning_rate": 0.0001844690966719493,
"loss": 0.8225,
"mean_token_accuracy": 0.8236450552940369,
"num_tokens": 1616085.0,
"step": 100
},
{
"epoch": 0.4410143329658214,
"eval_entropy": 0.7245843861952866,
"eval_loss": 0.6365505456924438,
"eval_mean_token_accuracy": 0.8594882246291283,
"eval_num_tokens": 1616085.0,
"eval_runtime": 8.1048,
"eval_samples_per_second": 99.447,
"eval_steps_per_second": 12.462,
"step": 100
},
{
"entropy": 0.7841475278139114,
"epoch": 0.48511576626240355,
"grad_norm": 0.25806924700737,
"learning_rate": 0.00018129952456418384,
"loss": 0.7675,
"mean_token_accuracy": 0.8291461855173111,
"num_tokens": 1780700.0,
"step": 110
},
{
"entropy": 0.8083247914910316,
"epoch": 0.5292171995589857,
"grad_norm": 0.2405901849269867,
"learning_rate": 0.00017812995245641838,
"loss": 0.7929,
"mean_token_accuracy": 0.8249855980277061,
"num_tokens": 1944758.0,
"step": 120
},
{
"entropy": 0.8017621666193009,
"epoch": 0.5733186328555678,
"grad_norm": 0.2787521183490753,
"learning_rate": 0.00017496038034865293,
"loss": 0.8007,
"mean_token_accuracy": 0.8232445836067199,
"num_tokens": 2108373.0,
"step": 130
},
{
"entropy": 0.783287800848484,
"epoch": 0.61742006615215,
"grad_norm": 0.24215690791606903,
"learning_rate": 0.0001717908082408875,
"loss": 0.7806,
"mean_token_accuracy": 0.82769885212183,
"num_tokens": 2279985.0,
"step": 140
},
{
"entropy": 0.7877024173736572,
"epoch": 0.6615214994487321,
"grad_norm": 0.32073867321014404,
"learning_rate": 0.00016862123613312205,
"loss": 0.7749,
"mean_token_accuracy": 0.8289618909358978,
"num_tokens": 2437290.0,
"step": 150
},
{
"entropy": 0.7758993163704873,
"epoch": 0.7056229327453142,
"grad_norm": 0.24493683874607086,
"learning_rate": 0.0001654516640253566,
"loss": 0.7613,
"mean_token_accuracy": 0.8307989597320556,
"num_tokens": 2599040.0,
"step": 160
},
{
"entropy": 0.7710816964507103,
"epoch": 0.7497243660418964,
"grad_norm": 0.26176023483276367,
"learning_rate": 0.00016228209191759114,
"loss": 0.7701,
"mean_token_accuracy": 0.8283671870827675,
"num_tokens": 2763931.0,
"step": 170
},
{
"entropy": 0.7687478274106979,
"epoch": 0.7938257993384785,
"grad_norm": 0.33372247219085693,
"learning_rate": 0.00015911251980982568,
"loss": 0.7599,
"mean_token_accuracy": 0.8326913744211197,
"num_tokens": 2921138.0,
"step": 180
},
{
"entropy": 0.7683898612856865,
"epoch": 0.8379272326350606,
"grad_norm": 0.30479806661605835,
"learning_rate": 0.00015594294770206023,
"loss": 0.7652,
"mean_token_accuracy": 0.8292697682976723,
"num_tokens": 3078075.0,
"step": 190
},
{
"entropy": 0.7426786199212074,
"epoch": 0.8820286659316428,
"grad_norm": 0.30385708808898926,
"learning_rate": 0.0001527733755942948,
"loss": 0.7484,
"mean_token_accuracy": 0.834077812731266,
"num_tokens": 3250532.0,
"step": 200
},
{
"epoch": 0.8820286659316428,
"eval_entropy": 0.6650676030923824,
"eval_loss": 0.5983571410179138,
"eval_mean_token_accuracy": 0.8642630500368552,
"eval_num_tokens": 3250532.0,
"eval_runtime": 8.088,
"eval_samples_per_second": 99.654,
"eval_steps_per_second": 12.488,
"step": 200
},
{
"entropy": 0.7351636976003647,
"epoch": 0.9261300992282249,
"grad_norm": 0.3023395836353302,
"learning_rate": 0.00014960380348652932,
"loss": 0.7276,
"mean_token_accuracy": 0.8379455998539924,
"num_tokens": 3412136.0,
"step": 210
},
{
"entropy": 0.7627917662262916,
"epoch": 0.9702315325248071,
"grad_norm": 0.27449166774749756,
"learning_rate": 0.00014643423137876386,
"loss": 0.7657,
"mean_token_accuracy": 0.832044218480587,
"num_tokens": 3578511.0,
"step": 220
},
{
"entropy": 0.7629164243355776,
"epoch": 1.0132304299889747,
"grad_norm": 0.3300953209400177,
"learning_rate": 0.0001432646592709984,
"loss": 0.7491,
"mean_token_accuracy": 0.8328371659303323,
"num_tokens": 3722301.0,
"step": 230
},
{
"entropy": 0.746186052262783,
"epoch": 1.0573318632855568,
"grad_norm": 0.3027808666229248,
"learning_rate": 0.00014009508716323295,
"loss": 0.7462,
"mean_token_accuracy": 0.8322931200265884,
"num_tokens": 3896679.0,
"step": 240
},
{
"entropy": 0.7389517679810524,
"epoch": 1.101433296582139,
"grad_norm": 0.2887279689311981,
"learning_rate": 0.00013692551505546752,
"loss": 0.7406,
"mean_token_accuracy": 0.8339706152677536,
"num_tokens": 4060607.0,
"step": 250
},
{
"entropy": 0.7400932610034943,
"epoch": 1.145534729878721,
"grad_norm": 0.29185837507247925,
"learning_rate": 0.00013375594294770207,
"loss": 0.7335,
"mean_token_accuracy": 0.8348478749394417,
"num_tokens": 4235108.0,
"step": 260
},
{
"entropy": 0.7405846387147903,
"epoch": 1.1896361631753032,
"grad_norm": 0.3650504946708679,
"learning_rate": 0.0001305863708399366,
"loss": 0.7242,
"mean_token_accuracy": 0.8392871618270874,
"num_tokens": 4399960.0,
"step": 270
},
{
"entropy": 0.7342920154333115,
"epoch": 1.2337375964718853,
"grad_norm": 0.32232147455215454,
"learning_rate": 0.00012741679873217116,
"loss": 0.7329,
"mean_token_accuracy": 0.8371502041816712,
"num_tokens": 4562192.0,
"step": 280
},
{
"entropy": 0.7346035555005074,
"epoch": 1.2778390297684674,
"grad_norm": 0.35367703437805176,
"learning_rate": 0.0001242472266244057,
"loss": 0.7251,
"mean_token_accuracy": 0.8377319499850273,
"num_tokens": 4721881.0,
"step": 290
},
{
"entropy": 0.7108615353703499,
"epoch": 1.3219404630650495,
"grad_norm": 0.2766059637069702,
"learning_rate": 0.00012107765451664026,
"loss": 0.7134,
"mean_token_accuracy": 0.8399718284606934,
"num_tokens": 4885379.0,
"step": 300
},
{
"epoch": 1.3219404630650495,
"eval_entropy": 0.6444886132042007,
"eval_loss": 0.5822195410728455,
"eval_mean_token_accuracy": 0.866522473864036,
"eval_num_tokens": 4885379.0,
"eval_runtime": 8.0528,
"eval_samples_per_second": 100.089,
"eval_steps_per_second": 12.542,
"step": 300
},
{
"entropy": 0.7376143395900726,
"epoch": 1.3660418963616316,
"grad_norm": 0.2916136384010315,
"learning_rate": 0.0001179080824088748,
"loss": 0.7378,
"mean_token_accuracy": 0.8362751066684723,
"num_tokens": 5049776.0,
"step": 310
},
{
"entropy": 0.7360161304473877,
"epoch": 1.4101433296582138,
"grad_norm": 0.3246314525604248,
"learning_rate": 0.00011473851030110936,
"loss": 0.7383,
"mean_token_accuracy": 0.8336721956729889,
"num_tokens": 5209191.0,
"step": 320
},
{
"entropy": 0.7085310086607933,
"epoch": 1.454244762954796,
"grad_norm": 0.2847578823566437,
"learning_rate": 0.00011156893819334391,
"loss": 0.7041,
"mean_token_accuracy": 0.8415971100330353,
"num_tokens": 5375904.0,
"step": 330
},
{
"entropy": 0.7118788257241249,
"epoch": 1.4983461962513782,
"grad_norm": 0.3348420262336731,
"learning_rate": 0.00010839936608557845,
"loss": 0.7121,
"mean_token_accuracy": 0.839908429980278,
"num_tokens": 5530092.0,
"step": 340
},
{
"entropy": 0.7398809120059013,
"epoch": 1.5424476295479603,
"grad_norm": 0.31565436720848083,
"learning_rate": 0.00010522979397781301,
"loss": 0.738,
"mean_token_accuracy": 0.8339577659964561,
"num_tokens": 5680997.0,
"step": 350
},
{
"entropy": 0.7200135216116905,
"epoch": 1.5865490628445424,
"grad_norm": 0.3452684283256531,
"learning_rate": 0.00010206022187004756,
"loss": 0.726,
"mean_token_accuracy": 0.8387370139360428,
"num_tokens": 5847675.0,
"step": 360
},
{
"entropy": 0.7074938386678695,
"epoch": 1.6306504961411246,
"grad_norm": 0.307235985994339,
"learning_rate": 9.889064976228209e-05,
"loss": 0.689,
"mean_token_accuracy": 0.8433170482516289,
"num_tokens": 6014120.0,
"step": 370
},
{
"entropy": 0.713779816031456,
"epoch": 1.6747519294377067,
"grad_norm": 0.33486098051071167,
"learning_rate": 9.572107765451665e-05,
"loss": 0.7166,
"mean_token_accuracy": 0.8390020251274108,
"num_tokens": 6175759.0,
"step": 380
},
{
"entropy": 0.7257090628147125,
"epoch": 1.718853362734289,
"grad_norm": 0.29937514662742615,
"learning_rate": 9.255150554675119e-05,
"loss": 0.7296,
"mean_token_accuracy": 0.8377068281173706,
"num_tokens": 6337886.0,
"step": 390
},
{
"entropy": 0.7071069806814194,
"epoch": 1.7629547960308711,
"grad_norm": 0.29501640796661377,
"learning_rate": 8.938193343898574e-05,
"loss": 0.7023,
"mean_token_accuracy": 0.8423839300870896,
"num_tokens": 6499459.0,
"step": 400
},
{
"epoch": 1.7629547960308711,
"eval_entropy": 0.6217773565561464,
"eval_loss": 0.5744128227233887,
"eval_mean_token_accuracy": 0.8680370243469088,
"eval_num_tokens": 6499459.0,
"eval_runtime": 8.0487,
"eval_samples_per_second": 100.14,
"eval_steps_per_second": 12.549,
"step": 400
},
{
"entropy": 0.7010652974247933,
"epoch": 1.8070562293274532,
"grad_norm": 0.3465122580528259,
"learning_rate": 8.62123613312203e-05,
"loss": 0.7056,
"mean_token_accuracy": 0.8412188425660133,
"num_tokens": 6661490.0,
"step": 410
},
{
"entropy": 0.7035703644156456,
"epoch": 1.8511576626240354,
"grad_norm": 0.2653435170650482,
"learning_rate": 8.304278922345484e-05,
"loss": 0.7047,
"mean_token_accuracy": 0.8421282634139061,
"num_tokens": 6806303.0,
"step": 420
},
{
"entropy": 0.7134277895092964,
"epoch": 1.8952590959206175,
"grad_norm": 0.3735100328922272,
"learning_rate": 7.987321711568939e-05,
"loss": 0.7222,
"mean_token_accuracy": 0.8374130159616471,
"num_tokens": 6977619.0,
"step": 430
},
{
"entropy": 0.6837946087121963,
"epoch": 1.9393605292171996,
"grad_norm": 0.3172140121459961,
"learning_rate": 7.670364500792393e-05,
"loss": 0.6872,
"mean_token_accuracy": 0.8447714239358902,
"num_tokens": 7134964.0,
"step": 440
},
{
"entropy": 0.7037998244166375,
"epoch": 1.9834619625137817,
"grad_norm": 0.3320825397968292,
"learning_rate": 7.353407290015848e-05,
"loss": 0.7065,
"mean_token_accuracy": 0.8428374692797661,
"num_tokens": 7302144.0,
"step": 450
},
{
"entropy": 0.6984619360703689,
"epoch": 2.0264608599779494,
"grad_norm": 0.285248339176178,
"learning_rate": 7.036450079239303e-05,
"loss": 0.6904,
"mean_token_accuracy": 0.8467852473258972,
"num_tokens": 7443873.0,
"step": 460
},
{
"entropy": 0.7012363314628601,
"epoch": 2.0705622932745316,
"grad_norm": 0.32722488045692444,
"learning_rate": 6.719492868462758e-05,
"loss": 0.6951,
"mean_token_accuracy": 0.8441829264163971,
"num_tokens": 7602458.0,
"step": 470
},
{
"entropy": 0.7040385738015175,
"epoch": 2.1146637265711137,
"grad_norm": 0.3185954689979553,
"learning_rate": 6.402535657686212e-05,
"loss": 0.7082,
"mean_token_accuracy": 0.8414013043045998,
"num_tokens": 7763939.0,
"step": 480
},
{
"entropy": 0.6956586122512818,
"epoch": 2.158765159867696,
"grad_norm": 0.34575557708740234,
"learning_rate": 6.0855784469096676e-05,
"loss": 0.695,
"mean_token_accuracy": 0.8428655683994293,
"num_tokens": 7927670.0,
"step": 490
},
{
"entropy": 0.702117745578289,
"epoch": 2.202866593164278,
"grad_norm": 0.3255711793899536,
"learning_rate": 5.768621236133123e-05,
"loss": 0.7111,
"mean_token_accuracy": 0.8387768477201462,
"num_tokens": 8090934.0,
"step": 500
},
{
"epoch": 2.202866593164278,
"eval_entropy": 0.6130540751584685,
"eval_loss": 0.5708180665969849,
"eval_mean_token_accuracy": 0.8685296850629373,
"eval_num_tokens": 8090934.0,
"eval_runtime": 8.0458,
"eval_samples_per_second": 100.176,
"eval_steps_per_second": 12.553,
"step": 500
},
{
"entropy": 0.6894026726484299,
"epoch": 2.24696802646086,
"grad_norm": 0.3195938467979431,
"learning_rate": 5.451664025356578e-05,
"loss": 0.6884,
"mean_token_accuracy": 0.8427316024899483,
"num_tokens": 8260273.0,
"step": 510
},
{
"entropy": 0.6812330767512321,
"epoch": 2.291069459757442,
"grad_norm": 0.4105236232280731,
"learning_rate": 5.134706814580032e-05,
"loss": 0.6849,
"mean_token_accuracy": 0.8467972829937935,
"num_tokens": 8418421.0,
"step": 520
},
{
"entropy": 0.6863151758909225,
"epoch": 2.3351708930540243,
"grad_norm": 0.3161937892436981,
"learning_rate": 4.817749603803487e-05,
"loss": 0.6813,
"mean_token_accuracy": 0.8450520291924477,
"num_tokens": 8584244.0,
"step": 530
},
{
"entropy": 0.6781483091413975,
"epoch": 2.3792723263506064,
"grad_norm": 0.37656792998313904,
"learning_rate": 4.5007923930269414e-05,
"loss": 0.6754,
"mean_token_accuracy": 0.8465494722127914,
"num_tokens": 8748714.0,
"step": 540
},
{
"entropy": 0.6889190331101418,
"epoch": 2.4233737596471885,
"grad_norm": 0.3911200761795044,
"learning_rate": 4.1838351822503966e-05,
"loss": 0.7013,
"mean_token_accuracy": 0.8418177396059037,
"num_tokens": 8908894.0,
"step": 550
},
{
"entropy": 0.6950134262442589,
"epoch": 2.4674751929437706,
"grad_norm": 0.32076194882392883,
"learning_rate": 3.866877971473851e-05,
"loss": 0.7057,
"mean_token_accuracy": 0.8429578125476838,
"num_tokens": 9068647.0,
"step": 560
},
{
"entropy": 0.6590564094483853,
"epoch": 2.5115766262403527,
"grad_norm": 0.33828845620155334,
"learning_rate": 3.549920760697306e-05,
"loss": 0.6653,
"mean_token_accuracy": 0.8498437628149986,
"num_tokens": 9241573.0,
"step": 570
},
{
"entropy": 0.6963606104254723,
"epoch": 2.555678059536935,
"grad_norm": 0.3341946303844452,
"learning_rate": 3.2329635499207614e-05,
"loss": 0.7008,
"mean_token_accuracy": 0.8427884921431541,
"num_tokens": 9402668.0,
"step": 580
},
{
"entropy": 0.7054937720298767,
"epoch": 2.599779492833517,
"grad_norm": 0.2775990962982178,
"learning_rate": 2.9160063391442156e-05,
"loss": 0.7011,
"mean_token_accuracy": 0.841389861702919,
"num_tokens": 9566515.0,
"step": 590
},
{
"entropy": 0.7010829344391822,
"epoch": 2.643880926130099,
"grad_norm": 0.35286372900009155,
"learning_rate": 2.5990491283676704e-05,
"loss": 0.6916,
"mean_token_accuracy": 0.8444753900170326,
"num_tokens": 9737713.0,
"step": 600
},
{
"epoch": 2.643880926130099,
"eval_entropy": 0.6011341580069891,
"eval_loss": 0.5674853324890137,
"eval_mean_token_accuracy": 0.8689056859158053,
"eval_num_tokens": 9737713.0,
"eval_runtime": 8.0369,
"eval_samples_per_second": 100.287,
"eval_steps_per_second": 12.567,
"step": 600
}
],
"logging_steps": 10,
"max_steps": 681,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.797942892344115e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}