felcas93's picture
Upload folder using huggingface_hub
7757329 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.962962962962963,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.7838703233003617,
"epoch": 0.07407407407407407,
"grad_norm": 2.2315239906311035,
"learning_rate": 8.032786885245902e-05,
"loss": 1.5419,
"mean_token_accuracy": 0.675746806114912,
"num_tokens": 406250.0,
"step": 50
},
{
"entropy": 0.42496320378035307,
"epoch": 0.14814814814814814,
"grad_norm": 0.3177862763404846,
"learning_rate": 9.990765991730485e-05,
"loss": 0.3491,
"mean_token_accuracy": 0.9115325964987278,
"num_tokens": 810885.0,
"step": 100
},
{
"entropy": 0.17912146519869565,
"epoch": 0.2222222222222222,
"grad_norm": 0.30210021138191223,
"learning_rate": 9.950545603782162e-05,
"loss": 0.1608,
"mean_token_accuracy": 0.9592751894891262,
"num_tokens": 1216948.0,
"step": 150
},
{
"entropy": 0.13433791074901819,
"epoch": 0.2962962962962963,
"grad_norm": 0.4639471173286438,
"learning_rate": 9.878674879048427e-05,
"loss": 0.1177,
"mean_token_accuracy": 0.9702847249805927,
"num_tokens": 1623039.0,
"step": 200
},
{
"entropy": 0.11072772483341396,
"epoch": 0.37037037037037035,
"grad_norm": 0.239689439535141,
"learning_rate": 9.775613308830824e-05,
"loss": 0.0996,
"mean_token_accuracy": 0.9741994588077069,
"num_tokens": 2029168.0,
"step": 250
},
{
"entropy": 0.1015299869235605,
"epoch": 0.4444444444444444,
"grad_norm": 0.14000816643238068,
"learning_rate": 9.642019796948866e-05,
"loss": 0.0946,
"mean_token_accuracy": 0.9754497842490673,
"num_tokens": 2433618.0,
"step": 300
},
{
"entropy": 0.0944658778142184,
"epoch": 0.5185185185185185,
"grad_norm": 0.2911929786205292,
"learning_rate": 9.478748447168449e-05,
"loss": 0.0888,
"mean_token_accuracy": 0.9764833557605743,
"num_tokens": 2837007.0,
"step": 350
},
{
"entropy": 0.09286680690012872,
"epoch": 0.5925925925925926,
"grad_norm": 0.1825980246067047,
"learning_rate": 9.28684310265789e-05,
"loss": 0.0884,
"mean_token_accuracy": 0.9767837685346603,
"num_tokens": 3241067.0,
"step": 400
},
{
"entropy": 0.09173870420083403,
"epoch": 0.6666666666666666,
"grad_norm": 0.16845248639583588,
"learning_rate": 9.067530672382544e-05,
"loss": 0.0871,
"mean_token_accuracy": 0.9771137611567974,
"num_tokens": 3644774.0,
"step": 450
},
{
"entropy": 0.08837487244978547,
"epoch": 0.7407407407407407,
"grad_norm": 0.15667857229709625,
"learning_rate": 8.822213287104348e-05,
"loss": 0.0846,
"mean_token_accuracy": 0.9784404304623604,
"num_tokens": 4050472.0,
"step": 500
},
{
"entropy": 0.0885874280706048,
"epoch": 0.8148148148148148,
"grad_norm": 0.10147374123334885,
"learning_rate": 8.552459335135381e-05,
"loss": 0.0848,
"mean_token_accuracy": 0.977893346697092,
"num_tokens": 4453374.0,
"step": 550
},
{
"entropy": 0.08721992008388042,
"epoch": 0.8888888888888888,
"grad_norm": 0.08062940090894699,
"learning_rate": 8.259993435156559e-05,
"loss": 0.0844,
"mean_token_accuracy": 0.9785151568055153,
"num_tokens": 4859149.0,
"step": 600
},
{
"entropy": 0.08216898602433503,
"epoch": 0.9629629629629629,
"grad_norm": 0.1335250586271286,
"learning_rate": 7.946685410208296e-05,
"loss": 0.0798,
"mean_token_accuracy": 0.9796955060958862,
"num_tokens": 5264437.0,
"step": 650
},
{
"entropy": 0.08460669645108282,
"epoch": 1.037037037037037,
"grad_norm": 0.0986652821302414,
"learning_rate": 7.614538333345735e-05,
"loss": 0.0822,
"mean_token_accuracy": 0.9785672229528427,
"num_tokens": 5669023.0,
"step": 700
},
{
"entropy": 0.0850414677709341,
"epoch": 1.1111111111111112,
"grad_norm": 0.07220367342233658,
"learning_rate": 7.265675721386285e-05,
"loss": 0.0824,
"mean_token_accuracy": 0.9784497334063054,
"num_tokens": 6073001.0,
"step": 750
},
{
"entropy": 0.08401141031645239,
"epoch": 1.1851851851851851,
"grad_norm": 0.1240311786532402,
"learning_rate": 6.902327958623736e-05,
"loss": 0.0826,
"mean_token_accuracy": 0.9786691051721573,
"num_tokens": 6478325.0,
"step": 800
},
{
"entropy": 0.08358457050286233,
"epoch": 1.2592592592592593,
"grad_norm": 0.10120349377393723,
"learning_rate": 6.526818037306228e-05,
"loss": 0.0811,
"mean_token_accuracy": 0.9787746147811413,
"num_tokens": 6882747.0,
"step": 850
},
{
"entropy": 0.0835177150182426,
"epoch": 1.3333333333333333,
"grad_norm": 0.10029594600200653,
"learning_rate": 6.14154670604355e-05,
"loss": 0.0818,
"mean_token_accuracy": 0.9788197261095047,
"num_tokens": 7287039.0,
"step": 900
},
{
"entropy": 0.08201941348612309,
"epoch": 1.4074074074074074,
"grad_norm": 0.09068141877651215,
"learning_rate": 5.7489771210944564e-05,
"loss": 0.0802,
"mean_token_accuracy": 0.9791601756215096,
"num_tokens": 7692281.0,
"step": 950
},
{
"entropy": 0.08500135038048029,
"epoch": 1.4814814814814814,
"grad_norm": 0.10811195522546768,
"learning_rate": 5.351619098663021e-05,
"loss": 0.0829,
"mean_token_accuracy": 0.9783452861011028,
"num_tokens": 8096455.0,
"step": 1000
},
{
"entropy": 0.08261076767928899,
"epoch": 1.5555555555555556,
"grad_norm": 0.07817448675632477,
"learning_rate": 4.952013068883795e-05,
"loss": 0.0807,
"mean_token_accuracy": 0.9787566863000393,
"num_tokens": 8501481.0,
"step": 1050
},
{
"entropy": 0.08088674335740506,
"epoch": 1.6296296296296298,
"grad_norm": 0.0741722360253334,
"learning_rate": 4.5527138340828776e-05,
"loss": 0.0794,
"mean_token_accuracy": 0.9796176181733608,
"num_tokens": 8907814.0,
"step": 1100
},
{
"entropy": 0.08092430792748928,
"epoch": 1.7037037037037037,
"grad_norm": 0.0863470658659935,
"learning_rate": 4.156274235153189e-05,
"loss": 0.0792,
"mean_token_accuracy": 0.9792905601859093,
"num_tokens": 9312142.0,
"step": 1150
},
{
"entropy": 0.07963837143965065,
"epoch": 1.7777777777777777,
"grad_norm": 0.10823621600866318,
"learning_rate": 3.765228830469794e-05,
"loss": 0.0791,
"mean_token_accuracy": 0.9794147987663746,
"num_tokens": 9716258.0,
"step": 1200
},
{
"entropy": 0.08162923349067569,
"epoch": 1.8518518518518519,
"grad_norm": 0.1495106816291809,
"learning_rate": 3.3820776916908857e-05,
"loss": 0.0801,
"mean_token_accuracy": 0.9793653392791748,
"num_tokens": 10121713.0,
"step": 1250
},
{
"entropy": 0.08032218031585217,
"epoch": 1.925925925925926,
"grad_norm": 0.08044654875993729,
"learning_rate": 3.0092704200428058e-05,
"loss": 0.079,
"mean_token_accuracy": 0.9795299915969372,
"num_tokens": 10526002.0,
"step": 1300
},
{
"entropy": 0.07872624884359539,
"epoch": 2.0,
"grad_norm": 0.07752422988414764,
"learning_rate": 2.649190485277792e-05,
"loss": 0.0775,
"mean_token_accuracy": 0.980090646147728,
"num_tokens": 10932428.0,
"step": 1350
},
{
"entropy": 0.08063295830972493,
"epoch": 2.074074074074074,
"grad_norm": 0.09133461862802505,
"learning_rate": 2.3041399874302905e-05,
"loss": 0.0793,
"mean_token_accuracy": 0.9794050461053848,
"num_tokens": 11337209.0,
"step": 1400
},
{
"entropy": 0.08033578357659281,
"epoch": 2.148148148148148,
"grad_norm": 0.06361774355173111,
"learning_rate": 1.976324938794482e-05,
"loss": 0.0792,
"mean_token_accuracy": 0.9797105365991592,
"num_tokens": 11741968.0,
"step": 1450
},
{
"entropy": 0.07973854598589242,
"epoch": 2.2222222222222223,
"grad_norm": 0.09148402512073517,
"learning_rate": 1.667841160219835e-05,
"loss": 0.0778,
"mean_token_accuracy": 0.9796544459462165,
"num_tokens": 12147108.0,
"step": 1500
},
{
"entropy": 0.07991634771227836,
"epoch": 2.2962962962962963,
"grad_norm": 0.058334823697805405,
"learning_rate": 1.3806608818939203e-05,
"loss": 0.0787,
"mean_token_accuracy": 0.9793905445933342,
"num_tokens": 12551885.0,
"step": 1550
},
{
"entropy": 0.07991615429520607,
"epoch": 2.3703703703703702,
"grad_norm": 0.07122901827096939,
"learning_rate": 1.1166201342777438e-05,
"loss": 0.0785,
"mean_token_accuracy": 0.979671506434679,
"num_tokens": 12956475.0,
"step": 1600
},
{
"entropy": 0.07969259418547153,
"epoch": 2.4444444444444446,
"grad_norm": 0.11193029582500458,
"learning_rate": 8.774070098071668e-06,
"loss": 0.0787,
"mean_token_accuracy": 0.979515576660633,
"num_tokens": 13362716.0,
"step": 1650
},
{
"entropy": 0.08067716302350164,
"epoch": 2.5185185185185186,
"grad_norm": 0.09012539684772491,
"learning_rate": 6.645508704069003e-06,
"loss": 0.0802,
"mean_token_accuracy": 0.9791687172651291,
"num_tokens": 13766986.0,
"step": 1700
},
{
"entropy": 0.07974634082056582,
"epoch": 2.5925925925925926,
"grad_norm": 0.09309827536344528,
"learning_rate": 4.794125698167262e-06,
"loss": 0.0787,
"mean_token_accuracy": 0.9794514080882073,
"num_tokens": 14171018.0,
"step": 1750
},
{
"entropy": 0.08121541824191808,
"epoch": 2.6666666666666665,
"grad_norm": 0.06920253485441208,
"learning_rate": 3.231757532415458e-06,
"loss": 0.0794,
"mean_token_accuracy": 0.9792174778878688,
"num_tokens": 14575902.0,
"step": 1800
},
{
"entropy": 0.07990395256318152,
"epoch": 2.7407407407407405,
"grad_norm": 0.05824149027466774,
"learning_rate": 1.9683928994924385e-06,
"loss": 0.0781,
"mean_token_accuracy": 0.9798404219746589,
"num_tokens": 14980838.0,
"step": 1850
},
{
"entropy": 0.08023261365480722,
"epoch": 2.814814814814815,
"grad_norm": 0.08101186901330948,
"learning_rate": 1.0121088719706296e-06,
"loss": 0.0795,
"mean_token_accuracy": 0.9791903717815876,
"num_tokens": 15385944.0,
"step": 1900
},
{
"entropy": 0.08040859408676625,
"epoch": 2.888888888888889,
"grad_norm": 0.07091067731380463,
"learning_rate": 3.6901926314575894e-07,
"loss": 0.0797,
"mean_token_accuracy": 0.9792876356840133,
"num_tokens": 15791071.0,
"step": 1950
},
{
"entropy": 0.0779489404708147,
"epoch": 2.962962962962963,
"grad_norm": 0.05987590551376343,
"learning_rate": 4.323553957759629e-08,
"loss": 0.0778,
"mean_token_accuracy": 0.9800369493663311,
"num_tokens": 16196052.0,
"step": 2000
}
],
"logging_steps": 50,
"max_steps": 2025,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.765523181366723e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}