odoom's picture
Upload folder using huggingface_hub
97e7dd6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 100,
"global_step": 240,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.4948265369981528,
"epoch": 0.12568735271013354,
"grad_norm": 0.0615234375,
"learning_rate": 0.00015000000000000001,
"loss": 1.7029500961303712,
"mean_token_accuracy": 0.6537273893132806,
"num_tokens": 156024.0,
"step": 10
},
{
"entropy": 1.1234326036646962,
"epoch": 0.2513747054202671,
"grad_norm": 0.037353515625,
"learning_rate": 0.00019953520716943371,
"loss": 1.1771140098571777,
"mean_token_accuracy": 0.7510503690689803,
"num_tokens": 298620.0,
"step": 20
},
{
"entropy": 1.0369394151493907,
"epoch": 0.3770620581304006,
"grad_norm": 0.0267333984375,
"learning_rate": 0.0001972690659618564,
"loss": 1.1110390663146972,
"mean_token_accuracy": 0.7662029687315226,
"num_tokens": 449966.0,
"step": 30
},
{
"entropy": 1.0444004433229566,
"epoch": 0.5027494108405341,
"grad_norm": 0.02880859375,
"learning_rate": 0.0001931591088051279,
"loss": 1.1269343376159668,
"mean_token_accuracy": 0.7651370905339718,
"num_tokens": 608754.0,
"step": 40
},
{
"entropy": 1.051739121414721,
"epoch": 0.6284367635506677,
"grad_norm": 0.0255126953125,
"learning_rate": 0.00018728324335139814,
"loss": 1.088887882232666,
"mean_token_accuracy": 0.7666051037609577,
"num_tokens": 764583.0,
"step": 50
},
{
"entropy": 0.9839291835203767,
"epoch": 0.7541241162608012,
"grad_norm": 0.029052734375,
"learning_rate": 0.0001797528515115709,
"loss": 1.020584487915039,
"mean_token_accuracy": 0.7803368698805571,
"num_tokens": 912716.0,
"step": 60
},
{
"entropy": 1.0362637933343648,
"epoch": 0.8798114689709348,
"grad_norm": 0.0244140625,
"learning_rate": 0.00017071067811865476,
"loss": 1.0753373146057128,
"mean_token_accuracy": 0.7713750531896949,
"num_tokens": 1064494.0,
"step": 70
},
{
"entropy": 0.9550455894345552,
"epoch": 1.0,
"grad_norm": 0.02880859375,
"learning_rate": 0.0001603281250808719,
"loss": 0.9675676345825195,
"mean_token_accuracy": 0.7857394160008898,
"num_tokens": 1201271.0,
"step": 80
},
{
"entropy": 0.9374621393159032,
"epoch": 1.1256873527101336,
"grad_norm": 0.0244140625,
"learning_rate": 0.00014880200231609983,
"loss": 0.9870312690734864,
"mean_token_accuracy": 0.7901170210912823,
"num_tokens": 1349133.0,
"step": 90
},
{
"entropy": 0.949882148578763,
"epoch": 1.251374705420267,
"grad_norm": 0.03173828125,
"learning_rate": 0.00013635079705638298,
"loss": 0.9436046600341796,
"mean_token_accuracy": 0.788494935259223,
"num_tokens": 1500927.0,
"step": 100
},
{
"epoch": 1.251374705420267,
"eval_entropy": 0.9404270783276625,
"eval_loss": 0.9446325302124023,
"eval_mean_token_accuracy": 0.7829881757497787,
"eval_num_tokens": 1500927.0,
"eval_runtime": 86.1079,
"eval_samples_per_second": 1.649,
"eval_steps_per_second": 1.649,
"step": 100
},
{
"entropy": 0.9173299714922905,
"epoch": 1.3770620581304005,
"grad_norm": 0.0233154296875,
"learning_rate": 0.0001232105322409468,
"loss": 0.9481925964355469,
"mean_token_accuracy": 0.7930883213877677,
"num_tokens": 1646678.0,
"step": 110
},
{
"entropy": 0.9073959412053227,
"epoch": 1.5027494108405341,
"grad_norm": 0.03076171875,
"learning_rate": 0.00010963029250531418,
"loss": 0.9351880073547363,
"mean_token_accuracy": 0.7968914289027452,
"num_tokens": 1799797.0,
"step": 120
},
{
"entropy": 0.9462396390736103,
"epoch": 1.6284367635506678,
"grad_norm": 0.029296875,
"learning_rate": 9.586750257511867e-05,
"loss": 0.9954720497131347,
"mean_token_accuracy": 0.789978607185185,
"num_tokens": 1958692.0,
"step": 130
},
{
"entropy": 0.9035223769024014,
"epoch": 1.7541241162608012,
"grad_norm": 0.02880859375,
"learning_rate": 8.218304756658072e-05,
"loss": 0.9473580360412598,
"mean_token_accuracy": 0.7965094247832895,
"num_tokens": 2098856.0,
"step": 140
},
{
"entropy": 0.9002945913001895,
"epoch": 1.8798114689709347,
"grad_norm": 0.035888671875,
"learning_rate": 6.883632769240589e-05,
"loss": 0.9326913833618165,
"mean_token_accuracy": 0.800568002089858,
"num_tokens": 2253582.0,
"step": 150
},
{
"entropy": 0.8597502765897053,
"epoch": 2.0,
"grad_norm": 0.031494140625,
"learning_rate": 5.608034111526298e-05,
"loss": 0.8665840148925781,
"mean_token_accuracy": 0.8103327634287816,
"num_tokens": 2402542.0,
"step": 160
},
{
"entropy": 0.8727964337915182,
"epoch": 2.1256873527101336,
"grad_norm": 0.03173828125,
"learning_rate": 4.415688815743858e-05,
"loss": 0.9112902641296386,
"mean_token_accuracy": 0.8074018105864524,
"num_tokens": 2554831.0,
"step": 170
},
{
"entropy": 0.8327508143149316,
"epoch": 2.2513747054202673,
"grad_norm": 0.03955078125,
"learning_rate": 3.329198777485869e-05,
"loss": 0.8397786140441894,
"mean_token_accuracy": 0.816638495773077,
"num_tokens": 2700898.0,
"step": 180
},
{
"entropy": 0.8549322345294058,
"epoch": 2.3770620581304005,
"grad_norm": 0.041015625,
"learning_rate": 2.3691593180019366e-05,
"loss": 0.8526265144348144,
"mean_token_accuracy": 0.8113520180806517,
"num_tokens": 2854824.0,
"step": 190
},
{
"entropy": 0.8102049398235976,
"epoch": 2.502749410840534,
"grad_norm": 0.06787109375,
"learning_rate": 1.553768782775351e-05,
"loss": 0.8202457427978516,
"mean_token_accuracy": 0.8225593730807305,
"num_tokens": 3003572.0,
"step": 200
},
{
"epoch": 2.502749410840534,
"eval_entropy": 0.8503286918284187,
"eval_loss": 0.8580905795097351,
"eval_mean_token_accuracy": 0.8025841964802272,
"eval_num_tokens": 3003572.0,
"eval_runtime": 85.9551,
"eval_samples_per_second": 1.652,
"eval_steps_per_second": 1.652,
"step": 200
},
{
"entropy": 0.8026974092237651,
"epoch": 2.628436763550668,
"grad_norm": 0.04736328125,
"learning_rate": 8.98483576766631e-06,
"loss": 0.805226993560791,
"mean_token_accuracy": 0.8234594237059355,
"num_tokens": 3151440.0,
"step": 210
},
{
"entropy": 0.7800698866136372,
"epoch": 2.754124116260801,
"grad_norm": 0.05029296875,
"learning_rate": 4.1572517541747294e-06,
"loss": 0.7574594020843506,
"mean_token_accuracy": 0.8299322757869959,
"num_tokens": 3316394.0,
"step": 220
},
{
"entropy": 0.7608709936961532,
"epoch": 2.8798114689709347,
"grad_norm": 0.04736328125,
"learning_rate": 1.146446652649169e-06,
"loss": 0.7368635654449462,
"mean_token_accuracy": 0.8316391207277775,
"num_tokens": 3459207.0,
"step": 230
},
{
"entropy": 0.7886644739146326,
"epoch": 3.0,
"grad_norm": 0.052001953125,
"learning_rate": 9.49277494008971e-09,
"loss": 0.737561845779419,
"mean_token_accuracy": 0.828946531208512,
"num_tokens": 3603813.0,
"step": 240
}
],
"logging_steps": 10,
"max_steps": 240,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.5556679422901453e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}