Secunda-0.5-RAW / checkpoint-900 /trainer_state.json
Yaroster's picture
Add files using upload-large-folder tool
7b910ac verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.2222222222222222,
"grad_norm": 0.09050317108631134,
"learning_rate": 1.9994502159417576e-05,
"loss": 1.4979,
"mean_token_accuracy": 0.6438735589385033,
"num_tokens": 164625.0,
"step": 20
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.11526794731616974,
"learning_rate": 1.9976842788356054e-05,
"loss": 1.4552,
"mean_token_accuracy": 0.6504906862974167,
"num_tokens": 327150.0,
"step": 40
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.13697730004787445,
"learning_rate": 1.9947028171171742e-05,
"loss": 1.4086,
"mean_token_accuracy": 0.6580756276845932,
"num_tokens": 492219.0,
"step": 60
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.1464962661266327,
"learning_rate": 1.990509463238309e-05,
"loss": 1.3719,
"mean_token_accuracy": 0.6640863925218582,
"num_tokens": 659628.0,
"step": 80
},
{
"epoch": 1.1111111111111112,
"grad_norm": 0.17173181474208832,
"learning_rate": 1.985109326154774e-05,
"loss": 1.34,
"mean_token_accuracy": 0.6709366589784622,
"num_tokens": 820158.0,
"step": 100
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.17650607228279114,
"learning_rate": 1.9785089851017788e-05,
"loss": 1.2904,
"mean_token_accuracy": 0.6785845950245857,
"num_tokens": 984740.0,
"step": 120
},
{
"epoch": 1.5555555555555556,
"grad_norm": 0.19704826176166534,
"learning_rate": 1.970716481578191e-05,
"loss": 1.2811,
"mean_token_accuracy": 0.68048547655344,
"num_tokens": 1150127.0,
"step": 140
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.20888939499855042,
"learning_rate": 1.9617413095492114e-05,
"loss": 1.2714,
"mean_token_accuracy": 0.681582860648632,
"num_tokens": 1318081.0,
"step": 160
},
{
"epoch": 2.0,
"grad_norm": 0.3165840208530426,
"learning_rate": 1.9515944038794384e-05,
"loss": 1.2361,
"mean_token_accuracy": 0.687538705766201,
"num_tokens": 1478412.0,
"step": 180
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.24854803085327148,
"learning_rate": 1.940288127010419e-05,
"loss": 1.2336,
"mean_token_accuracy": 0.6887955293059349,
"num_tokens": 1644237.0,
"step": 200
},
{
"epoch": 2.4444444444444446,
"grad_norm": 0.27059808373451233,
"learning_rate": 1.92783625389892e-05,
"loss": 1.2052,
"mean_token_accuracy": 0.6917410314083099,
"num_tokens": 1807943.0,
"step": 220
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.26881545782089233,
"learning_rate": 1.9142539552342638e-05,
"loss": 1.2103,
"mean_token_accuracy": 0.6928626969456673,
"num_tokens": 1971497.0,
"step": 240
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.2584948241710663,
"learning_rate": 1.8995577789551806e-05,
"loss": 1.1922,
"mean_token_accuracy": 0.6955347016453743,
"num_tokens": 2136802.0,
"step": 260
},
{
"epoch": 3.111111111111111,
"grad_norm": 0.2711962163448334,
"learning_rate": 1.8837656300886937e-05,
"loss": 1.1743,
"mean_token_accuracy": 0.698191574215889,
"num_tokens": 2300632.0,
"step": 280
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.2702801823616028,
"learning_rate": 1.866896748935603e-05,
"loss": 1.1858,
"mean_token_accuracy": 0.6950699493288994,
"num_tokens": 2466370.0,
"step": 300
},
{
"epoch": 3.5555555555555554,
"grad_norm": 0.28295591473579407,
"learning_rate": 1.8489716876291417e-05,
"loss": 1.1753,
"mean_token_accuracy": 0.6978182002902031,
"num_tokens": 2627868.0,
"step": 320
},
{
"epoch": 3.7777777777777777,
"grad_norm": 0.3407115042209625,
"learning_rate": 1.8300122850953678e-05,
"loss": 1.1755,
"mean_token_accuracy": 0.6989751189947129,
"num_tokens": 2793579.0,
"step": 340
},
{
"epoch": 4.0,
"grad_norm": 0.35823503136634827,
"learning_rate": 1.8100416404457962e-05,
"loss": 1.1583,
"mean_token_accuracy": 0.7020597368478775,
"num_tokens": 2956824.0,
"step": 360
},
{
"epoch": 4.222222222222222,
"grad_norm": 0.2857590615749359,
"learning_rate": 1.789084084834691e-05,
"loss": 1.1574,
"mean_token_accuracy": 0.701052300632,
"num_tokens": 3123009.0,
"step": 380
},
{
"epoch": 4.444444444444445,
"grad_norm": 0.3219825327396393,
"learning_rate": 1.7671651518153e-05,
"loss": 1.171,
"mean_token_accuracy": 0.6980017140507698,
"num_tokens": 3286307.0,
"step": 400
},
{
"epoch": 4.666666666666667,
"grad_norm": 0.35857293009757996,
"learning_rate": 1.744311546231154e-05,
"loss": 1.1406,
"mean_token_accuracy": 0.7054756179451942,
"num_tokens": 3453142.0,
"step": 420
},
{
"epoch": 4.888888888888889,
"grad_norm": 0.335440069437027,
"learning_rate": 1.7205511116803306e-05,
"loss": 1.1385,
"mean_token_accuracy": 0.705261904001236,
"num_tokens": 3616579.0,
"step": 440
},
{
"epoch": 5.111111111111111,
"grad_norm": 0.3440147042274475,
"learning_rate": 1.6959127965923144e-05,
"loss": 1.1405,
"mean_token_accuracy": 0.7062455296516419,
"num_tokens": 3777095.0,
"step": 460
},
{
"epoch": 5.333333333333333,
"grad_norm": 0.3388345241546631,
"learning_rate": 1.6704266189587992e-05,
"loss": 1.117,
"mean_token_accuracy": 0.7082562401890755,
"num_tokens": 3944069.0,
"step": 480
},
{
"epoch": 5.555555555555555,
"grad_norm": 0.3118428587913513,
"learning_rate": 1.644123629761387e-05,
"loss": 1.1518,
"mean_token_accuracy": 0.7026130899786949,
"num_tokens": 4109276.0,
"step": 500
},
{
"epoch": 5.777777777777778,
"grad_norm": 0.3456493616104126,
"learning_rate": 1.617035875140749e-05,
"loss": 1.1284,
"mean_token_accuracy": 0.7070924416184425,
"num_tokens": 4274290.0,
"step": 520
},
{
"epoch": 6.0,
"grad_norm": 0.47243574261665344,
"learning_rate": 1.5891963573533424e-05,
"loss": 1.1284,
"mean_token_accuracy": 0.7076667413115502,
"num_tokens": 4435236.0,
"step": 540
},
{
"epoch": 6.222222222222222,
"grad_norm": 0.4088926315307617,
"learning_rate": 1.560638994563242e-05,
"loss": 1.1174,
"mean_token_accuracy": 0.7105918556451798,
"num_tokens": 4599819.0,
"step": 560
},
{
"epoch": 6.444444444444445,
"grad_norm": 0.36049386858940125,
"learning_rate": 1.531398579518083e-05,
"loss": 1.1235,
"mean_token_accuracy": 0.707967433333397,
"num_tokens": 4765887.0,
"step": 580
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.3518804907798767,
"learning_rate": 1.5015107371594576e-05,
"loss": 1.1311,
"mean_token_accuracy": 0.7057292729616165,
"num_tokens": 4930033.0,
"step": 600
},
{
"epoch": 6.888888888888889,
"grad_norm": 0.3334203362464905,
"learning_rate": 1.47101188121941e-05,
"loss": 1.1146,
"mean_token_accuracy": 0.7091757193207741,
"num_tokens": 5093643.0,
"step": 620
},
{
"epoch": 7.111111111111111,
"grad_norm": 0.39216911792755127,
"learning_rate": 1.4399391698559153e-05,
"loss": 1.104,
"mean_token_accuracy": 0.7117078930139542,
"num_tokens": 5258211.0,
"step": 640
},
{
"epoch": 7.333333333333333,
"grad_norm": 0.3939253091812134,
"learning_rate": 1.408330460381385e-05,
"loss": 1.1039,
"mean_token_accuracy": 0.7122085765004158,
"num_tokens": 5419681.0,
"step": 660
},
{
"epoch": 7.555555555555555,
"grad_norm": 0.3723820745944977,
"learning_rate": 1.3762242631393656e-05,
"loss": 1.1038,
"mean_token_accuracy": 0.7110082015395165,
"num_tokens": 5586943.0,
"step": 680
},
{
"epoch": 7.777777777777778,
"grad_norm": 0.4541132152080536,
"learning_rate": 1.3436596945856164e-05,
"loss": 1.1159,
"mean_token_accuracy": 0.7090325355529785,
"num_tokens": 5748647.0,
"step": 700
},
{
"epoch": 8.0,
"grad_norm": 0.43293851613998413,
"learning_rate": 1.310676429630732e-05,
"loss": 1.108,
"mean_token_accuracy": 0.7101380705833436,
"num_tokens": 5913648.0,
"step": 720
},
{
"epoch": 8.222222222222221,
"grad_norm": 0.37585848569869995,
"learning_rate": 1.2773146533023782e-05,
"loss": 1.098,
"mean_token_accuracy": 0.712922240793705,
"num_tokens": 6078733.0,
"step": 740
},
{
"epoch": 8.444444444444445,
"grad_norm": 0.3709869086742401,
"learning_rate": 1.2436150117860226e-05,
"loss": 1.1003,
"mean_token_accuracy": 0.7116939216852188,
"num_tokens": 6243253.0,
"step": 760
},
{
"epoch": 8.666666666666666,
"grad_norm": 0.47251901030540466,
"learning_rate": 1.2096185629038219e-05,
"loss": 1.0873,
"mean_token_accuracy": 0.7131141215562821,
"num_tokens": 6406632.0,
"step": 780
},
{
"epoch": 8.88888888888889,
"grad_norm": 0.38695353269577026,
"learning_rate": 1.1753667260919872e-05,
"loss": 1.1016,
"mean_token_accuracy": 0.7131000861525536,
"num_tokens": 6574031.0,
"step": 800
},
{
"epoch": 9.11111111111111,
"grad_norm": 0.3999341130256653,
"learning_rate": 1.1409012319375828e-05,
"loss": 1.1238,
"mean_token_accuracy": 0.7076400697231293,
"num_tokens": 6735168.0,
"step": 820
},
{
"epoch": 9.333333333333334,
"grad_norm": 0.4220835268497467,
"learning_rate": 1.1062640713362333e-05,
"loss": 1.0777,
"mean_token_accuracy": 0.717046993970871,
"num_tokens": 6903690.0,
"step": 840
},
{
"epoch": 9.555555555555555,
"grad_norm": 0.44131582975387573,
"learning_rate": 1.071497444332686e-05,
"loss": 1.1077,
"mean_token_accuracy": 0.7106189414858818,
"num_tokens": 7064704.0,
"step": 860
},
{
"epoch": 9.777777777777779,
"grad_norm": 0.4432401657104492,
"learning_rate": 1.0366437087065564e-05,
"loss": 1.0708,
"mean_token_accuracy": 0.7174170568585396,
"num_tokens": 7229634.0,
"step": 880
},
{
"epoch": 10.0,
"grad_norm": 0.4888554811477661,
"learning_rate": 1.0017453283658984e-05,
"loss": 1.0756,
"mean_token_accuracy": 0.7171271413564682,
"num_tokens": 7392060.0,
"step": 900
}
],
"logging_steps": 20,
"max_steps": 1800,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.7138456604472115e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}