| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 10.0, |
| "eval_steps": 500, |
| "global_step": 900, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.2222222222222222, |
| "grad_norm": 0.09050317108631134, |
| "learning_rate": 1.9994502159417576e-05, |
| "loss": 1.4979, |
| "mean_token_accuracy": 0.6438735589385033, |
| "num_tokens": 164625.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.4444444444444444, |
| "grad_norm": 0.11526794731616974, |
| "learning_rate": 1.9976842788356054e-05, |
| "loss": 1.4552, |
| "mean_token_accuracy": 0.6504906862974167, |
| "num_tokens": 327150.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.6666666666666666, |
| "grad_norm": 0.13697730004787445, |
| "learning_rate": 1.9947028171171742e-05, |
| "loss": 1.4086, |
| "mean_token_accuracy": 0.6580756276845932, |
| "num_tokens": 492219.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.8888888888888888, |
| "grad_norm": 0.1464962661266327, |
| "learning_rate": 1.990509463238309e-05, |
| "loss": 1.3719, |
| "mean_token_accuracy": 0.6640863925218582, |
| "num_tokens": 659628.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.1111111111111112, |
| "grad_norm": 0.17173181474208832, |
| "learning_rate": 1.985109326154774e-05, |
| "loss": 1.34, |
| "mean_token_accuracy": 0.6709366589784622, |
| "num_tokens": 820158.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.3333333333333333, |
| "grad_norm": 0.17650607228279114, |
| "learning_rate": 1.9785089851017788e-05, |
| "loss": 1.2904, |
| "mean_token_accuracy": 0.6785845950245857, |
| "num_tokens": 984740.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.5555555555555556, |
| "grad_norm": 0.19704826176166534, |
| "learning_rate": 1.970716481578191e-05, |
| "loss": 1.2811, |
| "mean_token_accuracy": 0.68048547655344, |
| "num_tokens": 1150127.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.7777777777777777, |
| "grad_norm": 0.20888939499855042, |
| "learning_rate": 1.9617413095492114e-05, |
| "loss": 1.2714, |
| "mean_token_accuracy": 0.681582860648632, |
| "num_tokens": 1318081.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.3165840208530426, |
| "learning_rate": 1.9515944038794384e-05, |
| "loss": 1.2361, |
| "mean_token_accuracy": 0.687538705766201, |
| "num_tokens": 1478412.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 2.2222222222222223, |
| "grad_norm": 0.24854803085327148, |
| "learning_rate": 1.940288127010419e-05, |
| "loss": 1.2336, |
| "mean_token_accuracy": 0.6887955293059349, |
| "num_tokens": 1644237.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 2.4444444444444446, |
| "grad_norm": 0.27059808373451233, |
| "learning_rate": 1.92783625389892e-05, |
| "loss": 1.2052, |
| "mean_token_accuracy": 0.6917410314083099, |
| "num_tokens": 1807943.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 2.6666666666666665, |
| "grad_norm": 0.26881545782089233, |
| "learning_rate": 1.9142539552342638e-05, |
| "loss": 1.2103, |
| "mean_token_accuracy": 0.6928626969456673, |
| "num_tokens": 1971497.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 2.888888888888889, |
| "grad_norm": 0.2584948241710663, |
| "learning_rate": 1.8995577789551806e-05, |
| "loss": 1.1922, |
| "mean_token_accuracy": 0.6955347016453743, |
| "num_tokens": 2136802.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 3.111111111111111, |
| "grad_norm": 0.2711962163448334, |
| "learning_rate": 1.8837656300886937e-05, |
| "loss": 1.1743, |
| "mean_token_accuracy": 0.698191574215889, |
| "num_tokens": 2300632.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 3.3333333333333335, |
| "grad_norm": 0.2702801823616028, |
| "learning_rate": 1.866896748935603e-05, |
| "loss": 1.1858, |
| "mean_token_accuracy": 0.6950699493288994, |
| "num_tokens": 2466370.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 3.5555555555555554, |
| "grad_norm": 0.28295591473579407, |
| "learning_rate": 1.8489716876291417e-05, |
| "loss": 1.1753, |
| "mean_token_accuracy": 0.6978182002902031, |
| "num_tokens": 2627868.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 3.7777777777777777, |
| "grad_norm": 0.3407115042209625, |
| "learning_rate": 1.8300122850953678e-05, |
| "loss": 1.1755, |
| "mean_token_accuracy": 0.6989751189947129, |
| "num_tokens": 2793579.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 0.35823503136634827, |
| "learning_rate": 1.8100416404457962e-05, |
| "loss": 1.1583, |
| "mean_token_accuracy": 0.7020597368478775, |
| "num_tokens": 2956824.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 4.222222222222222, |
| "grad_norm": 0.2857590615749359, |
| "learning_rate": 1.789084084834691e-05, |
| "loss": 1.1574, |
| "mean_token_accuracy": 0.701052300632, |
| "num_tokens": 3123009.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 4.444444444444445, |
| "grad_norm": 0.3219825327396393, |
| "learning_rate": 1.7671651518153e-05, |
| "loss": 1.171, |
| "mean_token_accuracy": 0.6980017140507698, |
| "num_tokens": 3286307.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 4.666666666666667, |
| "grad_norm": 0.35857293009757996, |
| "learning_rate": 1.744311546231154e-05, |
| "loss": 1.1406, |
| "mean_token_accuracy": 0.7054756179451942, |
| "num_tokens": 3453142.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 4.888888888888889, |
| "grad_norm": 0.335440069437027, |
| "learning_rate": 1.7205511116803306e-05, |
| "loss": 1.1385, |
| "mean_token_accuracy": 0.705261904001236, |
| "num_tokens": 3616579.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 5.111111111111111, |
| "grad_norm": 0.3440147042274475, |
| "learning_rate": 1.6959127965923144e-05, |
| "loss": 1.1405, |
| "mean_token_accuracy": 0.7062455296516419, |
| "num_tokens": 3777095.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 5.333333333333333, |
| "grad_norm": 0.3388345241546631, |
| "learning_rate": 1.6704266189587992e-05, |
| "loss": 1.117, |
| "mean_token_accuracy": 0.7082562401890755, |
| "num_tokens": 3944069.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 5.555555555555555, |
| "grad_norm": 0.3118428587913513, |
| "learning_rate": 1.644123629761387e-05, |
| "loss": 1.1518, |
| "mean_token_accuracy": 0.7026130899786949, |
| "num_tokens": 4109276.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 5.777777777777778, |
| "grad_norm": 0.3456493616104126, |
| "learning_rate": 1.617035875140749e-05, |
| "loss": 1.1284, |
| "mean_token_accuracy": 0.7070924416184425, |
| "num_tokens": 4274290.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 6.0, |
| "grad_norm": 0.47243574261665344, |
| "learning_rate": 1.5891963573533424e-05, |
| "loss": 1.1284, |
| "mean_token_accuracy": 0.7076667413115502, |
| "num_tokens": 4435236.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 6.222222222222222, |
| "grad_norm": 0.4088926315307617, |
| "learning_rate": 1.560638994563242e-05, |
| "loss": 1.1174, |
| "mean_token_accuracy": 0.7105918556451798, |
| "num_tokens": 4599819.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 6.444444444444445, |
| "grad_norm": 0.36049386858940125, |
| "learning_rate": 1.531398579518083e-05, |
| "loss": 1.1235, |
| "mean_token_accuracy": 0.707967433333397, |
| "num_tokens": 4765887.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 6.666666666666667, |
| "grad_norm": 0.3518804907798767, |
| "learning_rate": 1.5015107371594576e-05, |
| "loss": 1.1311, |
| "mean_token_accuracy": 0.7057292729616165, |
| "num_tokens": 4930033.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 6.888888888888889, |
| "grad_norm": 0.3334203362464905, |
| "learning_rate": 1.47101188121941e-05, |
| "loss": 1.1146, |
| "mean_token_accuracy": 0.7091757193207741, |
| "num_tokens": 5093643.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 7.111111111111111, |
| "grad_norm": 0.39216911792755127, |
| "learning_rate": 1.4399391698559153e-05, |
| "loss": 1.104, |
| "mean_token_accuracy": 0.7117078930139542, |
| "num_tokens": 5258211.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 7.333333333333333, |
| "grad_norm": 0.3939253091812134, |
| "learning_rate": 1.408330460381385e-05, |
| "loss": 1.1039, |
| "mean_token_accuracy": 0.7122085765004158, |
| "num_tokens": 5419681.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 7.555555555555555, |
| "grad_norm": 0.3723820745944977, |
| "learning_rate": 1.3762242631393656e-05, |
| "loss": 1.1038, |
| "mean_token_accuracy": 0.7110082015395165, |
| "num_tokens": 5586943.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 7.777777777777778, |
| "grad_norm": 0.4541132152080536, |
| "learning_rate": 1.3436596945856164e-05, |
| "loss": 1.1159, |
| "mean_token_accuracy": 0.7090325355529785, |
| "num_tokens": 5748647.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 8.0, |
| "grad_norm": 0.43293851613998413, |
| "learning_rate": 1.310676429630732e-05, |
| "loss": 1.108, |
| "mean_token_accuracy": 0.7101380705833436, |
| "num_tokens": 5913648.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 8.222222222222221, |
| "grad_norm": 0.37585848569869995, |
| "learning_rate": 1.2773146533023782e-05, |
| "loss": 1.098, |
| "mean_token_accuracy": 0.712922240793705, |
| "num_tokens": 6078733.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 8.444444444444445, |
| "grad_norm": 0.3709869086742401, |
| "learning_rate": 1.2436150117860226e-05, |
| "loss": 1.1003, |
| "mean_token_accuracy": 0.7116939216852188, |
| "num_tokens": 6243253.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 8.666666666666666, |
| "grad_norm": 0.47251901030540466, |
| "learning_rate": 1.2096185629038219e-05, |
| "loss": 1.0873, |
| "mean_token_accuracy": 0.7131141215562821, |
| "num_tokens": 6406632.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 8.88888888888889, |
| "grad_norm": 0.38695353269577026, |
| "learning_rate": 1.1753667260919872e-05, |
| "loss": 1.1016, |
| "mean_token_accuracy": 0.7131000861525536, |
| "num_tokens": 6574031.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 9.11111111111111, |
| "grad_norm": 0.3999341130256653, |
| "learning_rate": 1.1409012319375828e-05, |
| "loss": 1.1238, |
| "mean_token_accuracy": 0.7076400697231293, |
| "num_tokens": 6735168.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 9.333333333333334, |
| "grad_norm": 0.4220835268497467, |
| "learning_rate": 1.1062640713362333e-05, |
| "loss": 1.0777, |
| "mean_token_accuracy": 0.717046993970871, |
| "num_tokens": 6903690.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 9.555555555555555, |
| "grad_norm": 0.44131582975387573, |
| "learning_rate": 1.071497444332686e-05, |
| "loss": 1.1077, |
| "mean_token_accuracy": 0.7106189414858818, |
| "num_tokens": 7064704.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 9.777777777777779, |
| "grad_norm": 0.4432401657104492, |
| "learning_rate": 1.0366437087065564e-05, |
| "loss": 1.0708, |
| "mean_token_accuracy": 0.7174170568585396, |
| "num_tokens": 7229634.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.4888554811477661, |
| "learning_rate": 1.0017453283658984e-05, |
| "loss": 1.0756, |
| "mean_token_accuracy": 0.7171271413564682, |
| "num_tokens": 7392060.0, |
| "step": 900 |
| } |
| ], |
| "logging_steps": 20, |
| "max_steps": 1800, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 20, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.7138456604472115e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|