| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 461, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.021719948415122516, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019869848156182213, | |
| "loss": 6.0531, | |
| "mean_token_accuracy": 0.37881034857127815, | |
| "num_tokens": 78573.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04343989683024503, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019725234996384672, | |
| "loss": 5.9903, | |
| "mean_token_accuracy": 0.3887314551044255, | |
| "num_tokens": 157496.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06515984524536754, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019580621836587129, | |
| "loss": 5.9929, | |
| "mean_token_accuracy": 0.3850487937917933, | |
| "num_tokens": 236512.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.08687979366049006, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019436008676789588, | |
| "loss": 5.986, | |
| "mean_token_accuracy": 0.3865805763518438, | |
| "num_tokens": 315608.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.10859974207561257, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019291395516992047, | |
| "loss": 6.0318, | |
| "mean_token_accuracy": 0.38490854618139564, | |
| "num_tokens": 394791.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.13031969049073508, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019146782357194506, | |
| "loss": 5.9875, | |
| "mean_token_accuracy": 0.38546840590424836, | |
| "num_tokens": 473456.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1520396389058576, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019002169197396965, | |
| "loss": 5.9773, | |
| "mean_token_accuracy": 0.3837947838823311, | |
| "num_tokens": 552247.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.17375958732098012, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001885755603759942, | |
| "loss": 6.0197, | |
| "mean_token_accuracy": 0.38375892234034836, | |
| "num_tokens": 631096.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.19547953573610263, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001871294287780188, | |
| "loss": 6.0014, | |
| "mean_token_accuracy": 0.38519658900331705, | |
| "num_tokens": 710275.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.21719948415122514, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001856832971800434, | |
| "loss": 6.0217, | |
| "mean_token_accuracy": 0.3826989881345071, | |
| "num_tokens": 789174.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.23891943256634765, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018423716558206799, | |
| "loss": 6.0294, | |
| "mean_token_accuracy": 0.3842650496866554, | |
| "num_tokens": 867922.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.26063938098147016, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018279103398409255, | |
| "loss": 5.9548, | |
| "mean_token_accuracy": 0.3875111517496407, | |
| "num_tokens": 947073.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2823593293965927, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018134490238611714, | |
| "loss": 6.0071, | |
| "mean_token_accuracy": 0.38376112943515184, | |
| "num_tokens": 1026211.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3040792778117152, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017989877078814173, | |
| "loss": 6.0105, | |
| "mean_token_accuracy": 0.38463024909142407, | |
| "num_tokens": 1105147.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3257992262268377, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001784526391901663, | |
| "loss": 6.0449, | |
| "mean_token_accuracy": 0.38241248747799544, | |
| "num_tokens": 1183837.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.34751917464196025, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017700650759219091, | |
| "loss": 5.9946, | |
| "mean_token_accuracy": 0.3821917780907825, | |
| "num_tokens": 1262676.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.36923912305708273, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017556037599421548, | |
| "loss": 6.0049, | |
| "mean_token_accuracy": 0.3842250820598565, | |
| "num_tokens": 1341759.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.39095907147220527, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017411424439624007, | |
| "loss": 5.9736, | |
| "mean_token_accuracy": 0.3861917880363762, | |
| "num_tokens": 1420987.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.41267901988732775, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017266811279826466, | |
| "loss": 6.0431, | |
| "mean_token_accuracy": 0.38146835477091373, | |
| "num_tokens": 1499903.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.4343989683024503, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017122198120028922, | |
| "loss": 6.0069, | |
| "mean_token_accuracy": 0.3861038032686338, | |
| "num_tokens": 1578880.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4561189167175728, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016977584960231381, | |
| "loss": 6.0091, | |
| "mean_token_accuracy": 0.3845075036631897, | |
| "num_tokens": 1657709.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.4778388651326953, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001683297180043384, | |
| "loss": 6.0029, | |
| "mean_token_accuracy": 0.38288453239947556, | |
| "num_tokens": 1736620.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.49955881354781784, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000166883586406363, | |
| "loss": 5.9933, | |
| "mean_token_accuracy": 0.38531269936356694, | |
| "num_tokens": 1815478.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5212787619629403, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016543745480838756, | |
| "loss": 6.0334, | |
| "mean_token_accuracy": 0.3833939728909172, | |
| "num_tokens": 1894256.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5429987103780628, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016399132321041215, | |
| "loss": 6.0484, | |
| "mean_token_accuracy": 0.3841994108865038, | |
| "num_tokens": 1972997.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5647186587931854, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016254519161243674, | |
| "loss": 6.0285, | |
| "mean_token_accuracy": 0.38443261398933826, | |
| "num_tokens": 2051890.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5864386072083079, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016109906001446133, | |
| "loss": 5.983, | |
| "mean_token_accuracy": 0.385862308065407, | |
| "num_tokens": 2130746.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6081585556234304, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015965292841648592, | |
| "loss": 6.0046, | |
| "mean_token_accuracy": 0.3859730801777914, | |
| "num_tokens": 2209603.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6298785040385529, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001582067968185105, | |
| "loss": 5.9887, | |
| "mean_token_accuracy": 0.38499277490191164, | |
| "num_tokens": 2288743.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6515984524536754, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015676066522053508, | |
| "loss": 5.9788, | |
| "mean_token_accuracy": 0.3864025830756873, | |
| "num_tokens": 2367648.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6733184008687979, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015531453362255964, | |
| "loss": 6.0162, | |
| "mean_token_accuracy": 0.38221894631860776, | |
| "num_tokens": 2446256.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6950383492839205, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015386840202458423, | |
| "loss": 6.0017, | |
| "mean_token_accuracy": 0.38538019855041056, | |
| "num_tokens": 2525065.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.716758297699043, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015242227042660883, | |
| "loss": 5.9569, | |
| "mean_token_accuracy": 0.3852926092222333, | |
| "num_tokens": 2604183.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7384782461141655, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015097613882863342, | |
| "loss": 6.0072, | |
| "mean_token_accuracy": 0.38616998645011336, | |
| "num_tokens": 2683084.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7601981945292879, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000149530007230658, | |
| "loss": 6.0428, | |
| "mean_token_accuracy": 0.3831095602363348, | |
| "num_tokens": 2761824.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7819181429444105, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014808387563268257, | |
| "loss": 5.9781, | |
| "mean_token_accuracy": 0.3861894382047467, | |
| "num_tokens": 2840709.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.803638091359533, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014663774403470716, | |
| "loss": 5.9898, | |
| "mean_token_accuracy": 0.3846066597965546, | |
| "num_tokens": 2919385.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.8253580397746555, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014519161243673173, | |
| "loss": 6.0098, | |
| "mean_token_accuracy": 0.38357423364650456, | |
| "num_tokens": 2998308.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8470779881897781, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014374548083875634, | |
| "loss": 6.0349, | |
| "mean_token_accuracy": 0.3809299209038727, | |
| "num_tokens": 3076962.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.8687979366049006, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001422993492407809, | |
| "loss": 6.0294, | |
| "mean_token_accuracy": 0.38217092433478683, | |
| "num_tokens": 3155983.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.890517885020023, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001408532176428055, | |
| "loss": 5.9728, | |
| "mean_token_accuracy": 0.3870704318396747, | |
| "num_tokens": 3234960.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9122378334351456, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001394070860448301, | |
| "loss": 6.0541, | |
| "mean_token_accuracy": 0.38209462116938087, | |
| "num_tokens": 3313797.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.9339577818502681, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013796095444685465, | |
| "loss": 5.9871, | |
| "mean_token_accuracy": 0.38401242352556436, | |
| "num_tokens": 3392880.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.9556777302653906, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013651482284887927, | |
| "loss": 5.9901, | |
| "mean_token_accuracy": 0.38375150066567587, | |
| "num_tokens": 3471597.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.9773976786805131, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013506869125090384, | |
| "loss": 5.9797, | |
| "mean_token_accuracy": 0.38728573790285736, | |
| "num_tokens": 3550697.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9991176270956357, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013362255965292843, | |
| "loss": 5.9971, | |
| "mean_token_accuracy": 0.38420643559657036, | |
| "num_tokens": 3629481.0, | |
| "step": 460 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1383, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.553509208459428e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |