| { |
| "best_global_step": 281, |
| "best_metric": 0.18149949610233307, |
| "best_model_checkpoint": "/app/output/bf-router-v0.5/checkpoint-281", |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 281, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.4157426357269287, |
| "epoch": 0.003561887800534283, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.0, |
| "loss": 4.336674213409424, |
| "mean_token_accuracy": 0.4680640399456024, |
| "num_tokens": 3092.0, |
| "step": 1 |
| }, |
| { |
| "entropy": 1.5884604156017303, |
| "epoch": 0.03561887800534283, |
| "grad_norm": 0.59375, |
| "learning_rate": 6.923076923076924e-05, |
| "loss": 3.986017862955729, |
| "mean_token_accuracy": 0.4735516524977154, |
| "num_tokens": 30650.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 2.2315500736236573, |
| "epoch": 0.07123775601068566, |
| "grad_norm": 0.26953125, |
| "learning_rate": 0.00014615384615384615, |
| "loss": 2.16544189453125, |
| "mean_token_accuracy": 0.6272616922855377, |
| "num_tokens": 61735.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.8066075079143047, |
| "epoch": 0.10685663401602849, |
| "grad_norm": 0.298828125, |
| "learning_rate": 0.00019999334629824895, |
| "loss": 0.7935717582702637, |
| "mean_token_accuracy": 0.857068446278572, |
| "num_tokens": 92484.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.5340980306267739, |
| "epoch": 0.14247551202137132, |
| "grad_norm": 0.10693359375, |
| "learning_rate": 0.00019987508289735716, |
| "loss": 0.5261529922485352, |
| "mean_token_accuracy": 0.9004581302404404, |
| "num_tokens": 123010.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.4001643668860197, |
| "epoch": 0.17809439002671415, |
| "grad_norm": 0.08251953125, |
| "learning_rate": 0.0001996091607179287, |
| "loss": 0.3956298351287842, |
| "mean_token_accuracy": 0.9209299519658088, |
| "num_tokens": 153710.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.34828933551907537, |
| "epoch": 0.21371326803205698, |
| "grad_norm": 0.10009765625, |
| "learning_rate": 0.00019919597290851538, |
| "loss": 0.3648662090301514, |
| "mean_token_accuracy": 0.9263768374919892, |
| "num_tokens": 184577.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.2983129292726517, |
| "epoch": 0.2493321460373998, |
| "grad_norm": 0.08447265625, |
| "learning_rate": 0.00019863613034027224, |
| "loss": 0.29642860889434813, |
| "mean_token_accuracy": 0.9371836110949516, |
| "num_tokens": 215040.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.280751570686698, |
| "epoch": 0.28495102404274264, |
| "grad_norm": 0.099609375, |
| "learning_rate": 0.00019793046070382437, |
| "loss": 0.2813178300857544, |
| "mean_token_accuracy": 0.9388988897204399, |
| "num_tokens": 245683.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.2960826952010393, |
| "epoch": 0.3205699020480855, |
| "grad_norm": 0.11962890625, |
| "learning_rate": 0.00019708000728558064, |
| "loss": 0.31115410327911375, |
| "mean_token_accuracy": 0.9335825845599175, |
| "num_tokens": 276430.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.28416929207742214, |
| "epoch": 0.3561887800534283, |
| "grad_norm": 0.103515625, |
| "learning_rate": 0.00019608602742530283, |
| "loss": 0.27938365936279297, |
| "mean_token_accuracy": 0.9393619552254677, |
| "num_tokens": 307296.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.23417662251740695, |
| "epoch": 0.39180765805877116, |
| "grad_norm": 0.06298828125, |
| "learning_rate": 0.00019494999065721108, |
| "loss": 0.23734774589538574, |
| "mean_token_accuracy": 0.9487677246332169, |
| "num_tokens": 337962.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.24019648060202597, |
| "epoch": 0.42742653606411396, |
| "grad_norm": 0.06640625, |
| "learning_rate": 0.0001936735765373737, |
| "loss": 0.245324969291687, |
| "mean_token_accuracy": 0.9457737937569618, |
| "num_tokens": 368782.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.269666058011353, |
| "epoch": 0.4630454140694568, |
| "grad_norm": 0.09765625, |
| "learning_rate": 0.00019225867216059325, |
| "loss": 0.2686375617980957, |
| "mean_token_accuracy": 0.9398508563637733, |
| "num_tokens": 399329.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.18328354582190515, |
| "epoch": 0.4986642920747996, |
| "grad_norm": 0.06298828125, |
| "learning_rate": 0.00019070736937046035, |
| "loss": 0.18070420026779174, |
| "mean_token_accuracy": 0.9568557634949684, |
| "num_tokens": 429759.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.21165105439722537, |
| "epoch": 0.5342831700801425, |
| "grad_norm": 0.0810546875, |
| "learning_rate": 0.0001890219616666997, |
| "loss": 0.22004930973052977, |
| "mean_token_accuracy": 0.9494227185845375, |
| "num_tokens": 460409.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.23916025627404452, |
| "epoch": 0.5699020480854853, |
| "grad_norm": 0.07568359375, |
| "learning_rate": 0.00018720494081438078, |
| "loss": 0.2379377841949463, |
| "mean_token_accuracy": 0.9455192387104034, |
| "num_tokens": 491344.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.2238690486177802, |
| "epoch": 0.6055209260908282, |
| "grad_norm": 0.08984375, |
| "learning_rate": 0.00018525899316000608, |
| "loss": 0.2245168685913086, |
| "mean_token_accuracy": 0.9485938593745231, |
| "num_tokens": 522029.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.17032606173306702, |
| "epoch": 0.641139804096171, |
| "grad_norm": 0.080078125, |
| "learning_rate": 0.00018318699565992357, |
| "loss": 0.17549347877502441, |
| "mean_token_accuracy": 0.9574712902307511, |
| "num_tokens": 552306.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.21890966054052113, |
| "epoch": 0.6767586821015138, |
| "grad_norm": 0.0673828125, |
| "learning_rate": 0.00018099201162693476, |
| "loss": 0.21968293190002441, |
| "mean_token_accuracy": 0.9482985377311707, |
| "num_tokens": 583181.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.20646683853119613, |
| "epoch": 0.7123775601068566, |
| "grad_norm": 0.09765625, |
| "learning_rate": 0.00017867728620138708, |
| "loss": 0.20968315601348878, |
| "mean_token_accuracy": 0.9519409075379371, |
| "num_tokens": 613818.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.23000301327556372, |
| "epoch": 0.7479964381121995, |
| "grad_norm": 0.08935546875, |
| "learning_rate": 0.00017624624155344626, |
| "loss": 0.2383474349975586, |
| "mean_token_accuracy": 0.9458372846245766, |
| "num_tokens": 644576.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.21420854832977057, |
| "epoch": 0.7836153161175423, |
| "grad_norm": 0.0966796875, |
| "learning_rate": 0.0001737024718236413, |
| "loss": 0.21017465591430665, |
| "mean_token_accuracy": 0.9511569887399673, |
| "num_tokens": 675564.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.19280126914381981, |
| "epoch": 0.8192341941228851, |
| "grad_norm": 0.08154296875, |
| "learning_rate": 0.00017104973780916294, |
| "loss": 0.1936139941215515, |
| "mean_token_accuracy": 0.954449312388897, |
| "num_tokens": 706309.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.22524890769273043, |
| "epoch": 0.8548530721282279, |
| "grad_norm": 0.10546875, |
| "learning_rate": 0.00016829196140377085, |
| "loss": 0.22950620651245118, |
| "mean_token_accuracy": 0.9476036429405212, |
| "num_tokens": 737135.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.18842433094978334, |
| "epoch": 0.8904719501335708, |
| "grad_norm": 0.087890625, |
| "learning_rate": 0.00016543321979953007, |
| "loss": 0.19049547910690307, |
| "mean_token_accuracy": 0.9543813273310662, |
| "num_tokens": 767853.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.1991994746029377, |
| "epoch": 0.9260908281389136, |
| "grad_norm": 0.07763671875, |
| "learning_rate": 0.00016247773945894962, |
| "loss": 0.19527161121368408, |
| "mean_token_accuracy": 0.9535173490643502, |
| "num_tokens": 798511.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.18196075949817897, |
| "epoch": 0.9617097061442564, |
| "grad_norm": 0.0771484375, |
| "learning_rate": 0.00015942988986643352, |
| "loss": 0.19091761112213135, |
| "mean_token_accuracy": 0.9535432115197182, |
| "num_tokens": 829447.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.18307169582694768, |
| "epoch": 0.9973285841495992, |
| "grad_norm": 0.08837890625, |
| "learning_rate": 0.00015629417706828423, |
| "loss": 0.1818631410598755, |
| "mean_token_accuracy": 0.955574706196785, |
| "num_tokens": 860142.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_entropy": 0.18207345770817276, |
| "eval_loss": 0.18149949610233307, |
| "eval_mean_token_accuracy": 0.9555717660180221, |
| "eval_num_tokens": 862512.0, |
| "eval_runtime": 35.3256, |
| "eval_samples_per_second": 15.881, |
| "eval_steps_per_second": 3.991, |
| "step": 281 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 843, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.028075887327232e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|