| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 3000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005, | |
| "grad_norm": 13.5625, | |
| "learning_rate": 1.3333333333333336e-07, | |
| "loss": 2.1142, | |
| "mean_token_accuracy": 0.5323337733745575, | |
| "num_tokens": 1733.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 84.0, | |
| "learning_rate": 3.0000000000000004e-07, | |
| "loss": 1.7051, | |
| "mean_token_accuracy": 0.6366191267967224, | |
| "num_tokens": 3780.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.015, | |
| "grad_norm": 110.5, | |
| "learning_rate": 4.666666666666667e-07, | |
| "loss": 1.9183, | |
| "mean_token_accuracy": 0.6092176318168641, | |
| "num_tokens": 6241.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 6.333333333333334e-07, | |
| "loss": 1.6002, | |
| "mean_token_accuracy": 0.6509096503257752, | |
| "num_tokens": 8399.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.025, | |
| "grad_norm": 71.0, | |
| "learning_rate": 8.000000000000001e-07, | |
| "loss": 1.5134, | |
| "mean_token_accuracy": 0.6915769815444947, | |
| "num_tokens": 11215.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 60.25, | |
| "learning_rate": 9.666666666666668e-07, | |
| "loss": 1.3989, | |
| "mean_token_accuracy": 0.7019677758216858, | |
| "num_tokens": 13673.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.035, | |
| "grad_norm": 16.0, | |
| "learning_rate": 1.1333333333333334e-06, | |
| "loss": 2.3128, | |
| "mean_token_accuracy": 0.5302097499370575, | |
| "num_tokens": 14834.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 12.0625, | |
| "learning_rate": 1.3e-06, | |
| "loss": 1.0989, | |
| "mean_token_accuracy": 0.7303242325782776, | |
| "num_tokens": 18087.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.045, | |
| "grad_norm": 5.75, | |
| "learning_rate": 1.4666666666666669e-06, | |
| "loss": 1.4854, | |
| "mean_token_accuracy": 0.6774025321006775, | |
| "num_tokens": 21156.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 11.625, | |
| "learning_rate": 1.6333333333333335e-06, | |
| "loss": 1.2891, | |
| "mean_token_accuracy": 0.6656664133071899, | |
| "num_tokens": 24379.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.055, | |
| "grad_norm": 85.0, | |
| "learning_rate": 1.8000000000000001e-06, | |
| "loss": 2.3012, | |
| "mean_token_accuracy": 0.5090380042791367, | |
| "num_tokens": 26962.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 8.75, | |
| "learning_rate": 1.9666666666666668e-06, | |
| "loss": 1.6577, | |
| "mean_token_accuracy": 0.6504465699195862, | |
| "num_tokens": 29127.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.065, | |
| "grad_norm": 8.875, | |
| "learning_rate": 2.133333333333334e-06, | |
| "loss": 1.0609, | |
| "mean_token_accuracy": 0.7339364051818847, | |
| "num_tokens": 32645.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 6.34375, | |
| "learning_rate": 2.3000000000000004e-06, | |
| "loss": 1.271, | |
| "mean_token_accuracy": 0.7069566965103149, | |
| "num_tokens": 35749.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.075, | |
| "grad_norm": 7.125, | |
| "learning_rate": 2.466666666666667e-06, | |
| "loss": 1.1678, | |
| "mean_token_accuracy": 0.7368309020996093, | |
| "num_tokens": 38324.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 60.0, | |
| "learning_rate": 2.6333333333333332e-06, | |
| "loss": 1.3974, | |
| "mean_token_accuracy": 0.664735347032547, | |
| "num_tokens": 41203.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.085, | |
| "grad_norm": 50.25, | |
| "learning_rate": 2.8000000000000003e-06, | |
| "loss": 2.3673, | |
| "mean_token_accuracy": 0.5113712131977082, | |
| "num_tokens": 42375.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 53.5, | |
| "learning_rate": 2.9666666666666673e-06, | |
| "loss": 1.8348, | |
| "mean_token_accuracy": 0.6246361255645752, | |
| "num_tokens": 44196.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.095, | |
| "grad_norm": 26.125, | |
| "learning_rate": 3.133333333333334e-06, | |
| "loss": 2.1825, | |
| "mean_token_accuracy": 0.553260189294815, | |
| "num_tokens": 45435.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 20.875, | |
| "learning_rate": 3.3000000000000006e-06, | |
| "loss": 1.6649, | |
| "mean_token_accuracy": 0.653439199924469, | |
| "num_tokens": 46787.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.105, | |
| "grad_norm": 6.15625, | |
| "learning_rate": 3.4666666666666672e-06, | |
| "loss": 2.0613, | |
| "mean_token_accuracy": 0.589026153087616, | |
| "num_tokens": 48337.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 6.21875, | |
| "learning_rate": 3.633333333333334e-06, | |
| "loss": 1.3082, | |
| "mean_token_accuracy": 0.6807661652565002, | |
| "num_tokens": 51045.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.115, | |
| "grad_norm": 7.09375, | |
| "learning_rate": 3.8000000000000005e-06, | |
| "loss": 0.7279, | |
| "mean_token_accuracy": 0.7977130174636841, | |
| "num_tokens": 55698.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 46.75, | |
| "learning_rate": 3.966666666666667e-06, | |
| "loss": 1.6527, | |
| "mean_token_accuracy": 0.5857374429702759, | |
| "num_tokens": 57880.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 8.5, | |
| "learning_rate": 4.133333333333333e-06, | |
| "loss": 1.1536, | |
| "mean_token_accuracy": 0.6969289302825927, | |
| "num_tokens": 60652.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 23.25, | |
| "learning_rate": 4.3e-06, | |
| "loss": 1.4197, | |
| "mean_token_accuracy": 0.6533516108989715, | |
| "num_tokens": 62909.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.135, | |
| "grad_norm": 33.0, | |
| "learning_rate": 4.4666666666666665e-06, | |
| "loss": 1.6691, | |
| "mean_token_accuracy": 0.610290253162384, | |
| "num_tokens": 64644.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.6875, | |
| "learning_rate": 4.633333333333334e-06, | |
| "loss": 1.0595, | |
| "mean_token_accuracy": 0.7040035367012024, | |
| "num_tokens": 67286.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.145, | |
| "grad_norm": 6.09375, | |
| "learning_rate": 4.800000000000001e-06, | |
| "loss": 1.6933, | |
| "mean_token_accuracy": 0.6413110613822937, | |
| "num_tokens": 68571.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 15.6875, | |
| "learning_rate": 4.966666666666667e-06, | |
| "loss": 1.5301, | |
| "mean_token_accuracy": 0.6430149674415588, | |
| "num_tokens": 69810.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.155, | |
| "grad_norm": 32.25, | |
| "learning_rate": 5.133333333333334e-06, | |
| "loss": 1.6792, | |
| "mean_token_accuracy": 0.6632359743118286, | |
| "num_tokens": 70724.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 7.90625, | |
| "learning_rate": 5.300000000000001e-06, | |
| "loss": 1.2571, | |
| "mean_token_accuracy": 0.7145259499549865, | |
| "num_tokens": 73380.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.165, | |
| "grad_norm": 26.375, | |
| "learning_rate": 5.466666666666667e-06, | |
| "loss": 1.3008, | |
| "mean_token_accuracy": 0.6873301148414612, | |
| "num_tokens": 75374.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 27.5, | |
| "learning_rate": 5.633333333333334e-06, | |
| "loss": 0.9376, | |
| "mean_token_accuracy": 0.7572417616844177, | |
| "num_tokens": 78638.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.175, | |
| "grad_norm": 25.75, | |
| "learning_rate": 5.8e-06, | |
| "loss": 1.2469, | |
| "mean_token_accuracy": 0.7168016552925109, | |
| "num_tokens": 81253.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 5.966666666666667e-06, | |
| "loss": 1.6604, | |
| "mean_token_accuracy": 0.6027044415473938, | |
| "num_tokens": 82921.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.185, | |
| "grad_norm": 34.25, | |
| "learning_rate": 6.133333333333334e-06, | |
| "loss": 1.3829, | |
| "mean_token_accuracy": 0.6629180788993836, | |
| "num_tokens": 84865.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 11.25, | |
| "learning_rate": 6.300000000000001e-06, | |
| "loss": 1.1711, | |
| "mean_token_accuracy": 0.7101378679275513, | |
| "num_tokens": 87233.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.195, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 6.466666666666667e-06, | |
| "loss": 1.6846, | |
| "mean_token_accuracy": 0.6098420560359955, | |
| "num_tokens": 89523.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 32.5, | |
| "learning_rate": 6.633333333333334e-06, | |
| "loss": 1.3466, | |
| "mean_token_accuracy": 0.6577545762062073, | |
| "num_tokens": 91844.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.205, | |
| "grad_norm": 14.0, | |
| "learning_rate": 6.800000000000001e-06, | |
| "loss": 1.2093, | |
| "mean_token_accuracy": 0.7090275406837463, | |
| "num_tokens": 94168.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 7.15625, | |
| "learning_rate": 6.966666666666667e-06, | |
| "loss": 1.3387, | |
| "mean_token_accuracy": 0.664970874786377, | |
| "num_tokens": 96323.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.215, | |
| "grad_norm": 32.75, | |
| "learning_rate": 7.133333333333334e-06, | |
| "loss": 1.1935, | |
| "mean_token_accuracy": 0.6921853065490723, | |
| "num_tokens": 98444.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 8.6875, | |
| "learning_rate": 7.3e-06, | |
| "loss": 1.1413, | |
| "mean_token_accuracy": 0.7070739269256592, | |
| "num_tokens": 101442.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.225, | |
| "grad_norm": 23.75, | |
| "learning_rate": 7.4666666666666675e-06, | |
| "loss": 1.0761, | |
| "mean_token_accuracy": 0.7386624693870545, | |
| "num_tokens": 103604.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 7.633333333333334e-06, | |
| "loss": 1.1318, | |
| "mean_token_accuracy": 0.7022495746612549, | |
| "num_tokens": 105540.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.235, | |
| "grad_norm": 34.5, | |
| "learning_rate": 7.800000000000002e-06, | |
| "loss": 1.4643, | |
| "mean_token_accuracy": 0.6770303070545196, | |
| "num_tokens": 107522.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 8.25, | |
| "learning_rate": 7.966666666666668e-06, | |
| "loss": 1.1224, | |
| "mean_token_accuracy": 0.7211631774902344, | |
| "num_tokens": 110450.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.245, | |
| "grad_norm": 5.53125, | |
| "learning_rate": 8.133333333333334e-06, | |
| "loss": 1.1466, | |
| "mean_token_accuracy": 0.7181923449039459, | |
| "num_tokens": 113134.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 8.3e-06, | |
| "loss": 1.1408, | |
| "mean_token_accuracy": 0.7148800849914551, | |
| "num_tokens": 115554.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.255, | |
| "grad_norm": 30.25, | |
| "learning_rate": 8.466666666666668e-06, | |
| "loss": 1.1868, | |
| "mean_token_accuracy": 0.6954805672168731, | |
| "num_tokens": 118247.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 21.75, | |
| "learning_rate": 8.633333333333334e-06, | |
| "loss": 1.2475, | |
| "mean_token_accuracy": 0.6629476428031922, | |
| "num_tokens": 120275.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.265, | |
| "grad_norm": 5.75, | |
| "learning_rate": 8.8e-06, | |
| "loss": 1.1846, | |
| "mean_token_accuracy": 0.687124228477478, | |
| "num_tokens": 122539.0, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 29.625, | |
| "learning_rate": 8.966666666666667e-06, | |
| "loss": 1.4612, | |
| "mean_token_accuracy": 0.6541434586048126, | |
| "num_tokens": 123637.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.275, | |
| "grad_norm": 5.3125, | |
| "learning_rate": 9.133333333333335e-06, | |
| "loss": 0.8047, | |
| "mean_token_accuracy": 0.7711950898170471, | |
| "num_tokens": 127360.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 8.0, | |
| "learning_rate": 9.3e-06, | |
| "loss": 1.1051, | |
| "mean_token_accuracy": 0.703935158252716, | |
| "num_tokens": 130138.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.285, | |
| "grad_norm": 7.65625, | |
| "learning_rate": 9.466666666666667e-06, | |
| "loss": 1.2678, | |
| "mean_token_accuracy": 0.7222112536430358, | |
| "num_tokens": 131621.0, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 9.0, | |
| "learning_rate": 9.633333333333335e-06, | |
| "loss": 1.1627, | |
| "mean_token_accuracy": 0.6856825113296509, | |
| "num_tokens": 134152.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.295, | |
| "grad_norm": 26.0, | |
| "learning_rate": 9.800000000000001e-06, | |
| "loss": 1.0517, | |
| "mean_token_accuracy": 0.728261661529541, | |
| "num_tokens": 136289.0, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 40.5, | |
| "learning_rate": 9.966666666666667e-06, | |
| "loss": 1.1132, | |
| "mean_token_accuracy": 0.7170758843421936, | |
| "num_tokens": 138790.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.305, | |
| "grad_norm": 8.0, | |
| "learning_rate": 1.0133333333333335e-05, | |
| "loss": 1.4069, | |
| "mean_token_accuracy": 0.6523711323738098, | |
| "num_tokens": 141292.0, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 1.0300000000000001e-05, | |
| "loss": 0.72, | |
| "mean_token_accuracy": 0.7980146527290344, | |
| "num_tokens": 145289.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.315, | |
| "grad_norm": 29.875, | |
| "learning_rate": 1.0466666666666668e-05, | |
| "loss": 1.1226, | |
| "mean_token_accuracy": 0.7180093169212342, | |
| "num_tokens": 147253.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 1.0633333333333334e-05, | |
| "loss": 0.7705, | |
| "mean_token_accuracy": 0.7848826766014099, | |
| "num_tokens": 150324.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.325, | |
| "grad_norm": 12.6875, | |
| "learning_rate": 1.0800000000000002e-05, | |
| "loss": 0.8428, | |
| "mean_token_accuracy": 0.7722244143486023, | |
| "num_tokens": 154155.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 4.5, | |
| "learning_rate": 1.0966666666666668e-05, | |
| "loss": 1.1001, | |
| "mean_token_accuracy": 0.7051229119300843, | |
| "num_tokens": 157400.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.335, | |
| "grad_norm": 35.25, | |
| "learning_rate": 1.1133333333333334e-05, | |
| "loss": 1.0369, | |
| "mean_token_accuracy": 0.7228875994682312, | |
| "num_tokens": 160018.0, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 1.13e-05, | |
| "loss": 1.0404, | |
| "mean_token_accuracy": 0.7233722567558288, | |
| "num_tokens": 163034.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.345, | |
| "grad_norm": 41.75, | |
| "learning_rate": 1.1466666666666668e-05, | |
| "loss": 1.536, | |
| "mean_token_accuracy": 0.6506137490272522, | |
| "num_tokens": 163880.0, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 30.75, | |
| "learning_rate": 1.1633333333333334e-05, | |
| "loss": 1.5428, | |
| "mean_token_accuracy": 0.607353800535202, | |
| "num_tokens": 164994.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.355, | |
| "grad_norm": 4.78125, | |
| "learning_rate": 1.18e-05, | |
| "loss": 0.7706, | |
| "mean_token_accuracy": 0.786465299129486, | |
| "num_tokens": 168825.0, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 31.75, | |
| "learning_rate": 1.1966666666666668e-05, | |
| "loss": 1.0517, | |
| "mean_token_accuracy": 0.7385401725769043, | |
| "num_tokens": 172234.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.365, | |
| "grad_norm": 6.6875, | |
| "learning_rate": 1.2133333333333335e-05, | |
| "loss": 1.4293, | |
| "mean_token_accuracy": 0.6525760054588318, | |
| "num_tokens": 174168.0, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 25.625, | |
| "learning_rate": 1.23e-05, | |
| "loss": 1.5997, | |
| "mean_token_accuracy": 0.6229295372962952, | |
| "num_tokens": 175506.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.375, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 1.2466666666666667e-05, | |
| "loss": 1.0306, | |
| "mean_token_accuracy": 0.7189781785011291, | |
| "num_tokens": 177941.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 7.53125, | |
| "learning_rate": 1.2633333333333335e-05, | |
| "loss": 0.9724, | |
| "mean_token_accuracy": 0.7246036767959595, | |
| "num_tokens": 180506.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.385, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 1.2800000000000001e-05, | |
| "loss": 1.1149, | |
| "mean_token_accuracy": 0.7029499173164367, | |
| "num_tokens": 182002.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 26.375, | |
| "learning_rate": 1.2966666666666667e-05, | |
| "loss": 1.5045, | |
| "mean_token_accuracy": 0.6098679423332214, | |
| "num_tokens": 182656.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.395, | |
| "grad_norm": 15.75, | |
| "learning_rate": 1.3133333333333334e-05, | |
| "loss": 0.9356, | |
| "mean_token_accuracy": 0.7362943291664124, | |
| "num_tokens": 185089.0, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 1.3300000000000001e-05, | |
| "loss": 0.9694, | |
| "mean_token_accuracy": 0.7282199025154114, | |
| "num_tokens": 188035.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.405, | |
| "grad_norm": 7.28125, | |
| "learning_rate": 1.3466666666666668e-05, | |
| "loss": 1.2325, | |
| "mean_token_accuracy": 0.6913779973983765, | |
| "num_tokens": 190462.0, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 29.125, | |
| "learning_rate": 1.3633333333333334e-05, | |
| "loss": 1.2702, | |
| "mean_token_accuracy": 0.663710606098175, | |
| "num_tokens": 192716.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.415, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 1.38e-05, | |
| "loss": 1.1096, | |
| "mean_token_accuracy": 0.7282086968421936, | |
| "num_tokens": 195440.0, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 7.9375, | |
| "learning_rate": 1.3966666666666668e-05, | |
| "loss": 0.8709, | |
| "mean_token_accuracy": 0.757840347290039, | |
| "num_tokens": 199110.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.425, | |
| "grad_norm": 29.375, | |
| "learning_rate": 1.4133333333333334e-05, | |
| "loss": 1.1679, | |
| "mean_token_accuracy": 0.7137652635574341, | |
| "num_tokens": 201034.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 6.84375, | |
| "learning_rate": 1.43e-05, | |
| "loss": 0.8863, | |
| "mean_token_accuracy": 0.7540697813034057, | |
| "num_tokens": 204163.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.435, | |
| "grad_norm": 7.65625, | |
| "learning_rate": 1.4466666666666668e-05, | |
| "loss": 0.9015, | |
| "mean_token_accuracy": 0.7539669752120972, | |
| "num_tokens": 207778.0, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 24.0, | |
| "learning_rate": 1.4633333333333334e-05, | |
| "loss": 1.0044, | |
| "mean_token_accuracy": 0.7438582420349121, | |
| "num_tokens": 211196.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.445, | |
| "grad_norm": 16.125, | |
| "learning_rate": 1.48e-05, | |
| "loss": 1.003, | |
| "mean_token_accuracy": 0.722856342792511, | |
| "num_tokens": 213420.0, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 10.0, | |
| "learning_rate": 1.4966666666666667e-05, | |
| "loss": 1.3499, | |
| "mean_token_accuracy": 0.64574693441391, | |
| "num_tokens": 215785.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.455, | |
| "grad_norm": 19.25, | |
| "learning_rate": 1.5133333333333335e-05, | |
| "loss": 0.8628, | |
| "mean_token_accuracy": 0.7608179092407227, | |
| "num_tokens": 218181.0, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 7.4375, | |
| "learning_rate": 1.5300000000000003e-05, | |
| "loss": 0.9714, | |
| "mean_token_accuracy": 0.737330162525177, | |
| "num_tokens": 221215.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.465, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 1.546666666666667e-05, | |
| "loss": 1.2211, | |
| "mean_token_accuracy": 0.6857584714889526, | |
| "num_tokens": 223524.0, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 8.75, | |
| "learning_rate": 1.5633333333333335e-05, | |
| "loss": 0.9083, | |
| "mean_token_accuracy": 0.7348023653030396, | |
| "num_tokens": 226999.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.475, | |
| "grad_norm": 21.375, | |
| "learning_rate": 1.58e-05, | |
| "loss": 1.2031, | |
| "mean_token_accuracy": 0.6628332495689392, | |
| "num_tokens": 229176.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 38.5, | |
| "learning_rate": 1.5966666666666667e-05, | |
| "loss": 1.0468, | |
| "mean_token_accuracy": 0.7459447026252747, | |
| "num_tokens": 232128.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.485, | |
| "grad_norm": 26.875, | |
| "learning_rate": 1.6133333333333334e-05, | |
| "loss": 1.376, | |
| "mean_token_accuracy": 0.6213439345359802, | |
| "num_tokens": 233115.0, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 6.75, | |
| "learning_rate": 1.63e-05, | |
| "loss": 1.3646, | |
| "mean_token_accuracy": 0.6519601762294769, | |
| "num_tokens": 234739.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.495, | |
| "grad_norm": 4.0, | |
| "learning_rate": 1.646666666666667e-05, | |
| "loss": 0.7117, | |
| "mean_token_accuracy": 0.7921866178512573, | |
| "num_tokens": 238072.0, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 7.90625, | |
| "learning_rate": 1.6633333333333336e-05, | |
| "loss": 1.241, | |
| "mean_token_accuracy": 0.6775242328643799, | |
| "num_tokens": 239427.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.505, | |
| "grad_norm": 5.09375, | |
| "learning_rate": 1.6800000000000002e-05, | |
| "loss": 0.9237, | |
| "mean_token_accuracy": 0.7540452361106873, | |
| "num_tokens": 241443.0, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 8.875, | |
| "learning_rate": 1.6966666666666668e-05, | |
| "loss": 0.9896, | |
| "mean_token_accuracy": 0.7450487017631531, | |
| "num_tokens": 244266.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.515, | |
| "grad_norm": 9.25, | |
| "learning_rate": 1.7133333333333334e-05, | |
| "loss": 1.0898, | |
| "mean_token_accuracy": 0.6930008172988892, | |
| "num_tokens": 247220.0, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 6.1875, | |
| "learning_rate": 1.73e-05, | |
| "loss": 0.7898, | |
| "mean_token_accuracy": 0.7681538939476014, | |
| "num_tokens": 251322.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.525, | |
| "grad_norm": 25.625, | |
| "learning_rate": 1.7466666666666667e-05, | |
| "loss": 1.1583, | |
| "mean_token_accuracy": 0.7108910560607911, | |
| "num_tokens": 253357.0, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 30.0, | |
| "learning_rate": 1.7633333333333336e-05, | |
| "loss": 0.9222, | |
| "mean_token_accuracy": 0.7563987374305725, | |
| "num_tokens": 256122.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.535, | |
| "grad_norm": 30.75, | |
| "learning_rate": 1.7800000000000002e-05, | |
| "loss": 0.9896, | |
| "mean_token_accuracy": 0.722571051120758, | |
| "num_tokens": 258447.0, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 1.796666666666667e-05, | |
| "loss": 0.7903, | |
| "mean_token_accuracy": 0.7722970724105835, | |
| "num_tokens": 261917.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.545, | |
| "grad_norm": 7.25, | |
| "learning_rate": 1.8133333333333335e-05, | |
| "loss": 0.6733, | |
| "mean_token_accuracy": 0.798851752281189, | |
| "num_tokens": 265658.0, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 23.125, | |
| "learning_rate": 1.83e-05, | |
| "loss": 1.2436, | |
| "mean_token_accuracy": 0.67679682970047, | |
| "num_tokens": 267665.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.555, | |
| "grad_norm": 26.75, | |
| "learning_rate": 1.8466666666666667e-05, | |
| "loss": 1.1331, | |
| "mean_token_accuracy": 0.686410254240036, | |
| "num_tokens": 269966.0, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 27.375, | |
| "learning_rate": 1.8633333333333333e-05, | |
| "loss": 0.9328, | |
| "mean_token_accuracy": 0.7283676505088806, | |
| "num_tokens": 273033.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.565, | |
| "grad_norm": 4.375, | |
| "learning_rate": 1.88e-05, | |
| "loss": 1.0776, | |
| "mean_token_accuracy": 0.7249770760536194, | |
| "num_tokens": 275754.0, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 25.25, | |
| "learning_rate": 1.896666666666667e-05, | |
| "loss": 0.9722, | |
| "mean_token_accuracy": 0.7202863574028016, | |
| "num_tokens": 279072.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.575, | |
| "grad_norm": 6.5625, | |
| "learning_rate": 1.9133333333333335e-05, | |
| "loss": 1.0787, | |
| "mean_token_accuracy": 0.7136101365089417, | |
| "num_tokens": 281884.0, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 1.93e-05, | |
| "loss": 0.8845, | |
| "mean_token_accuracy": 0.7490099549293519, | |
| "num_tokens": 285077.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.585, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 1.9466666666666668e-05, | |
| "loss": 1.1224, | |
| "mean_token_accuracy": 0.7014864623546601, | |
| "num_tokens": 287813.0, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 37.0, | |
| "learning_rate": 1.9633333333333334e-05, | |
| "loss": 1.1778, | |
| "mean_token_accuracy": 0.6806494235992432, | |
| "num_tokens": 288638.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.595, | |
| "grad_norm": 10.125, | |
| "learning_rate": 1.98e-05, | |
| "loss": 0.9242, | |
| "mean_token_accuracy": 0.760206151008606, | |
| "num_tokens": 291339.0, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 1.9966666666666666e-05, | |
| "loss": 0.9388, | |
| "mean_token_accuracy": 0.7776451945304871, | |
| "num_tokens": 292680.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.605, | |
| "grad_norm": 30.625, | |
| "learning_rate": 1.999986292247427e-05, | |
| "loss": 0.8811, | |
| "mean_token_accuracy": 0.7725589990615844, | |
| "num_tokens": 295802.0, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 7.25, | |
| "learning_rate": 1.9999306051466772e-05, | |
| "loss": 0.9805, | |
| "mean_token_accuracy": 0.7329005718231201, | |
| "num_tokens": 298109.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.615, | |
| "grad_norm": 7.5625, | |
| "learning_rate": 1.999832084346831e-05, | |
| "loss": 0.8401, | |
| "mean_token_accuracy": 0.7744085669517518, | |
| "num_tokens": 299060.0, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 1.9996907340681907e-05, | |
| "loss": 0.8956, | |
| "mean_token_accuracy": 0.7751296997070313, | |
| "num_tokens": 301210.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 21.125, | |
| "learning_rate": 1.9995065603657317e-05, | |
| "loss": 1.0128, | |
| "mean_token_accuracy": 0.7251328110694886, | |
| "num_tokens": 303234.0, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 1.9992795711288432e-05, | |
| "loss": 1.1435, | |
| "mean_token_accuracy": 0.6821866631507874, | |
| "num_tokens": 305773.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.635, | |
| "grad_norm": 5.03125, | |
| "learning_rate": 1.9990097760809878e-05, | |
| "loss": 0.9333, | |
| "mean_token_accuracy": 0.7206153392791748, | |
| "num_tokens": 307991.0, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 7.375, | |
| "learning_rate": 1.998697186779288e-05, | |
| "loss": 1.3283, | |
| "mean_token_accuracy": 0.6572122693061828, | |
| "num_tokens": 309669.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.645, | |
| "grad_norm": 4.96875, | |
| "learning_rate": 1.9983418166140286e-05, | |
| "loss": 0.7746, | |
| "mean_token_accuracy": 0.7766924381256104, | |
| "num_tokens": 312663.0, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 28.125, | |
| "learning_rate": 1.997943680808085e-05, | |
| "loss": 1.1281, | |
| "mean_token_accuracy": 0.6910940647125244, | |
| "num_tokens": 314698.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.655, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 1.9975027964162704e-05, | |
| "loss": 0.8849, | |
| "mean_token_accuracy": 0.7341102123260498, | |
| "num_tokens": 317519.0, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 5.84375, | |
| "learning_rate": 1.997019182324604e-05, | |
| "loss": 0.5954, | |
| "mean_token_accuracy": 0.8246450662612915, | |
| "num_tokens": 321897.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.665, | |
| "grad_norm": 24.625, | |
| "learning_rate": 1.9964928592495046e-05, | |
| "loss": 1.0302, | |
| "mean_token_accuracy": 0.6987462162971496, | |
| "num_tokens": 323882.0, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 6.40625, | |
| "learning_rate": 1.9959238497369006e-05, | |
| "loss": 0.9251, | |
| "mean_token_accuracy": 0.7502574563026428, | |
| "num_tokens": 325803.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.675, | |
| "grad_norm": 7.3125, | |
| "learning_rate": 1.9953121781612657e-05, | |
| "loss": 0.9518, | |
| "mean_token_accuracy": 0.7389394760131835, | |
| "num_tokens": 329701.0, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 5.71875, | |
| "learning_rate": 1.9946578707245744e-05, | |
| "loss": 1.0196, | |
| "mean_token_accuracy": 0.7160016298294067, | |
| "num_tokens": 333538.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.685, | |
| "grad_norm": 5.40625, | |
| "learning_rate": 1.99396095545518e-05, | |
| "loss": 1.145, | |
| "mean_token_accuracy": 0.71680166721344, | |
| "num_tokens": 335053.0, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 30.25, | |
| "learning_rate": 1.9932214622066123e-05, | |
| "loss": 1.2193, | |
| "mean_token_accuracy": 0.6923537135124207, | |
| "num_tokens": 336284.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.695, | |
| "grad_norm": 6.53125, | |
| "learning_rate": 1.9924394226563016e-05, | |
| "loss": 0.7077, | |
| "mean_token_accuracy": 0.8013210415840148, | |
| "num_tokens": 340181.0, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 27.125, | |
| "learning_rate": 1.9916148703042193e-05, | |
| "loss": 1.2146, | |
| "mean_token_accuracy": 0.6644866585731506, | |
| "num_tokens": 342323.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.705, | |
| "grad_norm": 17.375, | |
| "learning_rate": 1.9907478404714438e-05, | |
| "loss": 1.0603, | |
| "mean_token_accuracy": 0.7222308039665222, | |
| "num_tokens": 343975.0, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 29.25, | |
| "learning_rate": 1.9898383702986473e-05, | |
| "loss": 1.0548, | |
| "mean_token_accuracy": 0.7053543448448181, | |
| "num_tokens": 344944.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.715, | |
| "grad_norm": 5.0625, | |
| "learning_rate": 1.988886498744505e-05, | |
| "loss": 0.703, | |
| "mean_token_accuracy": 0.8008648157119751, | |
| "num_tokens": 348397.0, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 10.25, | |
| "learning_rate": 1.987892266584026e-05, | |
| "loss": 1.1011, | |
| "mean_token_accuracy": 0.7074811816215515, | |
| "num_tokens": 351294.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.725, | |
| "grad_norm": 33.75, | |
| "learning_rate": 1.9868557164068073e-05, | |
| "loss": 0.9834, | |
| "mean_token_accuracy": 0.706681752204895, | |
| "num_tokens": 353157.0, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 34.0, | |
| "learning_rate": 1.985776892615209e-05, | |
| "loss": 0.858, | |
| "mean_token_accuracy": 0.7515957832336426, | |
| "num_tokens": 355800.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.735, | |
| "grad_norm": 7.6875, | |
| "learning_rate": 1.984655841422451e-05, | |
| "loss": 0.9033, | |
| "mean_token_accuracy": 0.7307195067405701, | |
| "num_tokens": 358533.0, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 1.9834926108506357e-05, | |
| "loss": 0.9196, | |
| "mean_token_accuracy": 0.740574061870575, | |
| "num_tokens": 361710.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.745, | |
| "grad_norm": 27.125, | |
| "learning_rate": 1.982287250728689e-05, | |
| "loss": 1.2976, | |
| "mean_token_accuracy": 0.6771107912063599, | |
| "num_tokens": 363820.0, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 6.40625, | |
| "learning_rate": 1.981039812690227e-05, | |
| "loss": 0.9639, | |
| "mean_token_accuracy": 0.7433285117149353, | |
| "num_tokens": 366953.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.755, | |
| "grad_norm": 26.0, | |
| "learning_rate": 1.979750350171343e-05, | |
| "loss": 0.9865, | |
| "mean_token_accuracy": 0.7108042955398559, | |
| "num_tokens": 369294.0, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 20.375, | |
| "learning_rate": 1.9784189184083203e-05, | |
| "loss": 1.2112, | |
| "mean_token_accuracy": 0.6762296676635742, | |
| "num_tokens": 370620.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.765, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 1.977045574435264e-05, | |
| "loss": 0.8026, | |
| "mean_token_accuracy": 0.7732225179672241, | |
| "num_tokens": 373947.0, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 27.625, | |
| "learning_rate": 1.9756303770816588e-05, | |
| "loss": 0.9244, | |
| "mean_token_accuracy": 0.7461455583572387, | |
| "num_tokens": 376618.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.775, | |
| "grad_norm": 23.625, | |
| "learning_rate": 1.9741733869698497e-05, | |
| "loss": 1.0857, | |
| "mean_token_accuracy": 0.7027423620223999, | |
| "num_tokens": 378773.0, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 28.375, | |
| "learning_rate": 1.972674666512443e-05, | |
| "loss": 1.1611, | |
| "mean_token_accuracy": 0.687284791469574, | |
| "num_tokens": 380436.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.785, | |
| "grad_norm": 5.5, | |
| "learning_rate": 1.971134279909636e-05, | |
| "loss": 0.9853, | |
| "mean_token_accuracy": 0.7149757027626038, | |
| "num_tokens": 383360.0, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 29.625, | |
| "learning_rate": 1.9695522931464637e-05, | |
| "loss": 1.0704, | |
| "mean_token_accuracy": 0.7092410445213317, | |
| "num_tokens": 384901.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.795, | |
| "grad_norm": 4.90625, | |
| "learning_rate": 1.9679287739899733e-05, | |
| "loss": 0.9147, | |
| "mean_token_accuracy": 0.7358805298805237, | |
| "num_tokens": 388295.0, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 24.0, | |
| "learning_rate": 1.9662637919863224e-05, | |
| "loss": 0.9703, | |
| "mean_token_accuracy": 0.7364350974559783, | |
| "num_tokens": 391062.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.805, | |
| "grad_norm": 19.875, | |
| "learning_rate": 1.9645574184577982e-05, | |
| "loss": 1.314, | |
| "mean_token_accuracy": 0.6746392011642456, | |
| "num_tokens": 392268.0, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 6.53125, | |
| "learning_rate": 1.9628097264997637e-05, | |
| "loss": 0.7936, | |
| "mean_token_accuracy": 0.7952145218849183, | |
| "num_tokens": 395200.0, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.815, | |
| "grad_norm": 5.40625, | |
| "learning_rate": 1.9610207909775252e-05, | |
| "loss": 0.7545, | |
| "mean_token_accuracy": 0.7765708088874816, | |
| "num_tokens": 398196.0, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 1.9591906885231275e-05, | |
| "loss": 0.9946, | |
| "mean_token_accuracy": 0.7330313444137573, | |
| "num_tokens": 400393.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.825, | |
| "grad_norm": 6.53125, | |
| "learning_rate": 1.9573194975320672e-05, | |
| "loss": 0.9535, | |
| "mean_token_accuracy": 0.7550879120826721, | |
| "num_tokens": 403335.0, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 5.21875, | |
| "learning_rate": 1.9554072981599398e-05, | |
| "loss": 0.7711, | |
| "mean_token_accuracy": 0.7814031720161438, | |
| "num_tokens": 406793.0, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.835, | |
| "grad_norm": 7.5625, | |
| "learning_rate": 1.953454172319001e-05, | |
| "loss": 0.835, | |
| "mean_token_accuracy": 0.7673190832138062, | |
| "num_tokens": 410253.0, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 1.9514602036746627e-05, | |
| "loss": 0.8274, | |
| "mean_token_accuracy": 0.7780193209648132, | |
| "num_tokens": 412426.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.845, | |
| "grad_norm": 16.0, | |
| "learning_rate": 1.949425477641904e-05, | |
| "loss": 1.0199, | |
| "mean_token_accuracy": 0.7647771120071412, | |
| "num_tokens": 414613.0, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 5.375, | |
| "learning_rate": 1.9473500813816163e-05, | |
| "loss": 0.7706, | |
| "mean_token_accuracy": 0.7762330651283265, | |
| "num_tokens": 417980.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.855, | |
| "grad_norm": 24.875, | |
| "learning_rate": 1.9452341037968684e-05, | |
| "loss": 1.1174, | |
| "mean_token_accuracy": 0.6667572498321533, | |
| "num_tokens": 419696.0, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 1.943077635529097e-05, | |
| "loss": 0.869, | |
| "mean_token_accuracy": 0.7522995114326477, | |
| "num_tokens": 423247.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.865, | |
| "grad_norm": 8.375, | |
| "learning_rate": 1.9408807689542257e-05, | |
| "loss": 1.0374, | |
| "mean_token_accuracy": 0.7039300322532653, | |
| "num_tokens": 425586.0, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 23.875, | |
| "learning_rate": 1.9386435981787067e-05, | |
| "loss": 1.1326, | |
| "mean_token_accuracy": 0.6725295066833497, | |
| "num_tokens": 427221.0, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.875, | |
| "grad_norm": 7.53125, | |
| "learning_rate": 1.93636621903549e-05, | |
| "loss": 0.9602, | |
| "mean_token_accuracy": 0.7483055353164673, | |
| "num_tokens": 429868.0, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 5.28125, | |
| "learning_rate": 1.9340487290799187e-05, | |
| "loss": 0.8656, | |
| "mean_token_accuracy": 0.762619799375534, | |
| "num_tokens": 433334.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.885, | |
| "grad_norm": 5.5, | |
| "learning_rate": 1.931691227585549e-05, | |
| "loss": 0.9443, | |
| "mean_token_accuracy": 0.742293655872345, | |
| "num_tokens": 435710.0, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 8.6875, | |
| "learning_rate": 1.929293815539899e-05, | |
| "loss": 1.1683, | |
| "mean_token_accuracy": 0.6682364106178283, | |
| "num_tokens": 437697.0, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.895, | |
| "grad_norm": 9.125, | |
| "learning_rate": 1.926856595640121e-05, | |
| "loss": 1.2909, | |
| "mean_token_accuracy": 0.6599106311798095, | |
| "num_tokens": 439418.0, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 5.0, | |
| "learning_rate": 1.924379672288604e-05, | |
| "loss": 0.7213, | |
| "mean_token_accuracy": 0.7874080419540406, | |
| "num_tokens": 443108.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.905, | |
| "grad_norm": 5.59375, | |
| "learning_rate": 1.9218631515885007e-05, | |
| "loss": 1.0733, | |
| "mean_token_accuracy": 0.7128746628761291, | |
| "num_tokens": 446547.0, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 5.65625, | |
| "learning_rate": 1.9193071413391823e-05, | |
| "loss": 1.1654, | |
| "mean_token_accuracy": 0.6913678884506226, | |
| "num_tokens": 448651.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.915, | |
| "grad_norm": 6.90625, | |
| "learning_rate": 1.9167117510316203e-05, | |
| "loss": 0.8036, | |
| "mean_token_accuracy": 0.7887425661087036, | |
| "num_tokens": 452294.0, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 6.78125, | |
| "learning_rate": 1.9140770918436977e-05, | |
| "loss": 0.8051, | |
| "mean_token_accuracy": 0.7639839768409729, | |
| "num_tokens": 454833.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.925, | |
| "grad_norm": 25.375, | |
| "learning_rate": 1.9114032766354453e-05, | |
| "loss": 1.4186, | |
| "mean_token_accuracy": 0.6357974767684936, | |
| "num_tokens": 455450.0, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 30.0, | |
| "learning_rate": 1.9086904199442076e-05, | |
| "loss": 1.249, | |
| "mean_token_accuracy": 0.6428504943847656, | |
| "num_tokens": 456891.0, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.935, | |
| "grad_norm": 8.5, | |
| "learning_rate": 1.905938637979736e-05, | |
| "loss": 1.0072, | |
| "mean_token_accuracy": 0.707493394613266, | |
| "num_tokens": 459398.0, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 5.34375, | |
| "learning_rate": 1.9031480486192112e-05, | |
| "loss": 0.8838, | |
| "mean_token_accuracy": 0.747218382358551, | |
| "num_tokens": 462454.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.945, | |
| "grad_norm": 20.25, | |
| "learning_rate": 1.9003187714021936e-05, | |
| "loss": 1.2989, | |
| "mean_token_accuracy": 0.6610628962516785, | |
| "num_tokens": 464222.0, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 5.28125, | |
| "learning_rate": 1.897450927525503e-05, | |
| "loss": 0.8568, | |
| "mean_token_accuracy": 0.7829653263092041, | |
| "num_tokens": 466434.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.955, | |
| "grad_norm": 6.0625, | |
| "learning_rate": 1.894544639838025e-05, | |
| "loss": 1.0098, | |
| "mean_token_accuracy": 0.726330041885376, | |
| "num_tokens": 470554.0, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 30.125, | |
| "learning_rate": 1.8916000328354527e-05, | |
| "loss": 1.0486, | |
| "mean_token_accuracy": 0.6986983299255372, | |
| "num_tokens": 473548.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.965, | |
| "grad_norm": 5.4375, | |
| "learning_rate": 1.888617232654949e-05, | |
| "loss": 1.003, | |
| "mean_token_accuracy": 0.7258612275123596, | |
| "num_tokens": 476545.0, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 6.0, | |
| "learning_rate": 1.8855963670697458e-05, | |
| "loss": 0.9224, | |
| "mean_token_accuracy": 0.7539906024932861, | |
| "num_tokens": 478267.0, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.975, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 1.8825375654836712e-05, | |
| "loss": 0.7836, | |
| "mean_token_accuracy": 0.7753713011741639, | |
| "num_tokens": 482619.0, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 5.375, | |
| "learning_rate": 1.8794409589256043e-05, | |
| "loss": 0.7844, | |
| "mean_token_accuracy": 0.7756492972373963, | |
| "num_tokens": 485731.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.985, | |
| "grad_norm": 26.625, | |
| "learning_rate": 1.8763066800438638e-05, | |
| "loss": 1.1647, | |
| "mean_token_accuracy": 0.7253339409828186, | |
| "num_tokens": 487225.0, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 4.53125, | |
| "learning_rate": 1.8731348631005254e-05, | |
| "loss": 0.8562, | |
| "mean_token_accuracy": 0.7538999199867249, | |
| "num_tokens": 490666.0, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.995, | |
| "grad_norm": 6.53125, | |
| "learning_rate": 1.8699256439656695e-05, | |
| "loss": 0.8333, | |
| "mean_token_accuracy": 0.776654314994812, | |
| "num_tokens": 493700.0, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 6.53125, | |
| "learning_rate": 1.866679160111564e-05, | |
| "loss": 0.8648, | |
| "mean_token_accuracy": 0.7611707806587219, | |
| "num_tokens": 495684.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.005, | |
| "grad_norm": 10.125, | |
| "learning_rate": 1.8633955506067717e-05, | |
| "loss": 0.7187, | |
| "mean_token_accuracy": 0.8037701487541199, | |
| "num_tokens": 497750.0, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 1.8600749561101947e-05, | |
| "loss": 0.8252, | |
| "mean_token_accuracy": 0.7683161616325378, | |
| "num_tokens": 500015.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.015, | |
| "grad_norm": 5.71875, | |
| "learning_rate": 1.85671751886505e-05, | |
| "loss": 0.7824, | |
| "mean_token_accuracy": 0.7723958611488342, | |
| "num_tokens": 502191.0, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 11.5, | |
| "learning_rate": 1.853323382692774e-05, | |
| "loss": 0.8636, | |
| "mean_token_accuracy": 0.7786948680877686, | |
| "num_tokens": 504669.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.025, | |
| "grad_norm": 6.84375, | |
| "learning_rate": 1.849892692986864e-05, | |
| "loss": 1.0297, | |
| "mean_token_accuracy": 0.7207900285720825, | |
| "num_tokens": 505672.0, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 1.8464255967066493e-05, | |
| "loss": 0.7061, | |
| "mean_token_accuracy": 0.8118877649307251, | |
| "num_tokens": 508170.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.035, | |
| "grad_norm": 6.53125, | |
| "learning_rate": 1.8429222423709946e-05, | |
| "loss": 0.7749, | |
| "mean_token_accuracy": 0.7793939590454102, | |
| "num_tokens": 511561.0, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 6.375, | |
| "learning_rate": 1.8393827800519397e-05, | |
| "loss": 0.7721, | |
| "mean_token_accuracy": 0.7689852476119995, | |
| "num_tokens": 514986.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.045, | |
| "grad_norm": 25.25, | |
| "learning_rate": 1.8358073613682705e-05, | |
| "loss": 0.8118, | |
| "mean_token_accuracy": 0.7597795128822327, | |
| "num_tokens": 517050.0, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 29.125, | |
| "learning_rate": 1.8321961394790227e-05, | |
| "loss": 0.8591, | |
| "mean_token_accuracy": 0.7560444116592407, | |
| "num_tokens": 520299.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.055, | |
| "grad_norm": 8.0, | |
| "learning_rate": 1.8285492690769237e-05, | |
| "loss": 0.6946, | |
| "mean_token_accuracy": 0.7862708926200866, | |
| "num_tokens": 523512.0, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 5.09375, | |
| "learning_rate": 1.8248669063817636e-05, | |
| "loss": 0.9663, | |
| "mean_token_accuracy": 0.7233885884284973, | |
| "num_tokens": 526506.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.065, | |
| "grad_norm": 8.625, | |
| "learning_rate": 1.821149209133704e-05, | |
| "loss": 0.7843, | |
| "mean_token_accuracy": 0.7711453795433044, | |
| "num_tokens": 529422.0, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 6.25, | |
| "learning_rate": 1.8173963365865224e-05, | |
| "loss": 0.7622, | |
| "mean_token_accuracy": 0.7758561253547669, | |
| "num_tokens": 532773.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.075, | |
| "grad_norm": 7.8125, | |
| "learning_rate": 1.8136084495007874e-05, | |
| "loss": 1.0229, | |
| "mean_token_accuracy": 0.7247954487800599, | |
| "num_tokens": 535103.0, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 25.625, | |
| "learning_rate": 1.8097857101369746e-05, | |
| "loss": 0.5869, | |
| "mean_token_accuracy": 0.8586034655570984, | |
| "num_tokens": 537072.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.085, | |
| "grad_norm": 17.75, | |
| "learning_rate": 1.805928282248516e-05, | |
| "loss": 0.8882, | |
| "mean_token_accuracy": 0.7501534223556519, | |
| "num_tokens": 538321.0, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 30.25, | |
| "learning_rate": 1.8020363310747836e-05, | |
| "loss": 0.9023, | |
| "mean_token_accuracy": 0.7529425621032715, | |
| "num_tokens": 540248.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.095, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 1.7981100233340118e-05, | |
| "loss": 1.0241, | |
| "mean_token_accuracy": 0.7149657249450684, | |
| "num_tokens": 542034.0, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 1.7941495272161566e-05, | |
| "loss": 0.5642, | |
| "mean_token_accuracy": 0.837715458869934, | |
| "num_tokens": 545512.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.105, | |
| "grad_norm": 5.09375, | |
| "learning_rate": 1.7901550123756906e-05, | |
| "loss": 0.5522, | |
| "mean_token_accuracy": 0.8279439449310303, | |
| "num_tokens": 549092.0, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 5.59375, | |
| "learning_rate": 1.7861266499243345e-05, | |
| "loss": 0.673, | |
| "mean_token_accuracy": 0.809853708744049, | |
| "num_tokens": 553001.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.115, | |
| "grad_norm": 4.78125, | |
| "learning_rate": 1.782064612423728e-05, | |
| "loss": 0.6577, | |
| "mean_token_accuracy": 0.8298884153366088, | |
| "num_tokens": 554999.0, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 5.59375, | |
| "learning_rate": 1.7779690738780386e-05, | |
| "loss": 0.6141, | |
| "mean_token_accuracy": 0.8177601218223571, | |
| "num_tokens": 558170.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.125, | |
| "grad_norm": 9.3125, | |
| "learning_rate": 1.7738402097265063e-05, | |
| "loss": 0.8862, | |
| "mean_token_accuracy": 0.753680431842804, | |
| "num_tokens": 560695.0, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 4.09375, | |
| "learning_rate": 1.7696781968359295e-05, | |
| "loss": 0.6387, | |
| "mean_token_accuracy": 0.8356854557991028, | |
| "num_tokens": 563429.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.135, | |
| "grad_norm": 6.96875, | |
| "learning_rate": 1.7654832134930885e-05, | |
| "loss": 0.7735, | |
| "mean_token_accuracy": 0.7737359762191772, | |
| "num_tokens": 566588.0, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 1.1400000000000001, | |
| "grad_norm": 6.5625, | |
| "learning_rate": 1.7612554393971072e-05, | |
| "loss": 0.6943, | |
| "mean_token_accuracy": 0.7929964780807495, | |
| "num_tokens": 567913.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.145, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 1.7569950556517566e-05, | |
| "loss": 0.7843, | |
| "mean_token_accuracy": 0.7716325044631958, | |
| "num_tokens": 570225.0, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 6.40625, | |
| "learning_rate": 1.752702244757697e-05, | |
| "loss": 0.9425, | |
| "mean_token_accuracy": 0.7387582778930664, | |
| "num_tokens": 572694.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.155, | |
| "grad_norm": 7.8125, | |
| "learning_rate": 1.7483771906046604e-05, | |
| "loss": 0.7913, | |
| "mean_token_accuracy": 0.768131959438324, | |
| "num_tokens": 575852.0, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 19.625, | |
| "learning_rate": 1.7440200784635702e-05, | |
| "loss": 0.8175, | |
| "mean_token_accuracy": 0.7646706700325012, | |
| "num_tokens": 577552.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.165, | |
| "grad_norm": 5.46875, | |
| "learning_rate": 1.73963109497861e-05, | |
| "loss": 0.8025, | |
| "mean_token_accuracy": 0.7692142486572265, | |
| "num_tokens": 580416.0, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 11.0625, | |
| "learning_rate": 1.735210428159224e-05, | |
| "loss": 0.6878, | |
| "mean_token_accuracy": 0.7846928119659424, | |
| "num_tokens": 580808.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.175, | |
| "grad_norm": 5.46875, | |
| "learning_rate": 1.7307582673720663e-05, | |
| "loss": 0.55, | |
| "mean_token_accuracy": 0.8443691372871399, | |
| "num_tokens": 584652.0, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 9.5, | |
| "learning_rate": 1.7262748033328867e-05, | |
| "loss": 0.8979, | |
| "mean_token_accuracy": 0.7402187824249268, | |
| "num_tokens": 586182.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.185, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 1.7217602280983622e-05, | |
| "loss": 0.6855, | |
| "mean_token_accuracy": 0.7872085213661194, | |
| "num_tokens": 588297.0, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 4.46875, | |
| "learning_rate": 1.717214735057871e-05, | |
| "loss": 0.4546, | |
| "mean_token_accuracy": 0.8626020073890686, | |
| "num_tokens": 591413.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.195, | |
| "grad_norm": 9.375, | |
| "learning_rate": 1.7126385189252055e-05, | |
| "loss": 0.8309, | |
| "mean_token_accuracy": 0.7628986001014709, | |
| "num_tokens": 593383.0, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 11.375, | |
| "learning_rate": 1.7080317757302346e-05, | |
| "loss": 0.9022, | |
| "mean_token_accuracy": 0.7245154261589051, | |
| "num_tokens": 595420.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.205, | |
| "grad_norm": 7.53125, | |
| "learning_rate": 1.703394702810504e-05, | |
| "loss": 1.0703, | |
| "mean_token_accuracy": 0.7057002305984497, | |
| "num_tokens": 597288.0, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 5.59375, | |
| "learning_rate": 1.6987274988027844e-05, | |
| "loss": 0.7194, | |
| "mean_token_accuracy": 0.8033593416213989, | |
| "num_tokens": 601071.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.215, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 1.694030363634562e-05, | |
| "loss": 0.9448, | |
| "mean_token_accuracy": 0.7666464924812317, | |
| "num_tokens": 602400.0, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 5.96875, | |
| "learning_rate": 1.6893034985154736e-05, | |
| "loss": 0.5014, | |
| "mean_token_accuracy": 0.8425140500068664, | |
| "num_tokens": 606273.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.225, | |
| "grad_norm": 5.71875, | |
| "learning_rate": 1.684547105928689e-05, | |
| "loss": 0.8279, | |
| "mean_token_accuracy": 0.7773316502571106, | |
| "num_tokens": 607875.0, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 6.125, | |
| "learning_rate": 1.6797613896222362e-05, | |
| "loss": 0.7825, | |
| "mean_token_accuracy": 0.7865223050117492, | |
| "num_tokens": 610323.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.2349999999999999, | |
| "grad_norm": 8.5, | |
| "learning_rate": 1.6749465546002734e-05, | |
| "loss": 0.6528, | |
| "mean_token_accuracy": 0.7994057536125183, | |
| "num_tokens": 613556.0, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 5.6875, | |
| "learning_rate": 1.6701028071143078e-05, | |
| "loss": 0.6212, | |
| "mean_token_accuracy": 0.8158914804458618, | |
| "num_tokens": 616500.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.245, | |
| "grad_norm": 22.5, | |
| "learning_rate": 1.665230354654361e-05, | |
| "loss": 0.6869, | |
| "mean_token_accuracy": 0.8139790058135986, | |
| "num_tokens": 618250.0, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 21.75, | |
| "learning_rate": 1.6603294059400792e-05, | |
| "loss": 0.6546, | |
| "mean_token_accuracy": 0.8378836750984192, | |
| "num_tokens": 619652.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.255, | |
| "grad_norm": 7.65625, | |
| "learning_rate": 1.655400170911794e-05, | |
| "loss": 0.9698, | |
| "mean_token_accuracy": 0.7234090209007263, | |
| "num_tokens": 621675.0, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 27.875, | |
| "learning_rate": 1.6504428607215278e-05, | |
| "loss": 0.6869, | |
| "mean_token_accuracy": 0.7933684349060058, | |
| "num_tokens": 624935.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.2650000000000001, | |
| "grad_norm": 4.6875, | |
| "learning_rate": 1.645457687723951e-05, | |
| "loss": 0.7998, | |
| "mean_token_accuracy": 0.7805917382240295, | |
| "num_tokens": 627211.0, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 47.25, | |
| "learning_rate": 1.640444865467281e-05, | |
| "loss": 0.914, | |
| "mean_token_accuracy": 0.7634903073310852, | |
| "num_tokens": 630413.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.275, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 1.635404608684141e-05, | |
| "loss": 1.1762, | |
| "mean_token_accuracy": 0.7183934211730957, | |
| "num_tokens": 632050.0, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 23.375, | |
| "learning_rate": 1.630337133282356e-05, | |
| "loss": 0.9415, | |
| "mean_token_accuracy": 0.7469957709312439, | |
| "num_tokens": 633415.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.285, | |
| "grad_norm": 25.25, | |
| "learning_rate": 1.6252426563357054e-05, | |
| "loss": 0.8475, | |
| "mean_token_accuracy": 0.7693663716316224, | |
| "num_tokens": 635732.0, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 5.78125, | |
| "learning_rate": 1.6201213960746265e-05, | |
| "loss": 0.6118, | |
| "mean_token_accuracy": 0.8120120167732239, | |
| "num_tokens": 638295.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.295, | |
| "grad_norm": 7.84375, | |
| "learning_rate": 1.6149735718768643e-05, | |
| "loss": 0.8979, | |
| "mean_token_accuracy": 0.7418479681015014, | |
| "num_tokens": 640676.0, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 24.25, | |
| "learning_rate": 1.609799404258074e-05, | |
| "loss": 1.347, | |
| "mean_token_accuracy": 0.645957636833191, | |
| "num_tokens": 642328.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.305, | |
| "grad_norm": 5.28125, | |
| "learning_rate": 1.6045991148623752e-05, | |
| "loss": 0.7518, | |
| "mean_token_accuracy": 0.785522711277008, | |
| "num_tokens": 645217.0, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 8.6875, | |
| "learning_rate": 1.5993729264528574e-05, | |
| "loss": 0.7075, | |
| "mean_token_accuracy": 0.7937332510948181, | |
| "num_tokens": 647392.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.315, | |
| "grad_norm": 8.375, | |
| "learning_rate": 1.594121062902039e-05, | |
| "loss": 0.8125, | |
| "mean_token_accuracy": 0.7603193163871765, | |
| "num_tokens": 649437.0, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 7.34375, | |
| "learning_rate": 1.5888437491822735e-05, | |
| "loss": 0.7205, | |
| "mean_token_accuracy": 0.7862048745155334, | |
| "num_tokens": 651137.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.325, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 1.5835412113561176e-05, | |
| "loss": 0.809, | |
| "mean_token_accuracy": 0.777035117149353, | |
| "num_tokens": 654612.0, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 9.0, | |
| "learning_rate": 1.578213676566643e-05, | |
| "loss": 0.6462, | |
| "mean_token_accuracy": 0.8325076103210449, | |
| "num_tokens": 656903.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.335, | |
| "grad_norm": 7.75, | |
| "learning_rate": 1.572861373027709e-05, | |
| "loss": 0.8659, | |
| "mean_token_accuracy": 0.7543999433517456, | |
| "num_tokens": 659625.0, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 7.90625, | |
| "learning_rate": 1.5674845300141853e-05, | |
| "loss": 0.7714, | |
| "mean_token_accuracy": 0.7812225937843322, | |
| "num_tokens": 661962.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.345, | |
| "grad_norm": 20.125, | |
| "learning_rate": 1.5620833778521306e-05, | |
| "loss": 0.7595, | |
| "mean_token_accuracy": 0.7850270628929138, | |
| "num_tokens": 664595.0, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 1.5566581479089278e-05, | |
| "loss": 0.5983, | |
| "mean_token_accuracy": 0.8208136677742004, | |
| "num_tokens": 668641.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.355, | |
| "grad_norm": 20.0, | |
| "learning_rate": 1.5512090725833706e-05, | |
| "loss": 0.5589, | |
| "mean_token_accuracy": 0.8399159669876098, | |
| "num_tokens": 670717.0, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 4.90625, | |
| "learning_rate": 1.54573638529571e-05, | |
| "loss": 1.1796, | |
| "mean_token_accuracy": 0.6812988758087158, | |
| "num_tokens": 673121.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.365, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 1.5402403204776552e-05, | |
| "loss": 0.7475, | |
| "mean_token_accuracy": 0.7796531915664673, | |
| "num_tokens": 676221.0, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 41.0, | |
| "learning_rate": 1.5347211135623305e-05, | |
| "loss": 1.0734, | |
| "mean_token_accuracy": 0.7397149443626404, | |
| "num_tokens": 679178.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.375, | |
| "grad_norm": 4.6875, | |
| "learning_rate": 1.5291790009741906e-05, | |
| "loss": 0.9777, | |
| "mean_token_accuracy": 0.7575628995895386, | |
| "num_tokens": 681713.0, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 7.75, | |
| "learning_rate": 1.5236142201188937e-05, | |
| "loss": 0.9092, | |
| "mean_token_accuracy": 0.7369059562683106, | |
| "num_tokens": 684554.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.385, | |
| "grad_norm": 26.125, | |
| "learning_rate": 1.5180270093731305e-05, | |
| "loss": 0.6692, | |
| "mean_token_accuracy": 0.816079044342041, | |
| "num_tokens": 687204.0, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 1.3900000000000001, | |
| "grad_norm": 31.5, | |
| "learning_rate": 1.5124176080744133e-05, | |
| "loss": 0.8812, | |
| "mean_token_accuracy": 0.7615653157234192, | |
| "num_tokens": 688872.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.395, | |
| "grad_norm": 8.0, | |
| "learning_rate": 1.5067862565108242e-05, | |
| "loss": 0.6465, | |
| "mean_token_accuracy": 0.8142944216728211, | |
| "num_tokens": 691813.0, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 33.0, | |
| "learning_rate": 1.5011331959107218e-05, | |
| "loss": 0.9764, | |
| "mean_token_accuracy": 0.7363247156143189, | |
| "num_tokens": 693259.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.405, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 1.4954586684324077e-05, | |
| "loss": 0.6891, | |
| "mean_token_accuracy": 0.7893139243125915, | |
| "num_tokens": 695056.0, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 8.6875, | |
| "learning_rate": 1.4897629171537522e-05, | |
| "loss": 1.0266, | |
| "mean_token_accuracy": 0.718493127822876, | |
| "num_tokens": 696392.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.415, | |
| "grad_norm": 8.375, | |
| "learning_rate": 1.4840461860617834e-05, | |
| "loss": 0.7586, | |
| "mean_token_accuracy": 0.7843063712120056, | |
| "num_tokens": 699830.0, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 6.0625, | |
| "learning_rate": 1.4783087200422346e-05, | |
| "loss": 0.6742, | |
| "mean_token_accuracy": 0.7995662927627564, | |
| "num_tokens": 703132.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.425, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 1.4725507648690542e-05, | |
| "loss": 0.832, | |
| "mean_token_accuracy": 0.7495120525360107, | |
| "num_tokens": 705829.0, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 36.75, | |
| "learning_rate": 1.4667725671938777e-05, | |
| "loss": 0.8103, | |
| "mean_token_accuracy": 0.7845470070838928, | |
| "num_tokens": 708529.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.435, | |
| "grad_norm": 7.9375, | |
| "learning_rate": 1.4609743745354625e-05, | |
| "loss": 0.6037, | |
| "mean_token_accuracy": 0.8083573698997497, | |
| "num_tokens": 711859.0, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 4.59375, | |
| "learning_rate": 1.455156435269084e-05, | |
| "loss": 0.4976, | |
| "mean_token_accuracy": 0.8688451528549195, | |
| "num_tokens": 715456.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.445, | |
| "grad_norm": 6.1875, | |
| "learning_rate": 1.4493189986158966e-05, | |
| "loss": 0.7662, | |
| "mean_token_accuracy": 0.7678531765937805, | |
| "num_tokens": 718603.0, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 4.90625, | |
| "learning_rate": 1.4434623146322585e-05, | |
| "loss": 0.7864, | |
| "mean_token_accuracy": 0.7857390880584717, | |
| "num_tokens": 721618.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.455, | |
| "grad_norm": 4.75, | |
| "learning_rate": 1.4375866341990187e-05, | |
| "loss": 0.8562, | |
| "mean_token_accuracy": 0.7766207218170166, | |
| "num_tokens": 723473.0, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 8.625, | |
| "learning_rate": 1.4316922090107712e-05, | |
| "loss": 0.8474, | |
| "mean_token_accuracy": 0.7638363718986512, | |
| "num_tokens": 726150.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.465, | |
| "grad_norm": 18.125, | |
| "learning_rate": 1.4257792915650728e-05, | |
| "loss": 0.8561, | |
| "mean_token_accuracy": 0.7815182447433472, | |
| "num_tokens": 727068.0, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 3.78125, | |
| "learning_rate": 1.4198481351516274e-05, | |
| "loss": 0.6429, | |
| "mean_token_accuracy": 0.8231713652610779, | |
| "num_tokens": 729654.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.475, | |
| "grad_norm": 7.1875, | |
| "learning_rate": 1.413898993841435e-05, | |
| "loss": 1.1566, | |
| "mean_token_accuracy": 0.6975414037704468, | |
| "num_tokens": 731546.0, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 7.6875, | |
| "learning_rate": 1.4079321224759093e-05, | |
| "loss": 0.6556, | |
| "mean_token_accuracy": 0.8056102156639099, | |
| "num_tokens": 734532.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.4849999999999999, | |
| "grad_norm": 24.0, | |
| "learning_rate": 1.4019477766559604e-05, | |
| "loss": 0.6731, | |
| "mean_token_accuracy": 0.7824649691581727, | |
| "num_tokens": 736766.0, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 1.3959462127310455e-05, | |
| "loss": 0.863, | |
| "mean_token_accuracy": 0.7568913578987122, | |
| "num_tokens": 738963.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.495, | |
| "grad_norm": 18.5, | |
| "learning_rate": 1.3899276877881884e-05, | |
| "loss": 0.6798, | |
| "mean_token_accuracy": 0.8145083069801331, | |
| "num_tokens": 740735.0, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 21.625, | |
| "learning_rate": 1.3838924596409669e-05, | |
| "loss": 0.7044, | |
| "mean_token_accuracy": 0.788611114025116, | |
| "num_tokens": 743437.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.505, | |
| "grad_norm": 7.3125, | |
| "learning_rate": 1.3778407868184674e-05, | |
| "loss": 0.6833, | |
| "mean_token_accuracy": 0.7906283736228943, | |
| "num_tokens": 746683.0, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 5.9375, | |
| "learning_rate": 1.3717729285542123e-05, | |
| "loss": 0.8181, | |
| "mean_token_accuracy": 0.8152880191802978, | |
| "num_tokens": 748460.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.5150000000000001, | |
| "grad_norm": 7.59375, | |
| "learning_rate": 1.3656891447750544e-05, | |
| "loss": 0.6501, | |
| "mean_token_accuracy": 0.8229759573936463, | |
| "num_tokens": 750909.0, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 23.25, | |
| "learning_rate": 1.3595896960900424e-05, | |
| "loss": 0.6641, | |
| "mean_token_accuracy": 0.7920986771583557, | |
| "num_tokens": 753135.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.525, | |
| "grad_norm": 56.5, | |
| "learning_rate": 1.3534748437792573e-05, | |
| "loss": 0.9268, | |
| "mean_token_accuracy": 0.7407203435897827, | |
| "num_tokens": 754912.0, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 5.21875, | |
| "learning_rate": 1.3473448497826203e-05, | |
| "loss": 0.7019, | |
| "mean_token_accuracy": 0.8039157390594482, | |
| "num_tokens": 757369.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.5350000000000001, | |
| "grad_norm": 7.375, | |
| "learning_rate": 1.341199976688672e-05, | |
| "loss": 0.9554, | |
| "mean_token_accuracy": 0.7373032093048095, | |
| "num_tokens": 761380.0, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 26.75, | |
| "learning_rate": 1.335040487723324e-05, | |
| "loss": 0.7944, | |
| "mean_token_accuracy": 0.7538552761077881, | |
| "num_tokens": 763096.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.545, | |
| "grad_norm": 25.75, | |
| "learning_rate": 1.3288666467385834e-05, | |
| "loss": 0.7346, | |
| "mean_token_accuracy": 0.7893040895462036, | |
| "num_tokens": 766281.0, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 3.734375, | |
| "learning_rate": 1.3226787182012494e-05, | |
| "loss": 0.5305, | |
| "mean_token_accuracy": 0.8514875173568726, | |
| "num_tokens": 768598.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.5550000000000002, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 1.3164769671815862e-05, | |
| "loss": 0.707, | |
| "mean_token_accuracy": 0.8167065143585205, | |
| "num_tokens": 770795.0, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 7.28125, | |
| "learning_rate": 1.310261659341966e-05, | |
| "loss": 0.6754, | |
| "mean_token_accuracy": 0.8203782916069031, | |
| "num_tokens": 774051.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.565, | |
| "grad_norm": 7.34375, | |
| "learning_rate": 1.3040330609254903e-05, | |
| "loss": 0.5884, | |
| "mean_token_accuracy": 0.8256639838218689, | |
| "num_tokens": 776466.0, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 1.5699999999999998, | |
| "grad_norm": 8.0, | |
| "learning_rate": 1.2977914387445855e-05, | |
| "loss": 0.8225, | |
| "mean_token_accuracy": 0.7328911781311035, | |
| "num_tokens": 778001.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.575, | |
| "grad_norm": 6.90625, | |
| "learning_rate": 1.2915370601695715e-05, | |
| "loss": 0.7298, | |
| "mean_token_accuracy": 0.7988103628158569, | |
| "num_tokens": 781046.0, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 6.40625, | |
| "learning_rate": 1.2852701931172105e-05, | |
| "loss": 0.934, | |
| "mean_token_accuracy": 0.7509458780288696, | |
| "num_tokens": 782879.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.585, | |
| "grad_norm": 6.8125, | |
| "learning_rate": 1.2789911060392295e-05, | |
| "loss": 0.5838, | |
| "mean_token_accuracy": 0.8262917876243592, | |
| "num_tokens": 786507.0, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 1.5899999999999999, | |
| "grad_norm": 10.25, | |
| "learning_rate": 1.2727000679108198e-05, | |
| "loss": 0.6244, | |
| "mean_token_accuracy": 0.8335184097290039, | |
| "num_tokens": 788829.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.595, | |
| "grad_norm": 4.96875, | |
| "learning_rate": 1.2663973482191177e-05, | |
| "loss": 0.5816, | |
| "mean_token_accuracy": 0.8350133657455444, | |
| "num_tokens": 792924.0, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 6.4375, | |
| "learning_rate": 1.2600832169516569e-05, | |
| "loss": 0.9038, | |
| "mean_token_accuracy": 0.7410524249076843, | |
| "num_tokens": 795525.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.605, | |
| "grad_norm": 8.0, | |
| "learning_rate": 1.2537579445848058e-05, | |
| "loss": 0.7561, | |
| "mean_token_accuracy": 0.7771415114402771, | |
| "num_tokens": 797845.0, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 1.6099999999999999, | |
| "grad_norm": 5.34375, | |
| "learning_rate": 1.2474218020721808e-05, | |
| "loss": 0.8331, | |
| "mean_token_accuracy": 0.7366581082344055, | |
| "num_tokens": 799764.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.615, | |
| "grad_norm": 8.25, | |
| "learning_rate": 1.2410750608330389e-05, | |
| "loss": 0.7238, | |
| "mean_token_accuracy": 0.805954110622406, | |
| "num_tokens": 801318.0, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 5.34375, | |
| "learning_rate": 1.234717992740651e-05, | |
| "loss": 0.5966, | |
| "mean_token_accuracy": 0.846554183959961, | |
| "num_tokens": 803034.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.625, | |
| "grad_norm": 21.75, | |
| "learning_rate": 1.2283508701106559e-05, | |
| "loss": 0.9917, | |
| "mean_token_accuracy": 0.734014344215393, | |
| "num_tokens": 804647.0, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 7.71875, | |
| "learning_rate": 1.221973965689396e-05, | |
| "loss": 0.6353, | |
| "mean_token_accuracy": 0.8209498286247253, | |
| "num_tokens": 807462.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.635, | |
| "grad_norm": 17.875, | |
| "learning_rate": 1.2155875526422332e-05, | |
| "loss": 0.6549, | |
| "mean_token_accuracy": 0.8138946652412414, | |
| "num_tokens": 810426.0, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 1.6400000000000001, | |
| "grad_norm": 8.125, | |
| "learning_rate": 1.2091919045418456e-05, | |
| "loss": 1.0867, | |
| "mean_token_accuracy": 0.7125813007354737, | |
| "num_tokens": 813337.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.645, | |
| "grad_norm": 4.78125, | |
| "learning_rate": 1.2027872953565125e-05, | |
| "loss": 0.8961, | |
| "mean_token_accuracy": 0.7431066751480102, | |
| "num_tokens": 815299.0, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 1.1963739994383751e-05, | |
| "loss": 0.8284, | |
| "mean_token_accuracy": 0.7688675642013549, | |
| "num_tokens": 818263.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.655, | |
| "grad_norm": 7.84375, | |
| "learning_rate": 1.1899522915116848e-05, | |
| "loss": 0.5926, | |
| "mean_token_accuracy": 0.8393247485160827, | |
| "num_tokens": 821340.0, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 1.6600000000000001, | |
| "grad_norm": 6.375, | |
| "learning_rate": 1.1835224466610366e-05, | |
| "loss": 0.6788, | |
| "mean_token_accuracy": 0.8024531245231629, | |
| "num_tokens": 823905.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.665, | |
| "grad_norm": 5.03125, | |
| "learning_rate": 1.1770847403195836e-05, | |
| "loss": 0.7675, | |
| "mean_token_accuracy": 0.7753681302070617, | |
| "num_tokens": 825154.0, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 31.5, | |
| "learning_rate": 1.1706394482572389e-05, | |
| "loss": 0.9249, | |
| "mean_token_accuracy": 0.7288736701011658, | |
| "num_tokens": 827410.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.675, | |
| "grad_norm": 6.90625, | |
| "learning_rate": 1.164186846568863e-05, | |
| "loss": 0.7804, | |
| "mean_token_accuracy": 0.7748052835464477, | |
| "num_tokens": 829865.0, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 16.375, | |
| "learning_rate": 1.1577272116624365e-05, | |
| "loss": 0.6965, | |
| "mean_token_accuracy": 0.7901236653327942, | |
| "num_tokens": 831718.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.685, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 1.1512608202472195e-05, | |
| "loss": 0.6891, | |
| "mean_token_accuracy": 0.8099049091339111, | |
| "num_tokens": 834931.0, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 1.144787949321898e-05, | |
| "loss": 0.6237, | |
| "mean_token_accuracy": 0.8211381077766419, | |
| "num_tokens": 835848.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.6949999999999998, | |
| "grad_norm": 7.25, | |
| "learning_rate": 1.1383088761627193e-05, | |
| "loss": 0.9123, | |
| "mean_token_accuracy": 0.7527759313583374, | |
| "num_tokens": 839009.0, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 8.0, | |
| "learning_rate": 1.131823878311613e-05, | |
| "loss": 0.8947, | |
| "mean_token_accuracy": 0.7173206686973572, | |
| "num_tokens": 840767.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.705, | |
| "grad_norm": 8.6875, | |
| "learning_rate": 1.1253332335643043e-05, | |
| "loss": 0.8703, | |
| "mean_token_accuracy": 0.7573009014129639, | |
| "num_tokens": 842188.0, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 7.53125, | |
| "learning_rate": 1.118837219958411e-05, | |
| "loss": 0.8249, | |
| "mean_token_accuracy": 0.7549278020858765, | |
| "num_tokens": 845037.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.7149999999999999, | |
| "grad_norm": 7.875, | |
| "learning_rate": 1.1123361157615355e-05, | |
| "loss": 0.5855, | |
| "mean_token_accuracy": 0.8285978078842163, | |
| "num_tokens": 847057.0, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 6.75, | |
| "learning_rate": 1.1058301994593447e-05, | |
| "loss": 0.9877, | |
| "mean_token_accuracy": 0.7238339900970459, | |
| "num_tokens": 850125.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.725, | |
| "grad_norm": 5.53125, | |
| "learning_rate": 1.0993197497436392e-05, | |
| "loss": 0.7891, | |
| "mean_token_accuracy": 0.7663105010986329, | |
| "num_tokens": 852644.0, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 21.0, | |
| "learning_rate": 1.0928050455004164e-05, | |
| "loss": 0.6967, | |
| "mean_token_accuracy": 0.8234347224235534, | |
| "num_tokens": 853429.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.7349999999999999, | |
| "grad_norm": 8.6875, | |
| "learning_rate": 1.0862863657979237e-05, | |
| "loss": 0.9785, | |
| "mean_token_accuracy": 0.7190654397010803, | |
| "num_tokens": 855823.0, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 31.75, | |
| "learning_rate": 1.0797639898747033e-05, | |
| "loss": 0.7938, | |
| "mean_token_accuracy": 0.7673383355140686, | |
| "num_tokens": 857885.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.745, | |
| "grad_norm": 7.03125, | |
| "learning_rate": 1.0732381971276318e-05, | |
| "loss": 0.7509, | |
| "mean_token_accuracy": 0.7781156420707702, | |
| "num_tokens": 860114.0, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 4.75, | |
| "learning_rate": 1.0667092670999512e-05, | |
| "loss": 0.8932, | |
| "mean_token_accuracy": 0.7806082010269165, | |
| "num_tokens": 863726.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.755, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 1.0601774794692936e-05, | |
| "loss": 0.5887, | |
| "mean_token_accuracy": 0.8234007000923157, | |
| "num_tokens": 867574.0, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 13.125, | |
| "learning_rate": 1.0536431140357018e-05, | |
| "loss": 0.8601, | |
| "mean_token_accuracy": 0.7545054912567138, | |
| "num_tokens": 869234.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.7650000000000001, | |
| "grad_norm": 23.0, | |
| "learning_rate": 1.0471064507096427e-05, | |
| "loss": 0.8858, | |
| "mean_token_accuracy": 0.7431819677352905, | |
| "num_tokens": 871713.0, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 5.53125, | |
| "learning_rate": 1.040567769500018e-05, | |
| "loss": 0.6333, | |
| "mean_token_accuracy": 0.8191001772880554, | |
| "num_tokens": 875192.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.775, | |
| "grad_norm": 28.125, | |
| "learning_rate": 1.0340273505021675e-05, | |
| "loss": 0.7526, | |
| "mean_token_accuracy": 0.7924058437347412, | |
| "num_tokens": 878090.0, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 20.625, | |
| "learning_rate": 1.0274854738858735e-05, | |
| "loss": 0.8469, | |
| "mean_token_accuracy": 0.7690641760826111, | |
| "num_tokens": 880115.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.7850000000000001, | |
| "grad_norm": 20.5, | |
| "learning_rate": 1.0209424198833571e-05, | |
| "loss": 0.9089, | |
| "mean_token_accuracy": 0.7421629071235657, | |
| "num_tokens": 881569.0, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 30.25, | |
| "learning_rate": 1.0143984687772746e-05, | |
| "loss": 0.9561, | |
| "mean_token_accuracy": 0.7445757389068604, | |
| "num_tokens": 883754.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.795, | |
| "grad_norm": 5.09375, | |
| "learning_rate": 1.0078539008887114e-05, | |
| "loss": 0.6389, | |
| "mean_token_accuracy": 0.81376873254776, | |
| "num_tokens": 887475.0, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 7.21875, | |
| "learning_rate": 1.0013089965651743e-05, | |
| "loss": 0.7537, | |
| "mean_token_accuracy": 0.7766026139259339, | |
| "num_tokens": 890091.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.8050000000000002, | |
| "grad_norm": 17.75, | |
| "learning_rate": 9.947640361685805e-06, | |
| "loss": 0.6551, | |
| "mean_token_accuracy": 0.827604067325592, | |
| "num_tokens": 892332.0, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 5.625, | |
| "learning_rate": 9.882193000632507e-06, | |
| "loss": 0.5868, | |
| "mean_token_accuracy": 0.830898129940033, | |
| "num_tokens": 895800.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.815, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 9.816750686038974e-06, | |
| "loss": 0.8278, | |
| "mean_token_accuracy": 0.7838187098503113, | |
| "num_tokens": 897036.0, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 1.8199999999999998, | |
| "grad_norm": 5.09375, | |
| "learning_rate": 9.751316221236147e-06, | |
| "loss": 0.6766, | |
| "mean_token_accuracy": 0.7844752073287964, | |
| "num_tokens": 900013.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.825, | |
| "grad_norm": 5.15625, | |
| "learning_rate": 9.685892409218718e-06, | |
| "loss": 0.5658, | |
| "mean_token_accuracy": 0.8363893985748291, | |
| "num_tokens": 903991.0, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 11.625, | |
| "learning_rate": 9.620482052525048e-06, | |
| "loss": 0.7384, | |
| "mean_token_accuracy": 0.7787490963935852, | |
| "num_tokens": 905860.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.835, | |
| "grad_norm": 5.53125, | |
| "learning_rate": 9.555087953117114e-06, | |
| "loss": 0.8144, | |
| "mean_token_accuracy": 0.762047529220581, | |
| "num_tokens": 909342.0, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 6.53125, | |
| "learning_rate": 9.489712912260486e-06, | |
| "loss": 0.894, | |
| "mean_token_accuracy": 0.7327398538589478, | |
| "num_tokens": 911671.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.845, | |
| "grad_norm": 32.75, | |
| "learning_rate": 9.424359730404329e-06, | |
| "loss": 1.1982, | |
| "mean_token_accuracy": 0.6791368842124939, | |
| "num_tokens": 912586.0, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 7.84375, | |
| "learning_rate": 9.359031207061442e-06, | |
| "loss": 0.651, | |
| "mean_token_accuracy": 0.8372863888740539, | |
| "num_tokens": 915696.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.855, | |
| "grad_norm": 6.25, | |
| "learning_rate": 9.293730140688336e-06, | |
| "loss": 0.9395, | |
| "mean_token_accuracy": 0.7412417650222778, | |
| "num_tokens": 917503.0, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 1.8599999999999999, | |
| "grad_norm": 6.875, | |
| "learning_rate": 9.228459328565354e-06, | |
| "loss": 1.136, | |
| "mean_token_accuracy": 0.7111948132514954, | |
| "num_tokens": 918864.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.865, | |
| "grad_norm": 4.65625, | |
| "learning_rate": 9.163221566676847e-06, | |
| "loss": 0.8892, | |
| "mean_token_accuracy": 0.7210251212120056, | |
| "num_tokens": 920527.0, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 9.098019649591409e-06, | |
| "loss": 0.7635, | |
| "mean_token_accuracy": 0.7711951851844787, | |
| "num_tokens": 923776.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.875, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 9.032856370342158e-06, | |
| "loss": 0.5589, | |
| "mean_token_accuracy": 0.8322081685066223, | |
| "num_tokens": 927187.0, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 27.0, | |
| "learning_rate": 8.967734520307102e-06, | |
| "loss": 0.6424, | |
| "mean_token_accuracy": 0.8095461368560791, | |
| "num_tokens": 929400.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.885, | |
| "grad_norm": 7.875, | |
| "learning_rate": 8.902656889089548e-06, | |
| "loss": 0.7556, | |
| "mean_token_accuracy": 0.7700052261352539, | |
| "num_tokens": 932708.0, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 1.8900000000000001, | |
| "grad_norm": 18.75, | |
| "learning_rate": 8.837626264398623e-06, | |
| "loss": 0.8495, | |
| "mean_token_accuracy": 0.7746875047683716, | |
| "num_tokens": 934905.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.895, | |
| "grad_norm": 8.0, | |
| "learning_rate": 8.772645431929851e-06, | |
| "loss": 0.8468, | |
| "mean_token_accuracy": 0.7573032855987549, | |
| "num_tokens": 936953.0, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 6.15625, | |
| "learning_rate": 8.707717175245826e-06, | |
| "loss": 0.735, | |
| "mean_token_accuracy": 0.7652702808380127, | |
| "num_tokens": 939323.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.905, | |
| "grad_norm": 7.90625, | |
| "learning_rate": 8.642844275656957e-06, | |
| "loss": 0.8166, | |
| "mean_token_accuracy": 0.7702357769012451, | |
| "num_tokens": 941802.0, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 1.9100000000000001, | |
| "grad_norm": 24.375, | |
| "learning_rate": 8.578029512102357e-06, | |
| "loss": 0.6977, | |
| "mean_token_accuracy": 0.7843759894371033, | |
| "num_tokens": 944128.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.915, | |
| "grad_norm": 6.625, | |
| "learning_rate": 8.51327566103077e-06, | |
| "loss": 0.7186, | |
| "mean_token_accuracy": 0.7871297121047973, | |
| "num_tokens": 947149.0, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 7.9375, | |
| "learning_rate": 8.448585496281667e-06, | |
| "loss": 0.8985, | |
| "mean_token_accuracy": 0.7248257637023926, | |
| "num_tokens": 950716.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.925, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 8.38396178896639e-06, | |
| "loss": 0.8082, | |
| "mean_token_accuracy": 0.7705564975738526, | |
| "num_tokens": 952299.0, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 1.9300000000000002, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 8.319407307349482e-06, | |
| "loss": 0.7754, | |
| "mean_token_accuracy": 0.7596949458122253, | |
| "num_tokens": 954737.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.935, | |
| "grad_norm": 5.15625, | |
| "learning_rate": 8.254924816730083e-06, | |
| "loss": 0.7946, | |
| "mean_token_accuracy": 0.7884817004203797, | |
| "num_tokens": 956701.0, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 38.25, | |
| "learning_rate": 8.190517079323472e-06, | |
| "loss": 0.9175, | |
| "mean_token_accuracy": 0.7576337218284607, | |
| "num_tokens": 959914.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.9449999999999998, | |
| "grad_norm": 27.125, | |
| "learning_rate": 8.126186854142752e-06, | |
| "loss": 0.5564, | |
| "mean_token_accuracy": 0.8395720958709717, | |
| "num_tokens": 962779.0, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 17.375, | |
| "learning_rate": 8.061936896880674e-06, | |
| "loss": 0.6506, | |
| "mean_token_accuracy": 0.8393109798431396, | |
| "num_tokens": 964649.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.955, | |
| "grad_norm": 13.75, | |
| "learning_rate": 7.997769959791554e-06, | |
| "loss": 0.8628, | |
| "mean_token_accuracy": 0.7746016979217529, | |
| "num_tokens": 966716.0, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 6.4375, | |
| "learning_rate": 7.933688791573418e-06, | |
| "loss": 0.6913, | |
| "mean_token_accuracy": 0.7971544742584229, | |
| "num_tokens": 969874.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.9649999999999999, | |
| "grad_norm": 5.1875, | |
| "learning_rate": 7.869696137250235e-06, | |
| "loss": 0.5946, | |
| "mean_token_accuracy": 0.8192303895950317, | |
| "num_tokens": 972170.0, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 18.875, | |
| "learning_rate": 7.80579473805433e-06, | |
| "loss": 0.632, | |
| "mean_token_accuracy": 0.8178293108940125, | |
| "num_tokens": 974308.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.975, | |
| "grad_norm": 7.78125, | |
| "learning_rate": 7.741987331308964e-06, | |
| "loss": 0.6573, | |
| "mean_token_accuracy": 0.8085449814796448, | |
| "num_tokens": 977606.0, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 5.1875, | |
| "learning_rate": 7.678276650311075e-06, | |
| "loss": 0.8132, | |
| "mean_token_accuracy": 0.7615883231163025, | |
| "num_tokens": 980558.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.9849999999999999, | |
| "grad_norm": 4.65625, | |
| "learning_rate": 7.6146654242141935e-06, | |
| "loss": 0.7852, | |
| "mean_token_accuracy": 0.7767473936080933, | |
| "num_tokens": 983519.0, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 24.25, | |
| "learning_rate": 7.551156377911526e-06, | |
| "loss": 0.7591, | |
| "mean_token_accuracy": 0.7873912572860717, | |
| "num_tokens": 985872.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.995, | |
| "grad_norm": 31.0, | |
| "learning_rate": 7.487752231919246e-06, | |
| "loss": 0.9672, | |
| "mean_token_accuracy": 0.7186308145523072, | |
| "num_tokens": 987339.0, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 17.125, | |
| "learning_rate": 7.4244557022599394e-06, | |
| "loss": 0.55, | |
| "mean_token_accuracy": 0.8468565583229065, | |
| "num_tokens": 991368.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.005, | |
| "grad_norm": 5.15625, | |
| "learning_rate": 7.361269500346274e-06, | |
| "loss": 0.4919, | |
| "mean_token_accuracy": 0.8586863517761231, | |
| "num_tokens": 994329.0, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 24.625, | |
| "learning_rate": 7.298196332864834e-06, | |
| "loss": 0.7438, | |
| "mean_token_accuracy": 0.827777373790741, | |
| "num_tokens": 995585.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 2.015, | |
| "grad_norm": 6.78125, | |
| "learning_rate": 7.235238901660195e-06, | |
| "loss": 0.6072, | |
| "mean_token_accuracy": 0.8268356323242188, | |
| "num_tokens": 998236.0, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 22.75, | |
| "learning_rate": 7.172399903619165e-06, | |
| "loss": 0.8657, | |
| "mean_token_accuracy": 0.7580430507659912, | |
| "num_tokens": 999149.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 2.025, | |
| "grad_norm": 7.78125, | |
| "learning_rate": 7.109682030555283e-06, | |
| "loss": 0.4715, | |
| "mean_token_accuracy": 0.8574858069419861, | |
| "num_tokens": 1003020.0, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 6.4375, | |
| "learning_rate": 7.047087969093488e-06, | |
| "loss": 0.6394, | |
| "mean_token_accuracy": 0.8059547662734985, | |
| "num_tokens": 1006359.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 2.035, | |
| "grad_norm": 25.0, | |
| "learning_rate": 6.984620400555044e-06, | |
| "loss": 0.7012, | |
| "mean_token_accuracy": 0.7960086703300476, | |
| "num_tokens": 1008642.0, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 7.84375, | |
| "learning_rate": 6.922282000842675e-06, | |
| "loss": 0.6689, | |
| "mean_token_accuracy": 0.8068014264106751, | |
| "num_tokens": 1011422.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 2.045, | |
| "grad_norm": 17.5, | |
| "learning_rate": 6.860075440325951e-06, | |
| "loss": 0.5843, | |
| "mean_token_accuracy": 0.8451680421829224, | |
| "num_tokens": 1013590.0, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 6.90625, | |
| "learning_rate": 6.798003383726883e-06, | |
| "loss": 0.6298, | |
| "mean_token_accuracy": 0.8257234930992127, | |
| "num_tokens": 1015307.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 2.055, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 6.736068490005784e-06, | |
| "loss": 0.791, | |
| "mean_token_accuracy": 0.7678486227989196, | |
| "num_tokens": 1017765.0, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 10.75, | |
| "learning_rate": 6.674273412247365e-06, | |
| "loss": 0.5853, | |
| "mean_token_accuracy": 0.8227458953857422, | |
| "num_tokens": 1020785.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 2.065, | |
| "grad_norm": 22.625, | |
| "learning_rate": 6.612620797547087e-06, | |
| "loss": 0.6217, | |
| "mean_token_accuracy": 0.8210171341896058, | |
| "num_tokens": 1023175.0, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 5.96875, | |
| "learning_rate": 6.551113286897771e-06, | |
| "loss": 0.4628, | |
| "mean_token_accuracy": 0.8576245903968811, | |
| "num_tokens": 1024847.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 2.075, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 6.489753515076457e-06, | |
| "loss": 0.9042, | |
| "mean_token_accuracy": 0.753120231628418, | |
| "num_tokens": 1028443.0, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 27.75, | |
| "learning_rate": 6.428544110531549e-06, | |
| "loss": 0.8369, | |
| "mean_token_accuracy": 0.7524871230125427, | |
| "num_tokens": 1030302.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 2.085, | |
| "grad_norm": 8.125, | |
| "learning_rate": 6.367487695270218e-06, | |
| "loss": 0.5785, | |
| "mean_token_accuracy": 0.8259750604629517, | |
| "num_tokens": 1033853.0, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 7.5625, | |
| "learning_rate": 6.306586884746085e-06, | |
| "loss": 0.5235, | |
| "mean_token_accuracy": 0.8460973501205444, | |
| "num_tokens": 1036706.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 2.095, | |
| "grad_norm": 25.25, | |
| "learning_rate": 6.245844287747168e-06, | |
| "loss": 0.8981, | |
| "mean_token_accuracy": 0.7195080876350403, | |
| "num_tokens": 1038871.0, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 8.625, | |
| "learning_rate": 6.185262506284171e-06, | |
| "loss": 0.7093, | |
| "mean_token_accuracy": 0.7909031867980957, | |
| "num_tokens": 1042005.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.105, | |
| "grad_norm": 23.125, | |
| "learning_rate": 6.124844135478971e-06, | |
| "loss": 0.7969, | |
| "mean_token_accuracy": 0.7591824889183044, | |
| "num_tokens": 1044314.0, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 23.125, | |
| "learning_rate": 6.0645917634534856e-06, | |
| "loss": 0.6442, | |
| "mean_token_accuracy": 0.8363109707832337, | |
| "num_tokens": 1046326.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 2.115, | |
| "grad_norm": 6.625, | |
| "learning_rate": 6.004507971218801e-06, | |
| "loss": 0.5353, | |
| "mean_token_accuracy": 0.8375053167343139, | |
| "num_tokens": 1049452.0, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 5.944595332564598e-06, | |
| "loss": 0.4135, | |
| "mean_token_accuracy": 0.8807172417640686, | |
| "num_tokens": 1051626.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 2.125, | |
| "grad_norm": 15.8125, | |
| "learning_rate": 5.884856413948913e-06, | |
| "loss": 0.4851, | |
| "mean_token_accuracy": 0.8450929164886475, | |
| "num_tokens": 1055101.0, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 8.375, | |
| "learning_rate": 5.825293774388196e-06, | |
| "loss": 0.6181, | |
| "mean_token_accuracy": 0.8257151484489441, | |
| "num_tokens": 1058263.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.135, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 5.7659099653476805e-06, | |
| "loss": 0.6252, | |
| "mean_token_accuracy": 0.8017407178878784, | |
| "num_tokens": 1060704.0, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 8.125, | |
| "learning_rate": 5.7067075306321025e-06, | |
| "loss": 0.7446, | |
| "mean_token_accuracy": 0.8047372102737427, | |
| "num_tokens": 1063001.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 2.145, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 5.647689006276727e-06, | |
| "loss": 0.8342, | |
| "mean_token_accuracy": 0.7604440450668335, | |
| "num_tokens": 1064965.0, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 16.375, | |
| "learning_rate": 5.588856920438706e-06, | |
| "loss": 0.4514, | |
| "mean_token_accuracy": 0.8737953782081604, | |
| "num_tokens": 1066941.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.155, | |
| "grad_norm": 7.21875, | |
| "learning_rate": 5.53021379328879e-06, | |
| "loss": 0.6148, | |
| "mean_token_accuracy": 0.8180341601371766, | |
| "num_tokens": 1069106.0, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 7.21875, | |
| "learning_rate": 5.4717621369033654e-06, | |
| "loss": 0.3871, | |
| "mean_token_accuracy": 0.8827916264533997, | |
| "num_tokens": 1071843.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.165, | |
| "grad_norm": 12.25, | |
| "learning_rate": 5.413504455156855e-06, | |
| "loss": 0.6309, | |
| "mean_token_accuracy": 0.8097968459129333, | |
| "num_tokens": 1073821.0, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 21.875, | |
| "learning_rate": 5.355443243614434e-06, | |
| "loss": 0.6844, | |
| "mean_token_accuracy": 0.8111440539360046, | |
| "num_tokens": 1075459.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 2.175, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 5.297580989425177e-06, | |
| "loss": 0.6962, | |
| "mean_token_accuracy": 0.7998672127723694, | |
| "num_tokens": 1077136.0, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 5.5, | |
| "learning_rate": 5.2399201712154666e-06, | |
| "loss": 0.6331, | |
| "mean_token_accuracy": 0.8542533636093139, | |
| "num_tokens": 1079897.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 2.185, | |
| "grad_norm": 7.0625, | |
| "learning_rate": 5.1824632589828465e-06, | |
| "loss": 0.4966, | |
| "mean_token_accuracy": 0.8605088949203491, | |
| "num_tokens": 1081679.0, | |
| "step": 2185 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 18.0, | |
| "learning_rate": 5.125212713990207e-06, | |
| "loss": 0.5673, | |
| "mean_token_accuracy": 0.8518822908401489, | |
| "num_tokens": 1083383.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.195, | |
| "grad_norm": 4.40625, | |
| "learning_rate": 5.0681709886603454e-06, | |
| "loss": 0.72, | |
| "mean_token_accuracy": 0.789927351474762, | |
| "num_tokens": 1085659.0, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 6.09375, | |
| "learning_rate": 5.011340526470928e-06, | |
| "loss": 0.6352, | |
| "mean_token_accuracy": 0.8001546740531922, | |
| "num_tokens": 1088395.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.205, | |
| "grad_norm": 10.125, | |
| "learning_rate": 4.954723761849809e-06, | |
| "loss": 0.7239, | |
| "mean_token_accuracy": 0.7876050591468811, | |
| "num_tokens": 1090913.0, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 29.875, | |
| "learning_rate": 4.8983231200707495e-06, | |
| "loss": 0.6724, | |
| "mean_token_accuracy": 0.7935240149497986, | |
| "num_tokens": 1093568.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 2.215, | |
| "grad_norm": 7.28125, | |
| "learning_rate": 4.8421410171495265e-06, | |
| "loss": 0.7759, | |
| "mean_token_accuracy": 0.7690300822257996, | |
| "num_tokens": 1096199.0, | |
| "step": 2215 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 4.786179859740442e-06, | |
| "loss": 0.5158, | |
| "mean_token_accuracy": 0.8456796884536744, | |
| "num_tokens": 1098664.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.225, | |
| "grad_norm": 5.375, | |
| "learning_rate": 4.7304420450332244e-06, | |
| "loss": 0.6302, | |
| "mean_token_accuracy": 0.8168246984481812, | |
| "num_tokens": 1102032.0, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 4.65625, | |
| "learning_rate": 4.674929960650352e-06, | |
| "loss": 0.6012, | |
| "mean_token_accuracy": 0.8399333238601685, | |
| "num_tokens": 1104825.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 2.235, | |
| "grad_norm": 4.5, | |
| "learning_rate": 4.619645984544752e-06, | |
| "loss": 0.3681, | |
| "mean_token_accuracy": 0.8887263059616088, | |
| "num_tokens": 1106158.0, | |
| "step": 2235 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 9.25, | |
| "learning_rate": 4.564592484897965e-06, | |
| "loss": 0.5975, | |
| "mean_token_accuracy": 0.8158373713493348, | |
| "num_tokens": 1109858.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.245, | |
| "grad_norm": 25.25, | |
| "learning_rate": 4.509771820018682e-06, | |
| "loss": 0.7861, | |
| "mean_token_accuracy": 0.7793623328208923, | |
| "num_tokens": 1111179.0, | |
| "step": 2245 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 8.5, | |
| "learning_rate": 4.455186338241733e-06, | |
| "loss": 0.7591, | |
| "mean_token_accuracy": 0.7832494258880616, | |
| "num_tokens": 1114073.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.255, | |
| "grad_norm": 18.75, | |
| "learning_rate": 4.4008383778274835e-06, | |
| "loss": 0.7995, | |
| "mean_token_accuracy": 0.7816387176513672, | |
| "num_tokens": 1114839.0, | |
| "step": 2255 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 7.3125, | |
| "learning_rate": 4.346730266861673e-06, | |
| "loss": 0.6332, | |
| "mean_token_accuracy": 0.8161526322364807, | |
| "num_tokens": 1116483.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.265, | |
| "grad_norm": 7.59375, | |
| "learning_rate": 4.292864323155684e-06, | |
| "loss": 0.7247, | |
| "mean_token_accuracy": 0.7976547837257385, | |
| "num_tokens": 1118654.0, | |
| "step": 2265 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 6.375, | |
| "learning_rate": 4.23924285414727e-06, | |
| "loss": 0.6007, | |
| "mean_token_accuracy": 0.8198469638824463, | |
| "num_tokens": 1122216.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.275, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 4.185868156801695e-06, | |
| "loss": 0.5729, | |
| "mean_token_accuracy": 0.8336104035377503, | |
| "num_tokens": 1125041.0, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 2.2800000000000002, | |
| "grad_norm": 22.25, | |
| "learning_rate": 4.13274251751335e-06, | |
| "loss": 0.6672, | |
| "mean_token_accuracy": 0.7996611833572388, | |
| "num_tokens": 1127366.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.285, | |
| "grad_norm": 4.21875, | |
| "learning_rate": 4.0798682120078046e-06, | |
| "loss": 0.4389, | |
| "mean_token_accuracy": 0.8652261853218078, | |
| "num_tokens": 1131524.0, | |
| "step": 2285 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 18.25, | |
| "learning_rate": 4.027247505244329e-06, | |
| "loss": 0.6732, | |
| "mean_token_accuracy": 0.8089190244674682, | |
| "num_tokens": 1133566.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.295, | |
| "grad_norm": 24.125, | |
| "learning_rate": 3.974882651318869e-06, | |
| "loss": 0.5543, | |
| "mean_token_accuracy": 0.8400264263153077, | |
| "num_tokens": 1136223.0, | |
| "step": 2295 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 6.5625, | |
| "learning_rate": 3.9227758933674756e-06, | |
| "loss": 0.7666, | |
| "mean_token_accuracy": 0.7888131022453309, | |
| "num_tokens": 1139524.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.305, | |
| "grad_norm": 14.5, | |
| "learning_rate": 3.8709294634702374e-06, | |
| "loss": 0.9221, | |
| "mean_token_accuracy": 0.7322679758071899, | |
| "num_tokens": 1141630.0, | |
| "step": 2305 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 6.78125, | |
| "learning_rate": 3.819345582555653e-06, | |
| "loss": 0.5426, | |
| "mean_token_accuracy": 0.8481068849563599, | |
| "num_tokens": 1143585.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.315, | |
| "grad_norm": 7.9375, | |
| "learning_rate": 3.7680264603054985e-06, | |
| "loss": 0.6749, | |
| "mean_token_accuracy": 0.8034712195396423, | |
| "num_tokens": 1146169.0, | |
| "step": 2315 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 6.6875, | |
| "learning_rate": 3.716974295060163e-06, | |
| "loss": 0.4901, | |
| "mean_token_accuracy": 0.8497132182121276, | |
| "num_tokens": 1149812.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.325, | |
| "grad_norm": 13.0, | |
| "learning_rate": 3.6661912737244996e-06, | |
| "loss": 0.5275, | |
| "mean_token_accuracy": 0.8501026272773743, | |
| "num_tokens": 1153572.0, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 6.46875, | |
| "learning_rate": 3.6156795716741146e-06, | |
| "loss": 0.536, | |
| "mean_token_accuracy": 0.8431202411651612, | |
| "num_tokens": 1156510.0, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.335, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 3.565441352662211e-06, | |
| "loss": 0.6569, | |
| "mean_token_accuracy": 0.8402642846107483, | |
| "num_tokens": 1158458.0, | |
| "step": 2335 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 19.5, | |
| "learning_rate": 3.5154787687268852e-06, | |
| "loss": 0.6224, | |
| "mean_token_accuracy": 0.8333223938941956, | |
| "num_tokens": 1159689.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.3449999999999998, | |
| "grad_norm": 26.125, | |
| "learning_rate": 3.4657939600989453e-06, | |
| "loss": 0.4519, | |
| "mean_token_accuracy": 0.8754086375236512, | |
| "num_tokens": 1162518.0, | |
| "step": 2345 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 5.9375, | |
| "learning_rate": 3.4163890551102284e-06, | |
| "loss": 0.6395, | |
| "mean_token_accuracy": 0.7997304558753967, | |
| "num_tokens": 1165678.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.355, | |
| "grad_norm": 9.875, | |
| "learning_rate": 3.3672661701024324e-06, | |
| "loss": 0.9325, | |
| "mean_token_accuracy": 0.7247589707374573, | |
| "num_tokens": 1167423.0, | |
| "step": 2355 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 6.875, | |
| "learning_rate": 3.318427409336461e-06, | |
| "loss": 0.5473, | |
| "mean_token_accuracy": 0.8480026125907898, | |
| "num_tokens": 1169693.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.365, | |
| "grad_norm": 26.375, | |
| "learning_rate": 3.2698748649022693e-06, | |
| "loss": 0.4592, | |
| "mean_token_accuracy": 0.8708458781242371, | |
| "num_tokens": 1171977.0, | |
| "step": 2365 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 7.6875, | |
| "learning_rate": 3.2216106166292694e-06, | |
| "loss": 0.5238, | |
| "mean_token_accuracy": 0.8693408727645874, | |
| "num_tokens": 1173552.0, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.375, | |
| "grad_norm": 8.75, | |
| "learning_rate": 3.1736367319972216e-06, | |
| "loss": 0.7585, | |
| "mean_token_accuracy": 0.7678687095642089, | |
| "num_tokens": 1174533.0, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 17.875, | |
| "learning_rate": 3.1259552660476744e-06, | |
| "loss": 0.609, | |
| "mean_token_accuracy": 0.853853178024292, | |
| "num_tokens": 1176567.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.385, | |
| "grad_norm": 27.0, | |
| "learning_rate": 3.0785682612959334e-06, | |
| "loss": 0.6462, | |
| "mean_token_accuracy": 0.8025004386901855, | |
| "num_tokens": 1178840.0, | |
| "step": 2385 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 3.031477747643564e-06, | |
| "loss": 0.6865, | |
| "mean_token_accuracy": 0.8314264178276062, | |
| "num_tokens": 1181017.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.395, | |
| "grad_norm": 35.75, | |
| "learning_rate": 2.9846857422914434e-06, | |
| "loss": 0.6985, | |
| "mean_token_accuracy": 0.8023651003837585, | |
| "num_tokens": 1184083.0, | |
| "step": 2395 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 9.375, | |
| "learning_rate": 2.9381942496533443e-06, | |
| "loss": 0.6431, | |
| "mean_token_accuracy": 0.8262930393218995, | |
| "num_tokens": 1185646.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.4050000000000002, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 2.8920052612700755e-06, | |
| "loss": 0.5356, | |
| "mean_token_accuracy": 0.8470789790153503, | |
| "num_tokens": 1187614.0, | |
| "step": 2405 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 10.5, | |
| "learning_rate": 2.846120755724171e-06, | |
| "loss": 0.5002, | |
| "mean_token_accuracy": 0.8470618605613709, | |
| "num_tokens": 1190017.0, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.415, | |
| "grad_norm": 22.5, | |
| "learning_rate": 2.800542698555132e-06, | |
| "loss": 0.4556, | |
| "mean_token_accuracy": 0.8644039750099182, | |
| "num_tokens": 1193266.0, | |
| "step": 2415 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 5.28125, | |
| "learning_rate": 2.755273042175233e-06, | |
| "loss": 0.4292, | |
| "mean_token_accuracy": 0.8651724457740784, | |
| "num_tokens": 1197242.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.425, | |
| "grad_norm": 24.875, | |
| "learning_rate": 2.7103137257858867e-06, | |
| "loss": 0.5487, | |
| "mean_token_accuracy": 0.8282508969306945, | |
| "num_tokens": 1199347.0, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 28.375, | |
| "learning_rate": 2.6656666752945647e-06, | |
| "loss": 0.7351, | |
| "mean_token_accuracy": 0.7949961185455322, | |
| "num_tokens": 1201102.0, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.435, | |
| "grad_norm": 13.25, | |
| "learning_rate": 2.6213338032323175e-06, | |
| "loss": 0.6013, | |
| "mean_token_accuracy": 0.8310695767402649, | |
| "num_tokens": 1203274.0, | |
| "step": 2435 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 13.25, | |
| "learning_rate": 2.5773170086718324e-06, | |
| "loss": 0.5486, | |
| "mean_token_accuracy": 0.8236384749412536, | |
| "num_tokens": 1204799.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.445, | |
| "grad_norm": 5.5, | |
| "learning_rate": 2.5336181771460865e-06, | |
| "loss": 0.6963, | |
| "mean_token_accuracy": 0.8013479828834533, | |
| "num_tokens": 1207929.0, | |
| "step": 2445 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 2.490239180567585e-06, | |
| "loss": 0.7415, | |
| "mean_token_accuracy": 0.792452335357666, | |
| "num_tokens": 1209506.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.455, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 2.447181877148165e-06, | |
| "loss": 0.7063, | |
| "mean_token_accuracy": 0.7985927581787109, | |
| "num_tokens": 1212741.0, | |
| "step": 2455 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 6.65625, | |
| "learning_rate": 2.4044481113193953e-06, | |
| "loss": 0.5619, | |
| "mean_token_accuracy": 0.837166678905487, | |
| "num_tokens": 1214624.0, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.465, | |
| "grad_norm": 29.0, | |
| "learning_rate": 2.362039713653581e-06, | |
| "loss": 0.6907, | |
| "mean_token_accuracy": 0.7909873247146606, | |
| "num_tokens": 1217176.0, | |
| "step": 2465 | |
| }, | |
| { | |
| "epoch": 2.4699999999999998, | |
| "grad_norm": 7.9375, | |
| "learning_rate": 2.3199585007853233e-06, | |
| "loss": 0.7487, | |
| "mean_token_accuracy": 0.7831075429916382, | |
| "num_tokens": 1218962.0, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 2.475, | |
| "grad_norm": 8.25, | |
| "learning_rate": 2.278206275333731e-06, | |
| "loss": 0.694, | |
| "mean_token_accuracy": 0.8036178708076477, | |
| "num_tokens": 1220856.0, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 7.0, | |
| "learning_rate": 2.236784825825179e-06, | |
| "loss": 0.6465, | |
| "mean_token_accuracy": 0.8076552510261535, | |
| "num_tokens": 1222310.0, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.485, | |
| "grad_norm": 6.28125, | |
| "learning_rate": 2.195695926616702e-06, | |
| "loss": 0.7398, | |
| "mean_token_accuracy": 0.7894096493721008, | |
| "num_tokens": 1224812.0, | |
| "step": 2485 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 19.75, | |
| "learning_rate": 2.1549413378199912e-06, | |
| "loss": 0.7505, | |
| "mean_token_accuracy": 0.7837583899497986, | |
| "num_tokens": 1226946.0, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.495, | |
| "grad_norm": 8.5, | |
| "learning_rate": 2.1145228052259823e-06, | |
| "loss": 0.5331, | |
| "mean_token_accuracy": 0.8338425397872925, | |
| "num_tokens": 1229666.0, | |
| "step": 2495 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 2.074442060230093e-06, | |
| "loss": 0.3655, | |
| "mean_token_accuracy": 0.8997077226638794, | |
| "num_tokens": 1232184.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.505, | |
| "grad_norm": 8.125, | |
| "learning_rate": 2.034700819758039e-06, | |
| "loss": 0.5048, | |
| "mean_token_accuracy": 0.8454057693481445, | |
| "num_tokens": 1235829.0, | |
| "step": 2505 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 3.59375, | |
| "learning_rate": 1.995300786192291e-06, | |
| "loss": 0.5504, | |
| "mean_token_accuracy": 0.8341328144073487, | |
| "num_tokens": 1239156.0, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 2.515, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 1.956243647299155e-06, | |
| "loss": 0.6985, | |
| "mean_token_accuracy": 0.8047675132751465, | |
| "num_tokens": 1241694.0, | |
| "step": 2515 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 19.75, | |
| "learning_rate": 1.91753107615647e-06, | |
| "loss": 0.6351, | |
| "mean_token_accuracy": 0.8266091227531434, | |
| "num_tokens": 1242546.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.525, | |
| "grad_norm": 7.5, | |
| "learning_rate": 1.8791647310819371e-06, | |
| "loss": 0.6763, | |
| "mean_token_accuracy": 0.8041231989860534, | |
| "num_tokens": 1245217.0, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 2.5300000000000002, | |
| "grad_norm": 26.25, | |
| "learning_rate": 1.8411462555620896e-06, | |
| "loss": 0.5798, | |
| "mean_token_accuracy": 0.8182765483856201, | |
| "num_tokens": 1247501.0, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 2.535, | |
| "grad_norm": 3.796875, | |
| "learning_rate": 1.8034772781818776e-06, | |
| "loss": 0.6065, | |
| "mean_token_accuracy": 0.8119491338729858, | |
| "num_tokens": 1249229.0, | |
| "step": 2535 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 6.15625, | |
| "learning_rate": 1.766159412554922e-06, | |
| "loss": 0.4853, | |
| "mean_token_accuracy": 0.8598642110824585, | |
| "num_tokens": 1251011.0, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.545, | |
| "grad_norm": 6.375, | |
| "learning_rate": 1.7291942572543806e-06, | |
| "loss": 0.3356, | |
| "mean_token_accuracy": 0.904644763469696, | |
| "num_tokens": 1254128.0, | |
| "step": 2545 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 6.84375, | |
| "learning_rate": 1.6925833957444826e-06, | |
| "loss": 0.4384, | |
| "mean_token_accuracy": 0.8696613907814026, | |
| "num_tokens": 1257246.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.555, | |
| "grad_norm": 5.25, | |
| "learning_rate": 1.656328396312682e-06, | |
| "loss": 0.5397, | |
| "mean_token_accuracy": 0.8403537273406982, | |
| "num_tokens": 1260577.0, | |
| "step": 2555 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 4.0, | |
| "learning_rate": 1.6204308120024915e-06, | |
| "loss": 0.5642, | |
| "mean_token_accuracy": 0.831739854812622, | |
| "num_tokens": 1262707.0, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.565, | |
| "grad_norm": 5.71875, | |
| "learning_rate": 1.5848921805469396e-06, | |
| "loss": 0.782, | |
| "mean_token_accuracy": 0.7676750183105469, | |
| "num_tokens": 1266882.0, | |
| "step": 2565 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 10.625, | |
| "learning_rate": 1.5497140243027198e-06, | |
| "loss": 0.6529, | |
| "mean_token_accuracy": 0.8205275893211365, | |
| "num_tokens": 1268811.0, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 2.575, | |
| "grad_norm": 5.46875, | |
| "learning_rate": 1.5148978501849642e-06, | |
| "loss": 0.492, | |
| "mean_token_accuracy": 0.8443620800971985, | |
| "num_tokens": 1272499.0, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 1.4804451496026928e-06, | |
| "loss": 0.5225, | |
| "mean_token_accuracy": 0.8471122026443482, | |
| "num_tokens": 1275704.0, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.585, | |
| "grad_norm": 5.40625, | |
| "learning_rate": 1.446357398394934e-06, | |
| "loss": 0.4996, | |
| "mean_token_accuracy": 0.8760560512542724, | |
| "num_tokens": 1277875.0, | |
| "step": 2585 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 12.1875, | |
| "learning_rate": 1.4126360567674946e-06, | |
| "loss": 0.7877, | |
| "mean_token_accuracy": 0.7944910049438476, | |
| "num_tokens": 1280557.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 2.5949999999999998, | |
| "grad_norm": 10.125, | |
| "learning_rate": 1.379282569230419e-06, | |
| "loss": 0.5377, | |
| "mean_token_accuracy": 0.8431946516036988, | |
| "num_tokens": 1283379.0, | |
| "step": 2595 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 7.09375, | |
| "learning_rate": 1.3462983645360994e-06, | |
| "loss": 0.6152, | |
| "mean_token_accuracy": 0.7838948607444763, | |
| "num_tokens": 1286317.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.605, | |
| "grad_norm": 5.75, | |
| "learning_rate": 1.3136848556180893e-06, | |
| "loss": 0.6216, | |
| "mean_token_accuracy": 0.8219085574150086, | |
| "num_tokens": 1289902.0, | |
| "step": 2605 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 10.0, | |
| "learning_rate": 1.281443439530562e-06, | |
| "loss": 0.792, | |
| "mean_token_accuracy": 0.7688913822174073, | |
| "num_tokens": 1292298.0, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.615, | |
| "grad_norm": 9.625, | |
| "learning_rate": 1.2495754973884766e-06, | |
| "loss": 0.7977, | |
| "mean_token_accuracy": 0.7754817247390747, | |
| "num_tokens": 1295435.0, | |
| "step": 2615 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 14.625, | |
| "learning_rate": 1.2180823943084076e-06, | |
| "loss": 0.7828, | |
| "mean_token_accuracy": 0.7664639234542847, | |
| "num_tokens": 1296353.0, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.625, | |
| "grad_norm": 7.34375, | |
| "learning_rate": 1.1869654793500784e-06, | |
| "loss": 0.7631, | |
| "mean_token_accuracy": 0.7716354012489319, | |
| "num_tokens": 1298031.0, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 5.5, | |
| "learning_rate": 1.156226085458556e-06, | |
| "loss": 0.4382, | |
| "mean_token_accuracy": 0.8708669185638428, | |
| "num_tokens": 1301255.0, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 2.635, | |
| "grad_norm": 24.0, | |
| "learning_rate": 1.1258655294071686e-06, | |
| "loss": 0.5946, | |
| "mean_token_accuracy": 0.8110897779464722, | |
| "num_tokens": 1303391.0, | |
| "step": 2635 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 8.375, | |
| "learning_rate": 1.0958851117410874e-06, | |
| "loss": 0.5818, | |
| "mean_token_accuracy": 0.8356985807418823, | |
| "num_tokens": 1304908.0, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 2.645, | |
| "grad_norm": 16.5, | |
| "learning_rate": 1.0662861167216243e-06, | |
| "loss": 0.9185, | |
| "mean_token_accuracy": 0.7712666034698487, | |
| "num_tokens": 1306457.0, | |
| "step": 2645 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 5.3125, | |
| "learning_rate": 1.0370698122712131e-06, | |
| "loss": 0.4105, | |
| "mean_token_accuracy": 0.8678780198097229, | |
| "num_tokens": 1310791.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.6550000000000002, | |
| "grad_norm": 18.625, | |
| "learning_rate": 1.0082374499190961e-06, | |
| "loss": 0.6534, | |
| "mean_token_accuracy": 0.826017415523529, | |
| "num_tokens": 1312521.0, | |
| "step": 2655 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 18.125, | |
| "learning_rate": 9.797902647477175e-07, | |
| "loss": 0.5011, | |
| "mean_token_accuracy": 0.8516492247581482, | |
| "num_tokens": 1314019.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 2.665, | |
| "grad_norm": 19.5, | |
| "learning_rate": 9.517294753398066e-07, | |
| "loss": 0.3417, | |
| "mean_token_accuracy": 0.9054901719093322, | |
| "num_tokens": 1315720.0, | |
| "step": 2665 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 17.875, | |
| "learning_rate": 9.240562837261891e-07, | |
| "loss": 0.6058, | |
| "mean_token_accuracy": 0.8199878454208374, | |
| "num_tokens": 1316657.0, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 2.675, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 8.967718753342902e-07, | |
| "loss": 0.7952, | |
| "mean_token_accuracy": 0.7606085181236267, | |
| "num_tokens": 1318961.0, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 7.375, | |
| "learning_rate": 8.698774189373571e-07, | |
| "loss": 0.4273, | |
| "mean_token_accuracy": 0.8727764368057251, | |
| "num_tokens": 1321286.0, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 2.685, | |
| "grad_norm": 4.875, | |
| "learning_rate": 8.433740666043899e-07, | |
| "loss": 0.5555, | |
| "mean_token_accuracy": 0.8267341732978821, | |
| "num_tokens": 1325500.0, | |
| "step": 2685 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 7.75, | |
| "learning_rate": 8.172629536507915e-07, | |
| "loss": 0.459, | |
| "mean_token_accuracy": 0.8654767513275147, | |
| "num_tokens": 1327274.0, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 2.695, | |
| "grad_norm": 27.0, | |
| "learning_rate": 7.915451985897382e-07, | |
| "loss": 1.0706, | |
| "mean_token_accuracy": 0.6949961304664611, | |
| "num_tokens": 1328978.0, | |
| "step": 2695 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 6.875, | |
| "learning_rate": 7.662219030842588e-07, | |
| "loss": 0.5883, | |
| "mean_token_accuracy": 0.8402994990348815, | |
| "num_tokens": 1331301.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.705, | |
| "grad_norm": 7.4375, | |
| "learning_rate": 7.412941519000527e-07, | |
| "loss": 0.7718, | |
| "mean_token_accuracy": 0.7737884640693664, | |
| "num_tokens": 1334311.0, | |
| "step": 2705 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 20.375, | |
| "learning_rate": 7.167630128590131e-07, | |
| "loss": 0.6655, | |
| "mean_token_accuracy": 0.8034143686294556, | |
| "num_tokens": 1335913.0, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 2.715, | |
| "grad_norm": 5.875, | |
| "learning_rate": 6.92629536793491e-07, | |
| "loss": 0.6347, | |
| "mean_token_accuracy": 0.8110605478286743, | |
| "num_tokens": 1338621.0, | |
| "step": 2715 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 22.375, | |
| "learning_rate": 6.688947575012794e-07, | |
| "loss": 0.5813, | |
| "mean_token_accuracy": 0.8388025999069214, | |
| "num_tokens": 1341731.0, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 2.725, | |
| "grad_norm": 25.0, | |
| "learning_rate": 6.455596917013274e-07, | |
| "loss": 0.6901, | |
| "mean_token_accuracy": 0.8210200190544128, | |
| "num_tokens": 1343282.0, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 6.226253389901882e-07, | |
| "loss": 0.7357, | |
| "mean_token_accuracy": 0.7805762529373169, | |
| "num_tokens": 1346733.0, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 2.735, | |
| "grad_norm": 7.53125, | |
| "learning_rate": 6.000926817991992e-07, | |
| "loss": 0.715, | |
| "mean_token_accuracy": 0.7933832883834839, | |
| "num_tokens": 1349395.0, | |
| "step": 2735 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 8.1875, | |
| "learning_rate": 5.779626853524023e-07, | |
| "loss": 0.4749, | |
| "mean_token_accuracy": 0.8691505193710327, | |
| "num_tokens": 1351191.0, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 2.745, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 5.562362976251901e-07, | |
| "loss": 0.556, | |
| "mean_token_accuracy": 0.8356801629066467, | |
| "num_tokens": 1353846.0, | |
| "step": 2745 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 11.1875, | |
| "learning_rate": 5.349144493037017e-07, | |
| "loss": 0.5487, | |
| "mean_token_accuracy": 0.8493734836578369, | |
| "num_tokens": 1355697.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.755, | |
| "grad_norm": 23.75, | |
| "learning_rate": 5.13998053744954e-07, | |
| "loss": 0.7255, | |
| "mean_token_accuracy": 0.8080898642539978, | |
| "num_tokens": 1357081.0, | |
| "step": 2755 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 5.65625, | |
| "learning_rate": 4.934880069377179e-07, | |
| "loss": 0.6731, | |
| "mean_token_accuracy": 0.8096045136451722, | |
| "num_tokens": 1360537.0, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 2.765, | |
| "grad_norm": 7.5, | |
| "learning_rate": 4.733851874641382e-07, | |
| "loss": 0.7519, | |
| "mean_token_accuracy": 0.7882884621620179, | |
| "num_tokens": 1363841.0, | |
| "step": 2765 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 4.536904564620892e-07, | |
| "loss": 0.5781, | |
| "mean_token_accuracy": 0.8222225427627563, | |
| "num_tokens": 1367406.0, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 2.775, | |
| "grad_norm": 15.0625, | |
| "learning_rate": 4.344046575883021e-07, | |
| "loss": 0.6944, | |
| "mean_token_accuracy": 0.7947175741195679, | |
| "num_tokens": 1369306.0, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 2.7800000000000002, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 4.1552861698220927e-07, | |
| "loss": 0.5703, | |
| "mean_token_accuracy": 0.8330890297889709, | |
| "num_tokens": 1372042.0, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 2.785, | |
| "grad_norm": 14.625, | |
| "learning_rate": 3.9706314323056936e-07, | |
| "loss": 0.5702, | |
| "mean_token_accuracy": 0.8367876529693603, | |
| "num_tokens": 1373975.0, | |
| "step": 2785 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 26.875, | |
| "learning_rate": 3.7900902733281843e-07, | |
| "loss": 0.7193, | |
| "mean_token_accuracy": 0.8044403076171875, | |
| "num_tokens": 1376187.0, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 2.795, | |
| "grad_norm": 17.5, | |
| "learning_rate": 3.6136704266719115e-07, | |
| "loss": 0.3683, | |
| "mean_token_accuracy": 0.8847111463546753, | |
| "num_tokens": 1378803.0, | |
| "step": 2795 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 5.78125, | |
| "learning_rate": 3.441379449575943e-07, | |
| "loss": 0.5358, | |
| "mean_token_accuracy": 0.8456088662147522, | |
| "num_tokens": 1381997.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.805, | |
| "grad_norm": 5.96875, | |
| "learning_rate": 3.273224722412327e-07, | |
| "loss": 0.6415, | |
| "mean_token_accuracy": 0.8274478316307068, | |
| "num_tokens": 1384359.0, | |
| "step": 2805 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 3.1092134483698966e-07, | |
| "loss": 0.6165, | |
| "mean_token_accuracy": 0.8140155911445618, | |
| "num_tokens": 1385707.0, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 2.815, | |
| "grad_norm": 22.875, | |
| "learning_rate": 2.949352653145754e-07, | |
| "loss": 0.4145, | |
| "mean_token_accuracy": 0.8786633729934692, | |
| "num_tokens": 1388446.0, | |
| "step": 2815 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 4.78125, | |
| "learning_rate": 2.793649184644331e-07, | |
| "loss": 0.5485, | |
| "mean_token_accuracy": 0.8479639649391174, | |
| "num_tokens": 1390738.0, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 2.825, | |
| "grad_norm": 6.65625, | |
| "learning_rate": 2.6421097126839714e-07, | |
| "loss": 0.6989, | |
| "mean_token_accuracy": 0.7954578876495362, | |
| "num_tokens": 1394232.0, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 5.3125, | |
| "learning_rate": 2.49474072871132e-07, | |
| "loss": 0.6715, | |
| "mean_token_accuracy": 0.8026228666305542, | |
| "num_tokens": 1396421.0, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 2.835, | |
| "grad_norm": 40.25, | |
| "learning_rate": 2.3515485455231412e-07, | |
| "loss": 0.6224, | |
| "mean_token_accuracy": 0.8251531958580017, | |
| "num_tokens": 1399233.0, | |
| "step": 2835 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 39.0, | |
| "learning_rate": 2.212539296995997e-07, | |
| "loss": 0.6011, | |
| "mean_token_accuracy": 0.8397654533386231, | |
| "num_tokens": 1401387.0, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 2.8449999999999998, | |
| "grad_norm": 8.125, | |
| "learning_rate": 2.077718937823414e-07, | |
| "loss": 0.4666, | |
| "mean_token_accuracy": 0.8580923438072204, | |
| "num_tokens": 1404363.0, | |
| "step": 2845 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 26.25, | |
| "learning_rate": 1.9470932432609001e-07, | |
| "loss": 0.5946, | |
| "mean_token_accuracy": 0.8188249826431274, | |
| "num_tokens": 1406182.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 2.855, | |
| "grad_norm": 13.625, | |
| "learning_rate": 1.820667808878429e-07, | |
| "loss": 0.5073, | |
| "mean_token_accuracy": 0.8545669674873352, | |
| "num_tokens": 1409661.0, | |
| "step": 2855 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 22.625, | |
| "learning_rate": 1.6984480503208445e-07, | |
| "loss": 0.5162, | |
| "mean_token_accuracy": 0.8625428080558777, | |
| "num_tokens": 1412766.0, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 2.865, | |
| "grad_norm": 23.375, | |
| "learning_rate": 1.580439203075812e-07, | |
| "loss": 0.6426, | |
| "mean_token_accuracy": 0.8121456384658814, | |
| "num_tokens": 1414551.0, | |
| "step": 2865 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 14.625, | |
| "learning_rate": 1.4666463222495875e-07, | |
| "loss": 0.5375, | |
| "mean_token_accuracy": 0.8589481472969055, | |
| "num_tokens": 1417753.0, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 2.875, | |
| "grad_norm": 6.71875, | |
| "learning_rate": 1.357074282350457e-07, | |
| "loss": 0.7809, | |
| "mean_token_accuracy": 0.7673912167549133, | |
| "num_tokens": 1420667.0, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 7.375, | |
| "learning_rate": 1.2517277770799142e-07, | |
| "loss": 0.6226, | |
| "mean_token_accuracy": 0.8142449855804443, | |
| "num_tokens": 1423042.0, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 2.885, | |
| "grad_norm": 102.5, | |
| "learning_rate": 1.1506113191316447e-07, | |
| "loss": 0.5121, | |
| "mean_token_accuracy": 0.8586719989776611, | |
| "num_tokens": 1425381.0, | |
| "step": 2885 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 16.125, | |
| "learning_rate": 1.053729239998158e-07, | |
| "loss": 0.4504, | |
| "mean_token_accuracy": 0.887079381942749, | |
| "num_tokens": 1427853.0, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 2.895, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 9.610856897852683e-08, | |
| "loss": 0.6264, | |
| "mean_token_accuracy": 0.832252037525177, | |
| "num_tokens": 1429454.0, | |
| "step": 2895 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 18.375, | |
| "learning_rate": 8.726846370343267e-08, | |
| "loss": 0.7333, | |
| "mean_token_accuracy": 0.8016393065452576, | |
| "num_tokens": 1431278.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.9050000000000002, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 7.885298685522235e-08, | |
| "loss": 0.634, | |
| "mean_token_accuracy": 0.8164662957191468, | |
| "num_tokens": 1434290.0, | |
| "step": 2905 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 4.5, | |
| "learning_rate": 7.086249892491292e-08, | |
| "loss": 0.6649, | |
| "mean_token_accuracy": 0.8010025978088379, | |
| "num_tokens": 1437234.0, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 2.915, | |
| "grad_norm": 23.625, | |
| "learning_rate": 6.32973421984151e-08, | |
| "loss": 0.4797, | |
| "mean_token_accuracy": 0.8689961910247803, | |
| "num_tokens": 1440016.0, | |
| "step": 2915 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 7.59375, | |
| "learning_rate": 5.615784074186281e-08, | |
| "loss": 0.6124, | |
| "mean_token_accuracy": 0.8135050177574158, | |
| "num_tokens": 1444074.0, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 2.925, | |
| "grad_norm": 18.5, | |
| "learning_rate": 4.944430038773762e-08, | |
| "loss": 0.4905, | |
| "mean_token_accuracy": 0.8594370007514953, | |
| "num_tokens": 1446622.0, | |
| "step": 2925 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 34.75, | |
| "learning_rate": 4.315700872176254e-08, | |
| "loss": 0.5769, | |
| "mean_token_accuracy": 0.8277747988700866, | |
| "num_tokens": 1449915.0, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 2.935, | |
| "grad_norm": 7.6875, | |
| "learning_rate": 3.729623507058744e-08, | |
| "loss": 0.4319, | |
| "mean_token_accuracy": 0.8636555314064026, | |
| "num_tokens": 1453832.0, | |
| "step": 2935 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 16.25, | |
| "learning_rate": 3.1862230490249394e-08, | |
| "loss": 0.4257, | |
| "mean_token_accuracy": 0.8717685580253601, | |
| "num_tokens": 1456670.0, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 2.945, | |
| "grad_norm": 6.90625, | |
| "learning_rate": 2.6855227755419046e-08, | |
| "loss": 0.6175, | |
| "mean_token_accuracy": 0.8333387851715088, | |
| "num_tokens": 1459182.0, | |
| "step": 2945 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 16.375, | |
| "learning_rate": 2.2275441349428607e-08, | |
| "loss": 0.6274, | |
| "mean_token_accuracy": 0.8260146617889405, | |
| "num_tokens": 1460880.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 2.955, | |
| "grad_norm": 4.90625, | |
| "learning_rate": 1.812306745508474e-08, | |
| "loss": 0.4596, | |
| "mean_token_accuracy": 0.8644664764404297, | |
| "num_tokens": 1464261.0, | |
| "step": 2955 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 6.84375, | |
| "learning_rate": 1.439828394626641e-08, | |
| "loss": 0.8734, | |
| "mean_token_accuracy": 0.7530173778533935, | |
| "num_tokens": 1465974.0, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 2.965, | |
| "grad_norm": 25.25, | |
| "learning_rate": 1.1101250380300965e-08, | |
| "loss": 0.6501, | |
| "mean_token_accuracy": 0.8143182754516601, | |
| "num_tokens": 1468379.0, | |
| "step": 2965 | |
| }, | |
| { | |
| "epoch": 2.9699999999999998, | |
| "grad_norm": 6.03125, | |
| "learning_rate": 8.232107991131833e-09, | |
| "loss": 0.565, | |
| "mean_token_accuracy": 0.831293773651123, | |
| "num_tokens": 1471515.0, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 2.975, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 5.790979683271136e-09, | |
| "loss": 0.6732, | |
| "mean_token_accuracy": 0.8122606992721557, | |
| "num_tokens": 1473375.0, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 20.125, | |
| "learning_rate": 3.777970026531685e-09, | |
| "loss": 0.4793, | |
| "mean_token_accuracy": 0.8602708101272583, | |
| "num_tokens": 1476487.0, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 2.985, | |
| "grad_norm": 32.25, | |
| "learning_rate": 2.193165251545004e-09, | |
| "loss": 0.7171, | |
| "mean_token_accuracy": 0.790663480758667, | |
| "num_tokens": 1478477.0, | |
| "step": 2985 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 4.75, | |
| "learning_rate": 1.0366332460753913e-09, | |
| "loss": 0.6874, | |
| "mean_token_accuracy": 0.8349518656730652, | |
| "num_tokens": 1480618.0, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 2.995, | |
| "grad_norm": 8.625, | |
| "learning_rate": 3.0842355210336515e-10, | |
| "loss": 0.9018, | |
| "mean_token_accuracy": 0.7254812955856323, | |
| "num_tokens": 1483656.0, | |
| "step": 2995 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 9.125, | |
| "learning_rate": 8.567363708467114e-12, | |
| "loss": 0.5487, | |
| "mean_token_accuracy": 0.8382766008377075, | |
| "num_tokens": 1487052.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 3000, | |
| "total_flos": 2.8742560290791424e+16, | |
| "train_loss": 0.8452948161760966, | |
| "train_runtime": 60165.202, | |
| "train_samples_per_second": 0.05, | |
| "train_steps_per_second": 0.05 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 3000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.8742560290791424e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |