| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 420, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03571428571428571, | |
| "grad_norm": 18.0, | |
| "learning_rate": 2.9963385849481866e-05, | |
| "loss": 1.9707, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.07142857142857142, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 6.74176181613342e-05, | |
| "loss": 0.1802, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.10714285714285714, | |
| "grad_norm": 0.16796875, | |
| "learning_rate": 0.00010487185047318654, | |
| "loss": 0.0095, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 0.00014232608278503885, | |
| "loss": 0.0048, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.17857142857142858, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 0.0001797803150968912, | |
| "loss": 0.0021, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.21428571428571427, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 0.00021723454740874357, | |
| "loss": 0.0031, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 4.3125, | |
| "learning_rate": 0.00025468877972059585, | |
| "loss": 0.1383, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 0.00026212725891418574, | |
| "loss": 0.3519, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.32142857142857145, | |
| "grad_norm": 0.486328125, | |
| "learning_rate": 0.000261914612489722, | |
| "loss": 0.1419, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 0.0002615387676118368, | |
| "loss": 0.0417, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.39285714285714285, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 0.00026100034983723763, | |
| "loss": 0.0181, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 0.11865234375, | |
| "learning_rate": 0.0002603002553091743, | |
| "loss": 0.0084, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.4642857142857143, | |
| "grad_norm": 46.75, | |
| "learning_rate": 0.0002594396492658969, | |
| "loss": 0.0052, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.1572265625, | |
| "learning_rate": 0.00025841996410123214, | |
| "loss": 0.0115, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.5357142857142857, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 0.0002572428969805046, | |
| "loss": 0.0064, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.1494140625, | |
| "learning_rate": 0.000255910407015773, | |
| "loss": 0.0044, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.6071428571428571, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 0.00025442471200508136, | |
| "loss": 0.0047, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.6428571428571429, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 0.00025278828474115296, | |
| "loss": 0.0058, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.6785714285714286, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 0.00025100384889567096, | |
| "loss": 0.0874, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 0.1416015625, | |
| "learning_rate": 0.00024907437448599555, | |
| "loss": 0.0046, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.07177734375, | |
| "learning_rate": 0.0002470030729318636, | |
| "loss": 0.0041, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.7857142857142857, | |
| "grad_norm": 0.1884765625, | |
| "learning_rate": 0.0002447933917102969, | |
| "loss": 0.0021, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.8214285714285714, | |
| "grad_norm": 0.0947265625, | |
| "learning_rate": 0.00024244900861761706, | |
| "loss": 0.004, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.8571428571428571, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 0.00023997382564811678, | |
| "loss": 0.0025, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.8928571428571429, | |
| "grad_norm": 0.171875, | |
| "learning_rate": 0.00023737196249957492, | |
| "loss": 0.0057, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.9285714285714286, | |
| "grad_norm": 0.158203125, | |
| "learning_rate": 0.00023464774971642655, | |
| "loss": 0.004, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.9642857142857143, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 0.00023180572148199898, | |
| "loss": 0.0057, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.0966796875, | |
| "learning_rate": 0.00022885060807181098, | |
| "loss": 0.0176, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.0011085456935688853, | |
| "eval_runtime": 0.4827, | |
| "eval_samples_per_second": 6.215, | |
| "eval_steps_per_second": 6.215, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.0357142857142858, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 0.00022578732798049614, | |
| "loss": 0.0055, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.0714285714285714, | |
| "grad_norm": 0.07958984375, | |
| "learning_rate": 0.00022262097973545393, | |
| "loss": 0.002, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.1071428571428572, | |
| "grad_norm": 0.060791015625, | |
| "learning_rate": 0.00021935683341085392, | |
| "loss": 0.002, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.1428571428571428, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 0.00021600032185611751, | |
| "loss": 0.0019, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.1785714285714286, | |
| "grad_norm": 0.072265625, | |
| "learning_rate": 0.00021255703165347625, | |
| "loss": 0.0014, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.2142857142857142, | |
| "grad_norm": 0.04541015625, | |
| "learning_rate": 0.00020903269381965703, | |
| "loss": 0.0016, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.05126953125, | |
| "learning_rate": 0.00020543317426717028, | |
| "loss": 0.0019, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.2857142857142856, | |
| "grad_norm": 0.091796875, | |
| "learning_rate": 0.00020176446404107748, | |
| "loss": 0.0015, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.3214285714285714, | |
| "grad_norm": 0.0927734375, | |
| "learning_rate": 0.00019803266934748806, | |
| "loss": 0.0043, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.3571428571428572, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 0.00019424400139038178, | |
| "loss": 0.0091, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.3928571428571428, | |
| "grad_norm": 0.1083984375, | |
| "learning_rate": 0.00019040476603367275, | |
| "loss": 0.0028, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 0.0966796875, | |
| "learning_rate": 0.0001865213533057213, | |
| "loss": 0.002, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.4642857142857144, | |
| "grad_norm": 0.051025390625, | |
| "learning_rate": 0.00018260022676376233, | |
| "loss": 0.0016, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.0400390625, | |
| "learning_rate": 0.00017864791273595223, | |
| "loss": 0.0015, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.5357142857142856, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 0.00017467098945893973, | |
| "loss": 0.0007, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.5714285714285714, | |
| "grad_norm": 0.0234375, | |
| "learning_rate": 0.00017067607612904011, | |
| "loss": 0.001, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.6071428571428572, | |
| "grad_norm": 0.02294921875, | |
| "learning_rate": 0.00016666982188523637, | |
| "loss": 0.0012, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.6428571428571428, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 0.0001626588947423435, | |
| "loss": 0.0011, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.6785714285714286, | |
| "grad_norm": 0.0220947265625, | |
| "learning_rate": 0.00015864997049275565, | |
| "loss": 0.0006, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.7142857142857144, | |
| "grad_norm": 0.041748046875, | |
| "learning_rate": 0.00015464972159524894, | |
| "loss": 0.0009, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 0.01904296875, | |
| "learning_rate": 0.0001506648060693321, | |
| "loss": 0.0008, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.7857142857142856, | |
| "grad_norm": 0.0194091796875, | |
| "learning_rate": 0.00014670185641362996, | |
| "loss": 0.0005, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.8214285714285714, | |
| "grad_norm": 0.033203125, | |
| "learning_rate": 0.00014276746856674427, | |
| "loss": 0.0009, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.8571428571428572, | |
| "grad_norm": 0.026611328125, | |
| "learning_rate": 0.0001388681909289647, | |
| "loss": 0.0005, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.8928571428571428, | |
| "grad_norm": 0.03369140625, | |
| "learning_rate": 0.00013501051346310284, | |
| "loss": 0.0007, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.9285714285714286, | |
| "grad_norm": 0.0238037109375, | |
| "learning_rate": 0.00013120085689258973, | |
| "loss": 0.0009, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.9642857142857144, | |
| "grad_norm": 0.03369140625, | |
| "learning_rate": 0.0001274455620148151, | |
| "loss": 0.0005, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.01348876953125, | |
| "learning_rate": 0.00012375087914749612, | |
| "loss": 0.0005, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 5.337938273441978e-05, | |
| "eval_runtime": 0.3375, | |
| "eval_samples_per_second": 8.888, | |
| "eval_steps_per_second": 8.888, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.0357142857142856, | |
| "grad_norm": 0.0233154296875, | |
| "learning_rate": 0.00012012295772564025, | |
| "loss": 0.0002, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.0714285714285716, | |
| "grad_norm": 0.007293701171875, | |
| "learning_rate": 0.00011656783606641763, | |
| "loss": 0.0004, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.107142857142857, | |
| "grad_norm": 0.0274658203125, | |
| "learning_rate": 0.00011309143131897807, | |
| "loss": 0.0002, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.142857142857143, | |
| "grad_norm": 0.0130615234375, | |
| "learning_rate": 0.00010969952961594018, | |
| "loss": 0.0006, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.1785714285714284, | |
| "grad_norm": 0.01116943359375, | |
| "learning_rate": 0.00010639777644294472, | |
| "loss": 0.0002, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.2142857142857144, | |
| "grad_norm": 0.01190185546875, | |
| "learning_rate": 0.00010319166724230088, | |
| "loss": 0.0006, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 0.0020904541015625, | |
| "learning_rate": 0.00010008653826636486, | |
| "loss": 0.0002, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.2857142857142856, | |
| "grad_norm": 0.0498046875, | |
| "learning_rate": 9.708755769587456e-05, | |
| "loss": 0.0002, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.3214285714285716, | |
| "grad_norm": 0.0026397705078125, | |
| "learning_rate": 9.419971703802265e-05, | |
| "loss": 0.0005, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.357142857142857, | |
| "grad_norm": 0.00628662109375, | |
| "learning_rate": 9.142782281858553e-05, | |
| "loss": 0.0002, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.392857142857143, | |
| "grad_norm": 0.0036468505859375, | |
| "learning_rate": 8.877648858193555e-05, | |
| "loss": 0.0002, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 2.4285714285714284, | |
| "grad_norm": 0.1787109375, | |
| "learning_rate": 8.625012721225163e-05, | |
| "loss": 0.0006, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.4642857142857144, | |
| "grad_norm": 0.0076904296875, | |
| "learning_rate": 8.385294358870913e-05, | |
| "loss": 0.0002, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.0250244140625, | |
| "learning_rate": 8.15889275868734e-05, | |
| "loss": 0.0001, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.5357142857142856, | |
| "grad_norm": 0.00335693359375, | |
| "learning_rate": 7.946184743794559e-05, | |
| "loss": 0.0002, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 2.571428571428571, | |
| "grad_norm": 0.005584716796875, | |
| "learning_rate": 7.747524345691386e-05, | |
| "loss": 0.0002, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.607142857142857, | |
| "grad_norm": 0.011474609375, | |
| "learning_rate": 7.563242215004834e-05, | |
| "loss": 0.0004, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 2.642857142857143, | |
| "grad_norm": 0.005126953125, | |
| "learning_rate": 7.39364507115478e-05, | |
| "loss": 0.0002, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.678571428571429, | |
| "grad_norm": 0.00164031982421875, | |
| "learning_rate": 7.239015191849763e-05, | |
| "loss": 0.0001, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 2.7142857142857144, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 7.099609943263594e-05, | |
| "loss": 0.0003, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 0.024658203125, | |
| "learning_rate": 6.975661351674754e-05, | |
| "loss": 0.0001, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 2.7857142857142856, | |
| "grad_norm": 0.0120849609375, | |
| "learning_rate": 6.86737571728156e-05, | |
| "loss": 0.0002, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.821428571428571, | |
| "grad_norm": 0.004791259765625, | |
| "learning_rate": 6.774933270835844e-05, | |
| "loss": 0.0003, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 0.00946044921875, | |
| "learning_rate": 6.698487873666642e-05, | |
| "loss": 0.0001, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.892857142857143, | |
| "grad_norm": 0.003082275390625, | |
| "learning_rate": 6.638166761593213e-05, | |
| "loss": 0.0002, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 2.928571428571429, | |
| "grad_norm": 0.0189208984375, | |
| "learning_rate": 6.594070333153561e-05, | |
| "loss": 0.0002, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.9642857142857144, | |
| "grad_norm": 0.0224609375, | |
| "learning_rate": 6.566271982500987e-05, | |
| "loss": 0.0005, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.0037841796875, | |
| "learning_rate": 6.554817977246755e-05, | |
| "loss": 0.0002, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 5.2707153372466564e-05, | |
| "eval_runtime": 0.343, | |
| "eval_samples_per_second": 8.746, | |
| "eval_steps_per_second": 8.746, | |
| "step": 420 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 420, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.221798892876595e+17, | |
| "train_batch_size": 48, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |