{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 420, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03571428571428571, "grad_norm": 18.0, "learning_rate": 2.9963385849481866e-05, "loss": 1.9707, "step": 5 }, { "epoch": 0.07142857142857142, "grad_norm": 0.59375, "learning_rate": 6.74176181613342e-05, "loss": 0.1802, "step": 10 }, { "epoch": 0.10714285714285714, "grad_norm": 0.16796875, "learning_rate": 0.00010487185047318654, "loss": 0.0095, "step": 15 }, { "epoch": 0.14285714285714285, "grad_norm": 0.1689453125, "learning_rate": 0.00014232608278503885, "loss": 0.0048, "step": 20 }, { "epoch": 0.17857142857142858, "grad_norm": 0.1376953125, "learning_rate": 0.0001797803150968912, "loss": 0.0021, "step": 25 }, { "epoch": 0.21428571428571427, "grad_norm": 2.09375, "learning_rate": 0.00021723454740874357, "loss": 0.0031, "step": 30 }, { "epoch": 0.25, "grad_norm": 4.3125, "learning_rate": 0.00025468877972059585, "loss": 0.1383, "step": 35 }, { "epoch": 0.2857142857142857, "grad_norm": 1.6171875, "learning_rate": 0.00026212725891418574, "loss": 0.3519, "step": 40 }, { "epoch": 0.32142857142857145, "grad_norm": 0.486328125, "learning_rate": 0.000261914612489722, "loss": 0.1419, "step": 45 }, { "epoch": 0.35714285714285715, "grad_norm": 0.35546875, "learning_rate": 0.0002615387676118368, "loss": 0.0417, "step": 50 }, { "epoch": 0.39285714285714285, "grad_norm": 0.1455078125, "learning_rate": 0.00026100034983723763, "loss": 0.0181, "step": 55 }, { "epoch": 0.42857142857142855, "grad_norm": 0.11865234375, "learning_rate": 0.0002603002553091743, "loss": 0.0084, "step": 60 }, { "epoch": 0.4642857142857143, "grad_norm": 46.75, "learning_rate": 0.0002594396492658969, "loss": 0.0052, "step": 65 }, { "epoch": 0.5, "grad_norm": 0.1572265625, "learning_rate": 0.00025841996410123214, "loss": 0.0115, "step": 70 }, { "epoch": 0.5357142857142857, "grad_norm": 0.2294921875, "learning_rate": 0.0002572428969805046, "loss": 0.0064, "step": 75 }, { "epoch": 0.5714285714285714, "grad_norm": 0.1494140625, "learning_rate": 0.000255910407015773, "loss": 0.0044, "step": 80 }, { "epoch": 0.6071428571428571, "grad_norm": 0.5859375, "learning_rate": 0.00025442471200508136, "loss": 0.0047, "step": 85 }, { "epoch": 0.6428571428571429, "grad_norm": 0.423828125, "learning_rate": 0.00025278828474115296, "loss": 0.0058, "step": 90 }, { "epoch": 0.6785714285714286, "grad_norm": 0.37109375, "learning_rate": 0.00025100384889567096, "loss": 0.0874, "step": 95 }, { "epoch": 0.7142857142857143, "grad_norm": 0.1416015625, "learning_rate": 0.00024907437448599555, "loss": 0.0046, "step": 100 }, { "epoch": 0.75, "grad_norm": 0.07177734375, "learning_rate": 0.0002470030729318636, "loss": 0.0041, "step": 105 }, { "epoch": 0.7857142857142857, "grad_norm": 0.1884765625, "learning_rate": 0.0002447933917102969, "loss": 0.0021, "step": 110 }, { "epoch": 0.8214285714285714, "grad_norm": 0.0947265625, "learning_rate": 0.00024244900861761706, "loss": 0.004, "step": 115 }, { "epoch": 0.8571428571428571, "grad_norm": 0.1767578125, "learning_rate": 0.00023997382564811678, "loss": 0.0025, "step": 120 }, { "epoch": 0.8928571428571429, "grad_norm": 0.171875, "learning_rate": 0.00023737196249957492, "loss": 0.0057, "step": 125 }, { "epoch": 0.9285714285714286, "grad_norm": 0.158203125, "learning_rate": 0.00023464774971642655, "loss": 0.004, "step": 130 }, { "epoch": 0.9642857142857143, "grad_norm": 0.1982421875, "learning_rate": 0.00023180572148199898, "loss": 0.0057, "step": 135 }, { "epoch": 1.0, "grad_norm": 0.0966796875, "learning_rate": 0.00022885060807181098, "loss": 0.0176, "step": 140 }, { "epoch": 1.0, "eval_loss": 0.0011085456935688853, "eval_runtime": 0.4827, "eval_samples_per_second": 6.215, "eval_steps_per_second": 6.215, "step": 140 }, { "epoch": 1.0357142857142858, "grad_norm": 0.23046875, "learning_rate": 0.00022578732798049614, "loss": 0.0055, "step": 145 }, { "epoch": 1.0714285714285714, "grad_norm": 0.07958984375, "learning_rate": 0.00022262097973545393, "loss": 0.002, "step": 150 }, { "epoch": 1.1071428571428572, "grad_norm": 0.060791015625, "learning_rate": 0.00021935683341085392, "loss": 0.002, "step": 155 }, { "epoch": 1.1428571428571428, "grad_norm": 0.216796875, "learning_rate": 0.00021600032185611751, "loss": 0.0019, "step": 160 }, { "epoch": 1.1785714285714286, "grad_norm": 0.072265625, "learning_rate": 0.00021255703165347625, "loss": 0.0014, "step": 165 }, { "epoch": 1.2142857142857142, "grad_norm": 0.04541015625, "learning_rate": 0.00020903269381965703, "loss": 0.0016, "step": 170 }, { "epoch": 1.25, "grad_norm": 0.05126953125, "learning_rate": 0.00020543317426717028, "loss": 0.0019, "step": 175 }, { "epoch": 1.2857142857142856, "grad_norm": 0.091796875, "learning_rate": 0.00020176446404107748, "loss": 0.0015, "step": 180 }, { "epoch": 1.3214285714285714, "grad_norm": 0.0927734375, "learning_rate": 0.00019803266934748806, "loss": 0.0043, "step": 185 }, { "epoch": 1.3571428571428572, "grad_norm": 0.83203125, "learning_rate": 0.00019424400139038178, "loss": 0.0091, "step": 190 }, { "epoch": 1.3928571428571428, "grad_norm": 0.1083984375, "learning_rate": 0.00019040476603367275, "loss": 0.0028, "step": 195 }, { "epoch": 1.4285714285714286, "grad_norm": 0.0966796875, "learning_rate": 0.0001865213533057213, "loss": 0.002, "step": 200 }, { "epoch": 1.4642857142857144, "grad_norm": 0.051025390625, "learning_rate": 0.00018260022676376233, "loss": 0.0016, "step": 205 }, { "epoch": 1.5, "grad_norm": 0.0400390625, "learning_rate": 0.00017864791273595223, "loss": 0.0015, "step": 210 }, { "epoch": 1.5357142857142856, "grad_norm": 0.2392578125, "learning_rate": 0.00017467098945893973, "loss": 0.0007, "step": 215 }, { "epoch": 1.5714285714285714, "grad_norm": 0.0234375, "learning_rate": 0.00017067607612904011, "loss": 0.001, "step": 220 }, { "epoch": 1.6071428571428572, "grad_norm": 0.02294921875, "learning_rate": 0.00016666982188523637, "loss": 0.0012, "step": 225 }, { "epoch": 1.6428571428571428, "grad_norm": 0.140625, "learning_rate": 0.0001626588947423435, "loss": 0.0011, "step": 230 }, { "epoch": 1.6785714285714286, "grad_norm": 0.0220947265625, "learning_rate": 0.00015864997049275565, "loss": 0.0006, "step": 235 }, { "epoch": 1.7142857142857144, "grad_norm": 0.041748046875, "learning_rate": 0.00015464972159524894, "loss": 0.0009, "step": 240 }, { "epoch": 1.75, "grad_norm": 0.01904296875, "learning_rate": 0.0001506648060693321, "loss": 0.0008, "step": 245 }, { "epoch": 1.7857142857142856, "grad_norm": 0.0194091796875, "learning_rate": 0.00014670185641362996, "loss": 0.0005, "step": 250 }, { "epoch": 1.8214285714285714, "grad_norm": 0.033203125, "learning_rate": 0.00014276746856674427, "loss": 0.0009, "step": 255 }, { "epoch": 1.8571428571428572, "grad_norm": 0.026611328125, "learning_rate": 0.0001388681909289647, "loss": 0.0005, "step": 260 }, { "epoch": 1.8928571428571428, "grad_norm": 0.03369140625, "learning_rate": 0.00013501051346310284, "loss": 0.0007, "step": 265 }, { "epoch": 1.9285714285714286, "grad_norm": 0.0238037109375, "learning_rate": 0.00013120085689258973, "loss": 0.0009, "step": 270 }, { "epoch": 1.9642857142857144, "grad_norm": 0.03369140625, "learning_rate": 0.0001274455620148151, "loss": 0.0005, "step": 275 }, { "epoch": 2.0, "grad_norm": 0.01348876953125, "learning_rate": 0.00012375087914749612, "loss": 0.0005, "step": 280 }, { "epoch": 2.0, "eval_loss": 5.337938273441978e-05, "eval_runtime": 0.3375, "eval_samples_per_second": 8.888, "eval_steps_per_second": 8.888, "step": 280 }, { "epoch": 2.0357142857142856, "grad_norm": 0.0233154296875, "learning_rate": 0.00012012295772564025, "loss": 0.0002, "step": 285 }, { "epoch": 2.0714285714285716, "grad_norm": 0.007293701171875, "learning_rate": 0.00011656783606641763, "loss": 0.0004, "step": 290 }, { "epoch": 2.107142857142857, "grad_norm": 0.0274658203125, "learning_rate": 0.00011309143131897807, "loss": 0.0002, "step": 295 }, { "epoch": 2.142857142857143, "grad_norm": 0.0130615234375, "learning_rate": 0.00010969952961594018, "loss": 0.0006, "step": 300 }, { "epoch": 2.1785714285714284, "grad_norm": 0.01116943359375, "learning_rate": 0.00010639777644294472, "loss": 0.0002, "step": 305 }, { "epoch": 2.2142857142857144, "grad_norm": 0.01190185546875, "learning_rate": 0.00010319166724230088, "loss": 0.0006, "step": 310 }, { "epoch": 2.25, "grad_norm": 0.0020904541015625, "learning_rate": 0.00010008653826636486, "loss": 0.0002, "step": 315 }, { "epoch": 2.2857142857142856, "grad_norm": 0.0498046875, "learning_rate": 9.708755769587456e-05, "loss": 0.0002, "step": 320 }, { "epoch": 2.3214285714285716, "grad_norm": 0.0026397705078125, "learning_rate": 9.419971703802265e-05, "loss": 0.0005, "step": 325 }, { "epoch": 2.357142857142857, "grad_norm": 0.00628662109375, "learning_rate": 9.142782281858553e-05, "loss": 0.0002, "step": 330 }, { "epoch": 2.392857142857143, "grad_norm": 0.0036468505859375, "learning_rate": 8.877648858193555e-05, "loss": 0.0002, "step": 335 }, { "epoch": 2.4285714285714284, "grad_norm": 0.1787109375, "learning_rate": 8.625012721225163e-05, "loss": 0.0006, "step": 340 }, { "epoch": 2.4642857142857144, "grad_norm": 0.0076904296875, "learning_rate": 8.385294358870913e-05, "loss": 0.0002, "step": 345 }, { "epoch": 2.5, "grad_norm": 0.0250244140625, "learning_rate": 8.15889275868734e-05, "loss": 0.0001, "step": 350 }, { "epoch": 2.5357142857142856, "grad_norm": 0.00335693359375, "learning_rate": 7.946184743794559e-05, "loss": 0.0002, "step": 355 }, { "epoch": 2.571428571428571, "grad_norm": 0.005584716796875, "learning_rate": 7.747524345691386e-05, "loss": 0.0002, "step": 360 }, { "epoch": 2.607142857142857, "grad_norm": 0.011474609375, "learning_rate": 7.563242215004834e-05, "loss": 0.0004, "step": 365 }, { "epoch": 2.642857142857143, "grad_norm": 0.005126953125, "learning_rate": 7.39364507115478e-05, "loss": 0.0002, "step": 370 }, { "epoch": 2.678571428571429, "grad_norm": 0.00164031982421875, "learning_rate": 7.239015191849763e-05, "loss": 0.0001, "step": 375 }, { "epoch": 2.7142857142857144, "grad_norm": 2.265625, "learning_rate": 7.099609943263594e-05, "loss": 0.0003, "step": 380 }, { "epoch": 2.75, "grad_norm": 0.024658203125, "learning_rate": 6.975661351674754e-05, "loss": 0.0001, "step": 385 }, { "epoch": 2.7857142857142856, "grad_norm": 0.0120849609375, "learning_rate": 6.86737571728156e-05, "loss": 0.0002, "step": 390 }, { "epoch": 2.821428571428571, "grad_norm": 0.004791259765625, "learning_rate": 6.774933270835844e-05, "loss": 0.0003, "step": 395 }, { "epoch": 2.857142857142857, "grad_norm": 0.00946044921875, "learning_rate": 6.698487873666642e-05, "loss": 0.0001, "step": 400 }, { "epoch": 2.892857142857143, "grad_norm": 0.003082275390625, "learning_rate": 6.638166761593213e-05, "loss": 0.0002, "step": 405 }, { "epoch": 2.928571428571429, "grad_norm": 0.0189208984375, "learning_rate": 6.594070333153561e-05, "loss": 0.0002, "step": 410 }, { "epoch": 2.9642857142857144, "grad_norm": 0.0224609375, "learning_rate": 6.566271982500987e-05, "loss": 0.0005, "step": 415 }, { "epoch": 3.0, "grad_norm": 0.0037841796875, "learning_rate": 6.554817977246755e-05, "loss": 0.0002, "step": 420 }, { "epoch": 3.0, "eval_loss": 5.2707153372466564e-05, "eval_runtime": 0.343, "eval_samples_per_second": 8.746, "eval_steps_per_second": 8.746, "step": 420 } ], "logging_steps": 5, "max_steps": 420, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.221798892876595e+17, "train_batch_size": 48, "trial_name": null, "trial_params": null }