{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9955156950672646, "eval_steps": 500, "global_step": 74, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013452914798206279, "grad_norm": 95.4926986694336, "learning_rate": 5.0000000000000004e-08, "loss": 12.2856, "step": 1 }, { "epoch": 0.026905829596412557, "grad_norm": 93.69285583496094, "learning_rate": 1.0000000000000001e-07, "loss": 12.2383, "step": 2 }, { "epoch": 0.04035874439461883, "grad_norm": 95.09840393066406, "learning_rate": 1.5000000000000002e-07, "loss": 12.1293, "step": 3 }, { "epoch": 0.053811659192825115, "grad_norm": 95.04216766357422, "learning_rate": 2.0000000000000002e-07, "loss": 12.1453, "step": 4 }, { "epoch": 0.06726457399103139, "grad_norm": 93.44210052490234, "learning_rate": 2.5000000000000004e-07, "loss": 12.165, "step": 5 }, { "epoch": 0.08071748878923767, "grad_norm": 93.28514862060547, "learning_rate": 3.0000000000000004e-07, "loss": 12.063, "step": 6 }, { "epoch": 0.09417040358744394, "grad_norm": 95.3654556274414, "learning_rate": 3.5000000000000004e-07, "loss": 11.9676, "step": 7 }, { "epoch": 0.10762331838565023, "grad_norm": 96.05154418945312, "learning_rate": 4.0000000000000003e-07, "loss": 12.0911, "step": 8 }, { "epoch": 0.1210762331838565, "grad_norm": 97.69881439208984, "learning_rate": 4.5000000000000003e-07, "loss": 12.0717, "step": 9 }, { "epoch": 0.13452914798206278, "grad_norm": 95.55254364013672, "learning_rate": 5.000000000000001e-07, "loss": 11.9752, "step": 10 }, { "epoch": 0.14798206278026907, "grad_norm": 95.9182357788086, "learning_rate": 5.5e-07, "loss": 11.9413, "step": 11 }, { "epoch": 0.16143497757847533, "grad_norm": 95.40771484375, "learning_rate": 6.000000000000001e-07, "loss": 11.7523, "step": 12 }, { "epoch": 0.17488789237668162, "grad_norm": 94.40055847167969, "learning_rate": 6.5e-07, "loss": 11.6384, "step": 13 }, { "epoch": 0.18834080717488788, "grad_norm": 93.58352661132812, "learning_rate": 7.000000000000001e-07, "loss": 11.4293, "step": 14 }, { "epoch": 0.20179372197309417, "grad_norm": 94.48737335205078, "learning_rate": 7.5e-07, "loss": 11.1445, "step": 15 }, { "epoch": 0.21524663677130046, "grad_norm": 92.44265747070312, "learning_rate": 8.000000000000001e-07, "loss": 10.7705, "step": 16 }, { "epoch": 0.22869955156950672, "grad_norm": 90.97422790527344, "learning_rate": 8.500000000000001e-07, "loss": 10.3754, "step": 17 }, { "epoch": 0.242152466367713, "grad_norm": 88.54856872558594, "learning_rate": 9.000000000000001e-07, "loss": 10.0019, "step": 18 }, { "epoch": 0.2556053811659193, "grad_norm": 88.39138793945312, "learning_rate": 9.500000000000001e-07, "loss": 9.5093, "step": 19 }, { "epoch": 0.26905829596412556, "grad_norm": 86.55109405517578, "learning_rate": 1.0000000000000002e-06, "loss": 9.2342, "step": 20 }, { "epoch": 0.2825112107623318, "grad_norm": 80.62335205078125, "learning_rate": 1.0500000000000001e-06, "loss": 8.597, "step": 21 }, { "epoch": 0.29596412556053814, "grad_norm": 73.67768859863281, "learning_rate": 1.1e-06, "loss": 8.1122, "step": 22 }, { "epoch": 0.3094170403587444, "grad_norm": 64.57353210449219, "learning_rate": 1.1500000000000002e-06, "loss": 7.6455, "step": 23 }, { "epoch": 0.32286995515695066, "grad_norm": 55.2818603515625, "learning_rate": 1.2000000000000002e-06, "loss": 7.2493, "step": 24 }, { "epoch": 0.336322869955157, "grad_norm": 48.274452209472656, "learning_rate": 1.25e-06, "loss": 7.0377, "step": 25 }, { "epoch": 0.34977578475336324, "grad_norm": 42.7370491027832, "learning_rate": 1.3e-06, "loss": 6.5782, "step": 26 }, { "epoch": 0.3632286995515695, "grad_norm": 39.297462463378906, "learning_rate": 1.3500000000000002e-06, "loss": 6.2558, "step": 27 }, { "epoch": 0.37668161434977576, "grad_norm": 37.91667938232422, "learning_rate": 1.4000000000000001e-06, "loss": 5.9809, "step": 28 }, { "epoch": 0.3901345291479821, "grad_norm": 37.87322998046875, "learning_rate": 1.45e-06, "loss": 5.7268, "step": 29 }, { "epoch": 0.40358744394618834, "grad_norm": 36.48906707763672, "learning_rate": 1.5e-06, "loss": 5.449, "step": 30 }, { "epoch": 0.4170403587443946, "grad_norm": 36.38510513305664, "learning_rate": 1.5500000000000002e-06, "loss": 5.1884, "step": 31 }, { "epoch": 0.4304932735426009, "grad_norm": 35.656829833984375, "learning_rate": 1.6000000000000001e-06, "loss": 4.899, "step": 32 }, { "epoch": 0.4439461883408072, "grad_norm": 34.09960174560547, "learning_rate": 1.6500000000000003e-06, "loss": 4.5842, "step": 33 }, { "epoch": 0.45739910313901344, "grad_norm": 32.74240493774414, "learning_rate": 1.7000000000000002e-06, "loss": 4.3009, "step": 34 }, { "epoch": 0.47085201793721976, "grad_norm": 31.867507934570312, "learning_rate": 1.75e-06, "loss": 3.9865, "step": 35 }, { "epoch": 0.484304932735426, "grad_norm": 30.741374969482422, "learning_rate": 1.8000000000000001e-06, "loss": 3.6916, "step": 36 }, { "epoch": 0.4977578475336323, "grad_norm": 27.8775577545166, "learning_rate": 1.85e-06, "loss": 3.3719, "step": 37 }, { "epoch": 0.5112107623318386, "grad_norm": 25.97083282470703, "learning_rate": 1.9000000000000002e-06, "loss": 3.0907, "step": 38 }, { "epoch": 0.5246636771300448, "grad_norm": 23.62006950378418, "learning_rate": 1.9500000000000004e-06, "loss": 2.8336, "step": 39 }, { "epoch": 0.5381165919282511, "grad_norm": 23.80520248413086, "learning_rate": 2.0000000000000003e-06, "loss": 2.5717, "step": 40 }, { "epoch": 0.5515695067264574, "grad_norm": 25.32924461364746, "learning_rate": 2.05e-06, "loss": 2.3658, "step": 41 }, { "epoch": 0.5650224215246636, "grad_norm": 26.20570182800293, "learning_rate": 2.1000000000000002e-06, "loss": 2.2443, "step": 42 }, { "epoch": 0.57847533632287, "grad_norm": 24.581693649291992, "learning_rate": 2.15e-06, "loss": 1.926, "step": 43 }, { "epoch": 0.5919282511210763, "grad_norm": 24.414310455322266, "learning_rate": 2.2e-06, "loss": 1.7034, "step": 44 }, { "epoch": 0.6053811659192825, "grad_norm": 22.691083908081055, "learning_rate": 2.25e-06, "loss": 1.4857, "step": 45 }, { "epoch": 0.6188340807174888, "grad_norm": 20.669803619384766, "learning_rate": 2.3000000000000004e-06, "loss": 1.2415, "step": 46 }, { "epoch": 0.6322869955156951, "grad_norm": 20.149641036987305, "learning_rate": 2.35e-06, "loss": 0.997, "step": 47 }, { "epoch": 0.6457399103139013, "grad_norm": 18.632596969604492, "learning_rate": 2.4000000000000003e-06, "loss": 0.7552, "step": 48 }, { "epoch": 0.6591928251121076, "grad_norm": 16.93793296813965, "learning_rate": 2.4500000000000003e-06, "loss": 0.5883, "step": 49 }, { "epoch": 0.672645739910314, "grad_norm": 14.432519912719727, "learning_rate": 2.5e-06, "loss": 0.4382, "step": 50 }, { "epoch": 0.6860986547085202, "grad_norm": 11.829660415649414, "learning_rate": 2.55e-06, "loss": 0.2983, "step": 51 }, { "epoch": 0.6995515695067265, "grad_norm": 8.680500030517578, "learning_rate": 2.6e-06, "loss": 0.1988, "step": 52 }, { "epoch": 0.7130044843049327, "grad_norm": 6.53156852722168, "learning_rate": 2.6500000000000005e-06, "loss": 0.1589, "step": 53 }, { "epoch": 0.726457399103139, "grad_norm": 2.9756624698638916, "learning_rate": 2.7000000000000004e-06, "loss": 0.0686, "step": 54 }, { "epoch": 0.7399103139013453, "grad_norm": 5.545580863952637, "learning_rate": 2.7500000000000004e-06, "loss": 0.0865, "step": 55 }, { "epoch": 0.7533632286995515, "grad_norm": 4.045405387878418, "learning_rate": 2.8000000000000003e-06, "loss": 0.0949, "step": 56 }, { "epoch": 0.7668161434977578, "grad_norm": 1.6688120365142822, "learning_rate": 2.85e-06, "loss": 0.0396, "step": 57 }, { "epoch": 0.7802690582959642, "grad_norm": 2.4520657062530518, "learning_rate": 2.9e-06, "loss": 0.0439, "step": 58 }, { "epoch": 0.7937219730941704, "grad_norm": 2.608729600906372, "learning_rate": 2.95e-06, "loss": 0.057, "step": 59 }, { "epoch": 0.8071748878923767, "grad_norm": 2.365234851837158, "learning_rate": 3e-06, "loss": 0.0547, "step": 60 }, { "epoch": 0.820627802690583, "grad_norm": 0.787550687789917, "learning_rate": 3.05e-06, "loss": 0.0209, "step": 61 }, { "epoch": 0.8340807174887892, "grad_norm": 0.7686442732810974, "learning_rate": 3.1000000000000004e-06, "loss": 0.0221, "step": 62 }, { "epoch": 0.8475336322869955, "grad_norm": 1.2510555982589722, "learning_rate": 3.1500000000000003e-06, "loss": 0.0165, "step": 63 }, { "epoch": 0.8609865470852018, "grad_norm": 0.8923770189285278, "learning_rate": 3.2000000000000003e-06, "loss": 0.0187, "step": 64 }, { "epoch": 0.874439461883408, "grad_norm": 0.8052615523338318, "learning_rate": 3.2500000000000002e-06, "loss": 0.0266, "step": 65 }, { "epoch": 0.8878923766816144, "grad_norm": 0.6710303425788879, "learning_rate": 3.3000000000000006e-06, "loss": 0.0154, "step": 66 }, { "epoch": 0.9013452914798207, "grad_norm": 0.5213025212287903, "learning_rate": 3.3500000000000005e-06, "loss": 0.0085, "step": 67 }, { "epoch": 0.9147982062780269, "grad_norm": 0.5758580565452576, "learning_rate": 3.4000000000000005e-06, "loss": 0.0133, "step": 68 }, { "epoch": 0.9282511210762332, "grad_norm": 0.6828752160072327, "learning_rate": 3.45e-06, "loss": 0.0186, "step": 69 }, { "epoch": 0.9417040358744395, "grad_norm": 0.6814988255500793, "learning_rate": 3.5e-06, "loss": 0.0215, "step": 70 }, { "epoch": 0.9551569506726457, "grad_norm": 0.718296229839325, "learning_rate": 3.5500000000000003e-06, "loss": 0.0204, "step": 71 }, { "epoch": 0.968609865470852, "grad_norm": 0.7816944122314453, "learning_rate": 3.6000000000000003e-06, "loss": 0.0184, "step": 72 }, { "epoch": 0.9820627802690582, "grad_norm": 0.6058817505836487, "learning_rate": 3.65e-06, "loss": 0.0179, "step": 73 }, { "epoch": 0.9955156950672646, "grad_norm": 1.0496101379394531, "learning_rate": 3.7e-06, "loss": 0.032, "step": 74 } ], "logging_steps": 1, "max_steps": 444, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 74, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.6577058229662515e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }