| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9955156950672646, |
| "eval_steps": 500, |
| "global_step": 74, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.013452914798206279, |
| "grad_norm": 95.4926986694336, |
| "learning_rate": 5.0000000000000004e-08, |
| "loss": 12.2856, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.026905829596412557, |
| "grad_norm": 93.69285583496094, |
| "learning_rate": 1.0000000000000001e-07, |
| "loss": 12.2383, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.04035874439461883, |
| "grad_norm": 95.09840393066406, |
| "learning_rate": 1.5000000000000002e-07, |
| "loss": 12.1293, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.053811659192825115, |
| "grad_norm": 95.04216766357422, |
| "learning_rate": 2.0000000000000002e-07, |
| "loss": 12.1453, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.06726457399103139, |
| "grad_norm": 93.44210052490234, |
| "learning_rate": 2.5000000000000004e-07, |
| "loss": 12.165, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.08071748878923767, |
| "grad_norm": 93.28514862060547, |
| "learning_rate": 3.0000000000000004e-07, |
| "loss": 12.063, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.09417040358744394, |
| "grad_norm": 95.3654556274414, |
| "learning_rate": 3.5000000000000004e-07, |
| "loss": 11.9676, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.10762331838565023, |
| "grad_norm": 96.05154418945312, |
| "learning_rate": 4.0000000000000003e-07, |
| "loss": 12.0911, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.1210762331838565, |
| "grad_norm": 97.69881439208984, |
| "learning_rate": 4.5000000000000003e-07, |
| "loss": 12.0717, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.13452914798206278, |
| "grad_norm": 95.55254364013672, |
| "learning_rate": 5.000000000000001e-07, |
| "loss": 11.9752, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.14798206278026907, |
| "grad_norm": 95.9182357788086, |
| "learning_rate": 5.5e-07, |
| "loss": 11.9413, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.16143497757847533, |
| "grad_norm": 95.40771484375, |
| "learning_rate": 6.000000000000001e-07, |
| "loss": 11.7523, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.17488789237668162, |
| "grad_norm": 94.40055847167969, |
| "learning_rate": 6.5e-07, |
| "loss": 11.6384, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.18834080717488788, |
| "grad_norm": 93.58352661132812, |
| "learning_rate": 7.000000000000001e-07, |
| "loss": 11.4293, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.20179372197309417, |
| "grad_norm": 94.48737335205078, |
| "learning_rate": 7.5e-07, |
| "loss": 11.1445, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.21524663677130046, |
| "grad_norm": 92.44265747070312, |
| "learning_rate": 8.000000000000001e-07, |
| "loss": 10.7705, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.22869955156950672, |
| "grad_norm": 90.97422790527344, |
| "learning_rate": 8.500000000000001e-07, |
| "loss": 10.3754, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.242152466367713, |
| "grad_norm": 88.54856872558594, |
| "learning_rate": 9.000000000000001e-07, |
| "loss": 10.0019, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.2556053811659193, |
| "grad_norm": 88.39138793945312, |
| "learning_rate": 9.500000000000001e-07, |
| "loss": 9.5093, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.26905829596412556, |
| "grad_norm": 86.55109405517578, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 9.2342, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.2825112107623318, |
| "grad_norm": 80.62335205078125, |
| "learning_rate": 1.0500000000000001e-06, |
| "loss": 8.597, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.29596412556053814, |
| "grad_norm": 73.67768859863281, |
| "learning_rate": 1.1e-06, |
| "loss": 8.1122, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.3094170403587444, |
| "grad_norm": 64.57353210449219, |
| "learning_rate": 1.1500000000000002e-06, |
| "loss": 7.6455, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.32286995515695066, |
| "grad_norm": 55.2818603515625, |
| "learning_rate": 1.2000000000000002e-06, |
| "loss": 7.2493, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.336322869955157, |
| "grad_norm": 48.274452209472656, |
| "learning_rate": 1.25e-06, |
| "loss": 7.0377, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.34977578475336324, |
| "grad_norm": 42.7370491027832, |
| "learning_rate": 1.3e-06, |
| "loss": 6.5782, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.3632286995515695, |
| "grad_norm": 39.297462463378906, |
| "learning_rate": 1.3500000000000002e-06, |
| "loss": 6.2558, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.37668161434977576, |
| "grad_norm": 37.91667938232422, |
| "learning_rate": 1.4000000000000001e-06, |
| "loss": 5.9809, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.3901345291479821, |
| "grad_norm": 37.87322998046875, |
| "learning_rate": 1.45e-06, |
| "loss": 5.7268, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.40358744394618834, |
| "grad_norm": 36.48906707763672, |
| "learning_rate": 1.5e-06, |
| "loss": 5.449, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.4170403587443946, |
| "grad_norm": 36.38510513305664, |
| "learning_rate": 1.5500000000000002e-06, |
| "loss": 5.1884, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.4304932735426009, |
| "grad_norm": 35.656829833984375, |
| "learning_rate": 1.6000000000000001e-06, |
| "loss": 4.899, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.4439461883408072, |
| "grad_norm": 34.09960174560547, |
| "learning_rate": 1.6500000000000003e-06, |
| "loss": 4.5842, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.45739910313901344, |
| "grad_norm": 32.74240493774414, |
| "learning_rate": 1.7000000000000002e-06, |
| "loss": 4.3009, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.47085201793721976, |
| "grad_norm": 31.867507934570312, |
| "learning_rate": 1.75e-06, |
| "loss": 3.9865, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.484304932735426, |
| "grad_norm": 30.741374969482422, |
| "learning_rate": 1.8000000000000001e-06, |
| "loss": 3.6916, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.4977578475336323, |
| "grad_norm": 27.8775577545166, |
| "learning_rate": 1.85e-06, |
| "loss": 3.3719, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.5112107623318386, |
| "grad_norm": 25.97083282470703, |
| "learning_rate": 1.9000000000000002e-06, |
| "loss": 3.0907, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.5246636771300448, |
| "grad_norm": 23.62006950378418, |
| "learning_rate": 1.9500000000000004e-06, |
| "loss": 2.8336, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.5381165919282511, |
| "grad_norm": 23.80520248413086, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 2.5717, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.5515695067264574, |
| "grad_norm": 25.32924461364746, |
| "learning_rate": 2.05e-06, |
| "loss": 2.3658, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.5650224215246636, |
| "grad_norm": 26.20570182800293, |
| "learning_rate": 2.1000000000000002e-06, |
| "loss": 2.2443, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.57847533632287, |
| "grad_norm": 24.581693649291992, |
| "learning_rate": 2.15e-06, |
| "loss": 1.926, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.5919282511210763, |
| "grad_norm": 24.414310455322266, |
| "learning_rate": 2.2e-06, |
| "loss": 1.7034, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.6053811659192825, |
| "grad_norm": 22.691083908081055, |
| "learning_rate": 2.25e-06, |
| "loss": 1.4857, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.6188340807174888, |
| "grad_norm": 20.669803619384766, |
| "learning_rate": 2.3000000000000004e-06, |
| "loss": 1.2415, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.6322869955156951, |
| "grad_norm": 20.149641036987305, |
| "learning_rate": 2.35e-06, |
| "loss": 0.997, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.6457399103139013, |
| "grad_norm": 18.632596969604492, |
| "learning_rate": 2.4000000000000003e-06, |
| "loss": 0.7552, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.6591928251121076, |
| "grad_norm": 16.93793296813965, |
| "learning_rate": 2.4500000000000003e-06, |
| "loss": 0.5883, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.672645739910314, |
| "grad_norm": 14.432519912719727, |
| "learning_rate": 2.5e-06, |
| "loss": 0.4382, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.6860986547085202, |
| "grad_norm": 11.829660415649414, |
| "learning_rate": 2.55e-06, |
| "loss": 0.2983, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.6995515695067265, |
| "grad_norm": 8.680500030517578, |
| "learning_rate": 2.6e-06, |
| "loss": 0.1988, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.7130044843049327, |
| "grad_norm": 6.53156852722168, |
| "learning_rate": 2.6500000000000005e-06, |
| "loss": 0.1589, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.726457399103139, |
| "grad_norm": 2.9756624698638916, |
| "learning_rate": 2.7000000000000004e-06, |
| "loss": 0.0686, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.7399103139013453, |
| "grad_norm": 5.545580863952637, |
| "learning_rate": 2.7500000000000004e-06, |
| "loss": 0.0865, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.7533632286995515, |
| "grad_norm": 4.045405387878418, |
| "learning_rate": 2.8000000000000003e-06, |
| "loss": 0.0949, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.7668161434977578, |
| "grad_norm": 1.6688120365142822, |
| "learning_rate": 2.85e-06, |
| "loss": 0.0396, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.7802690582959642, |
| "grad_norm": 2.4520657062530518, |
| "learning_rate": 2.9e-06, |
| "loss": 0.0439, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.7937219730941704, |
| "grad_norm": 2.608729600906372, |
| "learning_rate": 2.95e-06, |
| "loss": 0.057, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.8071748878923767, |
| "grad_norm": 2.365234851837158, |
| "learning_rate": 3e-06, |
| "loss": 0.0547, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.820627802690583, |
| "grad_norm": 0.787550687789917, |
| "learning_rate": 3.05e-06, |
| "loss": 0.0209, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.8340807174887892, |
| "grad_norm": 0.7686442732810974, |
| "learning_rate": 3.1000000000000004e-06, |
| "loss": 0.0221, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.8475336322869955, |
| "grad_norm": 1.2510555982589722, |
| "learning_rate": 3.1500000000000003e-06, |
| "loss": 0.0165, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.8609865470852018, |
| "grad_norm": 0.8923770189285278, |
| "learning_rate": 3.2000000000000003e-06, |
| "loss": 0.0187, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.874439461883408, |
| "grad_norm": 0.8052615523338318, |
| "learning_rate": 3.2500000000000002e-06, |
| "loss": 0.0266, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.8878923766816144, |
| "grad_norm": 0.6710303425788879, |
| "learning_rate": 3.3000000000000006e-06, |
| "loss": 0.0154, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.9013452914798207, |
| "grad_norm": 0.5213025212287903, |
| "learning_rate": 3.3500000000000005e-06, |
| "loss": 0.0085, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.9147982062780269, |
| "grad_norm": 0.5758580565452576, |
| "learning_rate": 3.4000000000000005e-06, |
| "loss": 0.0133, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.9282511210762332, |
| "grad_norm": 0.6828752160072327, |
| "learning_rate": 3.45e-06, |
| "loss": 0.0186, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.9417040358744395, |
| "grad_norm": 0.6814988255500793, |
| "learning_rate": 3.5e-06, |
| "loss": 0.0215, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.9551569506726457, |
| "grad_norm": 0.718296229839325, |
| "learning_rate": 3.5500000000000003e-06, |
| "loss": 0.0204, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.968609865470852, |
| "grad_norm": 0.7816944122314453, |
| "learning_rate": 3.6000000000000003e-06, |
| "loss": 0.0184, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.9820627802690582, |
| "grad_norm": 0.6058817505836487, |
| "learning_rate": 3.65e-06, |
| "loss": 0.0179, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.9955156950672646, |
| "grad_norm": 1.0496101379394531, |
| "learning_rate": 3.7e-06, |
| "loss": 0.032, |
| "step": 74 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 444, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 6, |
| "save_steps": 74, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.6577058229662515e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|