{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9551569506726456, "eval_steps": 500, "global_step": 296, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013452914798206279, "grad_norm": 95.4926986694336, "learning_rate": 5.0000000000000004e-08, "loss": 12.2856, "step": 1 }, { "epoch": 0.026905829596412557, "grad_norm": 93.69285583496094, "learning_rate": 1.0000000000000001e-07, "loss": 12.2383, "step": 2 }, { "epoch": 0.04035874439461883, "grad_norm": 95.09840393066406, "learning_rate": 1.5000000000000002e-07, "loss": 12.1293, "step": 3 }, { "epoch": 0.053811659192825115, "grad_norm": 95.04216766357422, "learning_rate": 2.0000000000000002e-07, "loss": 12.1453, "step": 4 }, { "epoch": 0.06726457399103139, "grad_norm": 93.44210052490234, "learning_rate": 2.5000000000000004e-07, "loss": 12.165, "step": 5 }, { "epoch": 0.08071748878923767, "grad_norm": 93.28514862060547, "learning_rate": 3.0000000000000004e-07, "loss": 12.063, "step": 6 }, { "epoch": 0.09417040358744394, "grad_norm": 95.3654556274414, "learning_rate": 3.5000000000000004e-07, "loss": 11.9676, "step": 7 }, { "epoch": 0.10762331838565023, "grad_norm": 96.05154418945312, "learning_rate": 4.0000000000000003e-07, "loss": 12.0911, "step": 8 }, { "epoch": 0.1210762331838565, "grad_norm": 97.69881439208984, "learning_rate": 4.5000000000000003e-07, "loss": 12.0717, "step": 9 }, { "epoch": 0.13452914798206278, "grad_norm": 95.55254364013672, "learning_rate": 5.000000000000001e-07, "loss": 11.9752, "step": 10 }, { "epoch": 0.14798206278026907, "grad_norm": 95.9182357788086, "learning_rate": 5.5e-07, "loss": 11.9413, "step": 11 }, { "epoch": 0.16143497757847533, "grad_norm": 95.40771484375, "learning_rate": 6.000000000000001e-07, "loss": 11.7523, "step": 12 }, { "epoch": 0.17488789237668162, "grad_norm": 94.40055847167969, "learning_rate": 6.5e-07, "loss": 11.6384, "step": 13 }, { "epoch": 0.18834080717488788, "grad_norm": 93.58352661132812, "learning_rate": 7.000000000000001e-07, "loss": 11.4293, "step": 14 }, { "epoch": 0.20179372197309417, "grad_norm": 94.48737335205078, "learning_rate": 7.5e-07, "loss": 11.1445, "step": 15 }, { "epoch": 0.21524663677130046, "grad_norm": 92.44265747070312, "learning_rate": 8.000000000000001e-07, "loss": 10.7705, "step": 16 }, { "epoch": 0.22869955156950672, "grad_norm": 90.97422790527344, "learning_rate": 8.500000000000001e-07, "loss": 10.3754, "step": 17 }, { "epoch": 0.242152466367713, "grad_norm": 88.54856872558594, "learning_rate": 9.000000000000001e-07, "loss": 10.0019, "step": 18 }, { "epoch": 0.2556053811659193, "grad_norm": 88.39138793945312, "learning_rate": 9.500000000000001e-07, "loss": 9.5093, "step": 19 }, { "epoch": 0.26905829596412556, "grad_norm": 86.55109405517578, "learning_rate": 1.0000000000000002e-06, "loss": 9.2342, "step": 20 }, { "epoch": 0.2825112107623318, "grad_norm": 80.62335205078125, "learning_rate": 1.0500000000000001e-06, "loss": 8.597, "step": 21 }, { "epoch": 0.29596412556053814, "grad_norm": 73.67768859863281, "learning_rate": 1.1e-06, "loss": 8.1122, "step": 22 }, { "epoch": 0.3094170403587444, "grad_norm": 64.57353210449219, "learning_rate": 1.1500000000000002e-06, "loss": 7.6455, "step": 23 }, { "epoch": 0.32286995515695066, "grad_norm": 55.2818603515625, "learning_rate": 1.2000000000000002e-06, "loss": 7.2493, "step": 24 }, { "epoch": 0.336322869955157, "grad_norm": 48.274452209472656, "learning_rate": 1.25e-06, "loss": 7.0377, "step": 25 }, { "epoch": 0.34977578475336324, "grad_norm": 42.7370491027832, "learning_rate": 1.3e-06, "loss": 6.5782, "step": 26 }, { "epoch": 0.3632286995515695, "grad_norm": 39.297462463378906, "learning_rate": 1.3500000000000002e-06, "loss": 6.2558, "step": 27 }, { "epoch": 0.37668161434977576, "grad_norm": 37.91667938232422, "learning_rate": 1.4000000000000001e-06, "loss": 5.9809, "step": 28 }, { "epoch": 0.3901345291479821, "grad_norm": 37.87322998046875, "learning_rate": 1.45e-06, "loss": 5.7268, "step": 29 }, { "epoch": 0.40358744394618834, "grad_norm": 36.48906707763672, "learning_rate": 1.5e-06, "loss": 5.449, "step": 30 }, { "epoch": 0.4170403587443946, "grad_norm": 36.38510513305664, "learning_rate": 1.5500000000000002e-06, "loss": 5.1884, "step": 31 }, { "epoch": 0.4304932735426009, "grad_norm": 35.656829833984375, "learning_rate": 1.6000000000000001e-06, "loss": 4.899, "step": 32 }, { "epoch": 0.4439461883408072, "grad_norm": 34.09960174560547, "learning_rate": 1.6500000000000003e-06, "loss": 4.5842, "step": 33 }, { "epoch": 0.45739910313901344, "grad_norm": 32.74240493774414, "learning_rate": 1.7000000000000002e-06, "loss": 4.3009, "step": 34 }, { "epoch": 0.47085201793721976, "grad_norm": 31.867507934570312, "learning_rate": 1.75e-06, "loss": 3.9865, "step": 35 }, { "epoch": 0.484304932735426, "grad_norm": 30.741374969482422, "learning_rate": 1.8000000000000001e-06, "loss": 3.6916, "step": 36 }, { "epoch": 0.4977578475336323, "grad_norm": 27.8775577545166, "learning_rate": 1.85e-06, "loss": 3.3719, "step": 37 }, { "epoch": 0.5112107623318386, "grad_norm": 25.97083282470703, "learning_rate": 1.9000000000000002e-06, "loss": 3.0907, "step": 38 }, { "epoch": 0.5246636771300448, "grad_norm": 23.62006950378418, "learning_rate": 1.9500000000000004e-06, "loss": 2.8336, "step": 39 }, { "epoch": 0.5381165919282511, "grad_norm": 23.80520248413086, "learning_rate": 2.0000000000000003e-06, "loss": 2.5717, "step": 40 }, { "epoch": 0.5515695067264574, "grad_norm": 25.32924461364746, "learning_rate": 2.05e-06, "loss": 2.3658, "step": 41 }, { "epoch": 0.5650224215246636, "grad_norm": 26.20570182800293, "learning_rate": 2.1000000000000002e-06, "loss": 2.2443, "step": 42 }, { "epoch": 0.57847533632287, "grad_norm": 24.581693649291992, "learning_rate": 2.15e-06, "loss": 1.926, "step": 43 }, { "epoch": 0.5919282511210763, "grad_norm": 24.414310455322266, "learning_rate": 2.2e-06, "loss": 1.7034, "step": 44 }, { "epoch": 0.6053811659192825, "grad_norm": 22.691083908081055, "learning_rate": 2.25e-06, "loss": 1.4857, "step": 45 }, { "epoch": 0.6188340807174888, "grad_norm": 20.669803619384766, "learning_rate": 2.3000000000000004e-06, "loss": 1.2415, "step": 46 }, { "epoch": 0.6322869955156951, "grad_norm": 20.149641036987305, "learning_rate": 2.35e-06, "loss": 0.997, "step": 47 }, { "epoch": 0.6457399103139013, "grad_norm": 18.632596969604492, "learning_rate": 2.4000000000000003e-06, "loss": 0.7552, "step": 48 }, { "epoch": 0.6591928251121076, "grad_norm": 16.93793296813965, "learning_rate": 2.4500000000000003e-06, "loss": 0.5883, "step": 49 }, { "epoch": 0.672645739910314, "grad_norm": 14.432519912719727, "learning_rate": 2.5e-06, "loss": 0.4382, "step": 50 }, { "epoch": 0.6860986547085202, "grad_norm": 11.829660415649414, "learning_rate": 2.55e-06, "loss": 0.2983, "step": 51 }, { "epoch": 0.6995515695067265, "grad_norm": 8.680500030517578, "learning_rate": 2.6e-06, "loss": 0.1988, "step": 52 }, { "epoch": 0.7130044843049327, "grad_norm": 6.53156852722168, "learning_rate": 2.6500000000000005e-06, "loss": 0.1589, "step": 53 }, { "epoch": 0.726457399103139, "grad_norm": 2.9756624698638916, "learning_rate": 2.7000000000000004e-06, "loss": 0.0686, "step": 54 }, { "epoch": 0.7399103139013453, "grad_norm": 5.545580863952637, "learning_rate": 2.7500000000000004e-06, "loss": 0.0865, "step": 55 }, { "epoch": 0.7533632286995515, "grad_norm": 4.045405387878418, "learning_rate": 2.8000000000000003e-06, "loss": 0.0949, "step": 56 }, { "epoch": 0.7668161434977578, "grad_norm": 1.6688120365142822, "learning_rate": 2.85e-06, "loss": 0.0396, "step": 57 }, { "epoch": 0.7802690582959642, "grad_norm": 2.4520657062530518, "learning_rate": 2.9e-06, "loss": 0.0439, "step": 58 }, { "epoch": 0.7937219730941704, "grad_norm": 2.608729600906372, "learning_rate": 2.95e-06, "loss": 0.057, "step": 59 }, { "epoch": 0.8071748878923767, "grad_norm": 2.365234851837158, "learning_rate": 3e-06, "loss": 0.0547, "step": 60 }, { "epoch": 0.820627802690583, "grad_norm": 0.787550687789917, "learning_rate": 3.05e-06, "loss": 0.0209, "step": 61 }, { "epoch": 0.8340807174887892, "grad_norm": 0.7686442732810974, "learning_rate": 3.1000000000000004e-06, "loss": 0.0221, "step": 62 }, { "epoch": 0.8475336322869955, "grad_norm": 1.2510555982589722, "learning_rate": 3.1500000000000003e-06, "loss": 0.0165, "step": 63 }, { "epoch": 0.8609865470852018, "grad_norm": 0.8923770189285278, "learning_rate": 3.2000000000000003e-06, "loss": 0.0187, "step": 64 }, { "epoch": 0.874439461883408, "grad_norm": 0.8052615523338318, "learning_rate": 3.2500000000000002e-06, "loss": 0.0266, "step": 65 }, { "epoch": 0.8878923766816144, "grad_norm": 0.6710303425788879, "learning_rate": 3.3000000000000006e-06, "loss": 0.0154, "step": 66 }, { "epoch": 0.9013452914798207, "grad_norm": 0.5213025212287903, "learning_rate": 3.3500000000000005e-06, "loss": 0.0085, "step": 67 }, { "epoch": 0.9147982062780269, "grad_norm": 0.5758580565452576, "learning_rate": 3.4000000000000005e-06, "loss": 0.0133, "step": 68 }, { "epoch": 0.9282511210762332, "grad_norm": 0.6828752160072327, "learning_rate": 3.45e-06, "loss": 0.0186, "step": 69 }, { "epoch": 0.9417040358744395, "grad_norm": 0.6814988255500793, "learning_rate": 3.5e-06, "loss": 0.0215, "step": 70 }, { "epoch": 0.9551569506726457, "grad_norm": 0.718296229839325, "learning_rate": 3.5500000000000003e-06, "loss": 0.0204, "step": 71 }, { "epoch": 0.968609865470852, "grad_norm": 0.7816944122314453, "learning_rate": 3.6000000000000003e-06, "loss": 0.0184, "step": 72 }, { "epoch": 0.9820627802690582, "grad_norm": 0.6058817505836487, "learning_rate": 3.65e-06, "loss": 0.0179, "step": 73 }, { "epoch": 0.9955156950672646, "grad_norm": 1.0496101379394531, "learning_rate": 3.7e-06, "loss": 0.032, "step": 74 }, { "epoch": 1.0, "grad_norm": 1.0496101379394531, "learning_rate": 3.7500000000000005e-06, "loss": 0.0137, "step": 75 }, { "epoch": 1.0134529147982063, "grad_norm": 1.598720669746399, "learning_rate": 3.8000000000000005e-06, "loss": 0.0211, "step": 76 }, { "epoch": 1.0269058295964126, "grad_norm": 0.7792187333106995, "learning_rate": 3.85e-06, "loss": 0.0213, "step": 77 }, { "epoch": 1.0403587443946187, "grad_norm": 0.7717252373695374, "learning_rate": 3.900000000000001e-06, "loss": 0.0151, "step": 78 }, { "epoch": 1.053811659192825, "grad_norm": 0.39334648847579956, "learning_rate": 3.95e-06, "loss": 0.0186, "step": 79 }, { "epoch": 1.0672645739910314, "grad_norm": 0.9775457382202148, "learning_rate": 4.000000000000001e-06, "loss": 0.0043, "step": 80 }, { "epoch": 1.0807174887892377, "grad_norm": 1.2425150871276855, "learning_rate": 4.05e-06, "loss": 0.0179, "step": 81 }, { "epoch": 1.094170403587444, "grad_norm": 0.3884654939174652, "learning_rate": 4.1e-06, "loss": 0.008, "step": 82 }, { "epoch": 1.1076233183856503, "grad_norm": 0.4746466875076294, "learning_rate": 4.15e-06, "loss": 0.0144, "step": 83 }, { "epoch": 1.1210762331838564, "grad_norm": 0.6812214255332947, "learning_rate": 4.2000000000000004e-06, "loss": 0.0072, "step": 84 }, { "epoch": 1.1345291479820627, "grad_norm": 0.5414469838142395, "learning_rate": 4.25e-06, "loss": 0.016, "step": 85 }, { "epoch": 1.147982062780269, "grad_norm": 0.8709024786949158, "learning_rate": 4.3e-06, "loss": 0.0154, "step": 86 }, { "epoch": 1.1614349775784754, "grad_norm": 0.3181096613407135, "learning_rate": 4.350000000000001e-06, "loss": 0.0039, "step": 87 }, { "epoch": 1.1748878923766817, "grad_norm": 0.2581265866756439, "learning_rate": 4.4e-06, "loss": 0.0014, "step": 88 }, { "epoch": 1.188340807174888, "grad_norm": 0.10870776325464249, "learning_rate": 4.450000000000001e-06, "loss": 0.0088, "step": 89 }, { "epoch": 1.201793721973094, "grad_norm": 0.45314452052116394, "learning_rate": 4.5e-06, "loss": 0.0019, "step": 90 }, { "epoch": 1.2152466367713004, "grad_norm": 0.7328381538391113, "learning_rate": 4.5500000000000005e-06, "loss": 0.0082, "step": 91 }, { "epoch": 1.2286995515695067, "grad_norm": 0.6641069650650024, "learning_rate": 4.600000000000001e-06, "loss": 0.0075, "step": 92 }, { "epoch": 1.242152466367713, "grad_norm": 0.27984222769737244, "learning_rate": 4.65e-06, "loss": 0.0009, "step": 93 }, { "epoch": 1.2556053811659194, "grad_norm": 0.8341127634048462, "learning_rate": 4.7e-06, "loss": 0.0204, "step": 94 }, { "epoch": 1.2690582959641254, "grad_norm": 1.0140557289123535, "learning_rate": 4.75e-06, "loss": 0.0228, "step": 95 }, { "epoch": 1.2825112107623318, "grad_norm": 0.9439787268638611, "learning_rate": 4.800000000000001e-06, "loss": 0.027, "step": 96 }, { "epoch": 1.295964125560538, "grad_norm": 1.3867762088775635, "learning_rate": 4.85e-06, "loss": 0.0223, "step": 97 }, { "epoch": 1.3094170403587444, "grad_norm": 0.4747941493988037, "learning_rate": 4.9000000000000005e-06, "loss": 0.0048, "step": 98 }, { "epoch": 1.3228699551569507, "grad_norm": 0.5673424601554871, "learning_rate": 4.95e-06, "loss": 0.0033, "step": 99 }, { "epoch": 1.336322869955157, "grad_norm": 0.24510182440280914, "learning_rate": 5e-06, "loss": 0.0048, "step": 100 }, { "epoch": 1.3497757847533634, "grad_norm": 0.6084151268005371, "learning_rate": 4.99989574668946e-06, "loss": 0.032, "step": 101 }, { "epoch": 1.3632286995515694, "grad_norm": 0.606272280216217, "learning_rate": 4.999582995452842e-06, "loss": 0.0058, "step": 102 }, { "epoch": 1.3766816143497758, "grad_norm": 0.2270481288433075, "learning_rate": 4.999061772374426e-06, "loss": 0.0043, "step": 103 }, { "epoch": 1.390134529147982, "grad_norm": 0.2832431495189667, "learning_rate": 4.998332120925598e-06, "loss": 0.0079, "step": 104 }, { "epoch": 1.4035874439461884, "grad_norm": 0.44565466046333313, "learning_rate": 4.9973941019612235e-06, "loss": 0.017, "step": 105 }, { "epoch": 1.4170403587443947, "grad_norm": 0.3518397808074951, "learning_rate": 4.996247793714565e-06, "loss": 0.0057, "step": 106 }, { "epoch": 1.4304932735426008, "grad_norm": 0.41977858543395996, "learning_rate": 4.994893291790768e-06, "loss": 0.0056, "step": 107 }, { "epoch": 1.4439461883408071, "grad_norm": 0.22567600011825562, "learning_rate": 4.993330709158879e-06, "loss": 0.0067, "step": 108 }, { "epoch": 1.4573991031390134, "grad_norm": 0.2184637039899826, "learning_rate": 4.9915601761424304e-06, "loss": 0.0043, "step": 109 }, { "epoch": 1.4708520179372198, "grad_norm": 0.24834735691547394, "learning_rate": 4.989581840408562e-06, "loss": 0.0034, "step": 110 }, { "epoch": 1.484304932735426, "grad_norm": 0.13165591657161713, "learning_rate": 4.987395866955716e-06, "loss": 0.0041, "step": 111 }, { "epoch": 1.4977578475336322, "grad_norm": 0.27152982354164124, "learning_rate": 4.9850024380998655e-06, "loss": 0.0034, "step": 112 }, { "epoch": 1.5112107623318387, "grad_norm": 0.2793160676956177, "learning_rate": 4.982401753459317e-06, "loss": 0.0049, "step": 113 }, { "epoch": 1.5246636771300448, "grad_norm": 0.06674113124608994, "learning_rate": 4.979594029938058e-06, "loss": 0.0034, "step": 114 }, { "epoch": 1.5381165919282511, "grad_norm": 0.2601087689399719, "learning_rate": 4.976579501707665e-06, "loss": 0.0025, "step": 115 }, { "epoch": 1.5515695067264574, "grad_norm": 0.17075951397418976, "learning_rate": 4.973358420187776e-06, "loss": 0.0005, "step": 116 }, { "epoch": 1.5650224215246635, "grad_norm": 0.11211276799440384, "learning_rate": 4.969931054025122e-06, "loss": 0.0059, "step": 117 }, { "epoch": 1.57847533632287, "grad_norm": 0.5140838027000427, "learning_rate": 4.966297689071117e-06, "loss": 0.018, "step": 118 }, { "epoch": 1.5919282511210762, "grad_norm": 0.5356995463371277, "learning_rate": 4.962458628358021e-06, "loss": 0.0007, "step": 119 }, { "epoch": 1.6053811659192825, "grad_norm": 0.3967442214488983, "learning_rate": 4.958414192073665e-06, "loss": 0.014, "step": 120 }, { "epoch": 1.6188340807174888, "grad_norm": 0.4567921757698059, "learning_rate": 4.954164717534748e-06, "loss": 0.0027, "step": 121 }, { "epoch": 1.6322869955156951, "grad_norm": 0.4405613839626312, "learning_rate": 4.949710559158699e-06, "loss": 0.0135, "step": 122 }, { "epoch": 1.6457399103139014, "grad_norm": 0.7164422869682312, "learning_rate": 4.945052088434123e-06, "loss": 0.0207, "step": 123 }, { "epoch": 1.6591928251121075, "grad_norm": 0.47173142433166504, "learning_rate": 4.940189693889819e-06, "loss": 0.0019, "step": 124 }, { "epoch": 1.672645739910314, "grad_norm": 0.4606887996196747, "learning_rate": 4.9351237810623655e-06, "loss": 0.0125, "step": 125 }, { "epoch": 1.6860986547085202, "grad_norm": 0.7209401726722717, "learning_rate": 4.929854772462312e-06, "loss": 0.012, "step": 126 }, { "epoch": 1.6995515695067265, "grad_norm": 1.7617985010147095, "learning_rate": 4.924383107538929e-06, "loss": 0.0079, "step": 127 }, { "epoch": 1.7130044843049328, "grad_norm": 1.0861084461212158, "learning_rate": 4.918709242643563e-06, "loss": 0.0091, "step": 128 }, { "epoch": 1.726457399103139, "grad_norm": 0.11263061314821243, "learning_rate": 4.9128336509915746e-06, "loss": 0.0006, "step": 129 }, { "epoch": 1.7399103139013454, "grad_norm": 0.3411642909049988, "learning_rate": 4.906756822622865e-06, "loss": 0.0037, "step": 130 }, { "epoch": 1.7533632286995515, "grad_norm": 0.1550491899251938, "learning_rate": 4.900479264361017e-06, "loss": 0.0056, "step": 131 }, { "epoch": 1.7668161434977578, "grad_norm": 0.8448930382728577, "learning_rate": 4.894001499771015e-06, "loss": 0.0135, "step": 132 }, { "epoch": 1.7802690582959642, "grad_norm": 0.5105615258216858, "learning_rate": 4.887324069115582e-06, "loss": 0.009, "step": 133 }, { "epoch": 1.7937219730941703, "grad_norm": 0.5042226910591125, "learning_rate": 4.880447529310118e-06, "loss": 0.0022, "step": 134 }, { "epoch": 1.8071748878923768, "grad_norm": 0.15797697007656097, "learning_rate": 4.873372453876255e-06, "loss": 0.0073, "step": 135 }, { "epoch": 1.8206278026905829, "grad_norm": 0.47805944085121155, "learning_rate": 4.866099432894023e-06, "loss": 0.0084, "step": 136 }, { "epoch": 1.8340807174887892, "grad_norm": 0.3370952904224396, "learning_rate": 4.858629072952635e-06, "loss": 0.0037, "step": 137 }, { "epoch": 1.8475336322869955, "grad_norm": 0.30983835458755493, "learning_rate": 4.850961997099892e-06, "loss": 0.0024, "step": 138 }, { "epoch": 1.8609865470852018, "grad_norm": 0.2800588011741638, "learning_rate": 4.843098844790228e-06, "loss": 0.0032, "step": 139 }, { "epoch": 1.8744394618834082, "grad_norm": 0.2037343531847, "learning_rate": 4.835040271831371e-06, "loss": 0.0016, "step": 140 }, { "epoch": 1.8878923766816142, "grad_norm": 0.45981553196907043, "learning_rate": 4.826786950329646e-06, "loss": 0.0035, "step": 141 }, { "epoch": 1.9013452914798208, "grad_norm": 0.17092454433441162, "learning_rate": 4.818339568633926e-06, "loss": 0.0068, "step": 142 }, { "epoch": 1.9147982062780269, "grad_norm": 0.5339077711105347, "learning_rate": 4.809698831278217e-06, "loss": 0.0026, "step": 143 }, { "epoch": 1.9282511210762332, "grad_norm": 0.5208529829978943, "learning_rate": 4.800865458922899e-06, "loss": 0.0047, "step": 144 }, { "epoch": 1.9417040358744395, "grad_norm": 0.7488933801651001, "learning_rate": 4.79184018829462e-06, "loss": 0.0096, "step": 145 }, { "epoch": 1.9551569506726456, "grad_norm": 0.5727106332778931, "learning_rate": 4.782623772124854e-06, "loss": 0.001, "step": 146 }, { "epoch": 1.9686098654708521, "grad_norm": 0.3813195526599884, "learning_rate": 4.77321697908712e-06, "loss": 0.0085, "step": 147 }, { "epoch": 1.9820627802690582, "grad_norm": 0.5406109094619751, "learning_rate": 4.763620593732867e-06, "loss": 0.0017, "step": 148 }, { "epoch": 1.9955156950672646, "grad_norm": 0.391985148191452, "learning_rate": 4.7538354164260515e-06, "loss": 0.0019, "step": 149 }, { "epoch": 2.0, "grad_norm": 0.391985148191452, "learning_rate": 4.743862263276376e-06, "loss": 0.0003, "step": 150 }, { "epoch": 2.013452914798206, "grad_norm": 0.10133524239063263, "learning_rate": 4.733701966071226e-06, "loss": 0.0012, "step": 151 }, { "epoch": 2.0269058295964126, "grad_norm": 0.2032414674758911, "learning_rate": 4.723355372206297e-06, "loss": 0.0001, "step": 152 }, { "epoch": 2.0403587443946187, "grad_norm": 0.00796876847743988, "learning_rate": 4.712823344614921e-06, "loss": 0.0025, "step": 153 }, { "epoch": 2.0538116591928253, "grad_norm": 0.36200040578842163, "learning_rate": 4.702106761696091e-06, "loss": 0.0003, "step": 154 }, { "epoch": 2.0672645739910314, "grad_norm": 0.19646115601062775, "learning_rate": 4.691206517241205e-06, "loss": 0.0009, "step": 155 }, { "epoch": 2.0807174887892375, "grad_norm": 0.06766581535339355, "learning_rate": 4.68012352035952e-06, "loss": 0.0001, "step": 156 }, { "epoch": 2.094170403587444, "grad_norm": 0.00797713827341795, "learning_rate": 4.668858695402326e-06, "loss": 0.0001, "step": 157 }, { "epoch": 2.10762331838565, "grad_norm": 0.020394539460539818, "learning_rate": 4.657412981885862e-06, "loss": 0.0002, "step": 158 }, { "epoch": 2.1210762331838566, "grad_norm": 0.017584379762411118, "learning_rate": 4.645787334412945e-06, "loss": 0.0002, "step": 159 }, { "epoch": 2.1345291479820627, "grad_norm": 0.07967082411050797, "learning_rate": 4.633982722593367e-06, "loss": 0.0003, "step": 160 }, { "epoch": 2.1479820627802693, "grad_norm": 0.01606675237417221, "learning_rate": 4.622000130963015e-06, "loss": 0.0003, "step": 161 }, { "epoch": 2.1614349775784754, "grad_norm": 0.06052660569548607, "learning_rate": 4.6098405589017685e-06, "loss": 0.0007, "step": 162 }, { "epoch": 2.1748878923766815, "grad_norm": 0.07991409301757812, "learning_rate": 4.597505020550138e-06, "loss": 0.0002, "step": 163 }, { "epoch": 2.188340807174888, "grad_norm": 0.04991272836923599, "learning_rate": 4.584994544724695e-06, "loss": 0.0001, "step": 164 }, { "epoch": 2.201793721973094, "grad_norm": 0.027111921459436417, "learning_rate": 4.572310174832255e-06, "loss": 0.0001, "step": 165 }, { "epoch": 2.2152466367713006, "grad_norm": 0.006566982250660658, "learning_rate": 4.5594529687828615e-06, "loss": 0.0, "step": 166 }, { "epoch": 2.2286995515695067, "grad_norm": 0.002437079790979624, "learning_rate": 4.546423998901549e-06, "loss": 0.0004, "step": 167 }, { "epoch": 2.242152466367713, "grad_norm": 0.08434150367975235, "learning_rate": 4.533224351838914e-06, "loss": 0.0001, "step": 168 }, { "epoch": 2.2556053811659194, "grad_norm": 0.013094129040837288, "learning_rate": 4.519855128480478e-06, "loss": 0.0001, "step": 169 }, { "epoch": 2.2690582959641254, "grad_norm": 0.002770340768620372, "learning_rate": 4.5063174438548775e-06, "loss": 0.0001, "step": 170 }, { "epoch": 2.282511210762332, "grad_norm": 0.021480072289705276, "learning_rate": 4.492612427040864e-06, "loss": 0.0001, "step": 171 }, { "epoch": 2.295964125560538, "grad_norm": 0.0028862387407571077, "learning_rate": 4.478741221073136e-06, "loss": 0.0001, "step": 172 }, { "epoch": 2.3094170403587446, "grad_norm": 0.01969303749501705, "learning_rate": 4.464704982847008e-06, "loss": 0.0, "step": 173 }, { "epoch": 2.3228699551569507, "grad_norm": 0.00244798487983644, "learning_rate": 4.450504883021923e-06, "loss": 0.0, "step": 174 }, { "epoch": 2.336322869955157, "grad_norm": 0.0036638586316257715, "learning_rate": 4.436142105923814e-06, "loss": 0.0009, "step": 175 }, { "epoch": 2.3497757847533634, "grad_norm": 0.4378701448440552, "learning_rate": 4.4216178494463305e-06, "loss": 0.0003, "step": 176 }, { "epoch": 2.3632286995515694, "grad_norm": 0.004919757135212421, "learning_rate": 4.406933324950929e-06, "loss": 0.0, "step": 177 }, { "epoch": 2.376681614349776, "grad_norm": 0.0061010573990643024, "learning_rate": 4.392089757165841e-06, "loss": 0.0001, "step": 178 }, { "epoch": 2.390134529147982, "grad_norm": 0.00750540429726243, "learning_rate": 4.377088384083935e-06, "loss": 0.0004, "step": 179 }, { "epoch": 2.403587443946188, "grad_norm": 0.18055735528469086, "learning_rate": 4.361930456859455e-06, "loss": 0.0001, "step": 180 }, { "epoch": 2.4170403587443947, "grad_norm": 0.00540179992094636, "learning_rate": 4.346617239703676e-06, "loss": 0.0017, "step": 181 }, { "epoch": 2.430493273542601, "grad_norm": 0.6051300764083862, "learning_rate": 4.3311500097794655e-06, "loss": 0.0002, "step": 182 }, { "epoch": 2.4439461883408073, "grad_norm": 0.04465539753437042, "learning_rate": 4.315530057094763e-06, "loss": 0.0001, "step": 183 }, { "epoch": 2.4573991031390134, "grad_norm": 0.009125534445047379, "learning_rate": 4.2997586843949905e-06, "loss": 0.0001, "step": 184 }, { "epoch": 2.4708520179372195, "grad_norm": 0.009754106402397156, "learning_rate": 4.2838372070544e-06, "loss": 0.0001, "step": 185 }, { "epoch": 2.484304932735426, "grad_norm": 0.03795509785413742, "learning_rate": 4.267766952966369e-06, "loss": 0.0006, "step": 186 }, { "epoch": 2.497757847533632, "grad_norm": 0.19477951526641846, "learning_rate": 4.25154926243265e-06, "loss": 0.0013, "step": 187 }, { "epoch": 2.5112107623318387, "grad_norm": 0.27783000469207764, "learning_rate": 4.2351854880515856e-06, "loss": 0.0001, "step": 188 }, { "epoch": 2.524663677130045, "grad_norm": 0.030597640201449394, "learning_rate": 4.218676994605295e-06, "loss": 0.0, "step": 189 }, { "epoch": 2.538116591928251, "grad_norm": 0.019212787970900536, "learning_rate": 4.202025158945855e-06, "loss": 0.0001, "step": 190 }, { "epoch": 2.5515695067264574, "grad_norm": 0.017139675095677376, "learning_rate": 4.185231369880461e-06, "loss": 0.0001, "step": 191 }, { "epoch": 2.5650224215246635, "grad_norm": 0.0013709627091884613, "learning_rate": 4.168297028055599e-06, "loss": 0.0002, "step": 192 }, { "epoch": 2.57847533632287, "grad_norm": 0.07277967780828476, "learning_rate": 4.151223545840225e-06, "loss": 0.0, "step": 193 }, { "epoch": 2.591928251121076, "grad_norm": 0.0014003290561959147, "learning_rate": 4.134012347207974e-06, "loss": 0.0001, "step": 194 }, { "epoch": 2.6053811659192823, "grad_norm": 0.04370618611574173, "learning_rate": 4.116664867618395e-06, "loss": 0.0004, "step": 195 }, { "epoch": 2.618834080717489, "grad_norm": 0.24697266519069672, "learning_rate": 4.099182553897228e-06, "loss": 0.0, "step": 196 }, { "epoch": 2.6322869955156953, "grad_norm": 0.0013013904681429267, "learning_rate": 4.081566864115741e-06, "loss": 0.0, "step": 197 }, { "epoch": 2.6457399103139014, "grad_norm": 0.001239327946677804, "learning_rate": 4.063819267469114e-06, "loss": 0.0, "step": 198 }, { "epoch": 2.6591928251121075, "grad_norm": 0.007082940544933081, "learning_rate": 4.04594124415391e-06, "loss": 0.0, "step": 199 }, { "epoch": 2.672645739910314, "grad_norm": 0.00556205864995718, "learning_rate": 4.027934285244624e-06, "loss": 0.0001, "step": 200 }, { "epoch": 2.68609865470852, "grad_norm": 0.025378312915563583, "learning_rate": 4.009799892569317e-06, "loss": 0.0001, "step": 201 }, { "epoch": 2.6995515695067267, "grad_norm": 0.006344004534184933, "learning_rate": 3.991539578584368e-06, "loss": 0.0, "step": 202 }, { "epoch": 2.713004484304933, "grad_norm": 0.0016575426561757922, "learning_rate": 3.973154866248323e-06, "loss": 0.0, "step": 203 }, { "epoch": 2.726457399103139, "grad_norm": 0.0020909749437123537, "learning_rate": 3.9546472888948825e-06, "loss": 0.0, "step": 204 }, { "epoch": 2.7399103139013454, "grad_norm": 0.004855802282691002, "learning_rate": 3.936018390105013e-06, "loss": 0.0, "step": 205 }, { "epoch": 2.7533632286995515, "grad_norm": 0.0082467095926404, "learning_rate": 3.917269723578212e-06, "loss": 0.0, "step": 206 }, { "epoch": 2.766816143497758, "grad_norm": 0.0013344286708161235, "learning_rate": 3.898402853002921e-06, "loss": 0.0, "step": 207 }, { "epoch": 2.780269058295964, "grad_norm": 0.0034061160404235125, "learning_rate": 3.879419351926115e-06, "loss": 0.0, "step": 208 }, { "epoch": 2.7937219730941703, "grad_norm": 0.011912211775779724, "learning_rate": 3.86032080362206e-06, "loss": 0.0, "step": 209 }, { "epoch": 2.807174887892377, "grad_norm": 0.00134057376999408, "learning_rate": 3.841108800960264e-06, "loss": 0.0, "step": 210 }, { "epoch": 2.820627802690583, "grad_norm": 0.0016560767544433475, "learning_rate": 3.8217849462726334e-06, "loss": 0.0, "step": 211 }, { "epoch": 2.8340807174887894, "grad_norm": 0.006285225041210651, "learning_rate": 3.802350851219826e-06, "loss": 0.0, "step": 212 }, { "epoch": 2.8475336322869955, "grad_norm": 0.0018534021219238639, "learning_rate": 3.7828081366568388e-06, "loss": 0.0, "step": 213 }, { "epoch": 2.8609865470852016, "grad_norm": 0.0041048116981983185, "learning_rate": 3.763158432497824e-06, "loss": 0.0, "step": 214 }, { "epoch": 2.874439461883408, "grad_norm": 0.0023216214030981064, "learning_rate": 3.743403377580149e-06, "loss": 0.0, "step": 215 }, { "epoch": 2.8878923766816142, "grad_norm": 0.000994804548099637, "learning_rate": 3.723544619527714e-06, "loss": 0.0, "step": 216 }, { "epoch": 2.901345291479821, "grad_norm": 0.003168656025081873, "learning_rate": 3.703583814613536e-06, "loss": 0.0, "step": 217 }, { "epoch": 2.914798206278027, "grad_norm": 0.0013909138506278396, "learning_rate": 3.6835226276216087e-06, "loss": 0.0, "step": 218 }, { "epoch": 2.928251121076233, "grad_norm": 0.008506865240633488, "learning_rate": 3.663362731708059e-06, "loss": 0.0, "step": 219 }, { "epoch": 2.9417040358744395, "grad_norm": 0.0017819993663579226, "learning_rate": 3.6431058082615966e-06, "loss": 0.0, "step": 220 }, { "epoch": 2.9551569506726456, "grad_norm": 0.0014212332898750901, "learning_rate": 3.6227535467632873e-06, "loss": 0.0, "step": 221 }, { "epoch": 2.968609865470852, "grad_norm": 0.003146026050671935, "learning_rate": 3.6023076446456415e-06, "loss": 0.0, "step": 222 }, { "epoch": 2.9820627802690582, "grad_norm": 0.0012437553377822042, "learning_rate": 3.581769807151044e-06, "loss": 0.0, "step": 223 }, { "epoch": 2.9955156950672643, "grad_norm": 0.017311880365014076, "learning_rate": 3.561141747189538e-06, "loss": 0.0001, "step": 224 }, { "epoch": 3.0, "grad_norm": 0.012313771061599255, "learning_rate": 3.5404251851959537e-06, "loss": 0.0, "step": 225 }, { "epoch": 3.013452914798206, "grad_norm": 0.0043451120145618916, "learning_rate": 3.519621848986428e-06, "loss": 0.0, "step": 226 }, { "epoch": 3.0269058295964126, "grad_norm": 0.002464739605784416, "learning_rate": 3.498733473614298e-06, "loss": 0.0, "step": 227 }, { "epoch": 3.0403587443946187, "grad_norm": 0.0019529856508597732, "learning_rate": 3.47776180122539e-06, "loss": 0.0, "step": 228 }, { "epoch": 3.0538116591928253, "grad_norm": 0.0013189928140491247, "learning_rate": 3.4567085809127247e-06, "loss": 0.0, "step": 229 }, { "epoch": 3.0672645739910314, "grad_norm": 0.0021529668010771275, "learning_rate": 3.435575568570633e-06, "loss": 0.0, "step": 230 }, { "epoch": 3.0807174887892375, "grad_norm": 0.007290184032171965, "learning_rate": 3.4143645267483144e-06, "loss": 0.0, "step": 231 }, { "epoch": 3.094170403587444, "grad_norm": 0.001345694880001247, "learning_rate": 3.393077224502832e-06, "loss": 0.0, "step": 232 }, { "epoch": 3.10762331838565, "grad_norm": 0.004343180451542139, "learning_rate": 3.3717154372515716e-06, "loss": 0.0, "step": 233 }, { "epoch": 3.1210762331838566, "grad_norm": 0.0010816323338076472, "learning_rate": 3.350280946624166e-06, "loss": 0.0, "step": 234 }, { "epoch": 3.1345291479820627, "grad_norm": 0.003842687699943781, "learning_rate": 3.3287755403139007e-06, "loss": 0.0, "step": 235 }, { "epoch": 3.1479820627802693, "grad_norm": 0.0013102421071380377, "learning_rate": 3.3072010119286156e-06, "loss": 0.0, "step": 236 }, { "epoch": 3.1614349775784754, "grad_norm": 0.0014620574656873941, "learning_rate": 3.2855591608411203e-06, "loss": 0.0, "step": 237 }, { "epoch": 3.1748878923766815, "grad_norm": 0.0007613594643771648, "learning_rate": 3.2638517920391095e-06, "loss": 0.0, "step": 238 }, { "epoch": 3.188340807174888, "grad_norm": 0.001927120960317552, "learning_rate": 3.2420807159746333e-06, "loss": 0.0, "step": 239 }, { "epoch": 3.201793721973094, "grad_norm": 0.0047145108692348, "learning_rate": 3.2202477484130947e-06, "loss": 0.0, "step": 240 }, { "epoch": 3.2152466367713006, "grad_norm": 0.001525243278592825, "learning_rate": 3.1983547102818104e-06, "loss": 0.0, "step": 241 }, { "epoch": 3.2286995515695067, "grad_norm": 0.0008274471038021147, "learning_rate": 3.1764034275181436e-06, "loss": 0.0, "step": 242 }, { "epoch": 3.242152466367713, "grad_norm": 0.0009357924573123455, "learning_rate": 3.1543957309172136e-06, "loss": 0.0, "step": 243 }, { "epoch": 3.2556053811659194, "grad_norm": 0.0016225146828219295, "learning_rate": 3.132333455979202e-06, "loss": 0.0, "step": 244 }, { "epoch": 3.2690582959641254, "grad_norm": 0.013506707735359669, "learning_rate": 3.1102184427562696e-06, "loss": 0.0001, "step": 245 }, { "epoch": 3.282511210762332, "grad_norm": 0.0010258476249873638, "learning_rate": 3.0880525356990898e-06, "loss": 0.0, "step": 246 }, { "epoch": 3.295964125560538, "grad_norm": 0.006462691817432642, "learning_rate": 3.0658375835030148e-06, "loss": 0.0, "step": 247 }, { "epoch": 3.3094170403587446, "grad_norm": 0.0011441456153988838, "learning_rate": 3.043575438953893e-06, "loss": 0.0, "step": 248 }, { "epoch": 3.3228699551569507, "grad_norm": 0.000751970277633518, "learning_rate": 3.02126795877354e-06, "loss": 0.0, "step": 249 }, { "epoch": 3.336322869955157, "grad_norm": 0.006623424123972654, "learning_rate": 2.9989170034648823e-06, "loss": 0.0, "step": 250 }, { "epoch": 3.3497757847533634, "grad_norm": 0.0021056546829640865, "learning_rate": 2.9765244371567873e-06, "loss": 0.0, "step": 251 }, { "epoch": 3.3632286995515694, "grad_norm": 0.003343602642416954, "learning_rate": 2.9540921274485913e-06, "loss": 0.0, "step": 252 }, { "epoch": 3.376681614349776, "grad_norm": 0.0032404386438429356, "learning_rate": 2.9316219452543342e-06, "loss": 0.0, "step": 253 }, { "epoch": 3.390134529147982, "grad_norm": 0.02200383134186268, "learning_rate": 2.9091157646467205e-06, "loss": 0.0001, "step": 254 }, { "epoch": 3.403587443946188, "grad_norm": 0.0015861240681260824, "learning_rate": 2.886575462700821e-06, "loss": 0.0, "step": 255 }, { "epoch": 3.4170403587443947, "grad_norm": 0.0023104625288397074, "learning_rate": 2.864002919337513e-06, "loss": 0.0, "step": 256 }, { "epoch": 3.430493273542601, "grad_norm": 0.0009963945485651493, "learning_rate": 2.8414000171666952e-06, "loss": 0.0, "step": 257 }, { "epoch": 3.4439461883408073, "grad_norm": 0.006071928422898054, "learning_rate": 2.81876864133027e-06, "loss": 0.0, "step": 258 }, { "epoch": 3.4573991031390134, "grad_norm": 0.003969075623899698, "learning_rate": 2.7961106793449217e-06, "loss": 0.0, "step": 259 }, { "epoch": 3.4708520179372195, "grad_norm": 0.0010353871621191502, "learning_rate": 2.773428020944687e-06, "loss": 0.0, "step": 260 }, { "epoch": 3.484304932735426, "grad_norm": 0.003665305208414793, "learning_rate": 2.7507225579233487e-06, "loss": 0.0, "step": 261 }, { "epoch": 3.497757847533632, "grad_norm": 0.003311133710667491, "learning_rate": 2.727996183976659e-06, "loss": 0.0, "step": 262 }, { "epoch": 3.5112107623318387, "grad_norm": 0.004625072702765465, "learning_rate": 2.705250794544393e-06, "loss": 0.0, "step": 263 }, { "epoch": 3.524663677130045, "grad_norm": 0.0023010042496025562, "learning_rate": 2.682488286652269e-06, "loss": 0.0, "step": 264 }, { "epoch": 3.538116591928251, "grad_norm": 0.0008537416579201818, "learning_rate": 2.6597105587537307e-06, "loss": 0.0, "step": 265 }, { "epoch": 3.5515695067264574, "grad_norm": 0.0014414336765184999, "learning_rate": 2.6369195105716087e-06, "loss": 0.0, "step": 266 }, { "epoch": 3.5650224215246635, "grad_norm": 0.003060834715142846, "learning_rate": 2.614117042939685e-06, "loss": 0.0, "step": 267 }, { "epoch": 3.57847533632287, "grad_norm": 0.002594274003058672, "learning_rate": 2.591305057644148e-06, "loss": 0.0, "step": 268 }, { "epoch": 3.591928251121076, "grad_norm": 0.0009433673694729805, "learning_rate": 2.5684854572649876e-06, "loss": 0.0, "step": 269 }, { "epoch": 3.6053811659192823, "grad_norm": 0.000922717503271997, "learning_rate": 2.5456601450173123e-06, "loss": 0.0, "step": 270 }, { "epoch": 3.618834080717489, "grad_norm": 0.0013827934162691236, "learning_rate": 2.522831024592615e-06, "loss": 0.0, "step": 271 }, { "epoch": 3.6322869955156953, "grad_norm": 0.0007690931670367718, "learning_rate": 2.5e-06, "loss": 0.0, "step": 272 }, { "epoch": 3.6457399103139014, "grad_norm": 0.002565343165770173, "learning_rate": 2.4771689754073856e-06, "loss": 0.0, "step": 273 }, { "epoch": 3.6591928251121075, "grad_norm": 0.0013943302910774946, "learning_rate": 2.454339854982688e-06, "loss": 0.0, "step": 274 }, { "epoch": 3.672645739910314, "grad_norm": 0.0006668591522611678, "learning_rate": 2.4315145427350132e-06, "loss": 0.0, "step": 275 }, { "epoch": 3.68609865470852, "grad_norm": 0.0025589216966181993, "learning_rate": 2.408694942355853e-06, "loss": 0.0, "step": 276 }, { "epoch": 3.6995515695067267, "grad_norm": 0.001420054235495627, "learning_rate": 2.3858829570603157e-06, "loss": 0.0, "step": 277 }, { "epoch": 3.713004484304933, "grad_norm": 0.0014220779994502664, "learning_rate": 2.363080489428391e-06, "loss": 0.0, "step": 278 }, { "epoch": 3.726457399103139, "grad_norm": 0.0009400748531334102, "learning_rate": 2.3402894412462697e-06, "loss": 0.0, "step": 279 }, { "epoch": 3.7399103139013454, "grad_norm": 0.0006919830339029431, "learning_rate": 2.317511713347731e-06, "loss": 0.0, "step": 280 }, { "epoch": 3.7533632286995515, "grad_norm": 0.0011200032895430923, "learning_rate": 2.2947492054556075e-06, "loss": 0.0, "step": 281 }, { "epoch": 3.766816143497758, "grad_norm": 0.0008829529979266226, "learning_rate": 2.272003816023341e-06, "loss": 0.0, "step": 282 }, { "epoch": 3.780269058295964, "grad_norm": 0.0009032113594003022, "learning_rate": 2.2492774420766517e-06, "loss": 0.0, "step": 283 }, { "epoch": 3.7937219730941703, "grad_norm": 0.0012500348966568708, "learning_rate": 2.2265719790553147e-06, "loss": 0.0, "step": 284 }, { "epoch": 3.807174887892377, "grad_norm": 0.0008029688615351915, "learning_rate": 2.20388932065508e-06, "loss": 0.0, "step": 285 }, { "epoch": 3.820627802690583, "grad_norm": 0.0015973382396623492, "learning_rate": 2.1812313586697307e-06, "loss": 0.0, "step": 286 }, { "epoch": 3.8340807174887894, "grad_norm": 0.0021750489249825478, "learning_rate": 2.1585999828333065e-06, "loss": 0.0, "step": 287 }, { "epoch": 3.8475336322869955, "grad_norm": 0.004780885297805071, "learning_rate": 2.1359970806624886e-06, "loss": 0.0, "step": 288 }, { "epoch": 3.8609865470852016, "grad_norm": 0.0007556549389846623, "learning_rate": 2.11342453729918e-06, "loss": 0.0, "step": 289 }, { "epoch": 3.874439461883408, "grad_norm": 0.0006669393624179065, "learning_rate": 2.0908842353532803e-06, "loss": 0.0, "step": 290 }, { "epoch": 3.8878923766816142, "grad_norm": 0.001046078628860414, "learning_rate": 2.0683780547456666e-06, "loss": 0.0, "step": 291 }, { "epoch": 3.901345291479821, "grad_norm": 0.004534538835287094, "learning_rate": 2.045907872551409e-06, "loss": 0.0, "step": 292 }, { "epoch": 3.914798206278027, "grad_norm": 0.0008524219738319516, "learning_rate": 2.0234755628432135e-06, "loss": 0.0, "step": 293 }, { "epoch": 3.928251121076233, "grad_norm": 0.0006630662246607244, "learning_rate": 2.0010829965351185e-06, "loss": 0.0, "step": 294 }, { "epoch": 3.9417040358744395, "grad_norm": 0.008820832706987858, "learning_rate": 1.978732041226461e-06, "loss": 0.0, "step": 295 }, { "epoch": 3.9551569506726456, "grad_norm": 0.0007411285769194365, "learning_rate": 1.956424561046108e-06, "loss": 0.0, "step": 296 } ], "logging_steps": 1, "max_steps": 444, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 74, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.586020432455926e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }