{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.499302649930265, "eval_steps": 45, "global_step": 179, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002789400278940028, "grad_norm": 2.008944315843832, "learning_rate": 0.0, "loss": 1.7653, "step": 1 }, { "epoch": 0.002789400278940028, "eval_loss": 1.7864996194839478, "eval_runtime": 155.6922, "eval_samples_per_second": 2.042, "eval_steps_per_second": 0.128, "step": 1 }, { "epoch": 0.005578800557880056, "grad_norm": 1.5986956798562522, "learning_rate": 5.000000000000001e-07, "loss": 1.3721, "step": 2 }, { "epoch": 0.008368200836820083, "grad_norm": 2.487093074085383, "learning_rate": 1.0000000000000002e-06, "loss": 1.4468, "step": 3 }, { "epoch": 0.011157601115760111, "grad_norm": 1.9408908369837041, "learning_rate": 1.5e-06, "loss": 1.8555, "step": 4 }, { "epoch": 0.01394700139470014, "grad_norm": 1.855397723025267, "learning_rate": 2.0000000000000003e-06, "loss": 1.5173, "step": 5 }, { "epoch": 0.016736401673640166, "grad_norm": 1.6798170163537172, "learning_rate": 2.5e-06, "loss": 1.3812, "step": 6 }, { "epoch": 0.019525801952580194, "grad_norm": 2.0478066370088133, "learning_rate": 3e-06, "loss": 1.475, "step": 7 }, { "epoch": 0.022315202231520222, "grad_norm": 1.6541684745930123, "learning_rate": 3.5e-06, "loss": 1.6239, "step": 8 }, { "epoch": 0.02510460251046025, "grad_norm": 1.5940373225900581, "learning_rate": 4.000000000000001e-06, "loss": 1.4474, "step": 9 }, { "epoch": 0.02789400278940028, "grad_norm": 1.3436762031717087, "learning_rate": 4.5e-06, "loss": 1.5277, "step": 10 }, { "epoch": 0.030683403068340307, "grad_norm": 1.4820988105255548, "learning_rate": 5e-06, "loss": 1.3905, "step": 11 }, { "epoch": 0.03347280334728033, "grad_norm": 1.3026823080341792, "learning_rate": 5.500000000000001e-06, "loss": 1.2716, "step": 12 }, { "epoch": 0.03626220362622036, "grad_norm": 1.2786618249049673, "learning_rate": 6e-06, "loss": 1.4357, "step": 13 }, { "epoch": 0.03905160390516039, "grad_norm": 1.4401448629219828, "learning_rate": 6.5000000000000004e-06, "loss": 1.5418, "step": 14 }, { "epoch": 0.04184100418410042, "grad_norm": 1.4108691585319235, "learning_rate": 7e-06, "loss": 1.419, "step": 15 }, { "epoch": 0.044630404463040445, "grad_norm": 1.434665713671276, "learning_rate": 7.500000000000001e-06, "loss": 1.4042, "step": 16 }, { "epoch": 0.04741980474198047, "grad_norm": 1.4864898541989087, "learning_rate": 8.000000000000001e-06, "loss": 1.4338, "step": 17 }, { "epoch": 0.0502092050209205, "grad_norm": 1.7497952919365851, "learning_rate": 8.5e-06, "loss": 1.5988, "step": 18 }, { "epoch": 0.05299860529986053, "grad_norm": 1.5825124800137, "learning_rate": 9e-06, "loss": 1.4817, "step": 19 }, { "epoch": 0.05578800557880056, "grad_norm": 1.5639582937511203, "learning_rate": 9.5e-06, "loss": 1.3795, "step": 20 }, { "epoch": 0.058577405857740586, "grad_norm": 1.5512627729747626, "learning_rate": 1e-05, "loss": 1.4833, "step": 21 }, { "epoch": 0.061366806136680614, "grad_norm": 1.4462678614801439, "learning_rate": 1.0500000000000001e-05, "loss": 1.3781, "step": 22 }, { "epoch": 0.06415620641562064, "grad_norm": 1.5995776178618928, "learning_rate": 1.1000000000000001e-05, "loss": 1.4862, "step": 23 }, { "epoch": 0.06694560669456066, "grad_norm": 1.2623925872690254, "learning_rate": 1.15e-05, "loss": 1.4222, "step": 24 }, { "epoch": 0.0697350069735007, "grad_norm": 1.2816346894508914, "learning_rate": 1.2e-05, "loss": 1.3473, "step": 25 }, { "epoch": 0.07252440725244072, "grad_norm": 1.4528035335148701, "learning_rate": 1.25e-05, "loss": 1.5278, "step": 26 }, { "epoch": 0.07531380753138076, "grad_norm": 1.3415339044507064, "learning_rate": 1.3000000000000001e-05, "loss": 1.3939, "step": 27 }, { "epoch": 0.07810320781032078, "grad_norm": 1.6072610009319113, "learning_rate": 1.3500000000000001e-05, "loss": 1.5401, "step": 28 }, { "epoch": 0.08089260808926081, "grad_norm": 1.5988215139386146, "learning_rate": 1.4e-05, "loss": 1.3995, "step": 29 }, { "epoch": 0.08368200836820083, "grad_norm": 1.4507828208303803, "learning_rate": 1.45e-05, "loss": 1.4584, "step": 30 }, { "epoch": 0.08647140864714087, "grad_norm": 1.4975002404633069, "learning_rate": 1.5000000000000002e-05, "loss": 1.3834, "step": 31 }, { "epoch": 0.08926080892608089, "grad_norm": 1.4128570529331854, "learning_rate": 1.55e-05, "loss": 1.4216, "step": 32 }, { "epoch": 0.09205020920502092, "grad_norm": 1.3616495178863832, "learning_rate": 1.6000000000000003e-05, "loss": 1.5167, "step": 33 }, { "epoch": 0.09483960948396095, "grad_norm": 1.1988790310990909, "learning_rate": 1.65e-05, "loss": 1.4525, "step": 34 }, { "epoch": 0.09762900976290098, "grad_norm": 1.4483022749153813, "learning_rate": 1.7e-05, "loss": 1.5741, "step": 35 }, { "epoch": 0.100418410041841, "grad_norm": 1.394589777246842, "learning_rate": 1.7500000000000002e-05, "loss": 1.6437, "step": 36 }, { "epoch": 0.10320781032078104, "grad_norm": 1.4186049044929165, "learning_rate": 1.8e-05, "loss": 1.5162, "step": 37 }, { "epoch": 0.10599721059972106, "grad_norm": 1.4241876007622027, "learning_rate": 1.8500000000000002e-05, "loss": 1.2395, "step": 38 }, { "epoch": 0.1087866108786611, "grad_norm": 1.3743075911034852, "learning_rate": 1.9e-05, "loss": 1.4631, "step": 39 }, { "epoch": 0.11157601115760112, "grad_norm": 1.2863497247985725, "learning_rate": 1.95e-05, "loss": 1.3611, "step": 40 }, { "epoch": 0.11436541143654114, "grad_norm": 1.3054075793205129, "learning_rate": 2e-05, "loss": 1.2375, "step": 41 }, { "epoch": 0.11715481171548117, "grad_norm": 1.447249035073335, "learning_rate": 1.9999892011980525e-05, "loss": 1.4044, "step": 42 }, { "epoch": 0.1199442119944212, "grad_norm": 1.3461748028257197, "learning_rate": 1.9999568050254373e-05, "loss": 1.4081, "step": 43 }, { "epoch": 0.12273361227336123, "grad_norm": 1.6035229132022362, "learning_rate": 1.999902812181835e-05, "loss": 1.4167, "step": 44 }, { "epoch": 0.12552301255230125, "grad_norm": 1.7750306011999208, "learning_rate": 1.9998272238333606e-05, "loss": 1.3533, "step": 45 }, { "epoch": 0.12552301255230125, "eval_loss": 1.6828460693359375, "eval_runtime": 158.4193, "eval_samples_per_second": 2.007, "eval_steps_per_second": 0.126, "step": 45 }, { "epoch": 0.12831241283124128, "grad_norm": 1.354470787639867, "learning_rate": 1.9997300416125426e-05, "loss": 1.3901, "step": 46 }, { "epoch": 0.13110181311018132, "grad_norm": 1.23248602443203, "learning_rate": 1.999611267618283e-05, "loss": 1.3485, "step": 47 }, { "epoch": 0.13389121338912133, "grad_norm": 1.314827114627873, "learning_rate": 1.9994709044158157e-05, "loss": 1.3195, "step": 48 }, { "epoch": 0.13668061366806136, "grad_norm": 1.3741001015735983, "learning_rate": 1.99930895503665e-05, "loss": 1.4394, "step": 49 }, { "epoch": 0.1394700139470014, "grad_norm": 1.1624406309888304, "learning_rate": 1.9991254229785043e-05, "loss": 1.2943, "step": 50 }, { "epoch": 0.14225941422594143, "grad_norm": 1.431188759221728, "learning_rate": 1.998920312205231e-05, "loss": 1.4614, "step": 51 }, { "epoch": 0.14504881450488144, "grad_norm": 1.2513371987994648, "learning_rate": 1.9986936271467316e-05, "loss": 1.328, "step": 52 }, { "epoch": 0.14783821478382148, "grad_norm": 1.6084481811469094, "learning_rate": 1.99844537269886e-05, "loss": 1.4579, "step": 53 }, { "epoch": 0.1506276150627615, "grad_norm": 1.3643143080926854, "learning_rate": 1.9981755542233175e-05, "loss": 1.4471, "step": 54 }, { "epoch": 0.15341701534170155, "grad_norm": 1.443453372098446, "learning_rate": 1.9978841775475368e-05, "loss": 1.3931, "step": 55 }, { "epoch": 0.15620641562064155, "grad_norm": 1.186174044031903, "learning_rate": 1.997571248964556e-05, "loss": 1.2935, "step": 56 }, { "epoch": 0.1589958158995816, "grad_norm": 1.3967845728055575, "learning_rate": 1.9972367752328824e-05, "loss": 1.3819, "step": 57 }, { "epoch": 0.16178521617852162, "grad_norm": 1.585201984076838, "learning_rate": 1.9968807635763472e-05, "loss": 1.5391, "step": 58 }, { "epoch": 0.16457461645746166, "grad_norm": 1.515327450581085, "learning_rate": 1.9965032216839493e-05, "loss": 1.3969, "step": 59 }, { "epoch": 0.16736401673640167, "grad_norm": 1.3595894447160555, "learning_rate": 1.996104157709689e-05, "loss": 1.3598, "step": 60 }, { "epoch": 0.1701534170153417, "grad_norm": 1.4252798961269173, "learning_rate": 1.9956835802723916e-05, "loss": 1.447, "step": 61 }, { "epoch": 0.17294281729428174, "grad_norm": 1.4269613254937963, "learning_rate": 1.9952414984555225e-05, "loss": 1.4354, "step": 62 }, { "epoch": 0.17573221757322174, "grad_norm": 1.5984564978735487, "learning_rate": 1.994777921806989e-05, "loss": 1.5493, "step": 63 }, { "epoch": 0.17852161785216178, "grad_norm": 1.5396475514798837, "learning_rate": 1.9942928603389366e-05, "loss": 1.4095, "step": 64 }, { "epoch": 0.18131101813110181, "grad_norm": 1.381818615640278, "learning_rate": 1.9937863245275303e-05, "loss": 1.381, "step": 65 }, { "epoch": 0.18410041841004185, "grad_norm": 1.3411933307262995, "learning_rate": 1.9932583253127302e-05, "loss": 1.4514, "step": 66 }, { "epoch": 0.18688981868898186, "grad_norm": 1.4666636741675445, "learning_rate": 1.992708874098054e-05, "loss": 1.378, "step": 67 }, { "epoch": 0.1896792189679219, "grad_norm": 1.321297415086869, "learning_rate": 1.9921379827503316e-05, "loss": 1.3916, "step": 68 }, { "epoch": 0.19246861924686193, "grad_norm": 1.3031514888132292, "learning_rate": 1.991545663599448e-05, "loss": 1.3568, "step": 69 }, { "epoch": 0.19525801952580196, "grad_norm": 1.3252501098302665, "learning_rate": 1.990931929438078e-05, "loss": 1.3691, "step": 70 }, { "epoch": 0.19804741980474197, "grad_norm": 1.3190300203004819, "learning_rate": 1.990296793521408e-05, "loss": 1.425, "step": 71 }, { "epoch": 0.200836820083682, "grad_norm": 1.221728638105595, "learning_rate": 1.989640269566853e-05, "loss": 1.3669, "step": 72 }, { "epoch": 0.20362622036262204, "grad_norm": 1.2519446160682244, "learning_rate": 1.9889623717537564e-05, "loss": 1.5065, "step": 73 }, { "epoch": 0.20641562064156208, "grad_norm": 1.3192555240060455, "learning_rate": 1.9882631147230874e-05, "loss": 1.3077, "step": 74 }, { "epoch": 0.20920502092050208, "grad_norm": 1.811281511932427, "learning_rate": 1.987542513577122e-05, "loss": 1.4737, "step": 75 }, { "epoch": 0.21199442119944212, "grad_norm": 1.2107466822843787, "learning_rate": 1.9868005838791185e-05, "loss": 1.3632, "step": 76 }, { "epoch": 0.21478382147838215, "grad_norm": 1.4506886695952612, "learning_rate": 1.9860373416529804e-05, "loss": 1.3707, "step": 77 }, { "epoch": 0.2175732217573222, "grad_norm": 1.129475911714987, "learning_rate": 1.985252803382911e-05, "loss": 1.2879, "step": 78 }, { "epoch": 0.2203626220362622, "grad_norm": 1.2378410441125478, "learning_rate": 1.984446986013057e-05, "loss": 1.4352, "step": 79 }, { "epoch": 0.22315202231520223, "grad_norm": 1.2497274548196218, "learning_rate": 1.983619906947144e-05, "loss": 1.2929, "step": 80 }, { "epoch": 0.22594142259414227, "grad_norm": 1.2860272115066647, "learning_rate": 1.9827715840480962e-05, "loss": 1.2595, "step": 81 }, { "epoch": 0.22873082287308227, "grad_norm": 1.3259601590310435, "learning_rate": 1.9819020356376562e-05, "loss": 1.5421, "step": 82 }, { "epoch": 0.2315202231520223, "grad_norm": 1.3823289295025785, "learning_rate": 1.9810112804959867e-05, "loss": 1.4379, "step": 83 }, { "epoch": 0.23430962343096234, "grad_norm": 1.2790636696723037, "learning_rate": 1.980099337861264e-05, "loss": 1.4045, "step": 84 }, { "epoch": 0.23709902370990238, "grad_norm": 1.2621634213682735, "learning_rate": 1.9791662274292638e-05, "loss": 1.3433, "step": 85 }, { "epoch": 0.2398884239888424, "grad_norm": 1.4041226411604562, "learning_rate": 1.9782119693529358e-05, "loss": 1.6292, "step": 86 }, { "epoch": 0.24267782426778242, "grad_norm": 1.4238393471976662, "learning_rate": 1.977236584241968e-05, "loss": 1.398, "step": 87 }, { "epoch": 0.24546722454672246, "grad_norm": 1.3721757997449184, "learning_rate": 1.9762400931623413e-05, "loss": 1.372, "step": 88 }, { "epoch": 0.2482566248256625, "grad_norm": 1.3049726438204359, "learning_rate": 1.9752225176358757e-05, "loss": 1.4366, "step": 89 }, { "epoch": 0.2510460251046025, "grad_norm": 1.319316702022308, "learning_rate": 1.9741838796397638e-05, "loss": 1.2807, "step": 90 }, { "epoch": 0.2510460251046025, "eval_loss": 1.6544907093048096, "eval_runtime": 157.4295, "eval_samples_per_second": 2.02, "eval_steps_per_second": 0.127, "step": 90 }, { "epoch": 0.25383542538354253, "grad_norm": 1.3259568867589984, "learning_rate": 1.9731242016060985e-05, "loss": 1.3566, "step": 91 }, { "epoch": 0.25662482566248257, "grad_norm": 1.3318698765554446, "learning_rate": 1.972043506421386e-05, "loss": 1.3304, "step": 92 }, { "epoch": 0.2594142259414226, "grad_norm": 1.210762982291795, "learning_rate": 1.9709418174260523e-05, "loss": 1.3663, "step": 93 }, { "epoch": 0.26220362622036264, "grad_norm": 1.322366639833671, "learning_rate": 1.9698191584139402e-05, "loss": 1.4099, "step": 94 }, { "epoch": 0.2649930264993027, "grad_norm": 1.286954377653034, "learning_rate": 1.9686755536317945e-05, "loss": 1.3539, "step": 95 }, { "epoch": 0.26778242677824265, "grad_norm": 1.1590651579123286, "learning_rate": 1.967511027778738e-05, "loss": 1.2147, "step": 96 }, { "epoch": 0.2705718270571827, "grad_norm": 1.3100829080105303, "learning_rate": 1.9663256060057395e-05, "loss": 1.4093, "step": 97 }, { "epoch": 0.2733612273361227, "grad_norm": 1.346265752760783, "learning_rate": 1.965119313915068e-05, "loss": 1.491, "step": 98 }, { "epoch": 0.27615062761506276, "grad_norm": 1.378168477135006, "learning_rate": 1.9638921775597428e-05, "loss": 1.5049, "step": 99 }, { "epoch": 0.2789400278940028, "grad_norm": 1.4182107155054933, "learning_rate": 1.9626442234429684e-05, "loss": 1.5013, "step": 100 }, { "epoch": 0.28172942817294283, "grad_norm": 1.3165711654218712, "learning_rate": 1.961375478517564e-05, "loss": 1.5105, "step": 101 }, { "epoch": 0.28451882845188287, "grad_norm": 1.364780296172199, "learning_rate": 1.9600859701853796e-05, "loss": 1.5207, "step": 102 }, { "epoch": 0.28730822873082285, "grad_norm": 1.2988922679076684, "learning_rate": 1.958775726296706e-05, "loss": 1.2881, "step": 103 }, { "epoch": 0.2900976290097629, "grad_norm": 1.3450528168801636, "learning_rate": 1.9574447751496706e-05, "loss": 1.2877, "step": 104 }, { "epoch": 0.2928870292887029, "grad_norm": 1.4142239844479165, "learning_rate": 1.95609314548963e-05, "loss": 1.5604, "step": 105 }, { "epoch": 0.29567642956764295, "grad_norm": 1.3452993293539794, "learning_rate": 1.954720866508546e-05, "loss": 1.3447, "step": 106 }, { "epoch": 0.298465829846583, "grad_norm": 1.4534491830895881, "learning_rate": 1.953327967844356e-05, "loss": 1.3643, "step": 107 }, { "epoch": 0.301255230125523, "grad_norm": 1.350966370835706, "learning_rate": 1.9519144795803342e-05, "loss": 1.5196, "step": 108 }, { "epoch": 0.30404463040446306, "grad_norm": 1.3487778511447879, "learning_rate": 1.95048043224444e-05, "loss": 1.4243, "step": 109 }, { "epoch": 0.3068340306834031, "grad_norm": 1.3479508491141639, "learning_rate": 1.94902585680866e-05, "loss": 1.2869, "step": 110 }, { "epoch": 0.30962343096234307, "grad_norm": 1.5555627133101146, "learning_rate": 1.9475507846883377e-05, "loss": 1.4626, "step": 111 }, { "epoch": 0.3124128312412831, "grad_norm": 1.264890715784298, "learning_rate": 1.9460552477414972e-05, "loss": 1.2826, "step": 112 }, { "epoch": 0.31520223152022314, "grad_norm": 1.2628748380778831, "learning_rate": 1.9445392782681523e-05, "loss": 1.232, "step": 113 }, { "epoch": 0.3179916317991632, "grad_norm": 1.3550404859387195, "learning_rate": 1.9430029090096118e-05, "loss": 1.3034, "step": 114 }, { "epoch": 0.3207810320781032, "grad_norm": 1.1496127251687267, "learning_rate": 1.94144617314777e-05, "loss": 1.3215, "step": 115 }, { "epoch": 0.32357043235704325, "grad_norm": 1.3116085375581363, "learning_rate": 1.939869104304392e-05, "loss": 1.4154, "step": 116 }, { "epoch": 0.3263598326359833, "grad_norm": 1.2589366985022925, "learning_rate": 1.9382717365403854e-05, "loss": 1.517, "step": 117 }, { "epoch": 0.3291492329149233, "grad_norm": 1.3051575397158521, "learning_rate": 1.9366541043550667e-05, "loss": 1.3795, "step": 118 }, { "epoch": 0.3319386331938633, "grad_norm": 1.514184766493585, "learning_rate": 1.9350162426854152e-05, "loss": 1.5974, "step": 119 }, { "epoch": 0.33472803347280333, "grad_norm": 1.3773144705212048, "learning_rate": 1.933358186905318e-05, "loss": 1.3696, "step": 120 }, { "epoch": 0.33751743375174337, "grad_norm": 1.070700275666401, "learning_rate": 1.9316799728248074e-05, "loss": 1.1661, "step": 121 }, { "epoch": 0.3403068340306834, "grad_norm": 1.4181760004495874, "learning_rate": 1.9299816366892865e-05, "loss": 1.4376, "step": 122 }, { "epoch": 0.34309623430962344, "grad_norm": 1.6034791315185046, "learning_rate": 1.9282632151787462e-05, "loss": 1.5891, "step": 123 }, { "epoch": 0.3458856345885635, "grad_norm": 1.1884570183288778, "learning_rate": 1.9265247454069736e-05, "loss": 1.3432, "step": 124 }, { "epoch": 0.3486750348675035, "grad_norm": 1.477555714731358, "learning_rate": 1.924766264920751e-05, "loss": 1.3683, "step": 125 }, { "epoch": 0.3514644351464435, "grad_norm": 1.2504857458807423, "learning_rate": 1.922987811699042e-05, "loss": 1.2643, "step": 126 }, { "epoch": 0.3542538354253835, "grad_norm": 1.3679090790056112, "learning_rate": 1.9211894241521757e-05, "loss": 1.5962, "step": 127 }, { "epoch": 0.35704323570432356, "grad_norm": 1.2309836816544464, "learning_rate": 1.9193711411210138e-05, "loss": 1.3361, "step": 128 }, { "epoch": 0.3598326359832636, "grad_norm": 1.1714321663378802, "learning_rate": 1.917533001876113e-05, "loss": 1.3019, "step": 129 }, { "epoch": 0.36262203626220363, "grad_norm": 1.2813447334304535, "learning_rate": 1.9156750461168768e-05, "loss": 1.4319, "step": 130 }, { "epoch": 0.36541143654114366, "grad_norm": 1.427767209588848, "learning_rate": 1.9137973139706973e-05, "loss": 1.4712, "step": 131 }, { "epoch": 0.3682008368200837, "grad_norm": 1.3426640935974619, "learning_rate": 1.91189984599209e-05, "loss": 1.4397, "step": 132 }, { "epoch": 0.37099023709902373, "grad_norm": 1.3173830250699687, "learning_rate": 1.9099826831618168e-05, "loss": 1.4799, "step": 133 }, { "epoch": 0.3737796373779637, "grad_norm": 1.5260053920559482, "learning_rate": 1.908045866886001e-05, "loss": 1.5167, "step": 134 }, { "epoch": 0.37656903765690375, "grad_norm": 1.2838799058228236, "learning_rate": 1.9060894389952328e-05, "loss": 1.3957, "step": 135 }, { "epoch": 0.37656903765690375, "eval_loss": 1.629968523979187, "eval_runtime": 155.8796, "eval_samples_per_second": 2.04, "eval_steps_per_second": 0.128, "step": 135 }, { "epoch": 0.3793584379358438, "grad_norm": 1.2595723963522658, "learning_rate": 1.9041134417436674e-05, "loss": 1.3773, "step": 136 }, { "epoch": 0.3821478382147838, "grad_norm": 1.380977078216302, "learning_rate": 1.9021179178081107e-05, "loss": 1.5358, "step": 137 }, { "epoch": 0.38493723849372385, "grad_norm": 1.3245015694120374, "learning_rate": 1.9001029102870982e-05, "loss": 1.506, "step": 138 }, { "epoch": 0.3877266387726639, "grad_norm": 1.2602456347612, "learning_rate": 1.898068462699964e-05, "loss": 1.4246, "step": 139 }, { "epoch": 0.3905160390516039, "grad_norm": 1.3989105118625398, "learning_rate": 1.8960146189859014e-05, "loss": 1.4692, "step": 140 }, { "epoch": 0.39330543933054396, "grad_norm": 1.2616111800783727, "learning_rate": 1.8939414235030137e-05, "loss": 1.3755, "step": 141 }, { "epoch": 0.39609483960948394, "grad_norm": 1.2686523296470786, "learning_rate": 1.891848921027355e-05, "loss": 1.3442, "step": 142 }, { "epoch": 0.398884239888424, "grad_norm": 1.37417212405681, "learning_rate": 1.889737156751965e-05, "loss": 1.3651, "step": 143 }, { "epoch": 0.401673640167364, "grad_norm": 1.373244156828418, "learning_rate": 1.887606176285893e-05, "loss": 1.4961, "step": 144 }, { "epoch": 0.40446304044630405, "grad_norm": 1.1872961109156253, "learning_rate": 1.8854560256532098e-05, "loss": 1.2587, "step": 145 }, { "epoch": 0.4072524407252441, "grad_norm": 1.313036403643509, "learning_rate": 1.883286751292018e-05, "loss": 1.4575, "step": 146 }, { "epoch": 0.4100418410041841, "grad_norm": 1.390974479025736, "learning_rate": 1.8810984000534457e-05, "loss": 1.3891, "step": 147 }, { "epoch": 0.41283124128312415, "grad_norm": 1.3368020257438076, "learning_rate": 1.8788910192006363e-05, "loss": 1.3415, "step": 148 }, { "epoch": 0.41562064156206413, "grad_norm": 1.2329515511987332, "learning_rate": 1.8766646564077265e-05, "loss": 1.3339, "step": 149 }, { "epoch": 0.41841004184100417, "grad_norm": 1.276208090352136, "learning_rate": 1.8744193597588185e-05, "loss": 1.4495, "step": 150 }, { "epoch": 0.4211994421199442, "grad_norm": 1.1548092334953926, "learning_rate": 1.8721551777469397e-05, "loss": 1.2099, "step": 151 }, { "epoch": 0.42398884239888424, "grad_norm": 1.3184986062317001, "learning_rate": 1.869872159272997e-05, "loss": 1.2136, "step": 152 }, { "epoch": 0.42677824267782427, "grad_norm": 1.1962726760049636, "learning_rate": 1.8675703536447178e-05, "loss": 1.2598, "step": 153 }, { "epoch": 0.4295676429567643, "grad_norm": 1.3302312487017707, "learning_rate": 1.8652498105755898e-05, "loss": 1.3719, "step": 154 }, { "epoch": 0.43235704323570434, "grad_norm": 1.55872330252289, "learning_rate": 1.862910580183782e-05, "loss": 1.4867, "step": 155 }, { "epoch": 0.4351464435146444, "grad_norm": 1.3004017172912832, "learning_rate": 1.8605527129910663e-05, "loss": 1.3089, "step": 156 }, { "epoch": 0.43793584379358436, "grad_norm": 1.2650294480346138, "learning_rate": 1.858176259921724e-05, "loss": 1.2856, "step": 157 }, { "epoch": 0.4407252440725244, "grad_norm": 1.414189467122214, "learning_rate": 1.8557812723014476e-05, "loss": 1.4827, "step": 158 }, { "epoch": 0.4435146443514644, "grad_norm": 1.2634959665686347, "learning_rate": 1.853367801856231e-05, "loss": 1.3568, "step": 159 }, { "epoch": 0.44630404463040446, "grad_norm": 1.3164047095667806, "learning_rate": 1.8509359007112523e-05, "loss": 1.5306, "step": 160 }, { "epoch": 0.4490934449093445, "grad_norm": 1.3232600329575266, "learning_rate": 1.8484856213897496e-05, "loss": 1.4279, "step": 161 }, { "epoch": 0.45188284518828453, "grad_norm": 1.286194256752095, "learning_rate": 1.8460170168118857e-05, "loss": 1.2656, "step": 162 }, { "epoch": 0.45467224546722457, "grad_norm": 1.393553888811773, "learning_rate": 1.843530140293603e-05, "loss": 1.4003, "step": 163 }, { "epoch": 0.45746164574616455, "grad_norm": 1.2952418440030533, "learning_rate": 1.841025045545477e-05, "loss": 1.365, "step": 164 }, { "epoch": 0.4602510460251046, "grad_norm": 1.2556167978994803, "learning_rate": 1.8385017866715507e-05, "loss": 1.3313, "step": 165 }, { "epoch": 0.4630404463040446, "grad_norm": 1.3067564204278406, "learning_rate": 1.8359604181681703e-05, "loss": 1.4114, "step": 166 }, { "epoch": 0.46582984658298465, "grad_norm": 1.2088590174928828, "learning_rate": 1.833400994922806e-05, "loss": 1.1866, "step": 167 }, { "epoch": 0.4686192468619247, "grad_norm": 1.334753865529686, "learning_rate": 1.8308235722128674e-05, "loss": 1.2744, "step": 168 }, { "epoch": 0.4714086471408647, "grad_norm": 1.3612556693462265, "learning_rate": 1.8282282057045087e-05, "loss": 1.3558, "step": 169 }, { "epoch": 0.47419804741980476, "grad_norm": 1.2660924285869366, "learning_rate": 1.8256149514514284e-05, "loss": 1.2804, "step": 170 }, { "epoch": 0.4769874476987448, "grad_norm": 1.3192403133250346, "learning_rate": 1.8229838658936566e-05, "loss": 1.3874, "step": 171 }, { "epoch": 0.4797768479776848, "grad_norm": 1.3924722985838183, "learning_rate": 1.8203350058563366e-05, "loss": 1.5013, "step": 172 }, { "epoch": 0.4825662482566248, "grad_norm": 1.2874077296032287, "learning_rate": 1.8176684285484985e-05, "loss": 1.3261, "step": 173 }, { "epoch": 0.48535564853556484, "grad_norm": 1.309597924994118, "learning_rate": 1.814984191561823e-05, "loss": 1.4157, "step": 174 }, { "epoch": 0.4881450488145049, "grad_norm": 1.1912282212302885, "learning_rate": 1.8122823528693966e-05, "loss": 1.2585, "step": 175 }, { "epoch": 0.4909344490934449, "grad_norm": 1.2765007106705224, "learning_rate": 1.809562970824462e-05, "loss": 1.3437, "step": 176 }, { "epoch": 0.49372384937238495, "grad_norm": 1.2571675750543854, "learning_rate": 1.8068261041591548e-05, "loss": 1.4383, "step": 177 }, { "epoch": 0.496513249651325, "grad_norm": 1.1735759325774886, "learning_rate": 1.8040718119832378e-05, "loss": 1.3234, "step": 178 }, { "epoch": 0.499302649930265, "grad_norm": 1.1634243266804525, "learning_rate": 1.8013001537828213e-05, "loss": 1.2169, "step": 179 } ], "logging_steps": 1, "max_steps": 716, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 179, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 116435221217280.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }