| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.997, |
| "eval_steps": 500, |
| "global_step": 375, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.008008008008008008, |
| "grad_norm": 1.6445057392120361, |
| "learning_rate": 2e-05, |
| "loss": 2.3547, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.016016016016016016, |
| "grad_norm": 1.63363778591156, |
| "learning_rate": 4e-05, |
| "loss": 2.3812, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.024024024024024024, |
| "grad_norm": 1.6492197513580322, |
| "learning_rate": 6e-05, |
| "loss": 2.3399, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.03203203203203203, |
| "grad_norm": 1.6518611907958984, |
| "learning_rate": 8e-05, |
| "loss": 2.3172, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.04004004004004004, |
| "grad_norm": 1.7173571586608887, |
| "learning_rate": 0.0001, |
| "loss": 2.2563, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.04804804804804805, |
| "grad_norm": 1.563859224319458, |
| "learning_rate": 0.00012, |
| "loss": 2.0256, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.056056056056056056, |
| "grad_norm": 1.5590581893920898, |
| "learning_rate": 0.00014, |
| "loss": 1.8324, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.06406406406406406, |
| "grad_norm": 1.5127277374267578, |
| "learning_rate": 0.00016, |
| "loss": 1.5787, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.07207207207207207, |
| "grad_norm": 1.5447226762771606, |
| "learning_rate": 0.00018, |
| "loss": 1.3826, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.08008008008008008, |
| "grad_norm": 4.600811004638672, |
| "learning_rate": 0.0002, |
| "loss": 1.2388, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.08808808808808809, |
| "grad_norm": 1.6333264112472534, |
| "learning_rate": 0.00019999629591162656, |
| "loss": 1.0977, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.0960960960960961, |
| "grad_norm": 1.5325253009796143, |
| "learning_rate": 0.00019998518392091164, |
| "loss": 1.0178, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.1041041041041041, |
| "grad_norm": 1.866473913192749, |
| "learning_rate": 0.00019996666485105113, |
| "loss": 0.9454, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.11211211211211211, |
| "grad_norm": 1.450692892074585, |
| "learning_rate": 0.0001999407400739705, |
| "loss": 0.8514, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.12012012012012012, |
| "grad_norm": 5.149086952209473, |
| "learning_rate": 0.00019990741151022301, |
| "loss": 0.9136, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.12812812812812813, |
| "grad_norm": 0.7399551272392273, |
| "learning_rate": 0.00019986668162884762, |
| "loss": 0.8742, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.13613613613613615, |
| "grad_norm": 0.6142033934593201, |
| "learning_rate": 0.00019981855344718588, |
| "loss": 0.8082, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.14414414414414414, |
| "grad_norm": 0.47605425119400024, |
| "learning_rate": 0.00019976303053065859, |
| "loss": 0.8019, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.15215215215215216, |
| "grad_norm": 0.3993614614009857, |
| "learning_rate": 0.00019970011699250152, |
| "loss": 0.7625, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.16016016016016016, |
| "grad_norm": 0.4947199821472168, |
| "learning_rate": 0.00019962981749346078, |
| "loss": 0.7419, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.16816816816816818, |
| "grad_norm": 0.549526572227478, |
| "learning_rate": 0.00019955213724144754, |
| "loss": 0.7468, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.17617617617617617, |
| "grad_norm": 0.34314435720443726, |
| "learning_rate": 0.00019946708199115211, |
| "loss": 0.7482, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.1841841841841842, |
| "grad_norm": 0.38283613324165344, |
| "learning_rate": 0.00019937465804361783, |
| "loss": 0.7304, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.1921921921921922, |
| "grad_norm": 0.28871795535087585, |
| "learning_rate": 0.00019927487224577402, |
| "loss": 0.746, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.2002002002002002, |
| "grad_norm": 0.321494996547699, |
| "learning_rate": 0.000199167731989929, |
| "loss": 0.7461, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.2082082082082082, |
| "grad_norm": 0.315449982881546, |
| "learning_rate": 0.0001990532452132223, |
| "loss": 0.7286, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.21621621621621623, |
| "grad_norm": 0.2904318571090698, |
| "learning_rate": 0.00019893142039703664, |
| "loss": 0.7119, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.22422422422422422, |
| "grad_norm": 0.27874529361724854, |
| "learning_rate": 0.00019880226656636977, |
| "loss": 0.7105, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.23223223223223224, |
| "grad_norm": 0.2948579490184784, |
| "learning_rate": 0.0001986657932891657, |
| "loss": 0.6976, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.24024024024024024, |
| "grad_norm": 0.2542964220046997, |
| "learning_rate": 0.00019852201067560606, |
| "loss": 0.7351, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.24824824824824826, |
| "grad_norm": 0.2960706353187561, |
| "learning_rate": 0.000198370929377361, |
| "loss": 0.7179, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.25625625625625625, |
| "grad_norm": 0.24776384234428406, |
| "learning_rate": 0.00019821256058680006, |
| "loss": 0.7134, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.26426426426426425, |
| "grad_norm": 0.33054184913635254, |
| "learning_rate": 0.00019804691603616324, |
| "loss": 0.6995, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.2722722722722723, |
| "grad_norm": 0.2543237805366516, |
| "learning_rate": 0.00019787400799669154, |
| "loss": 0.7081, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.2802802802802803, |
| "grad_norm": 0.25240710377693176, |
| "learning_rate": 0.0001976938492777182, |
| "loss": 0.6928, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.2882882882882883, |
| "grad_norm": 0.35880276560783386, |
| "learning_rate": 0.0001975064532257195, |
| "loss": 0.7177, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.2962962962962963, |
| "grad_norm": 0.3675362467765808, |
| "learning_rate": 0.0001973118337233262, |
| "loss": 0.6865, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.30430430430430433, |
| "grad_norm": 0.3688451051712036, |
| "learning_rate": 0.00019711000518829507, |
| "loss": 0.6724, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.3123123123123123, |
| "grad_norm": 0.2982208728790283, |
| "learning_rate": 0.00019690098257244064, |
| "loss": 0.671, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.3203203203203203, |
| "grad_norm": 0.24197936058044434, |
| "learning_rate": 0.00019668478136052774, |
| "loss": 0.6777, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.3283283283283283, |
| "grad_norm": 0.748349130153656, |
| "learning_rate": 0.00019646141756912434, |
| "loss": 0.6641, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.33633633633633636, |
| "grad_norm": 0.5585939288139343, |
| "learning_rate": 0.00019623090774541487, |
| "loss": 0.6988, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.34434434434434436, |
| "grad_norm": 0.40285471081733704, |
| "learning_rate": 0.00019599326896597448, |
| "loss": 0.6811, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.35235235235235235, |
| "grad_norm": 0.25714346766471863, |
| "learning_rate": 0.00019574851883550395, |
| "loss": 0.6913, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.36036036036036034, |
| "grad_norm": 0.4926215708255768, |
| "learning_rate": 0.00019549667548552556, |
| "loss": 0.6707, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.3683683683683684, |
| "grad_norm": 0.3760850429534912, |
| "learning_rate": 0.00019523775757303974, |
| "loss": 0.6809, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.3763763763763764, |
| "grad_norm": 0.3734811842441559, |
| "learning_rate": 0.0001949717842791432, |
| "loss": 0.6386, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.3843843843843844, |
| "grad_norm": 0.3447561264038086, |
| "learning_rate": 0.00019469877530760754, |
| "loss": 0.6955, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.3923923923923924, |
| "grad_norm": 0.2680707573890686, |
| "learning_rate": 0.00019441875088341997, |
| "loss": 0.6625, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.4004004004004004, |
| "grad_norm": 0.2692941725254059, |
| "learning_rate": 0.00019413173175128473, |
| "loss": 0.66, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.4084084084084084, |
| "grad_norm": 0.32329630851745605, |
| "learning_rate": 0.00019383773917408642, |
| "loss": 0.6612, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.4164164164164164, |
| "grad_norm": 0.281435489654541, |
| "learning_rate": 0.00019353679493131485, |
| "loss": 0.6621, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.4244244244244244, |
| "grad_norm": 0.22186556458473206, |
| "learning_rate": 0.00019322892131745135, |
| "loss": 0.6465, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.43243243243243246, |
| "grad_norm": 0.2902645468711853, |
| "learning_rate": 0.00019291414114031743, |
| "loss": 0.6693, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.44044044044044045, |
| "grad_norm": 0.2899124324321747, |
| "learning_rate": 0.000192592477719385, |
| "loss": 0.6568, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.44844844844844844, |
| "grad_norm": 0.2124062478542328, |
| "learning_rate": 0.00019226395488404876, |
| "loss": 0.6724, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.45645645645645644, |
| "grad_norm": 0.23896393179893494, |
| "learning_rate": 0.00019192859697186106, |
| "loss": 0.6459, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.4644644644644645, |
| "grad_norm": 0.2762405574321747, |
| "learning_rate": 0.00019158642882672873, |
| "loss": 0.6498, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.4724724724724725, |
| "grad_norm": 0.2079935222864151, |
| "learning_rate": 0.00019123747579707275, |
| "loss": 0.6604, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.4804804804804805, |
| "grad_norm": 0.23864208161830902, |
| "learning_rate": 0.0001908817637339503, |
| "loss": 0.6378, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.48848848848848847, |
| "grad_norm": 0.21718506515026093, |
| "learning_rate": 0.00019051931898913976, |
| "loss": 0.6424, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.4964964964964965, |
| "grad_norm": 0.2773915231227875, |
| "learning_rate": 0.0001901501684131884, |
| "loss": 0.6474, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.5045045045045045, |
| "grad_norm": 0.23982493579387665, |
| "learning_rate": 0.0001897743393534234, |
| "loss": 0.6256, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.5125125125125125, |
| "grad_norm": 0.23621873557567596, |
| "learning_rate": 0.0001893918596519257, |
| "loss": 0.6403, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.5205205205205206, |
| "grad_norm": 0.22759953141212463, |
| "learning_rate": 0.00018900275764346768, |
| "loss": 0.6484, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.5285285285285285, |
| "grad_norm": 0.26695549488067627, |
| "learning_rate": 0.00018860706215341382, |
| "loss": 0.609, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.5365365365365365, |
| "grad_norm": 0.24594709277153015, |
| "learning_rate": 0.00018820480249558537, |
| "loss": 0.6338, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.5445445445445446, |
| "grad_norm": 0.22960062325000763, |
| "learning_rate": 0.00018779600847008884, |
| "loss": 0.6166, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.5525525525525525, |
| "grad_norm": 0.25302109122276306, |
| "learning_rate": 0.00018738071036110808, |
| "loss": 0.6422, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.5605605605605606, |
| "grad_norm": 0.3339892327785492, |
| "learning_rate": 0.0001869589389346611, |
| "loss": 0.6558, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.5685685685685685, |
| "grad_norm": 0.21397258341312408, |
| "learning_rate": 0.00018653072543632062, |
| "loss": 0.6323, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.5765765765765766, |
| "grad_norm": 0.2514493465423584, |
| "learning_rate": 0.00018609610158889942, |
| "loss": 0.657, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.5845845845845846, |
| "grad_norm": 0.25317835807800293, |
| "learning_rate": 0.00018565509959010036, |
| "loss": 0.641, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.5925925925925926, |
| "grad_norm": 0.22669494152069092, |
| "learning_rate": 0.00018520775211013093, |
| "loss": 0.6369, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.6006006006006006, |
| "grad_norm": 0.2214743047952652, |
| "learning_rate": 0.00018475409228928312, |
| "loss": 0.6307, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.6086086086086087, |
| "grad_norm": 0.24376747012138367, |
| "learning_rate": 0.00018429415373547828, |
| "loss": 0.6557, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.6166166166166166, |
| "grad_norm": 0.2158333659172058, |
| "learning_rate": 0.00018382797052177746, |
| "loss": 0.655, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.6246246246246246, |
| "grad_norm": 0.25565382838249207, |
| "learning_rate": 0.000183355577183857, |
| "loss": 0.6299, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.6326326326326326, |
| "grad_norm": 0.20636747777462006, |
| "learning_rate": 0.00018287700871745036, |
| "loss": 0.6283, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.6406406406406406, |
| "grad_norm": 0.21258121728897095, |
| "learning_rate": 0.00018239230057575542, |
| "loss": 0.6174, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.6486486486486487, |
| "grad_norm": 0.2861458957195282, |
| "learning_rate": 0.00018190148866680802, |
| "loss": 0.6547, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.6566566566566566, |
| "grad_norm": 0.23667441308498383, |
| "learning_rate": 0.0001814046093508218, |
| "loss": 0.6416, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.6646646646646647, |
| "grad_norm": 0.23191799223423004, |
| "learning_rate": 0.00018090169943749476, |
| "loss": 0.642, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.6726726726726727, |
| "grad_norm": 0.2622171938419342, |
| "learning_rate": 0.00018039279618328212, |
| "loss": 0.6241, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.6806806806806807, |
| "grad_norm": 0.2891266345977783, |
| "learning_rate": 0.00017987793728863651, |
| "loss": 0.6284, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.6886886886886887, |
| "grad_norm": 0.26767420768737793, |
| "learning_rate": 0.00017935716089521474, |
| "loss": 0.627, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.6966966966966966, |
| "grad_norm": 0.2828672230243683, |
| "learning_rate": 0.00017883050558305255, |
| "loss": 0.6418, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.7047047047047047, |
| "grad_norm": 0.32730573415756226, |
| "learning_rate": 0.00017829801036770628, |
| "loss": 0.6629, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.7127127127127127, |
| "grad_norm": 0.24029900133609772, |
| "learning_rate": 0.0001777597146973627, |
| "loss": 0.614, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.7207207207207207, |
| "grad_norm": 0.2929212152957916, |
| "learning_rate": 0.00017721565844991643, |
| "loss": 0.632, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.7287287287287287, |
| "grad_norm": 0.2860666513442993, |
| "learning_rate": 0.00017666588193001595, |
| "loss": 0.6289, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.7367367367367368, |
| "grad_norm": 0.23325330018997192, |
| "learning_rate": 0.00017611042586607748, |
| "loss": 0.6392, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.7447447447447447, |
| "grad_norm": 0.3126169443130493, |
| "learning_rate": 0.00017554933140726802, |
| "loss": 0.6422, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.7527527527527528, |
| "grad_norm": 0.26704883575439453, |
| "learning_rate": 0.00017498264012045687, |
| "loss": 0.6166, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.7607607607607607, |
| "grad_norm": 0.2184283286333084, |
| "learning_rate": 0.00017441039398713608, |
| "loss": 0.6235, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.7687687687687688, |
| "grad_norm": 0.23906390368938446, |
| "learning_rate": 0.00017383263540031067, |
| "loss": 0.6643, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.7767767767767768, |
| "grad_norm": 0.26839691400527954, |
| "learning_rate": 0.0001732494071613579, |
| "loss": 0.6514, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.7847847847847848, |
| "grad_norm": 0.2805701494216919, |
| "learning_rate": 0.00017266075247685656, |
| "loss": 0.6168, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.7927927927927928, |
| "grad_norm": 0.21650992333889008, |
| "learning_rate": 0.00017206671495538612, |
| "loss": 0.5983, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.8008008008008008, |
| "grad_norm": 0.2302800416946411, |
| "learning_rate": 0.00017146733860429612, |
| "loss": 0.6301, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.8088088088088088, |
| "grad_norm": 0.29078415036201477, |
| "learning_rate": 0.000170862667826446, |
| "loss": 0.616, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.8168168168168168, |
| "grad_norm": 0.24860034883022308, |
| "learning_rate": 0.0001702527474169157, |
| "loss": 0.6352, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.8248248248248248, |
| "grad_norm": 0.26281973719596863, |
| "learning_rate": 0.00016963762255968722, |
| "loss": 0.6218, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.8328328328328328, |
| "grad_norm": 0.29051998257637024, |
| "learning_rate": 0.0001690173388242972, |
| "loss": 0.6233, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.8408408408408409, |
| "grad_norm": 0.2471507042646408, |
| "learning_rate": 0.00016839194216246108, |
| "loss": 0.6147, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.8488488488488488, |
| "grad_norm": 0.2574704587459564, |
| "learning_rate": 0.0001677614789046689, |
| "loss": 0.6174, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.8568568568568569, |
| "grad_norm": 0.2551233172416687, |
| "learning_rate": 0.00016712599575675316, |
| "loss": 0.5989, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.8648648648648649, |
| "grad_norm": 0.2901318371295929, |
| "learning_rate": 0.00016648553979642868, |
| "loss": 0.6241, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.8728728728728729, |
| "grad_norm": 0.23769080638885498, |
| "learning_rate": 0.0001658401584698049, |
| "loss": 0.6044, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.8808808808808809, |
| "grad_norm": 0.2580976188182831, |
| "learning_rate": 0.00016518989958787126, |
| "loss": 0.622, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.8888888888888888, |
| "grad_norm": 0.24077744781970978, |
| "learning_rate": 0.00016453481132295506, |
| "loss": 0.6047, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.8968968968968969, |
| "grad_norm": 0.228902667760849, |
| "learning_rate": 0.00016387494220515274, |
| "loss": 0.6138, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.9049049049049049, |
| "grad_norm": 0.2607581317424774, |
| "learning_rate": 0.00016321034111873488, |
| "loss": 0.6307, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.9129129129129129, |
| "grad_norm": 0.2575569450855255, |
| "learning_rate": 0.00016254105729852464, |
| "loss": 0.6008, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.9209209209209209, |
| "grad_norm": 0.231553852558136, |
| "learning_rate": 0.00016186714032625035, |
| "loss": 0.617, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.928928928928929, |
| "grad_norm": 0.24820354580879211, |
| "learning_rate": 0.00016118864012687245, |
| "loss": 0.5991, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.9369369369369369, |
| "grad_norm": 0.2364109754562378, |
| "learning_rate": 0.00016050560696488492, |
| "loss": 0.6094, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.944944944944945, |
| "grad_norm": 0.2492029368877411, |
| "learning_rate": 0.00015981809144059166, |
| "loss": 0.6143, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.9529529529529529, |
| "grad_norm": 0.27745717763900757, |
| "learning_rate": 0.00015912614448635782, |
| "loss": 0.6203, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.960960960960961, |
| "grad_norm": 0.2555610239505768, |
| "learning_rate": 0.00015842981736283686, |
| "loss": 0.6314, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.968968968968969, |
| "grad_norm": 0.2268420308828354, |
| "learning_rate": 0.00015772916165517273, |
| "loss": 0.6155, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.9769769769769769, |
| "grad_norm": 0.250041127204895, |
| "learning_rate": 0.00015702422926917872, |
| "loss": 0.6226, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.984984984984985, |
| "grad_norm": 0.2596072554588318, |
| "learning_rate": 0.00015631507242749187, |
| "loss": 0.6086, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.992992992992993, |
| "grad_norm": 0.2280743271112442, |
| "learning_rate": 0.00015560174366570446, |
| "loss": 0.5994, |
| "step": 124 |
| }, |
| { |
| "epoch": 1.001001001001001, |
| "grad_norm": 0.23362237215042114, |
| "learning_rate": 0.00015488429582847192, |
| "loss": 0.616, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.006, |
| "grad_norm": 0.2956937849521637, |
| "learning_rate": 0.00015416278206559816, |
| "loss": 0.6038, |
| "step": 126 |
| }, |
| { |
| "epoch": 1.014, |
| "grad_norm": 0.250629723072052, |
| "learning_rate": 0.0001534372558280979, |
| "loss": 0.5991, |
| "step": 127 |
| }, |
| { |
| "epoch": 1.022, |
| "grad_norm": 0.231906458735466, |
| "learning_rate": 0.00015270777086423722, |
| "loss": 0.6088, |
| "step": 128 |
| }, |
| { |
| "epoch": 1.03, |
| "grad_norm": 0.2888093590736389, |
| "learning_rate": 0.0001519743812155516, |
| "loss": 0.5892, |
| "step": 129 |
| }, |
| { |
| "epoch": 1.038, |
| "grad_norm": 0.24940524995326996, |
| "learning_rate": 0.0001512371412128424, |
| "loss": 0.5982, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.046, |
| "grad_norm": 0.24017778038978577, |
| "learning_rate": 0.00015049610547215205, |
| "loss": 0.5608, |
| "step": 131 |
| }, |
| { |
| "epoch": 1.054, |
| "grad_norm": 0.2334035485982895, |
| "learning_rate": 0.00014975132889071807, |
| "loss": 0.6034, |
| "step": 132 |
| }, |
| { |
| "epoch": 1.062, |
| "grad_norm": 0.2773897349834442, |
| "learning_rate": 0.00014900286664290592, |
| "loss": 0.6387, |
| "step": 133 |
| }, |
| { |
| "epoch": 1.07, |
| "grad_norm": 0.24266445636749268, |
| "learning_rate": 0.00014825077417612186, |
| "loss": 0.5612, |
| "step": 134 |
| }, |
| { |
| "epoch": 1.078, |
| "grad_norm": 0.22919470071792603, |
| "learning_rate": 0.00014749510720670506, |
| "loss": 0.599, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.086, |
| "grad_norm": 0.23829148709774017, |
| "learning_rate": 0.00014673592171580025, |
| "loss": 0.6066, |
| "step": 136 |
| }, |
| { |
| "epoch": 1.094, |
| "grad_norm": 0.31981223821640015, |
| "learning_rate": 0.00014597327394521044, |
| "loss": 0.5692, |
| "step": 137 |
| }, |
| { |
| "epoch": 1.102, |
| "grad_norm": 0.2747564911842346, |
| "learning_rate": 0.00014520722039323045, |
| "loss": 0.62, |
| "step": 138 |
| }, |
| { |
| "epoch": 1.11, |
| "grad_norm": 0.2592499852180481, |
| "learning_rate": 0.00014443781781046136, |
| "loss": 0.5937, |
| "step": 139 |
| }, |
| { |
| "epoch": 1.1179999999999999, |
| "grad_norm": 0.31891530752182007, |
| "learning_rate": 0.0001436651231956064, |
| "loss": 0.5973, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.126, |
| "grad_norm": 0.2743702232837677, |
| "learning_rate": 0.00014288919379124837, |
| "loss": 0.6045, |
| "step": 141 |
| }, |
| { |
| "epoch": 1.134, |
| "grad_norm": 0.2665708661079407, |
| "learning_rate": 0.00014211008707960897, |
| "loss": 0.5898, |
| "step": 142 |
| }, |
| { |
| "epoch": 1.142, |
| "grad_norm": 0.33267003297805786, |
| "learning_rate": 0.00014132786077829043, |
| "loss": 0.5945, |
| "step": 143 |
| }, |
| { |
| "epoch": 1.15, |
| "grad_norm": 0.28636589646339417, |
| "learning_rate": 0.00014054257283599973, |
| "loss": 0.5914, |
| "step": 144 |
| }, |
| { |
| "epoch": 1.158, |
| "grad_norm": 0.27305400371551514, |
| "learning_rate": 0.0001397542814282556, |
| "loss": 0.6093, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.166, |
| "grad_norm": 0.2839919924736023, |
| "learning_rate": 0.0001389630449530788, |
| "loss": 0.6074, |
| "step": 146 |
| }, |
| { |
| "epoch": 1.174, |
| "grad_norm": 0.25652188062667847, |
| "learning_rate": 0.0001381689220266659, |
| "loss": 0.6059, |
| "step": 147 |
| }, |
| { |
| "epoch": 1.182, |
| "grad_norm": 0.2549704909324646, |
| "learning_rate": 0.0001373719714790469, |
| "loss": 0.5568, |
| "step": 148 |
| }, |
| { |
| "epoch": 1.19, |
| "grad_norm": 0.2509196400642395, |
| "learning_rate": 0.00013657225234972695, |
| "loss": 0.5968, |
| "step": 149 |
| }, |
| { |
| "epoch": 1.198, |
| "grad_norm": 0.25909289717674255, |
| "learning_rate": 0.0001357698238833126, |
| "loss": 0.5902, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.206, |
| "grad_norm": 0.26162394881248474, |
| "learning_rate": 0.00013496474552512287, |
| "loss": 0.5763, |
| "step": 151 |
| }, |
| { |
| "epoch": 1.214, |
| "grad_norm": 0.2721655070781708, |
| "learning_rate": 0.00013415707691678556, |
| "loss": 0.6037, |
| "step": 152 |
| }, |
| { |
| "epoch": 1.222, |
| "grad_norm": 0.28691592812538147, |
| "learning_rate": 0.0001333468778918187, |
| "loss": 0.625, |
| "step": 153 |
| }, |
| { |
| "epoch": 1.23, |
| "grad_norm": 0.25801119208335876, |
| "learning_rate": 0.00013253420847119803, |
| "loss": 0.5994, |
| "step": 154 |
| }, |
| { |
| "epoch": 1.238, |
| "grad_norm": 0.2609824538230896, |
| "learning_rate": 0.00013171912885891063, |
| "loss": 0.5999, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.246, |
| "grad_norm": 0.2631840407848358, |
| "learning_rate": 0.00013090169943749476, |
| "loss": 0.5844, |
| "step": 156 |
| }, |
| { |
| "epoch": 1.254, |
| "grad_norm": 0.2862647473812103, |
| "learning_rate": 0.00013008198076356676, |
| "loss": 0.5653, |
| "step": 157 |
| }, |
| { |
| "epoch": 1.262, |
| "grad_norm": 0.27327024936676025, |
| "learning_rate": 0.00012926003356333488, |
| "loss": 0.5933, |
| "step": 158 |
| }, |
| { |
| "epoch": 1.27, |
| "grad_norm": 0.2684379518032074, |
| "learning_rate": 0.0001284359187281004, |
| "loss": 0.5842, |
| "step": 159 |
| }, |
| { |
| "epoch": 1.278, |
| "grad_norm": 0.2620231807231903, |
| "learning_rate": 0.00012760969730974694, |
| "loss": 0.6079, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.286, |
| "grad_norm": 0.2584543526172638, |
| "learning_rate": 0.00012678143051621742, |
| "loss": 0.5941, |
| "step": 161 |
| }, |
| { |
| "epoch": 1.294, |
| "grad_norm": 0.26279789209365845, |
| "learning_rate": 0.00012595117970697997, |
| "loss": 0.6086, |
| "step": 162 |
| }, |
| { |
| "epoch": 1.302, |
| "grad_norm": 0.2541520893573761, |
| "learning_rate": 0.00012511900638848195, |
| "loss": 0.5907, |
| "step": 163 |
| }, |
| { |
| "epoch": 1.31, |
| "grad_norm": 0.27384746074676514, |
| "learning_rate": 0.0001242849722095936, |
| "loss": 0.6055, |
| "step": 164 |
| }, |
| { |
| "epoch": 1.318, |
| "grad_norm": 0.25682300329208374, |
| "learning_rate": 0.00012344913895704097, |
| "loss": 0.609, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.326, |
| "grad_norm": 0.25404492020606995, |
| "learning_rate": 0.00012261156855082882, |
| "loss": 0.6121, |
| "step": 166 |
| }, |
| { |
| "epoch": 1.334, |
| "grad_norm": 0.2649850845336914, |
| "learning_rate": 0.0001217723230396532, |
| "loss": 0.5695, |
| "step": 167 |
| }, |
| { |
| "epoch": 1.342, |
| "grad_norm": 0.28349268436431885, |
| "learning_rate": 0.00012093146459630487, |
| "loss": 0.5883, |
| "step": 168 |
| }, |
| { |
| "epoch": 1.35, |
| "grad_norm": 0.27068182826042175, |
| "learning_rate": 0.00012008905551306356, |
| "loss": 0.6147, |
| "step": 169 |
| }, |
| { |
| "epoch": 1.358, |
| "grad_norm": 0.29566338658332825, |
| "learning_rate": 0.000119245158197083, |
| "loss": 0.5901, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.366, |
| "grad_norm": 0.27946797013282776, |
| "learning_rate": 0.00011839983516576802, |
| "loss": 0.5831, |
| "step": 171 |
| }, |
| { |
| "epoch": 1.374, |
| "grad_norm": 0.25005409121513367, |
| "learning_rate": 0.00011755314904214284, |
| "loss": 0.5656, |
| "step": 172 |
| }, |
| { |
| "epoch": 1.3820000000000001, |
| "grad_norm": 0.2621053159236908, |
| "learning_rate": 0.00011670516255021193, |
| "loss": 0.5769, |
| "step": 173 |
| }, |
| { |
| "epoch": 1.3900000000000001, |
| "grad_norm": 0.2531629502773285, |
| "learning_rate": 0.00011585593851031347, |
| "loss": 0.5641, |
| "step": 174 |
| }, |
| { |
| "epoch": 1.3980000000000001, |
| "grad_norm": 0.3026553690433502, |
| "learning_rate": 0.00011500553983446527, |
| "loss": 0.6038, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.4060000000000001, |
| "grad_norm": 0.2748616337776184, |
| "learning_rate": 0.00011415402952170433, |
| "loss": 0.5808, |
| "step": 176 |
| }, |
| { |
| "epoch": 1.414, |
| "grad_norm": 0.27345311641693115, |
| "learning_rate": 0.0001133014706534196, |
| "loss": 0.5848, |
| "step": 177 |
| }, |
| { |
| "epoch": 1.422, |
| "grad_norm": 0.273357629776001, |
| "learning_rate": 0.00011244792638867893, |
| "loss": 0.5906, |
| "step": 178 |
| }, |
| { |
| "epoch": 1.43, |
| "grad_norm": 0.26230183243751526, |
| "learning_rate": 0.00011159345995955006, |
| "loss": 0.5886, |
| "step": 179 |
| }, |
| { |
| "epoch": 1.438, |
| "grad_norm": 0.2515832483768463, |
| "learning_rate": 0.00011073813466641632, |
| "loss": 0.5876, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.446, |
| "grad_norm": 0.27451092004776, |
| "learning_rate": 0.00010988201387328717, |
| "loss": 0.5841, |
| "step": 181 |
| }, |
| { |
| "epoch": 1.454, |
| "grad_norm": 0.2588571012020111, |
| "learning_rate": 0.00010902516100310411, |
| "loss": 0.5674, |
| "step": 182 |
| }, |
| { |
| "epoch": 1.462, |
| "grad_norm": 0.2440604865550995, |
| "learning_rate": 0.00010816763953304227, |
| "loss": 0.5646, |
| "step": 183 |
| }, |
| { |
| "epoch": 1.47, |
| "grad_norm": 0.2681560516357422, |
| "learning_rate": 0.00010730951298980776, |
| "loss": 0.5671, |
| "step": 184 |
| }, |
| { |
| "epoch": 1.478, |
| "grad_norm": 0.297048419713974, |
| "learning_rate": 0.00010645084494493165, |
| "loss": 0.583, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.486, |
| "grad_norm": 0.29275089502334595, |
| "learning_rate": 0.00010559169901006034, |
| "loss": 0.6007, |
| "step": 186 |
| }, |
| { |
| "epoch": 1.494, |
| "grad_norm": 0.2599998414516449, |
| "learning_rate": 0.0001047321388322432, |
| "loss": 0.5692, |
| "step": 187 |
| }, |
| { |
| "epoch": 1.502, |
| "grad_norm": 0.2714841663837433, |
| "learning_rate": 0.00010387222808921746, |
| "loss": 0.5996, |
| "step": 188 |
| }, |
| { |
| "epoch": 1.51, |
| "grad_norm": 0.2618089020252228, |
| "learning_rate": 0.00010301203048469083, |
| "loss": 0.5861, |
| "step": 189 |
| }, |
| { |
| "epoch": 1.518, |
| "grad_norm": 0.27270275354385376, |
| "learning_rate": 0.00010215160974362223, |
| "loss": 0.5771, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.526, |
| "grad_norm": 0.2583703398704529, |
| "learning_rate": 0.00010129102960750092, |
| "loss": 0.5897, |
| "step": 191 |
| }, |
| { |
| "epoch": 1.534, |
| "grad_norm": 0.25854265689849854, |
| "learning_rate": 0.00010043035382962443, |
| "loss": 0.5778, |
| "step": 192 |
| }, |
| { |
| "epoch": 1.542, |
| "grad_norm": 0.27234703302383423, |
| "learning_rate": 9.956964617037558e-05, |
| "loss": 0.5659, |
| "step": 193 |
| }, |
| { |
| "epoch": 1.55, |
| "grad_norm": 0.2672337293624878, |
| "learning_rate": 9.870897039249911e-05, |
| "loss": 0.5792, |
| "step": 194 |
| }, |
| { |
| "epoch": 1.558, |
| "grad_norm": 0.2644350528717041, |
| "learning_rate": 9.784839025637778e-05, |
| "loss": 0.5798, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.5659999999999998, |
| "grad_norm": 0.2572389543056488, |
| "learning_rate": 9.698796951530919e-05, |
| "loss": 0.6008, |
| "step": 196 |
| }, |
| { |
| "epoch": 1.5739999999999998, |
| "grad_norm": 0.2693156898021698, |
| "learning_rate": 9.612777191078258e-05, |
| "loss": 0.5903, |
| "step": 197 |
| }, |
| { |
| "epoch": 1.5819999999999999, |
| "grad_norm": 0.26244068145751953, |
| "learning_rate": 9.526786116775682e-05, |
| "loss": 0.569, |
| "step": 198 |
| }, |
| { |
| "epoch": 1.5899999999999999, |
| "grad_norm": 0.25875815749168396, |
| "learning_rate": 9.440830098993969e-05, |
| "loss": 0.6042, |
| "step": 199 |
| }, |
| { |
| "epoch": 1.5979999999999999, |
| "grad_norm": 0.27209436893463135, |
| "learning_rate": 9.354915505506839e-05, |
| "loss": 0.5846, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.6059999999999999, |
| "grad_norm": 0.270780473947525, |
| "learning_rate": 9.269048701019226e-05, |
| "loss": 0.5957, |
| "step": 201 |
| }, |
| { |
| "epoch": 1.6139999999999999, |
| "grad_norm": 0.25013670325279236, |
| "learning_rate": 9.183236046695777e-05, |
| "loss": 0.5845, |
| "step": 202 |
| }, |
| { |
| "epoch": 1.6219999999999999, |
| "grad_norm": 0.2716957628726959, |
| "learning_rate": 9.09748389968959e-05, |
| "loss": 0.5584, |
| "step": 203 |
| }, |
| { |
| "epoch": 1.63, |
| "grad_norm": 0.2737436890602112, |
| "learning_rate": 9.011798612671286e-05, |
| "loss": 0.5836, |
| "step": 204 |
| }, |
| { |
| "epoch": 1.638, |
| "grad_norm": 0.2748481333255768, |
| "learning_rate": 8.92618653335837e-05, |
| "loss": 0.5927, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.646, |
| "grad_norm": 0.2642996609210968, |
| "learning_rate": 8.840654004044996e-05, |
| "loss": 0.6088, |
| "step": 206 |
| }, |
| { |
| "epoch": 1.654, |
| "grad_norm": 0.26341068744659424, |
| "learning_rate": 8.755207361132108e-05, |
| "loss": 0.5841, |
| "step": 207 |
| }, |
| { |
| "epoch": 1.662, |
| "grad_norm": 0.30515289306640625, |
| "learning_rate": 8.669852934658042e-05, |
| "loss": 0.5525, |
| "step": 208 |
| }, |
| { |
| "epoch": 1.67, |
| "grad_norm": 0.2801468074321747, |
| "learning_rate": 8.58459704782957e-05, |
| "loss": 0.5547, |
| "step": 209 |
| }, |
| { |
| "epoch": 1.678, |
| "grad_norm": 0.27618443965911865, |
| "learning_rate": 8.499446016553474e-05, |
| "loss": 0.5613, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.686, |
| "grad_norm": 0.26961931586265564, |
| "learning_rate": 8.414406148968657e-05, |
| "loss": 0.5639, |
| "step": 211 |
| }, |
| { |
| "epoch": 1.694, |
| "grad_norm": 0.29023805260658264, |
| "learning_rate": 8.32948374497881e-05, |
| "loss": 0.5878, |
| "step": 212 |
| }, |
| { |
| "epoch": 1.702, |
| "grad_norm": 0.2671249508857727, |
| "learning_rate": 8.244685095785719e-05, |
| "loss": 0.5743, |
| "step": 213 |
| }, |
| { |
| "epoch": 1.71, |
| "grad_norm": 0.26124948263168335, |
| "learning_rate": 8.160016483423199e-05, |
| "loss": 0.5801, |
| "step": 214 |
| }, |
| { |
| "epoch": 1.718, |
| "grad_norm": 0.2721916437149048, |
| "learning_rate": 8.075484180291701e-05, |
| "loss": 0.5975, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.726, |
| "grad_norm": 0.2630290687084198, |
| "learning_rate": 7.991094448693648e-05, |
| "loss": 0.5714, |
| "step": 216 |
| }, |
| { |
| "epoch": 1.734, |
| "grad_norm": 0.25962400436401367, |
| "learning_rate": 7.906853540369514e-05, |
| "loss": 0.5912, |
| "step": 217 |
| }, |
| { |
| "epoch": 1.742, |
| "grad_norm": 0.26142629981040955, |
| "learning_rate": 7.822767696034682e-05, |
| "loss": 0.577, |
| "step": 218 |
| }, |
| { |
| "epoch": 1.75, |
| "grad_norm": 0.26839005947113037, |
| "learning_rate": 7.738843144917119e-05, |
| "loss": 0.5747, |
| "step": 219 |
| }, |
| { |
| "epoch": 1.758, |
| "grad_norm": 0.28053659200668335, |
| "learning_rate": 7.655086104295904e-05, |
| "loss": 0.612, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.766, |
| "grad_norm": 0.2728564143180847, |
| "learning_rate": 7.571502779040645e-05, |
| "loss": 0.5895, |
| "step": 221 |
| }, |
| { |
| "epoch": 1.774, |
| "grad_norm": 0.26337721943855286, |
| "learning_rate": 7.48809936115181e-05, |
| "loss": 0.5666, |
| "step": 222 |
| }, |
| { |
| "epoch": 1.782, |
| "grad_norm": 0.2544887661933899, |
| "learning_rate": 7.404882029302003e-05, |
| "loss": 0.5552, |
| "step": 223 |
| }, |
| { |
| "epoch": 1.79, |
| "grad_norm": 0.2690827250480652, |
| "learning_rate": 7.321856948378259e-05, |
| "loss": 0.5559, |
| "step": 224 |
| }, |
| { |
| "epoch": 1.798, |
| "grad_norm": 0.27959364652633667, |
| "learning_rate": 7.239030269025311e-05, |
| "loss": 0.5738, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.806, |
| "grad_norm": 0.29118430614471436, |
| "learning_rate": 7.156408127189965e-05, |
| "loss": 0.5753, |
| "step": 226 |
| }, |
| { |
| "epoch": 1.814, |
| "grad_norm": 0.2585116922855377, |
| "learning_rate": 7.073996643666517e-05, |
| "loss": 0.5499, |
| "step": 227 |
| }, |
| { |
| "epoch": 1.822, |
| "grad_norm": 0.24918395280838013, |
| "learning_rate": 6.991801923643324e-05, |
| "loss": 0.5792, |
| "step": 228 |
| }, |
| { |
| "epoch": 1.83, |
| "grad_norm": 0.2664697766304016, |
| "learning_rate": 6.909830056250527e-05, |
| "loss": 0.582, |
| "step": 229 |
| }, |
| { |
| "epoch": 1.838, |
| "grad_norm": 0.2679463326931, |
| "learning_rate": 6.82808711410894e-05, |
| "loss": 0.5919, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.846, |
| "grad_norm": 0.26800620555877686, |
| "learning_rate": 6.746579152880201e-05, |
| "loss": 0.5774, |
| "step": 231 |
| }, |
| { |
| "epoch": 1.854, |
| "grad_norm": 0.25649094581604004, |
| "learning_rate": 6.665312210818131e-05, |
| "loss": 0.5569, |
| "step": 232 |
| }, |
| { |
| "epoch": 1.862, |
| "grad_norm": 0.2661426067352295, |
| "learning_rate": 6.584292308321445e-05, |
| "loss": 0.5759, |
| "step": 233 |
| }, |
| { |
| "epoch": 1.87, |
| "grad_norm": 0.27011772990226746, |
| "learning_rate": 6.503525447487715e-05, |
| "loss": 0.5752, |
| "step": 234 |
| }, |
| { |
| "epoch": 1.8780000000000001, |
| "grad_norm": 0.28006768226623535, |
| "learning_rate": 6.423017611668745e-05, |
| "loss": 0.5873, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.8860000000000001, |
| "grad_norm": 0.281974196434021, |
| "learning_rate": 6.342774765027309e-05, |
| "loss": 0.5868, |
| "step": 236 |
| }, |
| { |
| "epoch": 1.8940000000000001, |
| "grad_norm": 0.28022536635398865, |
| "learning_rate": 6.262802852095311e-05, |
| "loss": 0.5505, |
| "step": 237 |
| }, |
| { |
| "epoch": 1.9020000000000001, |
| "grad_norm": 0.2971389889717102, |
| "learning_rate": 6.18310779733341e-05, |
| "loss": 0.5879, |
| "step": 238 |
| }, |
| { |
| "epoch": 1.9100000000000001, |
| "grad_norm": 0.2779372036457062, |
| "learning_rate": 6.103695504692122e-05, |
| "loss": 0.5648, |
| "step": 239 |
| }, |
| { |
| "epoch": 1.9180000000000001, |
| "grad_norm": 0.3032248616218567, |
| "learning_rate": 6.024571857174443e-05, |
| "loss": 0.5884, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.9260000000000002, |
| "grad_norm": 0.26035642623901367, |
| "learning_rate": 5.94574271640003e-05, |
| "loss": 0.5837, |
| "step": 241 |
| }, |
| { |
| "epoch": 1.9340000000000002, |
| "grad_norm": 0.2789236009120941, |
| "learning_rate": 5.8672139221709577e-05, |
| "loss": 0.5745, |
| "step": 242 |
| }, |
| { |
| "epoch": 1.942, |
| "grad_norm": 0.27459922432899475, |
| "learning_rate": 5.788991292039103e-05, |
| "loss": 0.568, |
| "step": 243 |
| }, |
| { |
| "epoch": 1.95, |
| "grad_norm": 0.2857699394226074, |
| "learning_rate": 5.7110806208751655e-05, |
| "loss": 0.5619, |
| "step": 244 |
| }, |
| { |
| "epoch": 1.958, |
| "grad_norm": 0.2715800106525421, |
| "learning_rate": 5.633487680439361e-05, |
| "loss": 0.5763, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.966, |
| "grad_norm": 0.27073079347610474, |
| "learning_rate": 5.556218218953868e-05, |
| "loss": 0.5815, |
| "step": 246 |
| }, |
| { |
| "epoch": 1.974, |
| "grad_norm": 0.2630924880504608, |
| "learning_rate": 5.479277960676958e-05, |
| "loss": 0.5735, |
| "step": 247 |
| }, |
| { |
| "epoch": 1.982, |
| "grad_norm": 0.2807694971561432, |
| "learning_rate": 5.40267260547896e-05, |
| "loss": 0.5611, |
| "step": 248 |
| }, |
| { |
| "epoch": 1.99, |
| "grad_norm": 0.28481224179267883, |
| "learning_rate": 5.326407828419979e-05, |
| "loss": 0.5671, |
| "step": 249 |
| }, |
| { |
| "epoch": 1.998, |
| "grad_norm": 0.27675607800483704, |
| "learning_rate": 5.2504892793295e-05, |
| "loss": 0.5825, |
| "step": 250 |
| }, |
| { |
| "epoch": 2.005, |
| "grad_norm": 0.27290546894073486, |
| "learning_rate": 5.174922582387819e-05, |
| "loss": 0.5671, |
| "step": 251 |
| }, |
| { |
| "epoch": 2.013, |
| "grad_norm": 0.264244019985199, |
| "learning_rate": 5.0997133357094085e-05, |
| "loss": 0.5697, |
| "step": 252 |
| }, |
| { |
| "epoch": 2.021, |
| "grad_norm": 0.28058966994285583, |
| "learning_rate": 5.0248671109281934e-05, |
| "loss": 0.548, |
| "step": 253 |
| }, |
| { |
| "epoch": 2.029, |
| "grad_norm": 0.28881585597991943, |
| "learning_rate": 4.9503894527847964e-05, |
| "loss": 0.5448, |
| "step": 254 |
| }, |
| { |
| "epoch": 2.037, |
| "grad_norm": 0.2921096980571747, |
| "learning_rate": 4.876285878715764e-05, |
| "loss": 0.5538, |
| "step": 255 |
| }, |
| { |
| "epoch": 2.045, |
| "grad_norm": 0.28465935587882996, |
| "learning_rate": 4.802561878444845e-05, |
| "loss": 0.5271, |
| "step": 256 |
| }, |
| { |
| "epoch": 2.053, |
| "grad_norm": 0.2935147285461426, |
| "learning_rate": 4.729222913576279e-05, |
| "loss": 0.5654, |
| "step": 257 |
| }, |
| { |
| "epoch": 2.061, |
| "grad_norm": 0.28425726294517517, |
| "learning_rate": 4.656274417190214e-05, |
| "loss": 0.5303, |
| "step": 258 |
| }, |
| { |
| "epoch": 2.069, |
| "grad_norm": 0.3496411144733429, |
| "learning_rate": 4.583721793440188e-05, |
| "loss": 0.5462, |
| "step": 259 |
| }, |
| { |
| "epoch": 2.077, |
| "grad_norm": 0.29614442586898804, |
| "learning_rate": 4.5115704171528105e-05, |
| "loss": 0.5528, |
| "step": 260 |
| }, |
| { |
| "epoch": 2.085, |
| "grad_norm": 0.3171357810497284, |
| "learning_rate": 4.439825633429557e-05, |
| "loss": 0.5296, |
| "step": 261 |
| }, |
| { |
| "epoch": 2.093, |
| "grad_norm": 0.3029578626155853, |
| "learning_rate": 4.368492757250814e-05, |
| "loss": 0.5499, |
| "step": 262 |
| }, |
| { |
| "epoch": 2.101, |
| "grad_norm": 0.30544060468673706, |
| "learning_rate": 4.297577073082129e-05, |
| "loss": 0.565, |
| "step": 263 |
| }, |
| { |
| "epoch": 2.109, |
| "grad_norm": 0.3022664785385132, |
| "learning_rate": 4.227083834482728e-05, |
| "loss": 0.5317, |
| "step": 264 |
| }, |
| { |
| "epoch": 2.117, |
| "grad_norm": 0.29141902923583984, |
| "learning_rate": 4.1570182637163155e-05, |
| "loss": 0.5365, |
| "step": 265 |
| }, |
| { |
| "epoch": 2.125, |
| "grad_norm": 0.2986874282360077, |
| "learning_rate": 4.087385551364219e-05, |
| "loss": 0.5617, |
| "step": 266 |
| }, |
| { |
| "epoch": 2.133, |
| "grad_norm": 0.2957478165626526, |
| "learning_rate": 4.0181908559408366e-05, |
| "loss": 0.5285, |
| "step": 267 |
| }, |
| { |
| "epoch": 2.141, |
| "grad_norm": 0.3051709830760956, |
| "learning_rate": 3.949439303511512e-05, |
| "loss": 0.5388, |
| "step": 268 |
| }, |
| { |
| "epoch": 2.149, |
| "grad_norm": 0.30644309520721436, |
| "learning_rate": 3.881135987312757e-05, |
| "loss": 0.556, |
| "step": 269 |
| }, |
| { |
| "epoch": 2.157, |
| "grad_norm": 0.3261376619338989, |
| "learning_rate": 3.813285967374969e-05, |
| "loss": 0.5568, |
| "step": 270 |
| }, |
| { |
| "epoch": 2.165, |
| "grad_norm": 0.29734110832214355, |
| "learning_rate": 3.745894270147539e-05, |
| "loss": 0.5369, |
| "step": 271 |
| }, |
| { |
| "epoch": 2.173, |
| "grad_norm": 0.28361955285072327, |
| "learning_rate": 3.678965888126513e-05, |
| "loss": 0.5395, |
| "step": 272 |
| }, |
| { |
| "epoch": 2.181, |
| "grad_norm": 0.31067171692848206, |
| "learning_rate": 3.612505779484728e-05, |
| "loss": 0.5634, |
| "step": 273 |
| }, |
| { |
| "epoch": 2.189, |
| "grad_norm": 0.31847238540649414, |
| "learning_rate": 3.546518867704499e-05, |
| "loss": 0.5435, |
| "step": 274 |
| }, |
| { |
| "epoch": 2.197, |
| "grad_norm": 0.3154855966567993, |
| "learning_rate": 3.4810100412128747e-05, |
| "loss": 0.545, |
| "step": 275 |
| }, |
| { |
| "epoch": 2.205, |
| "grad_norm": 0.2924310564994812, |
| "learning_rate": 3.415984153019513e-05, |
| "loss": 0.5512, |
| "step": 276 |
| }, |
| { |
| "epoch": 2.213, |
| "grad_norm": 0.301763117313385, |
| "learning_rate": 3.351446020357136e-05, |
| "loss": 0.5619, |
| "step": 277 |
| }, |
| { |
| "epoch": 2.221, |
| "grad_norm": 0.2811594307422638, |
| "learning_rate": 3.287400424324687e-05, |
| "loss": 0.5421, |
| "step": 278 |
| }, |
| { |
| "epoch": 2.229, |
| "grad_norm": 0.2934137284755707, |
| "learning_rate": 3.223852109533112e-05, |
| "loss": 0.5517, |
| "step": 279 |
| }, |
| { |
| "epoch": 2.237, |
| "grad_norm": 0.3074370324611664, |
| "learning_rate": 3.160805783753897e-05, |
| "loss": 0.5655, |
| "step": 280 |
| }, |
| { |
| "epoch": 2.245, |
| "grad_norm": 0.2928076386451721, |
| "learning_rate": 3.098266117570282e-05, |
| "loss": 0.5519, |
| "step": 281 |
| }, |
| { |
| "epoch": 2.253, |
| "grad_norm": 0.3051488995552063, |
| "learning_rate": 3.0362377440312784e-05, |
| "loss": 0.5493, |
| "step": 282 |
| }, |
| { |
| "epoch": 2.261, |
| "grad_norm": 0.30227115750312805, |
| "learning_rate": 2.9747252583084295e-05, |
| "loss": 0.5501, |
| "step": 283 |
| }, |
| { |
| "epoch": 2.269, |
| "grad_norm": 0.29634636640548706, |
| "learning_rate": 2.9137332173554043e-05, |
| "loss": 0.5264, |
| "step": 284 |
| }, |
| { |
| "epoch": 2.277, |
| "grad_norm": 0.2818604111671448, |
| "learning_rate": 2.853266139570391e-05, |
| "loss": 0.5434, |
| "step": 285 |
| }, |
| { |
| "epoch": 2.285, |
| "grad_norm": 0.2941757142543793, |
| "learning_rate": 2.793328504461391e-05, |
| "loss": 0.5477, |
| "step": 286 |
| }, |
| { |
| "epoch": 2.293, |
| "grad_norm": 0.2820718288421631, |
| "learning_rate": 2.733924752314345e-05, |
| "loss": 0.545, |
| "step": 287 |
| }, |
| { |
| "epoch": 2.301, |
| "grad_norm": 0.29931187629699707, |
| "learning_rate": 2.675059283864214e-05, |
| "loss": 0.5372, |
| "step": 288 |
| }, |
| { |
| "epoch": 2.309, |
| "grad_norm": 0.3119406998157501, |
| "learning_rate": 2.616736459968936e-05, |
| "loss": 0.562, |
| "step": 289 |
| }, |
| { |
| "epoch": 2.317, |
| "grad_norm": 0.3057345747947693, |
| "learning_rate": 2.5589606012863963e-05, |
| "loss": 0.5648, |
| "step": 290 |
| }, |
| { |
| "epoch": 2.325, |
| "grad_norm": 0.29526305198669434, |
| "learning_rate": 2.5017359879543166e-05, |
| "loss": 0.5448, |
| "step": 291 |
| }, |
| { |
| "epoch": 2.333, |
| "grad_norm": 0.32076817750930786, |
| "learning_rate": 2.4450668592731974e-05, |
| "loss": 0.5537, |
| "step": 292 |
| }, |
| { |
| "epoch": 2.341, |
| "grad_norm": 0.3103630542755127, |
| "learning_rate": 2.388957413392253e-05, |
| "loss": 0.5746, |
| "step": 293 |
| }, |
| { |
| "epoch": 2.349, |
| "grad_norm": 0.2959766089916229, |
| "learning_rate": 2.33341180699841e-05, |
| "loss": 0.5532, |
| "step": 294 |
| }, |
| { |
| "epoch": 2.357, |
| "grad_norm": 0.28677570819854736, |
| "learning_rate": 2.2784341550083576e-05, |
| "loss": 0.5439, |
| "step": 295 |
| }, |
| { |
| "epoch": 2.365, |
| "grad_norm": 0.29818570613861084, |
| "learning_rate": 2.224028530263733e-05, |
| "loss": 0.5453, |
| "step": 296 |
| }, |
| { |
| "epoch": 2.373, |
| "grad_norm": 0.2917187809944153, |
| "learning_rate": 2.1701989632293717e-05, |
| "loss": 0.5335, |
| "step": 297 |
| }, |
| { |
| "epoch": 2.3810000000000002, |
| "grad_norm": 0.31331866979599, |
| "learning_rate": 2.1169494416947477e-05, |
| "loss": 0.5663, |
| "step": 298 |
| }, |
| { |
| "epoch": 2.3890000000000002, |
| "grad_norm": 0.2955493927001953, |
| "learning_rate": 2.0642839104785272e-05, |
| "loss": 0.5509, |
| "step": 299 |
| }, |
| { |
| "epoch": 2.3970000000000002, |
| "grad_norm": 0.3018077313899994, |
| "learning_rate": 2.0122062711363532e-05, |
| "loss": 0.5435, |
| "step": 300 |
| }, |
| { |
| "epoch": 2.4050000000000002, |
| "grad_norm": 0.3038371801376343, |
| "learning_rate": 1.9607203816717888e-05, |
| "loss": 0.5544, |
| "step": 301 |
| }, |
| { |
| "epoch": 2.413, |
| "grad_norm": 0.29873090982437134, |
| "learning_rate": 1.9098300562505266e-05, |
| "loss": 0.5606, |
| "step": 302 |
| }, |
| { |
| "epoch": 2.421, |
| "grad_norm": 0.29808226227760315, |
| "learning_rate": 1.859539064917821e-05, |
| "loss": 0.5479, |
| "step": 303 |
| }, |
| { |
| "epoch": 2.429, |
| "grad_norm": 0.3080662190914154, |
| "learning_rate": 1.8098511333192024e-05, |
| "loss": 0.5587, |
| "step": 304 |
| }, |
| { |
| "epoch": 2.437, |
| "grad_norm": 0.29491138458251953, |
| "learning_rate": 1.7607699424244585e-05, |
| "loss": 0.5482, |
| "step": 305 |
| }, |
| { |
| "epoch": 2.445, |
| "grad_norm": 0.3016619384288788, |
| "learning_rate": 1.712299128254965e-05, |
| "loss": 0.5539, |
| "step": 306 |
| }, |
| { |
| "epoch": 2.453, |
| "grad_norm": 0.31340616941452026, |
| "learning_rate": 1.6644422816143024e-05, |
| "loss": 0.5559, |
| "step": 307 |
| }, |
| { |
| "epoch": 2.461, |
| "grad_norm": 0.3533984422683716, |
| "learning_rate": 1.6172029478222594e-05, |
| "loss": 0.5288, |
| "step": 308 |
| }, |
| { |
| "epoch": 2.469, |
| "grad_norm": 0.2987024188041687, |
| "learning_rate": 1.570584626452173e-05, |
| "loss": 0.5405, |
| "step": 309 |
| }, |
| { |
| "epoch": 2.477, |
| "grad_norm": 0.312209814786911, |
| "learning_rate": 1.5245907710716911e-05, |
| "loss": 0.544, |
| "step": 310 |
| }, |
| { |
| "epoch": 2.485, |
| "grad_norm": 0.30217549204826355, |
| "learning_rate": 1.4792247889869071e-05, |
| "loss": 0.5483, |
| "step": 311 |
| }, |
| { |
| "epoch": 2.493, |
| "grad_norm": 0.2916695475578308, |
| "learning_rate": 1.4344900409899642e-05, |
| "loss": 0.5461, |
| "step": 312 |
| }, |
| { |
| "epoch": 2.501, |
| "grad_norm": 0.3004852831363678, |
| "learning_rate": 1.3903898411100568e-05, |
| "loss": 0.5427, |
| "step": 313 |
| }, |
| { |
| "epoch": 2.509, |
| "grad_norm": 0.3007405400276184, |
| "learning_rate": 1.3469274563679402e-05, |
| "loss": 0.5666, |
| "step": 314 |
| }, |
| { |
| "epoch": 2.517, |
| "grad_norm": 0.2975088655948639, |
| "learning_rate": 1.30410610653389e-05, |
| "loss": 0.5528, |
| "step": 315 |
| }, |
| { |
| "epoch": 2.525, |
| "grad_norm": 0.31289756298065186, |
| "learning_rate": 1.261928963889194e-05, |
| "loss": 0.5757, |
| "step": 316 |
| }, |
| { |
| "epoch": 2.533, |
| "grad_norm": 0.32117587327957153, |
| "learning_rate": 1.2203991529911197e-05, |
| "loss": 0.5182, |
| "step": 317 |
| }, |
| { |
| "epoch": 2.541, |
| "grad_norm": 0.31478121876716614, |
| "learning_rate": 1.1795197504414656e-05, |
| "loss": 0.542, |
| "step": 318 |
| }, |
| { |
| "epoch": 2.549, |
| "grad_norm": 0.302287220954895, |
| "learning_rate": 1.1392937846586215e-05, |
| "loss": 0.5275, |
| "step": 319 |
| }, |
| { |
| "epoch": 2.557, |
| "grad_norm": 0.3095117509365082, |
| "learning_rate": 1.0997242356532334e-05, |
| "loss": 0.5534, |
| "step": 320 |
| }, |
| { |
| "epoch": 2.565, |
| "grad_norm": 0.3028477728366852, |
| "learning_rate": 1.0608140348074292e-05, |
| "loss": 0.5463, |
| "step": 321 |
| }, |
| { |
| "epoch": 2.573, |
| "grad_norm": 0.30029499530792236, |
| "learning_rate": 1.0225660646576629e-05, |
| "loss": 0.5449, |
| "step": 322 |
| }, |
| { |
| "epoch": 2.581, |
| "grad_norm": 0.30782079696655273, |
| "learning_rate": 9.849831586811598e-06, |
| "loss": 0.5508, |
| "step": 323 |
| }, |
| { |
| "epoch": 2.589, |
| "grad_norm": 0.2926454544067383, |
| "learning_rate": 9.48068101086026e-06, |
| "loss": 0.5467, |
| "step": 324 |
| }, |
| { |
| "epoch": 2.597, |
| "grad_norm": 0.3094619810581207, |
| "learning_rate": 9.118236266049707e-06, |
| "loss": 0.5496, |
| "step": 325 |
| }, |
| { |
| "epoch": 2.605, |
| "grad_norm": 0.30689892172813416, |
| "learning_rate": 8.76252420292728e-06, |
| "loss": 0.565, |
| "step": 326 |
| }, |
| { |
| "epoch": 2.613, |
| "grad_norm": 0.29666024446487427, |
| "learning_rate": 8.413571173271295e-06, |
| "loss": 0.5264, |
| "step": 327 |
| }, |
| { |
| "epoch": 2.621, |
| "grad_norm": 0.2976740598678589, |
| "learning_rate": 8.071403028138968e-06, |
| "loss": 0.5544, |
| "step": 328 |
| }, |
| { |
| "epoch": 2.629, |
| "grad_norm": 0.2889856994152069, |
| "learning_rate": 7.736045115951251e-06, |
| "loss": 0.5363, |
| "step": 329 |
| }, |
| { |
| "epoch": 2.637, |
| "grad_norm": 0.3127500116825104, |
| "learning_rate": 7.40752228061502e-06, |
| "loss": 0.5636, |
| "step": 330 |
| }, |
| { |
| "epoch": 2.645, |
| "grad_norm": 0.31198227405548096, |
| "learning_rate": 7.085858859682571e-06, |
| "loss": 0.5518, |
| "step": 331 |
| }, |
| { |
| "epoch": 2.653, |
| "grad_norm": 0.318718284368515, |
| "learning_rate": 6.7710786825486705e-06, |
| "loss": 0.5456, |
| "step": 332 |
| }, |
| { |
| "epoch": 2.661, |
| "grad_norm": 0.3112807869911194, |
| "learning_rate": 6.463205068685174e-06, |
| "loss": 0.5297, |
| "step": 333 |
| }, |
| { |
| "epoch": 2.669, |
| "grad_norm": 0.30573782324790955, |
| "learning_rate": 6.16226082591359e-06, |
| "loss": 0.5353, |
| "step": 334 |
| }, |
| { |
| "epoch": 2.677, |
| "grad_norm": 0.3077561855316162, |
| "learning_rate": 5.868268248715292e-06, |
| "loss": 0.5449, |
| "step": 335 |
| }, |
| { |
| "epoch": 2.685, |
| "grad_norm": 0.3065972626209259, |
| "learning_rate": 5.5812491165800675e-06, |
| "loss": 0.5467, |
| "step": 336 |
| }, |
| { |
| "epoch": 2.693, |
| "grad_norm": 0.3004700541496277, |
| "learning_rate": 5.3012246923924816e-06, |
| "loss": 0.5394, |
| "step": 337 |
| }, |
| { |
| "epoch": 2.701, |
| "grad_norm": 0.30005961656570435, |
| "learning_rate": 5.028215720856821e-06, |
| "loss": 0.5605, |
| "step": 338 |
| }, |
| { |
| "epoch": 2.709, |
| "grad_norm": 0.3056250810623169, |
| "learning_rate": 4.762242426960262e-06, |
| "loss": 0.5388, |
| "step": 339 |
| }, |
| { |
| "epoch": 2.717, |
| "grad_norm": 0.3118292987346649, |
| "learning_rate": 4.503324514474483e-06, |
| "loss": 0.5206, |
| "step": 340 |
| }, |
| { |
| "epoch": 2.725, |
| "grad_norm": 0.29597511887550354, |
| "learning_rate": 4.251481164496074e-06, |
| "loss": 0.5595, |
| "step": 341 |
| }, |
| { |
| "epoch": 2.733, |
| "grad_norm": 0.29819923639297485, |
| "learning_rate": 4.006731034025546e-06, |
| "loss": 0.5391, |
| "step": 342 |
| }, |
| { |
| "epoch": 2.741, |
| "grad_norm": 0.3268795907497406, |
| "learning_rate": 3.769092254585138e-06, |
| "loss": 0.5552, |
| "step": 343 |
| }, |
| { |
| "epoch": 2.749, |
| "grad_norm": 0.3218269646167755, |
| "learning_rate": 3.5385824308756587e-06, |
| "loss": 0.5511, |
| "step": 344 |
| }, |
| { |
| "epoch": 2.757, |
| "grad_norm": 0.29198867082595825, |
| "learning_rate": 3.3152186394722505e-06, |
| "loss": 0.5419, |
| "step": 345 |
| }, |
| { |
| "epoch": 2.765, |
| "grad_norm": 0.30343398451805115, |
| "learning_rate": 3.099017427559392e-06, |
| "loss": 0.5283, |
| "step": 346 |
| }, |
| { |
| "epoch": 2.773, |
| "grad_norm": 0.30696964263916016, |
| "learning_rate": 2.889994811704966e-06, |
| "loss": 0.5488, |
| "step": 347 |
| }, |
| { |
| "epoch": 2.781, |
| "grad_norm": 0.30184078216552734, |
| "learning_rate": 2.688166276673809e-06, |
| "loss": 0.5294, |
| "step": 348 |
| }, |
| { |
| "epoch": 2.789, |
| "grad_norm": 0.29285117983818054, |
| "learning_rate": 2.493546774280531e-06, |
| "loss": 0.5586, |
| "step": 349 |
| }, |
| { |
| "epoch": 2.797, |
| "grad_norm": 0.3032819926738739, |
| "learning_rate": 2.30615072228183e-06, |
| "loss": 0.5252, |
| "step": 350 |
| }, |
| { |
| "epoch": 2.805, |
| "grad_norm": 0.2997422516345978, |
| "learning_rate": 2.1259920033084745e-06, |
| "loss": 0.5515, |
| "step": 351 |
| }, |
| { |
| "epoch": 2.8129999999999997, |
| "grad_norm": 0.29675498604774475, |
| "learning_rate": 1.9530839638367995e-06, |
| "loss": 0.5636, |
| "step": 352 |
| }, |
| { |
| "epoch": 2.8209999999999997, |
| "grad_norm": 0.30580922961235046, |
| "learning_rate": 1.7874394131999427e-06, |
| "loss": 0.55, |
| "step": 353 |
| }, |
| { |
| "epoch": 2.8289999999999997, |
| "grad_norm": 0.298286497592926, |
| "learning_rate": 1.6290706226390285e-06, |
| "loss": 0.5306, |
| "step": 354 |
| }, |
| { |
| "epoch": 2.8369999999999997, |
| "grad_norm": 0.30674898624420166, |
| "learning_rate": 1.4779893243939359e-06, |
| "loss": 0.565, |
| "step": 355 |
| }, |
| { |
| "epoch": 2.8449999999999998, |
| "grad_norm": 0.30078810453414917, |
| "learning_rate": 1.334206710834296e-06, |
| "loss": 0.5152, |
| "step": 356 |
| }, |
| { |
| "epoch": 2.8529999999999998, |
| "grad_norm": 0.2936237156391144, |
| "learning_rate": 1.1977334336302438e-06, |
| "loss": 0.5393, |
| "step": 357 |
| }, |
| { |
| "epoch": 2.8609999999999998, |
| "grad_norm": 0.30025407671928406, |
| "learning_rate": 1.068579602963371e-06, |
| "loss": 0.5522, |
| "step": 358 |
| }, |
| { |
| "epoch": 2.8689999999999998, |
| "grad_norm": 0.31208300590515137, |
| "learning_rate": 9.46754786777726e-07, |
| "loss": 0.5616, |
| "step": 359 |
| }, |
| { |
| "epoch": 2.877, |
| "grad_norm": 0.29706162214279175, |
| "learning_rate": 8.322680100710023e-07, |
| "loss": 0.5437, |
| "step": 360 |
| }, |
| { |
| "epoch": 2.885, |
| "grad_norm": 0.30245542526245117, |
| "learning_rate": 7.251277542259849e-07, |
| "loss": 0.5608, |
| "step": 361 |
| }, |
| { |
| "epoch": 2.893, |
| "grad_norm": 0.2977691888809204, |
| "learning_rate": 6.253419563821972e-07, |
| "loss": 0.554, |
| "step": 362 |
| }, |
| { |
| "epoch": 2.901, |
| "grad_norm": 0.32129979133605957, |
| "learning_rate": 5.329180088478935e-07, |
| "loss": 0.5557, |
| "step": 363 |
| }, |
| { |
| "epoch": 2.909, |
| "grad_norm": 0.3130888044834137, |
| "learning_rate": 4.4786275855247527e-07, |
| "loss": 0.5433, |
| "step": 364 |
| }, |
| { |
| "epoch": 2.917, |
| "grad_norm": 0.2962971329689026, |
| "learning_rate": 3.701825065392184e-07, |
| "loss": 0.5362, |
| "step": 365 |
| }, |
| { |
| "epoch": 2.925, |
| "grad_norm": 0.29684919118881226, |
| "learning_rate": 2.998830074984915e-07, |
| "loss": 0.5295, |
| "step": 366 |
| }, |
| { |
| "epoch": 2.933, |
| "grad_norm": 0.3092200756072998, |
| "learning_rate": 2.369694693414304e-07, |
| "loss": 0.5645, |
| "step": 367 |
| }, |
| { |
| "epoch": 2.941, |
| "grad_norm": 0.2915026843547821, |
| "learning_rate": 1.8144655281413513e-07, |
| "loss": 0.5491, |
| "step": 368 |
| }, |
| { |
| "epoch": 2.949, |
| "grad_norm": 0.310531347990036, |
| "learning_rate": 1.333183711524133e-07, |
| "loss": 0.5447, |
| "step": 369 |
| }, |
| { |
| "epoch": 2.957, |
| "grad_norm": 0.28913000226020813, |
| "learning_rate": 9.258848977700129e-08, |
| "loss": 0.5251, |
| "step": 370 |
| }, |
| { |
| "epoch": 2.965, |
| "grad_norm": 0.28818029165267944, |
| "learning_rate": 5.925992602952013e-08, |
| "loss": 0.5324, |
| "step": 371 |
| }, |
| { |
| "epoch": 2.973, |
| "grad_norm": 0.31184566020965576, |
| "learning_rate": 3.333514894887646e-08, |
| "loss": 0.5557, |
| "step": 372 |
| }, |
| { |
| "epoch": 2.981, |
| "grad_norm": 0.29842570424079895, |
| "learning_rate": 1.4816079088375567e-08, |
| "loss": 0.5384, |
| "step": 373 |
| }, |
| { |
| "epoch": 2.989, |
| "grad_norm": 0.3020179867744446, |
| "learning_rate": 3.7040883734462683e-09, |
| "loss": 0.5352, |
| "step": 374 |
| }, |
| { |
| "epoch": 2.997, |
| "grad_norm": 0.28850436210632324, |
| "learning_rate": 0.0, |
| "loss": 0.5472, |
| "step": 375 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 375, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 125, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.219477097775104e+18, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|