| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.21416142417347075, |
| "eval_steps": 500, |
| "global_step": 200, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0010708071208673537, |
| "grad_norm": 3.1792166233062744, |
| "learning_rate": 5e-05, |
| "loss": 2.9696, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.0021416142417347074, |
| "grad_norm": 3.353651285171509, |
| "learning_rate": 0.0001, |
| "loss": 3.0758, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.0032124213626020613, |
| "grad_norm": 1.886090874671936, |
| "learning_rate": 0.00015, |
| "loss": 2.7586, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.004283228483469415, |
| "grad_norm": 1.451682686805725, |
| "learning_rate": 0.0002, |
| "loss": 2.5324, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.005354035604336769, |
| "grad_norm": 1.175742268562317, |
| "learning_rate": 0.00025, |
| "loss": 2.2141, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.0064248427252041225, |
| "grad_norm": 0.868193507194519, |
| "learning_rate": 0.0003, |
| "loss": 2.0835, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.007495649846071476, |
| "grad_norm": 1.0772305727005005, |
| "learning_rate": 0.00035, |
| "loss": 1.9921, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.00856645696693883, |
| "grad_norm": 1.069272518157959, |
| "learning_rate": 0.0004, |
| "loss": 1.9016, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.009637264087806184, |
| "grad_norm": 0.7301461100578308, |
| "learning_rate": 0.00045000000000000004, |
| "loss": 1.8262, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.010708071208673538, |
| "grad_norm": 0.49968260526657104, |
| "learning_rate": 0.0005, |
| "loss": 1.6998, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.01177887832954089, |
| "grad_norm": 0.42115330696105957, |
| "learning_rate": 0.0004994582881906825, |
| "loss": 1.6768, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.012849685450408245, |
| "grad_norm": 0.5151969790458679, |
| "learning_rate": 0.0004989165763813651, |
| "loss": 1.7301, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.0139204925712756, |
| "grad_norm": 0.604058563709259, |
| "learning_rate": 0.0004983748645720476, |
| "loss": 1.6961, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.014991299692142952, |
| "grad_norm": 0.4526136815547943, |
| "learning_rate": 0.0004978331527627302, |
| "loss": 1.6385, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.016062106813010308, |
| "grad_norm": 0.3732638657093048, |
| "learning_rate": 0.0004972914409534127, |
| "loss": 1.6927, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.01713291393387766, |
| "grad_norm": 0.39037632942199707, |
| "learning_rate": 0.0004967497291440954, |
| "loss": 1.6063, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.018203721054745013, |
| "grad_norm": 0.41541412472724915, |
| "learning_rate": 0.0004962080173347779, |
| "loss": 1.6535, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.019274528175612368, |
| "grad_norm": 0.33182990550994873, |
| "learning_rate": 0.0004956663055254605, |
| "loss": 1.5564, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.020345335296479722, |
| "grad_norm": 0.3516808748245239, |
| "learning_rate": 0.0004951245937161431, |
| "loss": 1.6012, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.021416142417347076, |
| "grad_norm": 0.3928525447845459, |
| "learning_rate": 0.0004945828819068256, |
| "loss": 1.6524, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.02248694953821443, |
| "grad_norm": 0.3181082308292389, |
| "learning_rate": 0.0004940411700975082, |
| "loss": 1.6055, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.02355775665908178, |
| "grad_norm": 0.30989620089530945, |
| "learning_rate": 0.0004934994582881907, |
| "loss": 1.6236, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.024628563779949136, |
| "grad_norm": 0.3335777521133423, |
| "learning_rate": 0.0004929577464788732, |
| "loss": 1.6403, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.02569937090081649, |
| "grad_norm": 0.36894136667251587, |
| "learning_rate": 0.0004924160346695558, |
| "loss": 1.6778, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.026770178021683844, |
| "grad_norm": 0.3191300928592682, |
| "learning_rate": 0.0004918743228602383, |
| "loss": 1.5897, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.0278409851425512, |
| "grad_norm": 0.3290117681026459, |
| "learning_rate": 0.0004913326110509209, |
| "loss": 1.6285, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.028911792263418553, |
| "grad_norm": 0.307182252407074, |
| "learning_rate": 0.0004907908992416034, |
| "loss": 1.5576, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.029982599384285904, |
| "grad_norm": 0.28709110617637634, |
| "learning_rate": 0.0004902491874322861, |
| "loss": 1.6744, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.031053406505153258, |
| "grad_norm": 0.33125609159469604, |
| "learning_rate": 0.0004897074756229686, |
| "loss": 1.6106, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.032124213626020616, |
| "grad_norm": 0.31909990310668945, |
| "learning_rate": 0.0004891657638136512, |
| "loss": 1.5368, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.03319502074688797, |
| "grad_norm": 0.34221193194389343, |
| "learning_rate": 0.0004886240520043337, |
| "loss": 1.6336, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.03426582786775532, |
| "grad_norm": 0.34219980239868164, |
| "learning_rate": 0.00048808234019501623, |
| "loss": 1.6243, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.035336634988622675, |
| "grad_norm": 0.29287898540496826, |
| "learning_rate": 0.0004875406283856988, |
| "loss": 1.5441, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.036407442109490026, |
| "grad_norm": 0.29403921961784363, |
| "learning_rate": 0.0004869989165763814, |
| "loss": 1.651, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.037478249230357384, |
| "grad_norm": 0.3238803446292877, |
| "learning_rate": 0.00048645720476706396, |
| "loss": 1.6178, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.038549056351224735, |
| "grad_norm": 0.3332749903202057, |
| "learning_rate": 0.0004859154929577465, |
| "loss": 1.5395, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.03961986347209209, |
| "grad_norm": 0.33042415976524353, |
| "learning_rate": 0.0004853737811484291, |
| "loss": 1.5116, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.040690670592959444, |
| "grad_norm": 0.32300877571105957, |
| "learning_rate": 0.00048483206933911164, |
| "loss": 1.5697, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.041761477713826795, |
| "grad_norm": 0.35760653018951416, |
| "learning_rate": 0.00048429035752979414, |
| "loss": 1.629, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.04283228483469415, |
| "grad_norm": 0.3095184564590454, |
| "learning_rate": 0.0004837486457204767, |
| "loss": 1.571, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0439030919555615, |
| "grad_norm": 0.30683574080467224, |
| "learning_rate": 0.00048320693391115926, |
| "loss": 1.5357, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.04497389907642886, |
| "grad_norm": 0.33406275510787964, |
| "learning_rate": 0.0004826652221018418, |
| "loss": 1.6077, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.04604470619729621, |
| "grad_norm": 0.42627573013305664, |
| "learning_rate": 0.0004821235102925244, |
| "loss": 1.5662, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.04711551331816356, |
| "grad_norm": 0.3232003152370453, |
| "learning_rate": 0.00048158179848320693, |
| "loss": 1.6063, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.04818632043903092, |
| "grad_norm": 0.4828573763370514, |
| "learning_rate": 0.0004810400866738895, |
| "loss": 1.523, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.04925712755989827, |
| "grad_norm": 0.39869874715805054, |
| "learning_rate": 0.00048049837486457205, |
| "loss": 1.5844, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.05032793468076563, |
| "grad_norm": 0.36061400175094604, |
| "learning_rate": 0.0004799566630552546, |
| "loss": 1.589, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.05139874180163298, |
| "grad_norm": 0.3593485951423645, |
| "learning_rate": 0.00047941495124593716, |
| "loss": 1.5149, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.05246954892250034, |
| "grad_norm": 0.3493165373802185, |
| "learning_rate": 0.0004788732394366197, |
| "loss": 1.586, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.05354035604336769, |
| "grad_norm": 0.3129478394985199, |
| "learning_rate": 0.00047833152762730233, |
| "loss": 1.5374, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.05461116316423504, |
| "grad_norm": 0.3232264816761017, |
| "learning_rate": 0.00047778981581798484, |
| "loss": 1.5473, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.0556819702851024, |
| "grad_norm": 0.3314213752746582, |
| "learning_rate": 0.0004772481040086674, |
| "loss": 1.5624, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.05675277740596975, |
| "grad_norm": 0.3443197011947632, |
| "learning_rate": 0.00047670639219934995, |
| "loss": 1.523, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.057823584526837106, |
| "grad_norm": 0.3222476840019226, |
| "learning_rate": 0.0004761646803900325, |
| "loss": 1.6094, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.05889439164770446, |
| "grad_norm": 0.30979102849960327, |
| "learning_rate": 0.00047562296858071507, |
| "loss": 1.6053, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.05996519876857181, |
| "grad_norm": 0.3003416061401367, |
| "learning_rate": 0.00047508125677139763, |
| "loss": 1.4889, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.061036005889439165, |
| "grad_norm": 0.3053031861782074, |
| "learning_rate": 0.0004745395449620802, |
| "loss": 1.5641, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.062106813010306516, |
| "grad_norm": 0.31200629472732544, |
| "learning_rate": 0.00047399783315276275, |
| "loss": 1.5857, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.06317762013117387, |
| "grad_norm": 0.3085310757160187, |
| "learning_rate": 0.0004734561213434453, |
| "loss": 1.5795, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.06424842725204123, |
| "grad_norm": 0.3053343892097473, |
| "learning_rate": 0.00047291440953412786, |
| "loss": 1.48, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.06531923437290858, |
| "grad_norm": 0.31742650270462036, |
| "learning_rate": 0.0004723726977248104, |
| "loss": 1.5267, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.06639004149377593, |
| "grad_norm": 0.302557110786438, |
| "learning_rate": 0.0004718309859154929, |
| "loss": 1.4835, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.06746084861464328, |
| "grad_norm": 0.3269102871417999, |
| "learning_rate": 0.0004712892741061755, |
| "loss": 1.6023, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.06853165573551064, |
| "grad_norm": 0.3242720365524292, |
| "learning_rate": 0.00047074756229685804, |
| "loss": 1.6019, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.069602462856378, |
| "grad_norm": 0.3117155134677887, |
| "learning_rate": 0.00047020585048754065, |
| "loss": 1.5719, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.07067326997724535, |
| "grad_norm": 0.31575411558151245, |
| "learning_rate": 0.0004696641386782232, |
| "loss": 1.5588, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.0717440770981127, |
| "grad_norm": 0.3055570125579834, |
| "learning_rate": 0.00046912242686890577, |
| "loss": 1.54, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.07281488421898005, |
| "grad_norm": 0.30278709530830383, |
| "learning_rate": 0.0004685807150595883, |
| "loss": 1.4943, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.0738856913398474, |
| "grad_norm": 0.31028270721435547, |
| "learning_rate": 0.0004680390032502709, |
| "loss": 1.4901, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.07495649846071477, |
| "grad_norm": 0.3005111515522003, |
| "learning_rate": 0.00046749729144095344, |
| "loss": 1.4811, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.07602730558158212, |
| "grad_norm": 0.31970301270484924, |
| "learning_rate": 0.000466955579631636, |
| "loss": 1.5812, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.07709811270244947, |
| "grad_norm": 0.31910890340805054, |
| "learning_rate": 0.00046641386782231856, |
| "loss": 1.5398, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.07816891982331682, |
| "grad_norm": 0.34352612495422363, |
| "learning_rate": 0.0004658721560130011, |
| "loss": 1.6016, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.07923972694418419, |
| "grad_norm": 0.3307402729988098, |
| "learning_rate": 0.0004653304442036836, |
| "loss": 1.5357, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.08031053406505154, |
| "grad_norm": 0.31802475452423096, |
| "learning_rate": 0.0004647887323943662, |
| "loss": 1.5463, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.08138134118591889, |
| "grad_norm": 0.3045582175254822, |
| "learning_rate": 0.00046424702058504874, |
| "loss": 1.4936, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.08245214830678624, |
| "grad_norm": 0.3408415913581848, |
| "learning_rate": 0.0004637053087757313, |
| "loss": 1.526, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.08352295542765359, |
| "grad_norm": 0.3176616430282593, |
| "learning_rate": 0.00046316359696641385, |
| "loss": 1.5581, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.08459376254852095, |
| "grad_norm": 0.3179102838039398, |
| "learning_rate": 0.0004626218851570964, |
| "loss": 1.5525, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.0856645696693883, |
| "grad_norm": 0.3425735831260681, |
| "learning_rate": 0.00046208017334777897, |
| "loss": 1.4914, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.08673537679025566, |
| "grad_norm": 0.36185234785079956, |
| "learning_rate": 0.0004615384615384616, |
| "loss": 1.5293, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.087806183911123, |
| "grad_norm": 0.3470607399940491, |
| "learning_rate": 0.00046099674972914414, |
| "loss": 1.5388, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.08887699103199036, |
| "grad_norm": 0.3171769976615906, |
| "learning_rate": 0.0004604550379198267, |
| "loss": 1.4932, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.08994779815285772, |
| "grad_norm": 0.3396613895893097, |
| "learning_rate": 0.00045991332611050926, |
| "loss": 1.5367, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.09101860527372507, |
| "grad_norm": 0.3147753179073334, |
| "learning_rate": 0.0004593716143011918, |
| "loss": 1.5413, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.09208941239459242, |
| "grad_norm": 0.3213801383972168, |
| "learning_rate": 0.0004588299024918743, |
| "loss": 1.4544, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.09316021951545977, |
| "grad_norm": 0.3900924623012543, |
| "learning_rate": 0.0004582881906825569, |
| "loss": 1.5155, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.09423102663632713, |
| "grad_norm": 0.34930315613746643, |
| "learning_rate": 0.00045774647887323943, |
| "loss": 1.5323, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.09530183375719449, |
| "grad_norm": 0.32511013746261597, |
| "learning_rate": 0.000457204767063922, |
| "loss": 1.484, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.09637264087806184, |
| "grad_norm": 0.3209106922149658, |
| "learning_rate": 0.00045666305525460455, |
| "loss": 1.4659, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.09744344799892919, |
| "grad_norm": 0.3438887298107147, |
| "learning_rate": 0.0004561213434452871, |
| "loss": 1.522, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.09851425511979654, |
| "grad_norm": 0.5644230842590332, |
| "learning_rate": 0.00045557963163596967, |
| "loss": 1.5703, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.0995850622406639, |
| "grad_norm": 0.35866114497184753, |
| "learning_rate": 0.0004550379198266522, |
| "loss": 1.5637, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.10065586936153126, |
| "grad_norm": 0.3141271770000458, |
| "learning_rate": 0.0004544962080173348, |
| "loss": 1.5275, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.10172667648239861, |
| "grad_norm": 0.3229062557220459, |
| "learning_rate": 0.00045395449620801734, |
| "loss": 1.509, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.10279748360326596, |
| "grad_norm": 0.3184738755226135, |
| "learning_rate": 0.0004534127843986999, |
| "loss": 1.5243, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.10386829072413331, |
| "grad_norm": 0.33315855264663696, |
| "learning_rate": 0.00045287107258938246, |
| "loss": 1.4969, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.10493909784500068, |
| "grad_norm": 0.37624651193618774, |
| "learning_rate": 0.000452329360780065, |
| "loss": 1.5713, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.10600990496586803, |
| "grad_norm": 0.3466942608356476, |
| "learning_rate": 0.0004517876489707476, |
| "loss": 1.4497, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.10708071208673538, |
| "grad_norm": 0.3428940773010254, |
| "learning_rate": 0.00045124593716143013, |
| "loss": 1.5272, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.10815151920760273, |
| "grad_norm": 0.32997605204582214, |
| "learning_rate": 0.0004507042253521127, |
| "loss": 1.5664, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.10922232632847008, |
| "grad_norm": 0.35048359632492065, |
| "learning_rate": 0.00045016251354279525, |
| "loss": 1.4883, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.11029313344933744, |
| "grad_norm": 0.3379492461681366, |
| "learning_rate": 0.0004496208017334778, |
| "loss": 1.4706, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.1113639405702048, |
| "grad_norm": 0.36966028809547424, |
| "learning_rate": 0.00044907908992416036, |
| "loss": 1.5116, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.11243474769107215, |
| "grad_norm": 0.3487953245639801, |
| "learning_rate": 0.0004485373781148429, |
| "loss": 1.5147, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.1135055548119395, |
| "grad_norm": 0.3422049582004547, |
| "learning_rate": 0.0004479956663055255, |
| "loss": 1.4782, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.11457636193280685, |
| "grad_norm": 0.3196428716182709, |
| "learning_rate": 0.00044745395449620804, |
| "loss": 1.4375, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.11564716905367421, |
| "grad_norm": 0.3369114398956299, |
| "learning_rate": 0.00044691224268689054, |
| "loss": 1.5261, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.11671797617454156, |
| "grad_norm": 0.35993748903274536, |
| "learning_rate": 0.0004463705308775731, |
| "loss": 1.5136, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.11778878329540891, |
| "grad_norm": 0.3427882790565491, |
| "learning_rate": 0.00044582881906825566, |
| "loss": 1.5352, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.11885959041627626, |
| "grad_norm": 0.3308979570865631, |
| "learning_rate": 0.0004452871072589382, |
| "loss": 1.4979, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.11993039753714362, |
| "grad_norm": 0.3407396376132965, |
| "learning_rate": 0.00044474539544962083, |
| "loss": 1.5055, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.12100120465801098, |
| "grad_norm": 0.34919309616088867, |
| "learning_rate": 0.0004442036836403034, |
| "loss": 1.5032, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.12207201177887833, |
| "grad_norm": 0.34088361263275146, |
| "learning_rate": 0.00044366197183098594, |
| "loss": 1.5489, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.12314281889974568, |
| "grad_norm": 0.3275073766708374, |
| "learning_rate": 0.0004431202600216685, |
| "loss": 1.4882, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.12421362602061303, |
| "grad_norm": 0.35690388083457947, |
| "learning_rate": 0.00044257854821235106, |
| "loss": 1.4762, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.12528443314148038, |
| "grad_norm": 0.668167233467102, |
| "learning_rate": 0.0004420368364030336, |
| "loss": 1.5231, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.12635524026234773, |
| "grad_norm": 0.3807876408100128, |
| "learning_rate": 0.0004414951245937162, |
| "loss": 1.5125, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.12742604738321509, |
| "grad_norm": 0.32847508788108826, |
| "learning_rate": 0.00044095341278439874, |
| "loss": 1.4791, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.12849685450408246, |
| "grad_norm": 0.34058675169944763, |
| "learning_rate": 0.00044041170097508124, |
| "loss": 1.4917, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.12956766162494981, |
| "grad_norm": 0.3316013216972351, |
| "learning_rate": 0.0004398699891657638, |
| "loss": 1.5397, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.13063846874581717, |
| "grad_norm": 0.32970407605171204, |
| "learning_rate": 0.00043932827735644636, |
| "loss": 1.56, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.13170927586668452, |
| "grad_norm": 0.3216981887817383, |
| "learning_rate": 0.0004387865655471289, |
| "loss": 1.4856, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.13278008298755187, |
| "grad_norm": 0.3492419421672821, |
| "learning_rate": 0.00043824485373781147, |
| "loss": 1.4941, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.13385089010841922, |
| "grad_norm": 0.3463359475135803, |
| "learning_rate": 0.00043770314192849403, |
| "loss": 1.5003, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.13492169722928657, |
| "grad_norm": 0.3727024793624878, |
| "learning_rate": 0.0004371614301191766, |
| "loss": 1.4981, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.13599250435015392, |
| "grad_norm": 0.5523554086685181, |
| "learning_rate": 0.00043661971830985915, |
| "loss": 1.5786, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.13706331147102127, |
| "grad_norm": 0.32683220505714417, |
| "learning_rate": 0.00043607800650054176, |
| "loss": 1.4902, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.13813411859188865, |
| "grad_norm": 0.3415539562702179, |
| "learning_rate": 0.0004355362946912243, |
| "loss": 1.4875, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.139204925712756, |
| "grad_norm": 0.3191353976726532, |
| "learning_rate": 0.0004349945828819069, |
| "loss": 1.4759, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.14027573283362335, |
| "grad_norm": 0.35508468747138977, |
| "learning_rate": 0.00043445287107258943, |
| "loss": 1.5611, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.1413465399544907, |
| "grad_norm": 0.33212971687316895, |
| "learning_rate": 0.00043391115926327194, |
| "loss": 1.4522, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.14241734707535805, |
| "grad_norm": 0.3219762146472931, |
| "learning_rate": 0.0004333694474539545, |
| "loss": 1.4582, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.1434881541962254, |
| "grad_norm": 0.36882877349853516, |
| "learning_rate": 0.00043282773564463705, |
| "loss": 1.5347, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.14455896131709275, |
| "grad_norm": 0.33573803305625916, |
| "learning_rate": 0.0004322860238353196, |
| "loss": 1.4876, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.1456297684379601, |
| "grad_norm": 0.33557966351509094, |
| "learning_rate": 0.00043174431202600217, |
| "loss": 1.4536, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.14670057555882746, |
| "grad_norm": 0.3364240527153015, |
| "learning_rate": 0.0004312026002166847, |
| "loss": 1.5241, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.1477713826796948, |
| "grad_norm": 0.31000298261642456, |
| "learning_rate": 0.0004306608884073673, |
| "loss": 1.4427, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.14884218980056219, |
| "grad_norm": 0.31178000569343567, |
| "learning_rate": 0.00043011917659804984, |
| "loss": 1.5455, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.14991299692142954, |
| "grad_norm": 0.3283156752586365, |
| "learning_rate": 0.0004295774647887324, |
| "loss": 1.5277, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.1509838040422969, |
| "grad_norm": 0.34077680110931396, |
| "learning_rate": 0.00042903575297941496, |
| "loss": 1.5203, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.15205461116316424, |
| "grad_norm": 0.3414633870124817, |
| "learning_rate": 0.0004284940411700975, |
| "loss": 1.5143, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.1531254182840316, |
| "grad_norm": 0.3262156844139099, |
| "learning_rate": 0.0004279523293607801, |
| "loss": 1.492, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.15419622540489894, |
| "grad_norm": 0.3537783920764923, |
| "learning_rate": 0.00042741061755146263, |
| "loss": 1.5223, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.1552670325257663, |
| "grad_norm": 0.339911550283432, |
| "learning_rate": 0.0004268689057421452, |
| "loss": 1.5162, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.15633783964663364, |
| "grad_norm": 0.36946552991867065, |
| "learning_rate": 0.00042632719393282775, |
| "loss": 1.4668, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.157408646767501, |
| "grad_norm": 0.33070170879364014, |
| "learning_rate": 0.0004257854821235103, |
| "loss": 1.4606, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.15847945388836837, |
| "grad_norm": 0.33413979411125183, |
| "learning_rate": 0.00042524377031419287, |
| "loss": 1.5032, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.15955026100923572, |
| "grad_norm": 0.3402380049228668, |
| "learning_rate": 0.0004247020585048754, |
| "loss": 1.52, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.16062106813010307, |
| "grad_norm": 0.3602783679962158, |
| "learning_rate": 0.000424160346695558, |
| "loss": 1.5349, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.16169187525097042, |
| "grad_norm": 0.32968804240226746, |
| "learning_rate": 0.00042361863488624054, |
| "loss": 1.4369, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.16276268237183777, |
| "grad_norm": 0.3444564938545227, |
| "learning_rate": 0.0004230769230769231, |
| "loss": 1.4565, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.16383348949270513, |
| "grad_norm": 0.37572184205055237, |
| "learning_rate": 0.00042253521126760566, |
| "loss": 1.4921, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.16490429661357248, |
| "grad_norm": 0.3675267994403839, |
| "learning_rate": 0.0004219934994582882, |
| "loss": 1.5345, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.16597510373443983, |
| "grad_norm": 0.34972381591796875, |
| "learning_rate": 0.0004214517876489707, |
| "loss": 1.4759, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.16704591085530718, |
| "grad_norm": 0.35719773173332214, |
| "learning_rate": 0.0004209100758396533, |
| "loss": 1.5401, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.16811671797617453, |
| "grad_norm": 0.3391767144203186, |
| "learning_rate": 0.00042036836403033583, |
| "loss": 1.5129, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.1691875250970419, |
| "grad_norm": 0.34171062707901, |
| "learning_rate": 0.0004198266522210184, |
| "loss": 1.5304, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.17025833221790926, |
| "grad_norm": 0.3329889476299286, |
| "learning_rate": 0.000419284940411701, |
| "loss": 1.4794, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.1713291393387766, |
| "grad_norm": 0.329875111579895, |
| "learning_rate": 0.00041874322860238356, |
| "loss": 1.4658, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.17239994645964396, |
| "grad_norm": 0.36654773354530334, |
| "learning_rate": 0.0004182015167930661, |
| "loss": 1.5079, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.1734707535805113, |
| "grad_norm": 0.3587745130062103, |
| "learning_rate": 0.0004176598049837487, |
| "loss": 1.4352, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.17454156070137866, |
| "grad_norm": 0.32216113805770874, |
| "learning_rate": 0.00041711809317443124, |
| "loss": 1.4214, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.175612367822246, |
| "grad_norm": 0.34425267577171326, |
| "learning_rate": 0.0004165763813651138, |
| "loss": 1.5408, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.17668317494311336, |
| "grad_norm": 0.34980979561805725, |
| "learning_rate": 0.00041603466955579635, |
| "loss": 1.4995, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.17775398206398071, |
| "grad_norm": 0.33706167340278625, |
| "learning_rate": 0.00041549295774647886, |
| "loss": 1.4966, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.1788247891848481, |
| "grad_norm": 0.3577290177345276, |
| "learning_rate": 0.0004149512459371614, |
| "loss": 1.5051, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.17989559630571544, |
| "grad_norm": 0.33480167388916016, |
| "learning_rate": 0.000414409534127844, |
| "loss": 1.4846, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.1809664034265828, |
| "grad_norm": 0.3389778137207031, |
| "learning_rate": 0.00041386782231852653, |
| "loss": 1.4659, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.18203721054745015, |
| "grad_norm": 0.34035906195640564, |
| "learning_rate": 0.0004133261105092091, |
| "loss": 1.5269, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.1831080176683175, |
| "grad_norm": 0.33953285217285156, |
| "learning_rate": 0.00041278439869989165, |
| "loss": 1.5608, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.18417882478918485, |
| "grad_norm": 0.331253319978714, |
| "learning_rate": 0.0004122426868905742, |
| "loss": 1.4238, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.1852496319100522, |
| "grad_norm": 0.3417370915412903, |
| "learning_rate": 0.00041170097508125676, |
| "loss": 1.5335, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.18632043903091955, |
| "grad_norm": 0.3459537923336029, |
| "learning_rate": 0.0004111592632719393, |
| "loss": 1.5405, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.1873912461517869, |
| "grad_norm": 0.34250974655151367, |
| "learning_rate": 0.00041061755146262193, |
| "loss": 1.5451, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.18846205327265425, |
| "grad_norm": 0.35121142864227295, |
| "learning_rate": 0.0004100758396533045, |
| "loss": 1.4584, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.18953286039352163, |
| "grad_norm": 0.3343502879142761, |
| "learning_rate": 0.00040953412784398705, |
| "loss": 1.4967, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.19060366751438898, |
| "grad_norm": 0.3440572917461395, |
| "learning_rate": 0.00040899241603466955, |
| "loss": 1.5322, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.19167447463525633, |
| "grad_norm": 0.3478721082210541, |
| "learning_rate": 0.0004084507042253521, |
| "loss": 1.4887, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.19274528175612368, |
| "grad_norm": 0.3297663927078247, |
| "learning_rate": 0.00040790899241603467, |
| "loss": 1.4321, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.19381608887699103, |
| "grad_norm": 0.3527899384498596, |
| "learning_rate": 0.00040736728060671723, |
| "loss": 1.5411, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.19488689599785838, |
| "grad_norm": 0.3361954987049103, |
| "learning_rate": 0.0004068255687973998, |
| "loss": 1.4383, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.19595770311872573, |
| "grad_norm": 0.35988926887512207, |
| "learning_rate": 0.00040628385698808235, |
| "loss": 1.4807, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.19702851023959309, |
| "grad_norm": 0.35412025451660156, |
| "learning_rate": 0.0004057421451787649, |
| "loss": 1.5432, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.19809931736046044, |
| "grad_norm": 0.3374565541744232, |
| "learning_rate": 0.00040520043336944746, |
| "loss": 1.4895, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.1991701244813278, |
| "grad_norm": 0.35347357392311096, |
| "learning_rate": 0.00040465872156013, |
| "loss": 1.4761, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.20024093160219517, |
| "grad_norm": 0.34612298011779785, |
| "learning_rate": 0.0004041170097508126, |
| "loss": 1.4867, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.20131173872306252, |
| "grad_norm": 0.36123159527778625, |
| "learning_rate": 0.00040357529794149514, |
| "loss": 1.4753, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.20238254584392987, |
| "grad_norm": 0.37735962867736816, |
| "learning_rate": 0.00040303358613217764, |
| "loss": 1.5158, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.20345335296479722, |
| "grad_norm": 0.365067720413208, |
| "learning_rate": 0.00040249187432286025, |
| "loss": 1.5493, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.20452416008566457, |
| "grad_norm": 0.33235374093055725, |
| "learning_rate": 0.0004019501625135428, |
| "loss": 1.495, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.20559496720653192, |
| "grad_norm": 0.35279738903045654, |
| "learning_rate": 0.00040140845070422537, |
| "loss": 1.4681, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.20666577432739927, |
| "grad_norm": 0.342896968126297, |
| "learning_rate": 0.0004008667388949079, |
| "loss": 1.5163, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.20773658144826662, |
| "grad_norm": 0.34132811427116394, |
| "learning_rate": 0.0004003250270855905, |
| "loss": 1.4822, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.20880738856913397, |
| "grad_norm": 0.34202563762664795, |
| "learning_rate": 0.00039978331527627304, |
| "loss": 1.44, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.20987819569000135, |
| "grad_norm": 0.3383086919784546, |
| "learning_rate": 0.0003992416034669556, |
| "loss": 1.4993, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.2109490028108687, |
| "grad_norm": 0.35314062237739563, |
| "learning_rate": 0.00039869989165763816, |
| "loss": 1.5139, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.21201980993173605, |
| "grad_norm": 0.3365531265735626, |
| "learning_rate": 0.0003981581798483207, |
| "loss": 1.429, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.2130906170526034, |
| "grad_norm": 0.33675894141197205, |
| "learning_rate": 0.0003976164680390033, |
| "loss": 1.4568, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.21416142417347075, |
| "grad_norm": 0.340620219707489, |
| "learning_rate": 0.00039707475622968583, |
| "loss": 1.4935, |
| "step": 200 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 933, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.528726582329344e+17, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|