| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.49952696310312206, | |
| "eval_steps": 500, | |
| "global_step": 99, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005045726900031536, | |
| "grad_norm": 0.16816571847556824, | |
| "learning_rate": 2.9999839160139495e-06, | |
| "loss": 0.7782, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.010091453800063072, | |
| "grad_norm": 0.1469143977253523, | |
| "learning_rate": 2.9999356645057024e-06, | |
| "loss": 0.6817, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.015137180700094607, | |
| "grad_norm": 0.07996774677933757, | |
| "learning_rate": 2.9998552468249567e-06, | |
| "loss": 0.6735, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.020182907600126143, | |
| "grad_norm": 0.0800127664777818, | |
| "learning_rate": 2.999742665221167e-06, | |
| "loss": 0.6569, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.02522863450015768, | |
| "grad_norm": 0.08070188267575489, | |
| "learning_rate": 2.999597922843484e-06, | |
| "loss": 0.6283, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.030274361400189215, | |
| "grad_norm": 0.06839180655145351, | |
| "learning_rate": 2.999421023740663e-06, | |
| "loss": 0.6446, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.03532008830022075, | |
| "grad_norm": 0.05534188923028301, | |
| "learning_rate": 2.9992119728609516e-06, | |
| "loss": 0.6371, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.040365815200252286, | |
| "grad_norm": 0.07094943987370793, | |
| "learning_rate": 2.9989707760519526e-06, | |
| "loss": 0.6111, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.04541154210028382, | |
| "grad_norm": 0.06436005389698786, | |
| "learning_rate": 2.9986974400604593e-06, | |
| "loss": 0.588, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.05045726900031536, | |
| "grad_norm": 0.05699223365274786, | |
| "learning_rate": 2.9983919725322667e-06, | |
| "loss": 0.6101, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.055502995900346894, | |
| "grad_norm": 0.058843386030182285, | |
| "learning_rate": 2.9980543820119585e-06, | |
| "loss": 0.6047, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.06054872280037843, | |
| "grad_norm": 0.047228554008764044, | |
| "learning_rate": 2.997684677942667e-06, | |
| "loss": 0.5937, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.06559444970040997, | |
| "grad_norm": 0.04830399085917525, | |
| "learning_rate": 2.9972828706658102e-06, | |
| "loss": 0.6448, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.0706401766004415, | |
| "grad_norm": 0.04469640349332499, | |
| "learning_rate": 2.996848971420801e-06, | |
| "loss": 0.6145, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.07568590350047304, | |
| "grad_norm": 0.048907003957727534, | |
| "learning_rate": 2.996382992344734e-06, | |
| "loss": 0.5755, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.08073163040050457, | |
| "grad_norm": 0.04502223888969105, | |
| "learning_rate": 2.9958849464720457e-06, | |
| "loss": 0.5765, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.08577735730053611, | |
| "grad_norm": 0.04485565875842678, | |
| "learning_rate": 2.9953548477341497e-06, | |
| "loss": 0.6364, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.09082308420056764, | |
| "grad_norm": 0.04319237430058616, | |
| "learning_rate": 2.9947927109590477e-06, | |
| "loss": 0.568, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.09586881110059918, | |
| "grad_norm": 0.042093297202993624, | |
| "learning_rate": 2.994198551870913e-06, | |
| "loss": 0.6184, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.10091453800063072, | |
| "grad_norm": 0.04087623899598573, | |
| "learning_rate": 2.993572387089653e-06, | |
| "loss": 0.5822, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.10596026490066225, | |
| "grad_norm": 0.042619877493329586, | |
| "learning_rate": 2.992914234130442e-06, | |
| "loss": 0.5983, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.11100599180069379, | |
| "grad_norm": 0.04314774114784986, | |
| "learning_rate": 2.9922241114032345e-06, | |
| "loss": 0.6058, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.11605171870072532, | |
| "grad_norm": 0.04125496902035363, | |
| "learning_rate": 2.9915020382122458e-06, | |
| "loss": 0.5741, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.12109744560075686, | |
| "grad_norm": 0.03985368427853683, | |
| "learning_rate": 2.990748034755415e-06, | |
| "loss": 0.6002, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.1261431725007884, | |
| "grad_norm": 0.04603566805698703, | |
| "learning_rate": 2.9899621221238394e-06, | |
| "loss": 0.5616, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.13118889940081993, | |
| "grad_norm": 0.033944581121186666, | |
| "learning_rate": 2.989144322301186e-06, | |
| "loss": 0.591, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.13623462630085148, | |
| "grad_norm": 0.0352127486146018, | |
| "learning_rate": 2.988294658163073e-06, | |
| "loss": 0.575, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.141280353200883, | |
| "grad_norm": 0.04026082684896548, | |
| "learning_rate": 2.9874131534764325e-06, | |
| "loss": 0.5783, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.14632608010091455, | |
| "grad_norm": 0.038584952671910096, | |
| "learning_rate": 2.9864998328988463e-06, | |
| "loss": 0.5814, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.15137180700094607, | |
| "grad_norm": 0.03294755370363045, | |
| "learning_rate": 2.985554721977853e-06, | |
| "loss": 0.5688, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.15641753390097762, | |
| "grad_norm": 0.035774614388450525, | |
| "learning_rate": 2.984577847150239e-06, | |
| "loss": 0.5914, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.16146326080100915, | |
| "grad_norm": 0.04512017281393784, | |
| "learning_rate": 2.983569235741291e-06, | |
| "loss": 0.557, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.1665089877010407, | |
| "grad_norm": 0.03447545680264101, | |
| "learning_rate": 2.9825289159640397e-06, | |
| "loss": 0.568, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.17155471460107222, | |
| "grad_norm": 0.033658505681229516, | |
| "learning_rate": 2.9814569169184642e-06, | |
| "loss": 0.5868, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.17660044150110377, | |
| "grad_norm": 0.03071546221735757, | |
| "learning_rate": 2.980353268590683e-06, | |
| "loss": 0.5487, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.1816461684011353, | |
| "grad_norm": 0.07417860742940319, | |
| "learning_rate": 2.9792180018521128e-06, | |
| "loss": 0.6099, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.18669189530116684, | |
| "grad_norm": 0.032180325133544276, | |
| "learning_rate": 2.978051148458604e-06, | |
| "loss": 0.5939, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.19173762220119836, | |
| "grad_norm": 0.031347752245340116, | |
| "learning_rate": 2.976852741049554e-06, | |
| "loss": 0.5764, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.1967833491012299, | |
| "grad_norm": 0.035873383222778825, | |
| "learning_rate": 2.975622813146996e-06, | |
| "loss": 0.57, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.20182907600126143, | |
| "grad_norm": 0.03130302787258777, | |
| "learning_rate": 2.9743613991546548e-06, | |
| "loss": 0.5503, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.20687480290129298, | |
| "grad_norm": 0.04111552220221803, | |
| "learning_rate": 2.9730685343569934e-06, | |
| "loss": 0.6028, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.2119205298013245, | |
| "grad_norm": 0.031561335436647305, | |
| "learning_rate": 2.971744254918218e-06, | |
| "loss": 0.5682, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.21696625670135605, | |
| "grad_norm": 0.03466870924962832, | |
| "learning_rate": 2.9703885978812726e-06, | |
| "loss": 0.55, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.22201198360138757, | |
| "grad_norm": 0.03396258277418921, | |
| "learning_rate": 2.9690016011667974e-06, | |
| "loss": 0.5953, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.22705771050141912, | |
| "grad_norm": 0.033463552224919146, | |
| "learning_rate": 2.967583303572073e-06, | |
| "loss": 0.6231, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.23210343740145065, | |
| "grad_norm": 0.03747113039368738, | |
| "learning_rate": 2.9661337447699316e-06, | |
| "loss": 0.5742, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.2371491643014822, | |
| "grad_norm": 0.04159182229285405, | |
| "learning_rate": 2.9646529653076493e-06, | |
| "loss": 0.5681, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.24219489120151372, | |
| "grad_norm": 0.032311171301265075, | |
| "learning_rate": 2.9631410066058098e-06, | |
| "loss": 0.5464, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.24724061810154527, | |
| "grad_norm": 0.035494911254562625, | |
| "learning_rate": 2.9615979109571493e-06, | |
| "loss": 0.5377, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.2522863450015768, | |
| "grad_norm": 0.032401750473671755, | |
| "learning_rate": 2.9600237215253696e-06, | |
| "loss": 0.6043, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.25733207190160834, | |
| "grad_norm": 0.03477400444401883, | |
| "learning_rate": 2.9584184823439337e-06, | |
| "loss": 0.6078, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.26237779880163986, | |
| "grad_norm": 0.03586539534553979, | |
| "learning_rate": 2.9567822383148315e-06, | |
| "loss": 0.5857, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.2674235257016714, | |
| "grad_norm": 0.034776366845092124, | |
| "learning_rate": 2.955115035207326e-06, | |
| "loss": 0.5652, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.27246925260170296, | |
| "grad_norm": 0.047916672806890825, | |
| "learning_rate": 2.953416919656672e-06, | |
| "loss": 0.529, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.2775149795017345, | |
| "grad_norm": 0.035512253032401846, | |
| "learning_rate": 2.9516879391628125e-06, | |
| "loss": 0.6018, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.282560706401766, | |
| "grad_norm": 0.0669654595534551, | |
| "learning_rate": 2.9499281420890474e-06, | |
| "loss": 0.5832, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.2876064333017975, | |
| "grad_norm": 0.04009377576904152, | |
| "learning_rate": 2.948137577660685e-06, | |
| "loss": 0.5376, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.2926521602018291, | |
| "grad_norm": 0.05517678453435375, | |
| "learning_rate": 2.946316295963661e-06, | |
| "loss": 0.5725, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.2976978871018606, | |
| "grad_norm": 0.040682905696082294, | |
| "learning_rate": 2.9444643479431393e-06, | |
| "loss": 0.5887, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.30274361400189215, | |
| "grad_norm": 0.044774454426992336, | |
| "learning_rate": 2.9425817854020873e-06, | |
| "loss": 0.5756, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.30778934090192367, | |
| "grad_norm": 0.03263011910584542, | |
| "learning_rate": 2.940668660999826e-06, | |
| "loss": 0.5693, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.31283506780195525, | |
| "grad_norm": 0.032496230797448664, | |
| "learning_rate": 2.9387250282505583e-06, | |
| "loss": 0.586, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.31788079470198677, | |
| "grad_norm": 0.03310861754959189, | |
| "learning_rate": 2.9367509415218687e-06, | |
| "loss": 0.5548, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.3229265216020183, | |
| "grad_norm": 0.031816229512850104, | |
| "learning_rate": 2.9347464560332084e-06, | |
| "loss": 0.6, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.3279722485020498, | |
| "grad_norm": 0.036465675122600016, | |
| "learning_rate": 2.932711627854344e-06, | |
| "loss": 0.5613, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.3330179754020814, | |
| "grad_norm": 0.03107217546123426, | |
| "learning_rate": 2.9306465139037947e-06, | |
| "loss": 0.5421, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.3380637023021129, | |
| "grad_norm": 0.031633841290591734, | |
| "learning_rate": 2.9285511719472367e-06, | |
| "loss": 0.58, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.34310942920214443, | |
| "grad_norm": 0.030853855883844275, | |
| "learning_rate": 2.9264256605958885e-06, | |
| "loss": 0.5496, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.34815515610217596, | |
| "grad_norm": 0.036265638918391636, | |
| "learning_rate": 2.924270039304873e-06, | |
| "loss": 0.5939, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.35320088300220753, | |
| "grad_norm": 0.03703293289195566, | |
| "learning_rate": 2.9220843683715497e-06, | |
| "loss": 0.5311, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.35824660990223905, | |
| "grad_norm": 0.07682301940151573, | |
| "learning_rate": 2.9198687089338345e-06, | |
| "loss": 0.5655, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.3632923368022706, | |
| "grad_norm": 0.03240731857642153, | |
| "learning_rate": 2.9176231229684835e-06, | |
| "loss": 0.5436, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.3683380637023021, | |
| "grad_norm": 0.03550209155305971, | |
| "learning_rate": 2.9153476732893646e-06, | |
| "loss": 0.5529, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.3733837906023337, | |
| "grad_norm": 0.03572110732287988, | |
| "learning_rate": 2.913042423545696e-06, | |
| "loss": 0.5601, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.3784295175023652, | |
| "grad_norm": 0.030318267187340705, | |
| "learning_rate": 2.910707438220269e-06, | |
| "loss": 0.5827, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.3834752444023967, | |
| "grad_norm": 0.030779462298936085, | |
| "learning_rate": 2.9083427826276414e-06, | |
| "loss": 0.5366, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.38852097130242824, | |
| "grad_norm": 0.033078267956613755, | |
| "learning_rate": 2.905948522912315e-06, | |
| "loss": 0.5769, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.3935666982024598, | |
| "grad_norm": 0.032022182515529865, | |
| "learning_rate": 2.90352472604688e-06, | |
| "loss": 0.6059, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.39861242510249134, | |
| "grad_norm": 0.032572741826790486, | |
| "learning_rate": 2.901071459830145e-06, | |
| "loss": 0.5325, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.40365815200252286, | |
| "grad_norm": 0.03322477222052267, | |
| "learning_rate": 2.89858879288524e-06, | |
| "loss": 0.6102, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.4087038789025544, | |
| "grad_norm": 0.03290381540817977, | |
| "learning_rate": 2.896076794657696e-06, | |
| "loss": 0.5297, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.41374960580258596, | |
| "grad_norm": 0.02986308712893659, | |
| "learning_rate": 2.893535535413504e-06, | |
| "loss": 0.6016, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.4187953327026175, | |
| "grad_norm": 0.03708195442407903, | |
| "learning_rate": 2.8909650862371465e-06, | |
| "loss": 0.5644, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.423841059602649, | |
| "grad_norm": 0.05585756602200335, | |
| "learning_rate": 2.888365519029615e-06, | |
| "loss": 0.5645, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.42888678650268053, | |
| "grad_norm": 0.03232081336561299, | |
| "learning_rate": 2.8857369065063893e-06, | |
| "loss": 0.5492, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.4339325134027121, | |
| "grad_norm": 0.03807439954613977, | |
| "learning_rate": 2.883079322195415e-06, | |
| "loss": 0.5694, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.4389782403027436, | |
| "grad_norm": 0.03718669059491054, | |
| "learning_rate": 2.880392840435036e-06, | |
| "loss": 0.5603, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.44402396720277515, | |
| "grad_norm": 0.02993361885707706, | |
| "learning_rate": 2.8776775363719244e-06, | |
| "loss": 0.5193, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.44906969410280667, | |
| "grad_norm": 0.03471859873605538, | |
| "learning_rate": 2.8749334859589696e-06, | |
| "loss": 0.5195, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.45411542100283825, | |
| "grad_norm": 0.03468783264087511, | |
| "learning_rate": 2.872160765953162e-06, | |
| "loss": 0.5685, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.45916114790286977, | |
| "grad_norm": 0.06028818099354049, | |
| "learning_rate": 2.86935945391344e-06, | |
| "loss": 0.5875, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.4642068748029013, | |
| "grad_norm": 0.03189592063110031, | |
| "learning_rate": 2.8665296281985232e-06, | |
| "loss": 0.5627, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.4692526017029328, | |
| "grad_norm": 0.032489964455779306, | |
| "learning_rate": 2.8636713679647195e-06, | |
| "loss": 0.5398, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.4742983286029644, | |
| "grad_norm": 0.03300509062363307, | |
| "learning_rate": 2.8607847531637127e-06, | |
| "loss": 0.5675, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.4793440555029959, | |
| "grad_norm": 0.034805575232998785, | |
| "learning_rate": 2.857869864540323e-06, | |
| "loss": 0.5526, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.48438978240302744, | |
| "grad_norm": 0.03304213276713402, | |
| "learning_rate": 2.854926783630253e-06, | |
| "loss": 0.5475, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.48943550930305896, | |
| "grad_norm": 0.03753659611183512, | |
| "learning_rate": 2.851955592757801e-06, | |
| "loss": 0.5511, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.49448123620309054, | |
| "grad_norm": 0.033892234979303396, | |
| "learning_rate": 2.848956375033562e-06, | |
| "loss": 0.5232, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.49952696310312206, | |
| "grad_norm": 0.037074509268233, | |
| "learning_rate": 2.845929214352105e-06, | |
| "loss": 0.5655, | |
| "step": 99 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 594, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 99, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 410485593735168.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |