| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.49966599866399464, |
| "eval_steps": 94, |
| "global_step": 374, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0013360053440213762, |
| "grad_norm": 7.838815885074719, |
| "learning_rate": 0.0, |
| "loss": 1.0128, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.0013360053440213762, |
| "eval_loss": 1.2741531133651733, |
| "eval_runtime": 82.257, |
| "eval_samples_per_second": 4.437, |
| "eval_steps_per_second": 0.28, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.0026720106880427524, |
| "grad_norm": 13.513347544874192, |
| "learning_rate": 5.000000000000001e-07, |
| "loss": 0.9623, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.004008016032064128, |
| "grad_norm": 4.696802485434519, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 0.9976, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.005344021376085505, |
| "grad_norm": 20.250593855934195, |
| "learning_rate": 1.5e-06, |
| "loss": 1.063, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.006680026720106881, |
| "grad_norm": 7.758735151646052, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 1.0372, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.008016032064128256, |
| "grad_norm": 5.267050839288624, |
| "learning_rate": 2.5e-06, |
| "loss": 1.0005, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.009352037408149633, |
| "grad_norm": 25.654993356050678, |
| "learning_rate": 3e-06, |
| "loss": 0.9773, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.01068804275217101, |
| "grad_norm": 7.870143712033602, |
| "learning_rate": 3.5e-06, |
| "loss": 0.9931, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.012024048096192385, |
| "grad_norm": 12.411677067737418, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.9719, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.013360053440213761, |
| "grad_norm": 6.932691521768928, |
| "learning_rate": 4.5e-06, |
| "loss": 0.9457, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.014696058784235137, |
| "grad_norm": 3.425585752873044, |
| "learning_rate": 5e-06, |
| "loss": 1.0424, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.01603206412825651, |
| "grad_norm": 6.62068237768944, |
| "learning_rate": 5.500000000000001e-06, |
| "loss": 1.027, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.01736806947227789, |
| "grad_norm": 2.3119985338398776, |
| "learning_rate": 6e-06, |
| "loss": 0.9872, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.018704074816299265, |
| "grad_norm": 2.0363065443264556, |
| "learning_rate": 6.5000000000000004e-06, |
| "loss": 0.9368, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.02004008016032064, |
| "grad_norm": 9.068553592976267, |
| "learning_rate": 7e-06, |
| "loss": 0.9194, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.02137608550434202, |
| "grad_norm": 18.99312133304054, |
| "learning_rate": 7.500000000000001e-06, |
| "loss": 1.0318, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.022712090848363394, |
| "grad_norm": 4.437766969350746, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 0.9095, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.02404809619238477, |
| "grad_norm": 3.103382175266206, |
| "learning_rate": 8.5e-06, |
| "loss": 0.9669, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.025384101536406144, |
| "grad_norm": 2.2535559949166273, |
| "learning_rate": 9e-06, |
| "loss": 0.8783, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.026720106880427523, |
| "grad_norm": 3.7787530859276792, |
| "learning_rate": 9.5e-06, |
| "loss": 0.9223, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.028056112224448898, |
| "grad_norm": 2.3401878471471087, |
| "learning_rate": 1e-05, |
| "loss": 0.8604, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.029392117568470273, |
| "grad_norm": 2.9746836397620795, |
| "learning_rate": 1.0500000000000001e-05, |
| "loss": 0.8979, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.03072812291249165, |
| "grad_norm": 2.094893661934687, |
| "learning_rate": 1.1000000000000001e-05, |
| "loss": 0.8892, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.03206412825651302, |
| "grad_norm": 2.711962464105192, |
| "learning_rate": 1.15e-05, |
| "loss": 0.895, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.033400133600534405, |
| "grad_norm": 3.275372297190117, |
| "learning_rate": 1.2e-05, |
| "loss": 0.8783, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.03473613894455578, |
| "grad_norm": 1.7038474605206169, |
| "learning_rate": 1.25e-05, |
| "loss": 0.9042, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.036072144288577156, |
| "grad_norm": 1.7422379866759963, |
| "learning_rate": 1.3000000000000001e-05, |
| "loss": 0.8441, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.03740814963259853, |
| "grad_norm": 2.874145219891766, |
| "learning_rate": 1.3500000000000001e-05, |
| "loss": 0.9384, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.038744154976619906, |
| "grad_norm": 1.3906236959628462, |
| "learning_rate": 1.4e-05, |
| "loss": 0.8592, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.04008016032064128, |
| "grad_norm": 2.3468367038994966, |
| "learning_rate": 1.45e-05, |
| "loss": 0.845, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.041416165664662656, |
| "grad_norm": 2.1599884588230434, |
| "learning_rate": 1.5000000000000002e-05, |
| "loss": 0.8403, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.04275217100868404, |
| "grad_norm": 3.5854699631424833, |
| "learning_rate": 1.55e-05, |
| "loss": 0.921, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.04408817635270541, |
| "grad_norm": 2.3960698187143747, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 0.8634, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.04542418169672679, |
| "grad_norm": 2.016236271888516, |
| "learning_rate": 1.65e-05, |
| "loss": 0.9352, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.04676018704074816, |
| "grad_norm": 3.1420515274166667, |
| "learning_rate": 1.7e-05, |
| "loss": 0.9432, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.04809619238476954, |
| "grad_norm": 1.3347031470281001, |
| "learning_rate": 1.7500000000000002e-05, |
| "loss": 0.8401, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.049432197728790914, |
| "grad_norm": 2.673287639892654, |
| "learning_rate": 1.8e-05, |
| "loss": 0.8963, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.05076820307281229, |
| "grad_norm": 3.512551568017944, |
| "learning_rate": 1.8500000000000002e-05, |
| "loss": 0.8508, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.052104208416833664, |
| "grad_norm": 6.506587434743008, |
| "learning_rate": 1.9e-05, |
| "loss": 0.9253, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.053440213760855046, |
| "grad_norm": 2.7596036679009774, |
| "learning_rate": 1.95e-05, |
| "loss": 0.8189, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.05477621910487642, |
| "grad_norm": 6.362299838220405, |
| "learning_rate": 2e-05, |
| "loss": 0.8844, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.056112224448897796, |
| "grad_norm": 2.327166199025258, |
| "learning_rate": 1.999997672193743e-05, |
| "loss": 0.887, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.05744822979291917, |
| "grad_norm": 4.914513889015775, |
| "learning_rate": 1.999990688785808e-05, |
| "loss": 0.805, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.058784235136940546, |
| "grad_norm": 3.3238995149327253, |
| "learning_rate": 1.9999790498087083e-05, |
| "loss": 0.8774, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.06012024048096192, |
| "grad_norm": 5.7830116735089145, |
| "learning_rate": 1.9999627553166296e-05, |
| "loss": 0.8875, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.0614562458249833, |
| "grad_norm": 10.360950284098827, |
| "learning_rate": 1.9999418053854324e-05, |
| "loss": 0.7691, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.06279225116900468, |
| "grad_norm": 4.517803823252036, |
| "learning_rate": 1.999916200112653e-05, |
| "loss": 0.8755, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.06412825651302605, |
| "grad_norm": 2.105033837937751, |
| "learning_rate": 1.9998859396174982e-05, |
| "loss": 0.8625, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.06546426185704743, |
| "grad_norm": 1.611864645136641, |
| "learning_rate": 1.9998510240408495e-05, |
| "loss": 0.8586, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.06680026720106881, |
| "grad_norm": 1.6245776488500119, |
| "learning_rate": 1.999811453545261e-05, |
| "loss": 0.8312, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.06813627254509018, |
| "grad_norm": 1.1829706372961784, |
| "learning_rate": 1.9997672283149562e-05, |
| "loss": 0.9061, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.06947227788911156, |
| "grad_norm": 1.0888397967769048, |
| "learning_rate": 1.999718348555832e-05, |
| "loss": 0.8127, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.07080828323313293, |
| "grad_norm": 3.8154657880304463, |
| "learning_rate": 1.9996648144954533e-05, |
| "loss": 0.8687, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.07214428857715431, |
| "grad_norm": 1.9650485024559405, |
| "learning_rate": 1.9996066263830533e-05, |
| "loss": 0.878, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.07348029392117568, |
| "grad_norm": 2.7080828300193605, |
| "learning_rate": 1.9995437844895337e-05, |
| "loss": 0.8529, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.07481629926519706, |
| "grad_norm": 1.012499430721715, |
| "learning_rate": 1.9994762891074618e-05, |
| "loss": 0.8675, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.07615230460921844, |
| "grad_norm": 1.8405991625510976, |
| "learning_rate": 1.9994041405510705e-05, |
| "loss": 0.8577, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.07748830995323981, |
| "grad_norm": 1.1627266766718507, |
| "learning_rate": 1.9993273391562552e-05, |
| "loss": 0.8405, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.0788243152972612, |
| "grad_norm": 1.484248853527955, |
| "learning_rate": 1.9992458852805735e-05, |
| "loss": 0.8888, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.08016032064128256, |
| "grad_norm": 1.4396350788962005, |
| "learning_rate": 1.999159779303243e-05, |
| "loss": 0.7598, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.08149632598530394, |
| "grad_norm": 5.89254331534054, |
| "learning_rate": 1.9990690216251395e-05, |
| "loss": 0.873, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.08283233132932531, |
| "grad_norm": 2.2487358287985186, |
| "learning_rate": 1.998973612668796e-05, |
| "loss": 0.8007, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.0841683366733467, |
| "grad_norm": 7.577612372306987, |
| "learning_rate": 1.9988735528783997e-05, |
| "loss": 0.8588, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.08550434201736808, |
| "grad_norm": 1.4044808214777407, |
| "learning_rate": 1.9987688427197898e-05, |
| "loss": 0.8747, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.08684034736138944, |
| "grad_norm": 3.496311818475767, |
| "learning_rate": 1.9986594826804563e-05, |
| "loss": 0.8296, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.08817635270541083, |
| "grad_norm": 1.9533191536516312, |
| "learning_rate": 1.9985454732695376e-05, |
| "loss": 0.7744, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.0895123580494322, |
| "grad_norm": 1.3921457875302905, |
| "learning_rate": 1.998426815017817e-05, |
| "loss": 0.8162, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.09084836339345358, |
| "grad_norm": 1.615858326043177, |
| "learning_rate": 1.998303508477721e-05, |
| "loss": 0.7986, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.09218436873747494, |
| "grad_norm": 3.090131944452677, |
| "learning_rate": 1.9981755542233175e-05, |
| "loss": 0.7497, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.09352037408149633, |
| "grad_norm": 2.624485840021933, |
| "learning_rate": 1.998042952850312e-05, |
| "loss": 0.9075, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.09485637942551771, |
| "grad_norm": 29.147628078759166, |
| "learning_rate": 1.997905704976045e-05, |
| "loss": 0.8228, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.09619238476953908, |
| "grad_norm": 4.901460027051236, |
| "learning_rate": 1.9977638112394896e-05, |
| "loss": 0.8469, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.09752839011356046, |
| "grad_norm": 4.148739124098703, |
| "learning_rate": 1.997617272301248e-05, |
| "loss": 0.8836, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.09886439545758183, |
| "grad_norm": 3.63419157744923, |
| "learning_rate": 1.9974660888435478e-05, |
| "loss": 0.8133, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.10020040080160321, |
| "grad_norm": 3.2509902982703194, |
| "learning_rate": 1.997310261570242e-05, |
| "loss": 0.8146, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.10153640614562458, |
| "grad_norm": 2.4273198087241648, |
| "learning_rate": 1.9971497912068014e-05, |
| "loss": 0.8396, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.10287241148964596, |
| "grad_norm": 1.2510833308372302, |
| "learning_rate": 1.9969846785003134e-05, |
| "loss": 0.835, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.10420841683366733, |
| "grad_norm": 7.439551703518052, |
| "learning_rate": 1.9968149242194794e-05, |
| "loss": 0.7915, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.10554442217768871, |
| "grad_norm": 1.3805440550868535, |
| "learning_rate": 1.9966405291546097e-05, |
| "loss": 0.7975, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.10688042752171009, |
| "grad_norm": 3.3630722318781885, |
| "learning_rate": 1.9964614941176194e-05, |
| "loss": 0.7848, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.10821643286573146, |
| "grad_norm": 1.950197889850579, |
| "learning_rate": 1.9962778199420265e-05, |
| "loss": 0.823, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.10955243820975284, |
| "grad_norm": 2.9463664043639803, |
| "learning_rate": 1.9960895074829473e-05, |
| "loss": 0.8431, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.11088844355377421, |
| "grad_norm": 1.160417795917114, |
| "learning_rate": 1.995896557617091e-05, |
| "loss": 0.82, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.11222444889779559, |
| "grad_norm": 1.8393928529177772, |
| "learning_rate": 1.995698971242758e-05, |
| "loss": 0.8382, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.11356045424181696, |
| "grad_norm": 3.5850882231132126, |
| "learning_rate": 1.9954967492798335e-05, |
| "loss": 0.8537, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.11489645958583834, |
| "grad_norm": 3.56601815812619, |
| "learning_rate": 1.9952898926697847e-05, |
| "loss": 0.8409, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.11623246492985972, |
| "grad_norm": 1.815693680865375, |
| "learning_rate": 1.9950784023756555e-05, |
| "loss": 0.8442, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.11756847027388109, |
| "grad_norm": 1.6650068897211114, |
| "learning_rate": 1.9948622793820634e-05, |
| "loss": 0.8283, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.11890447561790247, |
| "grad_norm": 1.2227903165679364, |
| "learning_rate": 1.9946415246951928e-05, |
| "loss": 0.8537, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.12024048096192384, |
| "grad_norm": 1.2877652464931537, |
| "learning_rate": 1.9944161393427923e-05, |
| "loss": 0.8481, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.12157648630594522, |
| "grad_norm": 1.255602772129579, |
| "learning_rate": 1.9941861243741685e-05, |
| "loss": 0.8276, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.1229124916499666, |
| "grad_norm": 1.0687078498979825, |
| "learning_rate": 1.9939514808601822e-05, |
| "loss": 0.7938, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.12424849699398798, |
| "grad_norm": 0.8831821630415814, |
| "learning_rate": 1.9937122098932428e-05, |
| "loss": 0.7918, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.12558450233800936, |
| "grad_norm": 1.1452965835019118, |
| "learning_rate": 1.993468312587303e-05, |
| "loss": 0.8925, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.12558450233800936, |
| "eval_loss": 0.8576086163520813, |
| "eval_runtime": 82.9601, |
| "eval_samples_per_second": 4.4, |
| "eval_steps_per_second": 0.277, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.12692050768203073, |
| "grad_norm": 1.0503480557927924, |
| "learning_rate": 1.9932197900778537e-05, |
| "loss": 0.8418, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.1282565130260521, |
| "grad_norm": 1.8004937271151245, |
| "learning_rate": 1.99296664352192e-05, |
| "loss": 0.8069, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.1295925183700735, |
| "grad_norm": 3.3208429527012284, |
| "learning_rate": 1.992708874098054e-05, |
| "loss": 0.8033, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.13092852371409486, |
| "grad_norm": 0.9342473073020796, |
| "learning_rate": 1.9924464830063306e-05, |
| "loss": 0.8272, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.13226452905811623, |
| "grad_norm": 1.0410510106148712, |
| "learning_rate": 1.9921794714683405e-05, |
| "loss": 0.8617, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.13360053440213762, |
| "grad_norm": 1.0293402901251099, |
| "learning_rate": 1.9919078407271863e-05, |
| "loss": 0.8811, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.134936539746159, |
| "grad_norm": 1.1832507510854497, |
| "learning_rate": 1.991631592047475e-05, |
| "loss": 0.7543, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.13627254509018036, |
| "grad_norm": 0.8921424978973641, |
| "learning_rate": 1.9913507267153142e-05, |
| "loss": 0.7957, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.13760855043420173, |
| "grad_norm": 1.7741188159612133, |
| "learning_rate": 1.9910652460383035e-05, |
| "loss": 0.8273, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.13894455577822312, |
| "grad_norm": 1.0819650935789644, |
| "learning_rate": 1.99077515134553e-05, |
| "loss": 0.7759, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.1402805611222445, |
| "grad_norm": 1.1355967600471175, |
| "learning_rate": 1.9904804439875635e-05, |
| "loss": 0.8574, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.14161656646626586, |
| "grad_norm": 0.8387962140937252, |
| "learning_rate": 1.9901811253364458e-05, |
| "loss": 0.7492, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.14295257181028725, |
| "grad_norm": 1.0475304332346644, |
| "learning_rate": 1.9898771967856892e-05, |
| "loss": 0.8223, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.14428857715430862, |
| "grad_norm": 1.3179392693422012, |
| "learning_rate": 1.9895686597502674e-05, |
| "loss": 0.8049, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.14562458249833, |
| "grad_norm": 0.9125891873253087, |
| "learning_rate": 1.989255515666609e-05, |
| "loss": 0.7906, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.14696058784235136, |
| "grad_norm": 1.0290434623202205, |
| "learning_rate": 1.9889377659925914e-05, |
| "loss": 0.8018, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.14829659318637275, |
| "grad_norm": 1.161083347912507, |
| "learning_rate": 1.9886154122075344e-05, |
| "loss": 0.7791, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.14963259853039412, |
| "grad_norm": 1.122905860463976, |
| "learning_rate": 1.988288455812192e-05, |
| "loss": 0.8264, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.1509686038744155, |
| "grad_norm": 1.0437395565153096, |
| "learning_rate": 1.9879568983287468e-05, |
| "loss": 0.8083, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.1523046092184369, |
| "grad_norm": 1.0536128682952364, |
| "learning_rate": 1.9876207413008014e-05, |
| "loss": 0.817, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.15364061456245826, |
| "grad_norm": 1.0296406603687045, |
| "learning_rate": 1.9872799862933732e-05, |
| "loss": 0.7662, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.15497661990647962, |
| "grad_norm": 0.8084463393273238, |
| "learning_rate": 1.9869346348928852e-05, |
| "loss": 0.7362, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.156312625250501, |
| "grad_norm": 1.095651529367155, |
| "learning_rate": 1.9865846887071596e-05, |
| "loss": 0.8116, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.1576486305945224, |
| "grad_norm": 1.0257956601123124, |
| "learning_rate": 1.986230149365411e-05, |
| "loss": 0.8687, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.15898463593854376, |
| "grad_norm": 1.1138783663460037, |
| "learning_rate": 1.985871018518236e-05, |
| "loss": 0.8911, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.16032064128256512, |
| "grad_norm": 1.0457893052192304, |
| "learning_rate": 1.9855072978376094e-05, |
| "loss": 0.8383, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.16165664662658652, |
| "grad_norm": 1.0909752730036382, |
| "learning_rate": 1.9851389890168738e-05, |
| "loss": 0.8183, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.1629926519706079, |
| "grad_norm": 1.1006276069625966, |
| "learning_rate": 1.9847660937707323e-05, |
| "loss": 0.8093, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.16432865731462926, |
| "grad_norm": 0.9558427841930492, |
| "learning_rate": 1.9843886138352407e-05, |
| "loss": 0.7989, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.16566466265865062, |
| "grad_norm": 0.980480148514281, |
| "learning_rate": 1.9840065509677987e-05, |
| "loss": 0.8211, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.16700066800267202, |
| "grad_norm": 1.0386631545303169, |
| "learning_rate": 1.983619906947144e-05, |
| "loss": 0.8901, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.1683366733466934, |
| "grad_norm": 1.0953366835172111, |
| "learning_rate": 1.9832286835733404e-05, |
| "loss": 0.8454, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.16967267869071476, |
| "grad_norm": 0.9485618080916154, |
| "learning_rate": 1.9828328826677727e-05, |
| "loss": 0.8441, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.17100868403473615, |
| "grad_norm": 0.9475907913103896, |
| "learning_rate": 1.9824325060731365e-05, |
| "loss": 0.7982, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.17234468937875752, |
| "grad_norm": 1.0588971445151802, |
| "learning_rate": 1.9820275556534306e-05, |
| "loss": 0.8405, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.1736806947227789, |
| "grad_norm": 0.9012400317041451, |
| "learning_rate": 1.9816180332939467e-05, |
| "loss": 0.8509, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.17501670006680026, |
| "grad_norm": 1.1447201198562984, |
| "learning_rate": 1.981203940901262e-05, |
| "loss": 0.7491, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.17635270541082165, |
| "grad_norm": 1.1314843702169148, |
| "learning_rate": 1.9807852804032306e-05, |
| "loss": 0.821, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.17768871075484302, |
| "grad_norm": 1.0056061869208837, |
| "learning_rate": 1.9803620537489737e-05, |
| "loss": 0.76, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.1790247160988644, |
| "grad_norm": 1.2881569614514319, |
| "learning_rate": 1.9799342629088704e-05, |
| "loss": 0.7805, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.18036072144288579, |
| "grad_norm": 0.9598066752642362, |
| "learning_rate": 1.979501909874549e-05, |
| "loss": 0.7892, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.18169672678690715, |
| "grad_norm": 0.9800080926146378, |
| "learning_rate": 1.979064996658878e-05, |
| "loss": 0.8085, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.18303273213092852, |
| "grad_norm": 1.5299218474264311, |
| "learning_rate": 1.9786235252959555e-05, |
| "loss": 0.8288, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.1843687374749499, |
| "grad_norm": 1.0723101834400854, |
| "learning_rate": 1.9781774978411013e-05, |
| "loss": 0.8241, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.18570474281897129, |
| "grad_norm": 1.0482768158101567, |
| "learning_rate": 1.977726916370847e-05, |
| "loss": 0.8199, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.18704074816299265, |
| "grad_norm": 1.148738107893617, |
| "learning_rate": 1.977271782982925e-05, |
| "loss": 0.8266, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.18837675350701402, |
| "grad_norm": 1.181792040344075, |
| "learning_rate": 1.9768120997962593e-05, |
| "loss": 0.844, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.18971275885103542, |
| "grad_norm": 1.941515189777966, |
| "learning_rate": 1.9763478689509577e-05, |
| "loss": 0.7836, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.19104876419505679, |
| "grad_norm": 0.8740037429654163, |
| "learning_rate": 1.9758790926082985e-05, |
| "loss": 0.756, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.19238476953907815, |
| "grad_norm": 1.9266810490671247, |
| "learning_rate": 1.9754057729507228e-05, |
| "loss": 0.8425, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.19372077488309952, |
| "grad_norm": 2.208894954466593, |
| "learning_rate": 1.9749279121818235e-05, |
| "loss": 0.8276, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.19505678022712092, |
| "grad_norm": 1.135992127758442, |
| "learning_rate": 1.974445512526336e-05, |
| "loss": 0.8478, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.1963927855711423, |
| "grad_norm": 1.51153729496298, |
| "learning_rate": 1.973958576230125e-05, |
| "loss": 0.7465, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.19772879091516365, |
| "grad_norm": 2.1893490141350704, |
| "learning_rate": 1.9734671055601774e-05, |
| "loss": 0.8419, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.19906479625918505, |
| "grad_norm": 2.8020736131622286, |
| "learning_rate": 1.972971102804591e-05, |
| "loss": 0.6921, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.20040080160320642, |
| "grad_norm": 2.8028825010709806, |
| "learning_rate": 1.9724705702725616e-05, |
| "loss": 0.7741, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.2017368069472278, |
| "grad_norm": 1.5226693954322967, |
| "learning_rate": 1.9719655102943753e-05, |
| "loss": 0.8288, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.20307281229124916, |
| "grad_norm": 1.4265300604657116, |
| "learning_rate": 1.971455925221395e-05, |
| "loss": 0.843, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.20440881763527055, |
| "grad_norm": 2.606777021361483, |
| "learning_rate": 1.9709418174260523e-05, |
| "loss": 0.7961, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.20574482297929192, |
| "grad_norm": 3.0811840978934764, |
| "learning_rate": 1.9704231893018327e-05, |
| "loss": 0.8261, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.2070808283233133, |
| "grad_norm": 1.0258508230958459, |
| "learning_rate": 1.9699000432632692e-05, |
| "loss": 0.778, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.20841683366733466, |
| "grad_norm": 1.1056341674177685, |
| "learning_rate": 1.9693723817459257e-05, |
| "loss": 0.8217, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.20975283901135605, |
| "grad_norm": 1.0100255351570648, |
| "learning_rate": 1.9688402072063905e-05, |
| "loss": 0.8561, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.21108884435537742, |
| "grad_norm": 0.8412501156302337, |
| "learning_rate": 1.9683035221222617e-05, |
| "loss": 0.7966, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.2124248496993988, |
| "grad_norm": 1.0507509064131302, |
| "learning_rate": 1.9677623289921372e-05, |
| "loss": 0.7748, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.21376085504342018, |
| "grad_norm": 0.9999960266478757, |
| "learning_rate": 1.967216630335603e-05, |
| "loss": 0.8412, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.21509686038744155, |
| "grad_norm": 1.0233504970895435, |
| "learning_rate": 1.9666664286932198e-05, |
| "loss": 0.8259, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.21643286573146292, |
| "grad_norm": 0.9182296950978539, |
| "learning_rate": 1.9661117266265136e-05, |
| "loss": 0.7698, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.2177688710754843, |
| "grad_norm": 1.357551418488895, |
| "learning_rate": 1.9655525267179626e-05, |
| "loss": 0.8406, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.21910487641950568, |
| "grad_norm": 0.9357420201075286, |
| "learning_rate": 1.964988831570984e-05, |
| "loss": 0.7759, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.22044088176352705, |
| "grad_norm": 0.995184899662308, |
| "learning_rate": 1.964420643809925e-05, |
| "loss": 0.7795, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.22177688710754842, |
| "grad_norm": 1.00019448701564, |
| "learning_rate": 1.9638479660800476e-05, |
| "loss": 0.7784, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.22311289245156982, |
| "grad_norm": 0.9348560211142481, |
| "learning_rate": 1.9632708010475166e-05, |
| "loss": 0.7796, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.22444889779559118, |
| "grad_norm": 1.0037478540613847, |
| "learning_rate": 1.9626891513993892e-05, |
| "loss": 0.8329, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.22578490313961255, |
| "grad_norm": 0.992355622463503, |
| "learning_rate": 1.9621030198436007e-05, |
| "loss": 0.8448, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.22712090848363392, |
| "grad_norm": 0.9054972050706259, |
| "learning_rate": 1.9615124091089527e-05, |
| "loss": 0.7982, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.22845691382765532, |
| "grad_norm": 1.0239079943868206, |
| "learning_rate": 1.9609173219450998e-05, |
| "loss": 0.8299, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.22979291917167669, |
| "grad_norm": 0.9752447525429634, |
| "learning_rate": 1.960317761122537e-05, |
| "loss": 0.77, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.23112892451569805, |
| "grad_norm": 1.0466078122253917, |
| "learning_rate": 1.9597137294325877e-05, |
| "loss": 0.8168, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.23246492985971945, |
| "grad_norm": 0.9939421815514726, |
| "learning_rate": 1.959105229687389e-05, |
| "loss": 0.7993, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.23380093520374082, |
| "grad_norm": 0.9644915360110909, |
| "learning_rate": 1.95849226471988e-05, |
| "loss": 0.8316, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.23513694054776219, |
| "grad_norm": 0.9864431951061833, |
| "learning_rate": 1.957874837383788e-05, |
| "loss": 0.8068, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.23647294589178355, |
| "grad_norm": 1.0374945065399837, |
| "learning_rate": 1.957252950553616e-05, |
| "loss": 0.8107, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.23780895123580495, |
| "grad_norm": 0.9925326283569903, |
| "learning_rate": 1.9566266071246272e-05, |
| "loss": 0.8439, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.23914495657982632, |
| "grad_norm": 0.9360168462739733, |
| "learning_rate": 1.955995810012835e-05, |
| "loss": 0.7771, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.24048096192384769, |
| "grad_norm": 0.9621311579196008, |
| "learning_rate": 1.9553605621549848e-05, |
| "loss": 0.7755, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.24181696726786908, |
| "grad_norm": 0.9646231648282781, |
| "learning_rate": 1.954720866508546e-05, |
| "loss": 0.7841, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.24315297261189045, |
| "grad_norm": 0.9635787946451111, |
| "learning_rate": 1.9540767260516927e-05, |
| "loss": 0.7911, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.24448897795591182, |
| "grad_norm": 0.9379046362465309, |
| "learning_rate": 1.9534281437832935e-05, |
| "loss": 0.7759, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.2458249832999332, |
| "grad_norm": 1.0236027725335752, |
| "learning_rate": 1.9527751227228964e-05, |
| "loss": 0.7837, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.24716098864395458, |
| "grad_norm": 0.9247593802498467, |
| "learning_rate": 1.952117665910714e-05, |
| "loss": 0.7647, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.24849699398797595, |
| "grad_norm": 0.9791881807441684, |
| "learning_rate": 1.9514557764076113e-05, |
| "loss": 0.8284, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.24983299933199732, |
| "grad_norm": 0.8885222676669997, |
| "learning_rate": 1.9507894572950884e-05, |
| "loss": 0.7756, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.2511690046760187, |
| "grad_norm": 0.994170275189555, |
| "learning_rate": 1.9501187116752694e-05, |
| "loss": 0.8065, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.2511690046760187, |
| "eval_loss": 0.8102058172225952, |
| "eval_runtime": 83.8633, |
| "eval_samples_per_second": 4.352, |
| "eval_steps_per_second": 0.274, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.25250501002004005, |
| "grad_norm": 1.0253443247527654, |
| "learning_rate": 1.9494435426708856e-05, |
| "loss": 0.7708, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.25384101536406145, |
| "grad_norm": 1.0827366453761385, |
| "learning_rate": 1.9487639534252624e-05, |
| "loss": 0.8002, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.25517702070808285, |
| "grad_norm": 0.9974130987822876, |
| "learning_rate": 1.9480799471023047e-05, |
| "loss": 0.7695, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.2565130260521042, |
| "grad_norm": 1.0882225879465166, |
| "learning_rate": 1.9473915268864796e-05, |
| "loss": 0.8349, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.2578490313961256, |
| "grad_norm": 0.9485443541330834, |
| "learning_rate": 1.9466986959828063e-05, |
| "loss": 0.8247, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.259185036740147, |
| "grad_norm": 1.057201202652337, |
| "learning_rate": 1.9460014576168357e-05, |
| "loss": 0.7823, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.2605210420841683, |
| "grad_norm": 1.23788381138827, |
| "learning_rate": 1.9452998150346403e-05, |
| "loss": 0.7897, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.2618570474281897, |
| "grad_norm": 0.8790913786456861, |
| "learning_rate": 1.944593771502796e-05, |
| "loss": 0.7959, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.2631930527722111, |
| "grad_norm": 0.9866039641517707, |
| "learning_rate": 1.9438833303083677e-05, |
| "loss": 0.8159, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.26452905811623245, |
| "grad_norm": 0.9203841261082187, |
| "learning_rate": 1.9431684947588943e-05, |
| "loss": 0.7594, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.26586506346025385, |
| "grad_norm": 0.9489214674271, |
| "learning_rate": 1.9424492681823733e-05, |
| "loss": 0.7282, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.26720106880427524, |
| "grad_norm": 0.8607449162985858, |
| "learning_rate": 1.9417256539272448e-05, |
| "loss": 0.7458, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.2685370741482966, |
| "grad_norm": 0.8376898409765609, |
| "learning_rate": 1.9409976553623767e-05, |
| "loss": 0.7248, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.269873079492318, |
| "grad_norm": 1.0515394700271572, |
| "learning_rate": 1.9402652758770476e-05, |
| "loss": 0.8728, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.2712090848363393, |
| "grad_norm": 0.9051594498491204, |
| "learning_rate": 1.9395285188809332e-05, |
| "loss": 0.7454, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.2725450901803607, |
| "grad_norm": 0.9514293332943959, |
| "learning_rate": 1.9387873878040883e-05, |
| "loss": 0.7698, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.2738810955243821, |
| "grad_norm": 0.9023759712398705, |
| "learning_rate": 1.938041886096932e-05, |
| "loss": 0.7775, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.27521710086840345, |
| "grad_norm": 0.942069813638029, |
| "learning_rate": 1.9372920172302317e-05, |
| "loss": 0.7931, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.27655310621242485, |
| "grad_norm": 0.9872428267317357, |
| "learning_rate": 1.936537784695086e-05, |
| "loss": 0.7873, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.27788911155644624, |
| "grad_norm": 0.9417817757642352, |
| "learning_rate": 1.935779192002909e-05, |
| "loss": 0.8134, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.2792251169004676, |
| "grad_norm": 0.9506920786651124, |
| "learning_rate": 1.9350162426854152e-05, |
| "loss": 0.799, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.280561122244489, |
| "grad_norm": 0.9253573158145543, |
| "learning_rate": 1.9342489402945997e-05, |
| "loss": 0.8272, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.2818971275885104, |
| "grad_norm": 1.0101186531631632, |
| "learning_rate": 1.9334772884027267e-05, |
| "loss": 0.8004, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.2832331329325317, |
| "grad_norm": 0.9529057709596102, |
| "learning_rate": 1.9327012906023076e-05, |
| "loss": 0.8037, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.2845691382765531, |
| "grad_norm": 0.982053999462765, |
| "learning_rate": 1.931920950506087e-05, |
| "loss": 0.8476, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.2859051436205745, |
| "grad_norm": 0.8337384559208328, |
| "learning_rate": 1.9311362717470268e-05, |
| "loss": 0.7417, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.28724114896459585, |
| "grad_norm": 1.06385116799924, |
| "learning_rate": 1.9303472579782867e-05, |
| "loss": 0.7823, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.28857715430861725, |
| "grad_norm": 0.9786270127071084, |
| "learning_rate": 1.9295539128732096e-05, |
| "loss": 0.8085, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.2899131596526386, |
| "grad_norm": 0.9018629565375761, |
| "learning_rate": 1.9287562401253023e-05, |
| "loss": 0.7778, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.29124916499666, |
| "grad_norm": 0.8958595594334638, |
| "learning_rate": 1.927954243448221e-05, |
| "loss": 0.8099, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.2925851703406814, |
| "grad_norm": 0.8848672708903511, |
| "learning_rate": 1.92714792657575e-05, |
| "loss": 0.7769, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.2939211756847027, |
| "grad_norm": 0.9216638745149088, |
| "learning_rate": 1.9263372932617894e-05, |
| "loss": 0.7872, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.2952571810287241, |
| "grad_norm": 0.8519065954619743, |
| "learning_rate": 1.9255223472803337e-05, |
| "loss": 0.7571, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.2965931863727455, |
| "grad_norm": 0.9227915163127393, |
| "learning_rate": 1.924703092425455e-05, |
| "loss": 0.8012, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.29792919171676685, |
| "grad_norm": 0.9396592428655529, |
| "learning_rate": 1.9238795325112867e-05, |
| "loss": 0.8265, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.29926519706078825, |
| "grad_norm": 0.8767299687574203, |
| "learning_rate": 1.9230516713720053e-05, |
| "loss": 0.7276, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.30060120240480964, |
| "grad_norm": 0.9492384775822614, |
| "learning_rate": 1.9222195128618108e-05, |
| "loss": 0.7729, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.301937207748831, |
| "grad_norm": 1.0739639178999145, |
| "learning_rate": 1.92138306085491e-05, |
| "loss": 0.782, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.3032732130928524, |
| "grad_norm": 0.9639479767673114, |
| "learning_rate": 1.9205423192455014e-05, |
| "loss": 0.849, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.3046092184368738, |
| "grad_norm": 0.8884897514517014, |
| "learning_rate": 1.9196972919477503e-05, |
| "loss": 0.7279, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.3059452237808951, |
| "grad_norm": 0.921226421123449, |
| "learning_rate": 1.9188479828957773e-05, |
| "loss": 0.7807, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.3072812291249165, |
| "grad_norm": 1.2023742759555582, |
| "learning_rate": 1.917994396043636e-05, |
| "loss": 0.8161, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.30861723446893785, |
| "grad_norm": 0.8439712407459901, |
| "learning_rate": 1.917136535365296e-05, |
| "loss": 0.7337, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.30995323981295925, |
| "grad_norm": 0.9037076806654686, |
| "learning_rate": 1.9162744048546242e-05, |
| "loss": 0.7671, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.31128924515698064, |
| "grad_norm": 0.895296829626764, |
| "learning_rate": 1.9154080085253665e-05, |
| "loss": 0.814, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.312625250501002, |
| "grad_norm": 0.846724812083691, |
| "learning_rate": 1.914537350411128e-05, |
| "loss": 0.7859, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.3139612558450234, |
| "grad_norm": 0.9879319650560158, |
| "learning_rate": 1.9136624345653557e-05, |
| "loss": 0.8323, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.3152972611890448, |
| "grad_norm": 1.1096324575175736, |
| "learning_rate": 1.912783265061319e-05, |
| "loss": 0.8297, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.3166332665330661, |
| "grad_norm": 0.9239300064289768, |
| "learning_rate": 1.91189984599209e-05, |
| "loss": 0.7865, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.3179692718770875, |
| "grad_norm": 0.9111867722547871, |
| "learning_rate": 1.9110121814705263e-05, |
| "loss": 0.783, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.3193052772211089, |
| "grad_norm": 0.8851064868690998, |
| "learning_rate": 1.910120275629249e-05, |
| "loss": 0.7349, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.32064128256513025, |
| "grad_norm": 1.0180285168985816, |
| "learning_rate": 1.9092241326206268e-05, |
| "loss": 0.812, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.32197728790915164, |
| "grad_norm": 0.9907242836275669, |
| "learning_rate": 1.908323756616754e-05, |
| "loss": 0.7879, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.32331329325317304, |
| "grad_norm": 0.8553511222617886, |
| "learning_rate": 1.9074191518094326e-05, |
| "loss": 0.7662, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.3246492985971944, |
| "grad_norm": 0.8832151337920235, |
| "learning_rate": 1.906510322410152e-05, |
| "loss": 0.7875, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.3259853039412158, |
| "grad_norm": 0.9081930718770509, |
| "learning_rate": 1.9055972726500696e-05, |
| "loss": 0.7838, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.3273213092852371, |
| "grad_norm": 1.135017842072666, |
| "learning_rate": 1.9046800067799914e-05, |
| "loss": 0.8377, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.3286573146292585, |
| "grad_norm": 1.1532714119764724, |
| "learning_rate": 1.9037585290703514e-05, |
| "loss": 0.7737, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.3299933199732799, |
| "grad_norm": 0.9800486770921085, |
| "learning_rate": 1.9028328438111938e-05, |
| "loss": 0.8067, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.33132932531730125, |
| "grad_norm": 0.9979504949421713, |
| "learning_rate": 1.9019029553121494e-05, |
| "loss": 0.798, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.33266533066132264, |
| "grad_norm": 0.8868928719480308, |
| "learning_rate": 1.900968867902419e-05, |
| "loss": 0.8051, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.33400133600534404, |
| "grad_norm": 0.9772643890716372, |
| "learning_rate": 1.9000305859307527e-05, |
| "loss": 0.8078, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.3353373413493654, |
| "grad_norm": 1.5051612319682424, |
| "learning_rate": 1.899088113765426e-05, |
| "loss": 0.705, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.3366733466933868, |
| "grad_norm": 0.9952405576310039, |
| "learning_rate": 1.8981414557942255e-05, |
| "loss": 0.7927, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.3380093520374082, |
| "grad_norm": 1.062881836098141, |
| "learning_rate": 1.8971906164244232e-05, |
| "loss": 0.7815, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.3393453573814295, |
| "grad_norm": 1.0259148109550424, |
| "learning_rate": 1.896235600082759e-05, |
| "loss": 0.7932, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.3406813627254509, |
| "grad_norm": 0.9170476321584136, |
| "learning_rate": 1.8952764112154193e-05, |
| "loss": 0.7803, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.3420173680694723, |
| "grad_norm": 0.9141907670529551, |
| "learning_rate": 1.894313054288015e-05, |
| "loss": 0.8015, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.34335337341349365, |
| "grad_norm": 0.9556187931162476, |
| "learning_rate": 1.8933455337855633e-05, |
| "loss": 0.8322, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.34468937875751504, |
| "grad_norm": 0.9466969545434704, |
| "learning_rate": 1.8923738542124644e-05, |
| "loss": 0.821, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.3460253841015364, |
| "grad_norm": 1.0409170880273046, |
| "learning_rate": 1.8913980200924822e-05, |
| "loss": 0.8644, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.3473613894455578, |
| "grad_norm": 0.9042506253210668, |
| "learning_rate": 1.8904180359687218e-05, |
| "loss": 0.7608, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.3486973947895792, |
| "grad_norm": 0.9255715906938344, |
| "learning_rate": 1.88943390640361e-05, |
| "loss": 0.7734, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.3500334001336005, |
| "grad_norm": 0.9156808871587376, |
| "learning_rate": 1.8884456359788725e-05, |
| "loss": 0.7984, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.3513694054776219, |
| "grad_norm": 0.9601351675507691, |
| "learning_rate": 1.8874532292955135e-05, |
| "loss": 0.7929, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.3527054108216433, |
| "grad_norm": 0.7796424988188564, |
| "learning_rate": 1.886456690973794e-05, |
| "loss": 0.6588, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.35404141616566465, |
| "grad_norm": 0.9222472620667675, |
| "learning_rate": 1.8854560256532098e-05, |
| "loss": 0.7872, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.35537742150968604, |
| "grad_norm": 1.0205962612465123, |
| "learning_rate": 1.884451237992472e-05, |
| "loss": 0.7986, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.35671342685370744, |
| "grad_norm": 0.973789250872766, |
| "learning_rate": 1.8834423326694814e-05, |
| "loss": 0.812, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.3580494321977288, |
| "grad_norm": 0.962237948235962, |
| "learning_rate": 1.8824293143813112e-05, |
| "loss": 0.8256, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.3593854375417502, |
| "grad_norm": 0.9638974882770617, |
| "learning_rate": 1.8814121878441814e-05, |
| "loss": 0.8064, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.36072144288577157, |
| "grad_norm": 0.9634276691152937, |
| "learning_rate": 1.8803909577934398e-05, |
| "loss": 0.7796, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.3620574482297929, |
| "grad_norm": 1.0411727226985446, |
| "learning_rate": 1.8793656289835365e-05, |
| "loss": 0.8714, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.3633934535738143, |
| "grad_norm": 1.325621257952684, |
| "learning_rate": 1.8783362061880063e-05, |
| "loss": 0.7664, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.36472945891783565, |
| "grad_norm": 1.1015887873495702, |
| "learning_rate": 1.877302694199442e-05, |
| "loss": 0.8063, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.36606546426185704, |
| "grad_norm": 0.9444891604424849, |
| "learning_rate": 1.876265097829476e-05, |
| "loss": 0.8432, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.36740146960587844, |
| "grad_norm": 1.0039307007740597, |
| "learning_rate": 1.8752234219087538e-05, |
| "loss": 0.8185, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.3687374749498998, |
| "grad_norm": 1.062219541121473, |
| "learning_rate": 1.8741776712869154e-05, |
| "loss": 0.8129, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.3700734802939212, |
| "grad_norm": 0.9974048266839923, |
| "learning_rate": 1.873127850832571e-05, |
| "loss": 0.7777, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.37140948563794257, |
| "grad_norm": 1.141507576532494, |
| "learning_rate": 1.872073965433277e-05, |
| "loss": 0.7842, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.3727454909819639, |
| "grad_norm": 0.9399997606328846, |
| "learning_rate": 1.8710160199955158e-05, |
| "loss": 0.8246, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.3740814963259853, |
| "grad_norm": 0.9016882475418688, |
| "learning_rate": 1.8699540194446712e-05, |
| "loss": 0.8251, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.3754175016700067, |
| "grad_norm": 0.8306158799954252, |
| "learning_rate": 1.8688879687250067e-05, |
| "loss": 0.7619, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.37675350701402804, |
| "grad_norm": 0.9625643405936993, |
| "learning_rate": 1.8678178727996412e-05, |
| "loss": 0.782, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.37675350701402804, |
| "eval_loss": 0.7880542278289795, |
| "eval_runtime": 84.1498, |
| "eval_samples_per_second": 4.338, |
| "eval_steps_per_second": 0.273, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.37808951235804944, |
| "grad_norm": 0.8865643900778303, |
| "learning_rate": 1.8667437366505262e-05, |
| "loss": 0.7438, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.37942551770207084, |
| "grad_norm": 0.9137286216934896, |
| "learning_rate": 1.865665565278424e-05, |
| "loss": 0.7762, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.3807615230460922, |
| "grad_norm": 0.9201023232593961, |
| "learning_rate": 1.8645833637028828e-05, |
| "loss": 0.7936, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.38209752839011357, |
| "grad_norm": 0.833106635574073, |
| "learning_rate": 1.863497136962213e-05, |
| "loss": 0.7136, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.3834335337341349, |
| "grad_norm": 0.8652082233529838, |
| "learning_rate": 1.8624068901134662e-05, |
| "loss": 0.7882, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.3847695390781563, |
| "grad_norm": 0.8575094982604357, |
| "learning_rate": 1.8613126282324092e-05, |
| "loss": 0.8311, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.3861055444221777, |
| "grad_norm": 0.9696774165598375, |
| "learning_rate": 1.860214356413501e-05, |
| "loss": 0.7886, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.38744154976619904, |
| "grad_norm": 0.9493185001035063, |
| "learning_rate": 1.8591120797698696e-05, |
| "loss": 0.7975, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.38877755511022044, |
| "grad_norm": 0.8830825137536001, |
| "learning_rate": 1.8580058034332878e-05, |
| "loss": 0.761, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.39011356045424184, |
| "grad_norm": 0.9776572303963238, |
| "learning_rate": 1.8568955325541506e-05, |
| "loss": 0.7662, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.3914495657982632, |
| "grad_norm": 0.8865957210316401, |
| "learning_rate": 1.8557812723014476e-05, |
| "loss": 0.7588, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.3927855711422846, |
| "grad_norm": 1.0193821114908428, |
| "learning_rate": 1.8546630278627437e-05, |
| "loss": 0.782, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.39412157648630597, |
| "grad_norm": 0.8580703986297549, |
| "learning_rate": 1.8535408044441515e-05, |
| "loss": 0.7893, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.3954575818303273, |
| "grad_norm": 1.1420834211942559, |
| "learning_rate": 1.852414607270308e-05, |
| "loss": 0.775, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.3967935871743487, |
| "grad_norm": 0.9162950910785166, |
| "learning_rate": 1.8512844415843514e-05, |
| "loss": 0.7729, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.3981295925183701, |
| "grad_norm": 0.9382189580262706, |
| "learning_rate": 1.8501503126478947e-05, |
| "loss": 0.7917, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.39946559786239144, |
| "grad_norm": 0.9830618649317936, |
| "learning_rate": 1.8490122257410034e-05, |
| "loss": 0.831, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.40080160320641284, |
| "grad_norm": 0.881056742361536, |
| "learning_rate": 1.8478701861621686e-05, |
| "loss": 0.7548, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.4021376085504342, |
| "grad_norm": 0.8762842315032566, |
| "learning_rate": 1.8467241992282842e-05, |
| "loss": 0.7734, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.4034736138944556, |
| "grad_norm": 1.027523587678614, |
| "learning_rate": 1.8455742702746216e-05, |
| "loss": 0.8614, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.40480961923847697, |
| "grad_norm": 0.8674697552241131, |
| "learning_rate": 1.844420404654804e-05, |
| "loss": 0.7832, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.4061456245824983, |
| "grad_norm": 0.8696469726934413, |
| "learning_rate": 1.843262607740783e-05, |
| "loss": 0.7252, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.4074816299265197, |
| "grad_norm": 0.8380404854767282, |
| "learning_rate": 1.842100884922812e-05, |
| "loss": 0.7327, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.4088176352705411, |
| "grad_norm": 0.8793971971550516, |
| "learning_rate": 1.8409352416094224e-05, |
| "loss": 0.7518, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.41015364061456244, |
| "grad_norm": 0.963789537566054, |
| "learning_rate": 1.8397656832273982e-05, |
| "loss": 0.7763, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.41148964595858384, |
| "grad_norm": 0.9048764596384147, |
| "learning_rate": 1.8385922152217496e-05, |
| "loss": 0.7021, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.41282565130260523, |
| "grad_norm": 0.907334932613408, |
| "learning_rate": 1.8374148430556888e-05, |
| "loss": 0.834, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.4141616566466266, |
| "grad_norm": 0.8064437894344078, |
| "learning_rate": 1.8362335722106048e-05, |
| "loss": 0.7885, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.41549766199064797, |
| "grad_norm": 0.9018113787224833, |
| "learning_rate": 1.835048408186037e-05, |
| "loss": 0.741, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.4168336673346693, |
| "grad_norm": 0.8612675937204591, |
| "learning_rate": 1.8338593564996497e-05, |
| "loss": 0.781, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.4181696726786907, |
| "grad_norm": 0.9402380563799979, |
| "learning_rate": 1.8326664226872063e-05, |
| "loss": 0.8153, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.4195056780227121, |
| "grad_norm": 0.9199991108589402, |
| "learning_rate": 1.8314696123025456e-05, |
| "loss": 0.8358, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.42084168336673344, |
| "grad_norm": 0.8586516171988818, |
| "learning_rate": 1.8302689309175516e-05, |
| "loss": 0.7505, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.42217768871075484, |
| "grad_norm": 1.1364325732176226, |
| "learning_rate": 1.8290643841221324e-05, |
| "loss": 0.8365, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.42351369405477624, |
| "grad_norm": 0.9392924248023505, |
| "learning_rate": 1.827855977524191e-05, |
| "loss": 0.7586, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.4248496993987976, |
| "grad_norm": 0.8315819706850655, |
| "learning_rate": 1.8266437167496005e-05, |
| "loss": 0.7846, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.42618570474281897, |
| "grad_norm": 1.0366440193958355, |
| "learning_rate": 1.825427607442177e-05, |
| "loss": 0.7818, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.42752171008684037, |
| "grad_norm": 0.8416489852984904, |
| "learning_rate": 1.824207655263654e-05, |
| "loss": 0.7891, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.4288577154308617, |
| "grad_norm": 0.8805548673757602, |
| "learning_rate": 1.8229838658936566e-05, |
| "loss": 0.7332, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.4301937207748831, |
| "grad_norm": 0.8677532917619016, |
| "learning_rate": 1.8217562450296737e-05, |
| "loss": 0.795, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.4315297261189045, |
| "grad_norm": 0.7902727548581008, |
| "learning_rate": 1.8205247983870325e-05, |
| "loss": 0.7358, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.43286573146292584, |
| "grad_norm": 0.9552346458655085, |
| "learning_rate": 1.8192895316988714e-05, |
| "loss": 0.8334, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.43420173680694724, |
| "grad_norm": 0.9205528204489275, |
| "learning_rate": 1.818050450716113e-05, |
| "loss": 0.7288, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.4355377421509686, |
| "grad_norm": 0.8926032527471568, |
| "learning_rate": 1.8168075612074388e-05, |
| "loss": 0.8029, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.43687374749499, |
| "grad_norm": 0.8523550164711797, |
| "learning_rate": 1.8155608689592604e-05, |
| "loss": 0.774, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.43820975283901137, |
| "grad_norm": 0.8390140607328515, |
| "learning_rate": 1.8143103797756942e-05, |
| "loss": 0.8062, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.4395457581830327, |
| "grad_norm": 0.8646682710820988, |
| "learning_rate": 1.8130560994785325e-05, |
| "loss": 0.8361, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.4408817635270541, |
| "grad_norm": 0.7463310187965114, |
| "learning_rate": 1.8117980339072195e-05, |
| "loss": 0.7083, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.4422177688710755, |
| "grad_norm": 0.849400527781498, |
| "learning_rate": 1.8105361889188203e-05, |
| "loss": 0.7842, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.44355377421509684, |
| "grad_norm": 0.8137751188691663, |
| "learning_rate": 1.8092705703879962e-05, |
| "loss": 0.762, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.44488977955911824, |
| "grad_norm": 0.8805052711171745, |
| "learning_rate": 1.8080011842069768e-05, |
| "loss": 0.787, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.44622578490313963, |
| "grad_norm": 0.909561565524386, |
| "learning_rate": 1.8067280362855322e-05, |
| "loss": 0.7908, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.447561790247161, |
| "grad_norm": 0.8732531909802541, |
| "learning_rate": 1.805451132550946e-05, |
| "loss": 0.7546, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.44889779559118237, |
| "grad_norm": 0.9367423192836012, |
| "learning_rate": 1.8041704789479872e-05, |
| "loss": 0.8275, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.45023380093520377, |
| "grad_norm": 0.8953225243864141, |
| "learning_rate": 1.8028860814388826e-05, |
| "loss": 0.8156, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.4515698062792251, |
| "grad_norm": 0.7980098646263484, |
| "learning_rate": 1.801597946003289e-05, |
| "loss": 0.7495, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.4529058116232465, |
| "grad_norm": 0.8105594453593196, |
| "learning_rate": 1.800306078638267e-05, |
| "loss": 0.7379, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.45424181696726784, |
| "grad_norm": 0.8456092325511747, |
| "learning_rate": 1.7990104853582494e-05, |
| "loss": 0.7492, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.45557782231128924, |
| "grad_norm": 0.8639099396677343, |
| "learning_rate": 1.7977111721950163e-05, |
| "loss": 0.7801, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.45691382765531063, |
| "grad_norm": 0.8333562591176576, |
| "learning_rate": 1.7964081451976673e-05, |
| "loss": 0.7795, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.458249832999332, |
| "grad_norm": 0.8807729648357469, |
| "learning_rate": 1.7951014104325907e-05, |
| "loss": 0.8172, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.45958583834335337, |
| "grad_norm": 0.8934888035966929, |
| "learning_rate": 1.7937909739834366e-05, |
| "loss": 0.8615, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.46092184368737477, |
| "grad_norm": 0.838698477333794, |
| "learning_rate": 1.7924768419510906e-05, |
| "loss": 0.7634, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.4622578490313961, |
| "grad_norm": 0.9239881274019001, |
| "learning_rate": 1.7911590204536413e-05, |
| "loss": 0.815, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.4635938543754175, |
| "grad_norm": 0.9754741156791019, |
| "learning_rate": 1.7898375156263555e-05, |
| "loss": 0.7944, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.4649298597194389, |
| "grad_norm": 0.8490315330716274, |
| "learning_rate": 1.7885123336216473e-05, |
| "loss": 0.7492, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.46626586506346024, |
| "grad_norm": 0.9341741782247254, |
| "learning_rate": 1.7871834806090502e-05, |
| "loss": 0.7429, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.46760187040748163, |
| "grad_norm": 0.8923751867355049, |
| "learning_rate": 1.78585096277519e-05, |
| "loss": 0.7842, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.46893787575150303, |
| "grad_norm": 0.8829294552751072, |
| "learning_rate": 1.7845147863237526e-05, |
| "loss": 0.7446, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.47027388109552437, |
| "grad_norm": 0.8895149135560049, |
| "learning_rate": 1.7831749574754577e-05, |
| "loss": 0.8291, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.47160988643954577, |
| "grad_norm": 0.8945837154504075, |
| "learning_rate": 1.78183148246803e-05, |
| "loss": 0.7748, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.4729458917835671, |
| "grad_norm": 0.8974574630224665, |
| "learning_rate": 1.7804843675561678e-05, |
| "loss": 0.7849, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.4742818971275885, |
| "grad_norm": 1.033028360810294, |
| "learning_rate": 1.7791336190115168e-05, |
| "loss": 0.7994, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.4756179024716099, |
| "grad_norm": 1.5863973920381642, |
| "learning_rate": 1.7777792431226384e-05, |
| "loss": 0.7626, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.47695390781563124, |
| "grad_norm": 0.9015785505517224, |
| "learning_rate": 1.776421246194982e-05, |
| "loss": 0.8379, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.47828991315965264, |
| "grad_norm": 0.8805896589378985, |
| "learning_rate": 1.775059634550855e-05, |
| "loss": 0.7916, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.47962591850367403, |
| "grad_norm": 0.8772495666853293, |
| "learning_rate": 1.7736944145293936e-05, |
| "loss": 0.8024, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.48096192384769537, |
| "grad_norm": 0.8680679320150547, |
| "learning_rate": 1.7723255924865338e-05, |
| "loss": 0.761, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.48229792919171677, |
| "grad_norm": 0.9094101324467635, |
| "learning_rate": 1.7709531747949796e-05, |
| "loss": 0.8319, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.48363393453573816, |
| "grad_norm": 0.8465074373858681, |
| "learning_rate": 1.7695771678441768e-05, |
| "loss": 0.7685, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.4849699398797595, |
| "grad_norm": 1.0007908805518602, |
| "learning_rate": 1.7681975780402807e-05, |
| "loss": 0.8049, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.4863059452237809, |
| "grad_norm": 0.8609540863703258, |
| "learning_rate": 1.7668144118061263e-05, |
| "loss": 0.7798, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.4876419505678023, |
| "grad_norm": 0.866314689793943, |
| "learning_rate": 1.7654276755811997e-05, |
| "loss": 0.8006, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.48897795591182364, |
| "grad_norm": 0.9174718757887473, |
| "learning_rate": 1.7640373758216075e-05, |
| "loss": 0.8106, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.49031396125584503, |
| "grad_norm": 0.9301019775437347, |
| "learning_rate": 1.7626435190000468e-05, |
| "loss": 0.8217, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.4916499665998664, |
| "grad_norm": 0.8883644166245096, |
| "learning_rate": 1.761246111605775e-05, |
| "loss": 0.8212, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.49298597194388777, |
| "grad_norm": 0.8577348952749505, |
| "learning_rate": 1.759845160144579e-05, |
| "loss": 0.7793, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.49432197728790916, |
| "grad_norm": 0.9207167682391375, |
| "learning_rate": 1.7584406711387462e-05, |
| "loss": 0.7571, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.4956579826319305, |
| "grad_norm": 0.870399047277893, |
| "learning_rate": 1.7570326511270332e-05, |
| "loss": 0.7541, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.4969939879759519, |
| "grad_norm": 0.8971467990645112, |
| "learning_rate": 1.7556211066646355e-05, |
| "loss": 0.776, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.4983299933199733, |
| "grad_norm": 0.8876408424995392, |
| "learning_rate": 1.7542060443231572e-05, |
| "loss": 0.7948, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.49966599866399464, |
| "grad_norm": 0.8378719459887969, |
| "learning_rate": 1.7527874706905804e-05, |
| "loss": 0.7896, |
| "step": 374 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 1496, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 374, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 243633362042880.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|