diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5776 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1638, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.003663003663003663, + "grad_norm": 1.5506155490875244, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.681140899658203, + "step": 2 + }, + { + "epoch": 0.007326007326007326, + "grad_norm": 0.4526568055152893, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.6674047708511353, + "step": 4 + }, + { + "epoch": 0.01098901098901099, + "grad_norm": 0.8889099359512329, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.8801467418670654, + "step": 6 + }, + { + "epoch": 0.014652014652014652, + "grad_norm": 0.19507154822349548, + "learning_rate": 2.8000000000000003e-06, + "loss": 2.0659124851226807, + "step": 8 + }, + { + "epoch": 0.018315018315018316, + "grad_norm": 0.3095138967037201, + "learning_rate": 3.6000000000000003e-06, + "loss": 2.2201435565948486, + "step": 10 + }, + { + "epoch": 0.02197802197802198, + "grad_norm": 0.7828930020332336, + "learning_rate": 4.4e-06, + "loss": 2.0232832431793213, + "step": 12 + }, + { + "epoch": 0.02564102564102564, + "grad_norm": 0.21790610253810883, + "learning_rate": 5.2e-06, + "loss": 1.7574424743652344, + "step": 14 + }, + { + "epoch": 0.029304029304029304, + "grad_norm": 0.25006550550460815, + "learning_rate": 6e-06, + "loss": 1.8244725465774536, + "step": 16 + }, + { + "epoch": 0.03296703296703297, + "grad_norm": 1.2375390529632568, + "learning_rate": 6.800000000000001e-06, + "loss": 1.7521573305130005, + "step": 18 + }, + { + "epoch": 0.03663003663003663, + "grad_norm": 0.22365400195121765, + "learning_rate": 7.600000000000001e-06, + "loss": 1.7442874908447266, + "step": 20 + }, + { + "epoch": 0.040293040293040296, + "grad_norm": 1.432734489440918, + "learning_rate": 8.400000000000001e-06, + "loss": 1.137043833732605, + "step": 22 + }, + { + "epoch": 0.04395604395604396, + "grad_norm": 0.23710986971855164, + "learning_rate": 9.200000000000002e-06, + "loss": 1.3199552297592163, + "step": 24 + }, + { + "epoch": 0.047619047619047616, + "grad_norm": 0.6795807480812073, + "learning_rate": 1e-05, + "loss": 1.4519306421279907, + "step": 26 + }, + { + "epoch": 0.05128205128205128, + "grad_norm": 1.1287591457366943, + "learning_rate": 1.0800000000000002e-05, + "loss": 1.4781732559204102, + "step": 28 + }, + { + "epoch": 0.054945054945054944, + "grad_norm": 0.40312108397483826, + "learning_rate": 1.16e-05, + "loss": 1.1731195449829102, + "step": 30 + }, + { + "epoch": 0.05860805860805861, + "grad_norm": 0.3888772428035736, + "learning_rate": 1.2400000000000002e-05, + "loss": 1.10336172580719, + "step": 32 + }, + { + "epoch": 0.06227106227106227, + "grad_norm": 0.057199470698833466, + "learning_rate": 1.3200000000000002e-05, + "loss": 1.2399317026138306, + "step": 34 + }, + { + "epoch": 0.06593406593406594, + "grad_norm": 0.24481792747974396, + "learning_rate": 1.4e-05, + "loss": 1.61336088180542, + "step": 36 + }, + { + "epoch": 0.0695970695970696, + "grad_norm": 0.37236636877059937, + "learning_rate": 1.48e-05, + "loss": 1.372746229171753, + "step": 38 + }, + { + "epoch": 0.07326007326007326, + "grad_norm": 0.5762131810188293, + "learning_rate": 1.5600000000000003e-05, + "loss": 1.4253513813018799, + "step": 40 + }, + { + "epoch": 0.07692307692307693, + "grad_norm": 0.17068041861057281, + "learning_rate": 1.64e-05, + "loss": 1.136276364326477, + "step": 42 + }, + { + "epoch": 0.08058608058608059, + "grad_norm": 0.19196651875972748, + "learning_rate": 1.72e-05, + "loss": 1.5315269231796265, + "step": 44 + }, + { + "epoch": 0.08424908424908426, + "grad_norm": 0.12281376868486404, + "learning_rate": 1.8e-05, + "loss": 1.6208034753799438, + "step": 46 + }, + { + "epoch": 0.08791208791208792, + "grad_norm": 0.40122199058532715, + "learning_rate": 1.88e-05, + "loss": 1.05559241771698, + "step": 48 + }, + { + "epoch": 0.09157509157509157, + "grad_norm": 0.687041163444519, + "learning_rate": 1.9600000000000002e-05, + "loss": 1.5109608173370361, + "step": 50 + }, + { + "epoch": 0.09523809523809523, + "grad_norm": 0.15380625426769257, + "learning_rate": 1.999998238790087e-05, + "loss": 0.7463083863258362, + "step": 52 + }, + { + "epoch": 0.0989010989010989, + "grad_norm": 0.37560248374938965, + "learning_rate": 1.999984149152137e-05, + "loss": 0.9765978455543518, + "step": 54 + }, + { + "epoch": 0.10256410256410256, + "grad_norm": 0.15966370701789856, + "learning_rate": 1.999955970096814e-05, + "loss": 1.3462445735931396, + "step": 56 + }, + { + "epoch": 0.10622710622710622, + "grad_norm": 0.11719954758882523, + "learning_rate": 1.9999137020652663e-05, + "loss": 1.197383999824524, + "step": 58 + }, + { + "epoch": 0.10989010989010989, + "grad_norm": 0.3115270137786865, + "learning_rate": 1.999857345719207e-05, + "loss": 1.4108028411865234, + "step": 60 + }, + { + "epoch": 0.11355311355311355, + "grad_norm": 0.31386664509773254, + "learning_rate": 1.9997869019409047e-05, + "loss": 1.4261771440505981, + "step": 62 + }, + { + "epoch": 0.11721611721611722, + "grad_norm": 0.25177937746047974, + "learning_rate": 1.9997023718331707e-05, + "loss": 1.3881282806396484, + "step": 64 + }, + { + "epoch": 0.12087912087912088, + "grad_norm": 0.14471372961997986, + "learning_rate": 1.9996037567193388e-05, + "loss": 1.3539735078811646, + "step": 66 + }, + { + "epoch": 0.12454212454212454, + "grad_norm": 0.14020830392837524, + "learning_rate": 1.9994910581432466e-05, + "loss": 1.3212106227874756, + "step": 68 + }, + { + "epoch": 0.1282051282051282, + "grad_norm": 0.6085403561592102, + "learning_rate": 1.9993642778692116e-05, + "loss": 1.0624397993087769, + "step": 70 + }, + { + "epoch": 0.13186813186813187, + "grad_norm": 0.11608725786209106, + "learning_rate": 1.999223417882002e-05, + "loss": 1.4300200939178467, + "step": 72 + }, + { + "epoch": 0.13553113553113552, + "grad_norm": 0.35819724202156067, + "learning_rate": 1.9990684803868068e-05, + "loss": 1.5944573879241943, + "step": 74 + }, + { + "epoch": 0.1391941391941392, + "grad_norm": 0.09426046907901764, + "learning_rate": 1.9988994678092007e-05, + "loss": 1.0820951461791992, + "step": 76 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.1737513393163681, + "learning_rate": 1.9987163827951077e-05, + "loss": 1.4329181909561157, + "step": 78 + }, + { + "epoch": 0.14652014652014653, + "grad_norm": 0.25684013962745667, + "learning_rate": 1.998519228210756e-05, + "loss": 1.5802102088928223, + "step": 80 + }, + { + "epoch": 0.15018315018315018, + "grad_norm": 0.6553322672843933, + "learning_rate": 1.998308007142638e-05, + "loss": 1.1970324516296387, + "step": 82 + }, + { + "epoch": 0.15384615384615385, + "grad_norm": 0.16790863871574402, + "learning_rate": 1.9980827228974575e-05, + "loss": 1.3608276844024658, + "step": 84 + }, + { + "epoch": 0.1575091575091575, + "grad_norm": 0.3864733874797821, + "learning_rate": 1.997843379002081e-05, + "loss": 1.4945706129074097, + "step": 86 + }, + { + "epoch": 0.16117216117216118, + "grad_norm": 0.12279342859983444, + "learning_rate": 1.9975899792034824e-05, + "loss": 0.700541615486145, + "step": 88 + }, + { + "epoch": 0.16483516483516483, + "grad_norm": 0.06569632887840271, + "learning_rate": 1.9973225274686804e-05, + "loss": 0.8554237484931946, + "step": 90 + }, + { + "epoch": 0.1684981684981685, + "grad_norm": 0.14802643656730652, + "learning_rate": 1.9970410279846816e-05, + "loss": 1.31403648853302, + "step": 92 + }, + { + "epoch": 0.17216117216117216, + "grad_norm": 0.13368292152881622, + "learning_rate": 1.9967454851584132e-05, + "loss": 1.3383275270462036, + "step": 94 + }, + { + "epoch": 0.17582417582417584, + "grad_norm": 0.22364209592342377, + "learning_rate": 1.996435903616651e-05, + "loss": 1.2345792055130005, + "step": 96 + }, + { + "epoch": 0.1794871794871795, + "grad_norm": 0.3644048571586609, + "learning_rate": 1.9961122882059523e-05, + "loss": 1.3272985219955444, + "step": 98 + }, + { + "epoch": 0.18315018315018314, + "grad_norm": 0.38108178973197937, + "learning_rate": 1.9957746439925748e-05, + "loss": 1.1760129928588867, + "step": 100 + }, + { + "epoch": 0.18681318681318682, + "grad_norm": 0.37011611461639404, + "learning_rate": 1.9954229762624016e-05, + "loss": 1.2455718517303467, + "step": 102 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 0.21642433106899261, + "learning_rate": 1.995057290520855e-05, + "loss": 0.897014319896698, + "step": 104 + }, + { + "epoch": 0.19413919413919414, + "grad_norm": 0.3669384717941284, + "learning_rate": 1.9946775924928132e-05, + "loss": 1.0838041305541992, + "step": 106 + }, + { + "epoch": 0.1978021978021978, + "grad_norm": 0.38939031958580017, + "learning_rate": 1.9942838881225183e-05, + "loss": 1.3144299983978271, + "step": 108 + }, + { + "epoch": 0.20146520146520147, + "grad_norm": 0.6144610643386841, + "learning_rate": 1.9938761835734842e-05, + "loss": 1.1635433435440063, + "step": 110 + }, + { + "epoch": 0.20512820512820512, + "grad_norm": 0.1933593600988388, + "learning_rate": 1.9934544852284013e-05, + "loss": 1.2244360446929932, + "step": 112 + }, + { + "epoch": 0.2087912087912088, + "grad_norm": 0.30885422229766846, + "learning_rate": 1.9930187996890347e-05, + "loss": 0.6363462209701538, + "step": 114 + }, + { + "epoch": 0.21245421245421245, + "grad_norm": 0.1966598927974701, + "learning_rate": 1.992569133776121e-05, + "loss": 1.300977349281311, + "step": 116 + }, + { + "epoch": 0.21611721611721613, + "grad_norm": 0.8058923482894897, + "learning_rate": 1.992105494529264e-05, + "loss": 1.2940763235092163, + "step": 118 + }, + { + "epoch": 0.21978021978021978, + "grad_norm": 0.2799862325191498, + "learning_rate": 1.99162788920682e-05, + "loss": 1.3638042211532593, + "step": 120 + }, + { + "epoch": 0.22344322344322345, + "grad_norm": 9.588260650634766, + "learning_rate": 1.9911363252857887e-05, + "loss": 1.2911320924758911, + "step": 122 + }, + { + "epoch": 0.2271062271062271, + "grad_norm": 0.3367455303668976, + "learning_rate": 1.990630810461694e-05, + "loss": 1.022411823272705, + "step": 124 + }, + { + "epoch": 0.23076923076923078, + "grad_norm": 0.18774043023586273, + "learning_rate": 1.990111352648463e-05, + "loss": 0.8959170579910278, + "step": 126 + }, + { + "epoch": 0.23443223443223443, + "grad_norm": 0.19097784161567688, + "learning_rate": 1.9895779599783033e-05, + "loss": 1.1906862258911133, + "step": 128 + }, + { + "epoch": 0.23809523809523808, + "grad_norm": 0.26235634088516235, + "learning_rate": 1.989030640801576e-05, + "loss": 1.2996340990066528, + "step": 130 + }, + { + "epoch": 0.24175824175824176, + "grad_norm": 0.22878225147724152, + "learning_rate": 1.9884694036866624e-05, + "loss": 1.391095757484436, + "step": 132 + }, + { + "epoch": 0.2454212454212454, + "grad_norm": 0.6518133878707886, + "learning_rate": 1.9878942574198334e-05, + "loss": 1.289358377456665, + "step": 134 + }, + { + "epoch": 0.2490842490842491, + "grad_norm": 0.16628128290176392, + "learning_rate": 1.9873052110051094e-05, + "loss": 1.273111343383789, + "step": 136 + }, + { + "epoch": 0.25274725274725274, + "grad_norm": 0.9112105369567871, + "learning_rate": 1.9867022736641205e-05, + "loss": 1.089441180229187, + "step": 138 + }, + { + "epoch": 0.2564102564102564, + "grad_norm": 0.1825258433818817, + "learning_rate": 1.9860854548359615e-05, + "loss": 1.2736470699310303, + "step": 140 + }, + { + "epoch": 0.2600732600732601, + "grad_norm": 0.40484118461608887, + "learning_rate": 1.9854547641770446e-05, + "loss": 1.2917908430099487, + "step": 142 + }, + { + "epoch": 0.26373626373626374, + "grad_norm": 0.1851384937763214, + "learning_rate": 1.9848102115609483e-05, + "loss": 1.2552529573440552, + "step": 144 + }, + { + "epoch": 0.2673992673992674, + "grad_norm": 0.3763132095336914, + "learning_rate": 1.9841518070782615e-05, + "loss": 1.4075181484222412, + "step": 146 + }, + { + "epoch": 0.27106227106227104, + "grad_norm": 0.19049830734729767, + "learning_rate": 1.983479561036429e-05, + "loss": 1.3139081001281738, + "step": 148 + }, + { + "epoch": 0.27472527472527475, + "grad_norm": 0.06231250986456871, + "learning_rate": 1.982793483959585e-05, + "loss": 0.9256948828697205, + "step": 150 + }, + { + "epoch": 0.2783882783882784, + "grad_norm": 0.09106171131134033, + "learning_rate": 1.9820935865883924e-05, + "loss": 0.635033369064331, + "step": 152 + }, + { + "epoch": 0.28205128205128205, + "grad_norm": 0.18586048483848572, + "learning_rate": 1.981379879879874e-05, + "loss": 1.1024783849716187, + "step": 154 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.18290774524211884, + "learning_rate": 1.9806523750072385e-05, + "loss": 1.3396060466766357, + "step": 156 + }, + { + "epoch": 0.2893772893772894, + "grad_norm": 0.2552820146083832, + "learning_rate": 1.9799110833597093e-05, + "loss": 1.2755292654037476, + "step": 158 + }, + { + "epoch": 0.29304029304029305, + "grad_norm": 0.21683865785598755, + "learning_rate": 1.9791560165423433e-05, + "loss": 0.9428300857543945, + "step": 160 + }, + { + "epoch": 0.2967032967032967, + "grad_norm": 0.16783960163593292, + "learning_rate": 1.9783871863758503e-05, + "loss": 1.5332800149917603, + "step": 162 + }, + { + "epoch": 0.30036630036630035, + "grad_norm": 0.4921853244304657, + "learning_rate": 1.9776046048964082e-05, + "loss": 1.0453159809112549, + "step": 164 + }, + { + "epoch": 0.304029304029304, + "grad_norm": 0.17284567654132843, + "learning_rate": 1.9768082843554737e-05, + "loss": 1.389228105545044, + "step": 166 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 0.16623765230178833, + "learning_rate": 1.9759982372195918e-05, + "loss": 1.129130244255066, + "step": 168 + }, + { + "epoch": 0.31135531135531136, + "grad_norm": 0.3981800079345703, + "learning_rate": 1.9751744761701984e-05, + "loss": 1.2560561895370483, + "step": 170 + }, + { + "epoch": 0.315018315018315, + "grad_norm": 0.17309850454330444, + "learning_rate": 1.9743370141034248e-05, + "loss": 1.0026013851165771, + "step": 172 + }, + { + "epoch": 0.31868131868131866, + "grad_norm": 0.12312160432338715, + "learning_rate": 1.973485864129894e-05, + "loss": 0.8527880311012268, + "step": 174 + }, + { + "epoch": 0.32234432234432236, + "grad_norm": 0.7391336560249329, + "learning_rate": 1.9726210395745148e-05, + "loss": 1.3922659158706665, + "step": 176 + }, + { + "epoch": 0.326007326007326, + "grad_norm": 0.16699042916297913, + "learning_rate": 1.971742553976275e-05, + "loss": 0.921902596950531, + "step": 178 + }, + { + "epoch": 0.32967032967032966, + "grad_norm": 0.22556649148464203, + "learning_rate": 1.9708504210880284e-05, + "loss": 1.4865270853042603, + "step": 180 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.2530522346496582, + "learning_rate": 1.969944654876279e-05, + "loss": 0.973817765712738, + "step": 182 + }, + { + "epoch": 0.336996336996337, + "grad_norm": 0.2421599179506302, + "learning_rate": 1.9690252695209636e-05, + "loss": 1.2389814853668213, + "step": 184 + }, + { + "epoch": 0.34065934065934067, + "grad_norm": 0.33265063166618347, + "learning_rate": 1.9680922794152294e-05, + "loss": 1.3093231916427612, + "step": 186 + }, + { + "epoch": 0.3443223443223443, + "grad_norm": 0.14455479383468628, + "learning_rate": 1.9671456991652072e-05, + "loss": 1.1712009906768799, + "step": 188 + }, + { + "epoch": 0.34798534798534797, + "grad_norm": 0.16971716284751892, + "learning_rate": 1.9661855435897858e-05, + "loss": 1.2694875001907349, + "step": 190 + }, + { + "epoch": 0.3516483516483517, + "grad_norm": 0.42642462253570557, + "learning_rate": 1.9652118277203767e-05, + "loss": 1.0975149869918823, + "step": 192 + }, + { + "epoch": 0.3553113553113553, + "grad_norm": 0.14698947966098785, + "learning_rate": 1.9642245668006814e-05, + "loss": 1.2505216598510742, + "step": 194 + }, + { + "epoch": 0.358974358974359, + "grad_norm": 0.16152219474315643, + "learning_rate": 1.963223776286451e-05, + "loss": 1.252524495124817, + "step": 196 + }, + { + "epoch": 0.3626373626373626, + "grad_norm": 0.11607574671506882, + "learning_rate": 1.9622094718452448e-05, + "loss": 0.8584736585617065, + "step": 198 + }, + { + "epoch": 0.3663003663003663, + "grad_norm": 0.24364766478538513, + "learning_rate": 1.9611816693561858e-05, + "loss": 1.0111479759216309, + "step": 200 + }, + { + "epoch": 0.36996336996337, + "grad_norm": 0.9354498386383057, + "learning_rate": 1.96014038490971e-05, + "loss": 1.4314929246902466, + "step": 202 + }, + { + "epoch": 0.37362637362637363, + "grad_norm": 0.20628754794597626, + "learning_rate": 1.9590856348073182e-05, + "loss": 1.205211877822876, + "step": 204 + }, + { + "epoch": 0.3772893772893773, + "grad_norm": 0.8566434979438782, + "learning_rate": 1.9580174355613168e-05, + "loss": 0.7619340419769287, + "step": 206 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 0.222511425614357, + "learning_rate": 1.9569358038945617e-05, + "loss": 1.1318811178207397, + "step": 208 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 0.20434831082820892, + "learning_rate": 1.9558407567401945e-05, + "loss": 1.407196283340454, + "step": 210 + }, + { + "epoch": 0.3882783882783883, + "grad_norm": 0.14155927300453186, + "learning_rate": 1.9547323112413806e-05, + "loss": 1.0677672624588013, + "step": 212 + }, + { + "epoch": 0.39194139194139194, + "grad_norm": 0.20749327540397644, + "learning_rate": 1.9536104847510384e-05, + "loss": 1.1320164203643799, + "step": 214 + }, + { + "epoch": 0.3956043956043956, + "grad_norm": 0.24210180342197418, + "learning_rate": 1.9524752948315677e-05, + "loss": 1.2214279174804688, + "step": 216 + }, + { + "epoch": 0.3992673992673993, + "grad_norm": 0.20847296714782715, + "learning_rate": 1.9513267592545752e-05, + "loss": 1.2568475008010864, + "step": 218 + }, + { + "epoch": 0.40293040293040294, + "grad_norm": 0.14721128344535828, + "learning_rate": 1.9501648960005964e-05, + "loss": 0.6154600381851196, + "step": 220 + }, + { + "epoch": 0.4065934065934066, + "grad_norm": 0.6793058514595032, + "learning_rate": 1.948989723258815e-05, + "loss": 1.3387889862060547, + "step": 222 + }, + { + "epoch": 0.41025641025641024, + "grad_norm": 0.32100027799606323, + "learning_rate": 1.9478012594267757e-05, + "loss": 1.0697791576385498, + "step": 224 + }, + { + "epoch": 0.4139194139194139, + "grad_norm": 0.594149649143219, + "learning_rate": 1.946599523110099e-05, + "loss": 1.2391932010650635, + "step": 226 + }, + { + "epoch": 0.4175824175824176, + "grad_norm": 0.21164406836032867, + "learning_rate": 1.945384533122187e-05, + "loss": 1.3020524978637695, + "step": 228 + }, + { + "epoch": 0.42124542124542125, + "grad_norm": 0.4633813500404358, + "learning_rate": 1.9441563084839324e-05, + "loss": 1.2441799640655518, + "step": 230 + }, + { + "epoch": 0.4249084249084249, + "grad_norm": 0.3936166763305664, + "learning_rate": 1.942914868423417e-05, + "loss": 0.9889364838600159, + "step": 232 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.3139214515686035, + "learning_rate": 1.941660232375614e-05, + "loss": 1.4886717796325684, + "step": 234 + }, + { + "epoch": 0.43223443223443225, + "grad_norm": 0.3724687993526459, + "learning_rate": 1.9403924199820813e-05, + "loss": 1.016167163848877, + "step": 236 + }, + { + "epoch": 0.4358974358974359, + "grad_norm": 0.17974892258644104, + "learning_rate": 1.9391114510906546e-05, + "loss": 1.0665429830551147, + "step": 238 + }, + { + "epoch": 0.43956043956043955, + "grad_norm": 0.3367801010608673, + "learning_rate": 1.937817345755138e-05, + "loss": 0.8994011878967285, + "step": 240 + }, + { + "epoch": 0.4432234432234432, + "grad_norm": 0.4172503650188446, + "learning_rate": 1.9365101242349883e-05, + "loss": 0.8775147795677185, + "step": 242 + }, + { + "epoch": 0.4468864468864469, + "grad_norm": 0.09528416395187378, + "learning_rate": 1.9351898069949985e-05, + "loss": 0.5708340406417847, + "step": 244 + }, + { + "epoch": 0.45054945054945056, + "grad_norm": 0.16236096620559692, + "learning_rate": 1.9338564147049785e-05, + "loss": 1.2500817775726318, + "step": 246 + }, + { + "epoch": 0.4542124542124542, + "grad_norm": 2.8649377822875977, + "learning_rate": 1.9325099682394296e-05, + "loss": 0.8762341141700745, + "step": 248 + }, + { + "epoch": 0.45787545787545786, + "grad_norm": 0.45987099409103394, + "learning_rate": 1.9311504886772183e-05, + "loss": 1.2730751037597656, + "step": 250 + }, + { + "epoch": 0.46153846153846156, + "grad_norm": 0.11307661235332489, + "learning_rate": 1.929777997301248e-05, + "loss": 1.1749809980392456, + "step": 252 + }, + { + "epoch": 0.4652014652014652, + "grad_norm": 0.277836412191391, + "learning_rate": 1.9283925155981228e-05, + "loss": 0.9623442888259888, + "step": 254 + }, + { + "epoch": 0.46886446886446886, + "grad_norm": 0.15862256288528442, + "learning_rate": 1.9269940652578143e-05, + "loss": 1.2541102170944214, + "step": 256 + }, + { + "epoch": 0.4725274725274725, + "grad_norm": 0.20002801716327667, + "learning_rate": 1.9255826681733194e-05, + "loss": 1.2813372611999512, + "step": 258 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 0.0928114503622055, + "learning_rate": 1.924158346440319e-05, + "loss": 0.7289823293685913, + "step": 260 + }, + { + "epoch": 0.47985347985347987, + "grad_norm": 0.1095692440867424, + "learning_rate": 1.9227211223568317e-05, + "loss": 1.1414886713027954, + "step": 262 + }, + { + "epoch": 0.4835164835164835, + "grad_norm": 0.13363513350486755, + "learning_rate": 1.9212710184228654e-05, + "loss": 1.2244765758514404, + "step": 264 + }, + { + "epoch": 0.48717948717948717, + "grad_norm": 0.1490332931280136, + "learning_rate": 1.9198080573400634e-05, + "loss": 1.4965099096298218, + "step": 266 + }, + { + "epoch": 0.4908424908424908, + "grad_norm": 0.43857482075691223, + "learning_rate": 1.9183322620113505e-05, + "loss": 0.7830735445022583, + "step": 268 + }, + { + "epoch": 0.4945054945054945, + "grad_norm": 0.3251877427101135, + "learning_rate": 1.916843655540574e-05, + "loss": 1.1967202425003052, + "step": 270 + }, + { + "epoch": 0.4981684981684982, + "grad_norm": 0.27459704875946045, + "learning_rate": 1.915342261232142e-05, + "loss": 0.8744536638259888, + "step": 272 + }, + { + "epoch": 0.5018315018315018, + "grad_norm": 0.21172094345092773, + "learning_rate": 1.913828102590659e-05, + "loss": 1.248273253440857, + "step": 274 + }, + { + "epoch": 0.5054945054945055, + "grad_norm": 0.3026851713657379, + "learning_rate": 1.9123012033205564e-05, + "loss": 0.8064572215080261, + "step": 276 + }, + { + "epoch": 0.5091575091575091, + "grad_norm": 0.0853167474269867, + "learning_rate": 1.9107615873257234e-05, + "loss": 0.8765072226524353, + "step": 278 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.3027302026748657, + "learning_rate": 1.909209278709131e-05, + "loss": 1.2487154006958008, + "step": 280 + }, + { + "epoch": 0.5164835164835165, + "grad_norm": 0.40836283564567566, + "learning_rate": 1.9076443017724568e-05, + "loss": 1.2448886632919312, + "step": 282 + }, + { + "epoch": 0.5201465201465202, + "grad_norm": 0.2344697266817093, + "learning_rate": 1.9060666810157025e-05, + "loss": 1.2436648607254028, + "step": 284 + }, + { + "epoch": 0.5238095238095238, + "grad_norm": 1.9295133352279663, + "learning_rate": 1.9044764411368106e-05, + "loss": 1.0280476808547974, + "step": 286 + }, + { + "epoch": 0.5274725274725275, + "grad_norm": 0.2943372130393982, + "learning_rate": 1.9028736070312796e-05, + "loss": 1.2490639686584473, + "step": 288 + }, + { + "epoch": 0.5311355311355311, + "grad_norm": 0.20403721928596497, + "learning_rate": 1.9012582037917713e-05, + "loss": 1.2165021896362305, + "step": 290 + }, + { + "epoch": 0.5347985347985348, + "grad_norm": 0.07780837267637253, + "learning_rate": 1.8996302567077217e-05, + "loss": 0.7315054535865784, + "step": 292 + }, + { + "epoch": 0.5384615384615384, + "grad_norm": 0.02841615118086338, + "learning_rate": 1.897989791264941e-05, + "loss": 0.9443866610527039, + "step": 294 + }, + { + "epoch": 0.5421245421245421, + "grad_norm": 1.233228087425232, + "learning_rate": 1.8963368331452172e-05, + "loss": 1.0225800275802612, + "step": 296 + }, + { + "epoch": 0.5457875457875457, + "grad_norm": 0.4084673523902893, + "learning_rate": 1.8946714082259145e-05, + "loss": 1.2971231937408447, + "step": 298 + }, + { + "epoch": 0.5494505494505495, + "grad_norm": 0.10006222873926163, + "learning_rate": 1.8929935425795655e-05, + "loss": 1.1916959285736084, + "step": 300 + }, + { + "epoch": 0.5531135531135531, + "grad_norm": 0.09651292860507965, + "learning_rate": 1.8913032624734657e-05, + "loss": 1.1871459484100342, + "step": 302 + }, + { + "epoch": 0.5567765567765568, + "grad_norm": 1.0658628940582275, + "learning_rate": 1.8896005943692614e-05, + "loss": 0.9745575189590454, + "step": 304 + }, + { + "epoch": 0.5604395604395604, + "grad_norm": 1.3608713150024414, + "learning_rate": 1.8878855649225346e-05, + "loss": 0.9455310106277466, + "step": 306 + }, + { + "epoch": 0.5641025641025641, + "grad_norm": 0.2951708436012268, + "learning_rate": 1.8861582009823868e-05, + "loss": 1.4047762155532837, + "step": 308 + }, + { + "epoch": 0.5677655677655677, + "grad_norm": 0.21646811068058014, + "learning_rate": 1.884418529591018e-05, + "loss": 0.9867649078369141, + "step": 310 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.16589109599590302, + "learning_rate": 1.882666577983304e-05, + "loss": 1.2163362503051758, + "step": 312 + }, + { + "epoch": 0.575091575091575, + "grad_norm": 0.13253678381443024, + "learning_rate": 1.8809023735863693e-05, + "loss": 1.1416099071502686, + "step": 314 + }, + { + "epoch": 0.5787545787545788, + "grad_norm": 0.34199172258377075, + "learning_rate": 1.879125944019158e-05, + "loss": 1.2828210592269897, + "step": 316 + }, + { + "epoch": 0.5824175824175825, + "grad_norm": 0.16684100031852722, + "learning_rate": 1.8773373170920022e-05, + "loss": 1.1197882890701294, + "step": 318 + }, + { + "epoch": 0.5860805860805861, + "grad_norm": 0.1224118322134018, + "learning_rate": 1.875536520806185e-05, + "loss": 1.3375335931777954, + "step": 320 + }, + { + "epoch": 0.5897435897435898, + "grad_norm": 0.20716524124145508, + "learning_rate": 1.8737235833535033e-05, + "loss": 1.5252546072006226, + "step": 322 + }, + { + "epoch": 0.5934065934065934, + "grad_norm": 0.15489473938941956, + "learning_rate": 1.871898533115827e-05, + "loss": 1.2672544717788696, + "step": 324 + }, + { + "epoch": 0.5970695970695971, + "grad_norm": 0.10702221095561981, + "learning_rate": 1.870061398664653e-05, + "loss": 1.359837293624878, + "step": 326 + }, + { + "epoch": 0.6007326007326007, + "grad_norm": 0.22020572423934937, + "learning_rate": 1.868212208760658e-05, + "loss": 1.2261296510696411, + "step": 328 + }, + { + "epoch": 0.6043956043956044, + "grad_norm": 0.2609195113182068, + "learning_rate": 1.8663509923532514e-05, + "loss": 1.1154756546020508, + "step": 330 + }, + { + "epoch": 0.608058608058608, + "grad_norm": 0.37561291456222534, + "learning_rate": 1.8644777785801175e-05, + "loss": 1.1825737953186035, + "step": 332 + }, + { + "epoch": 0.6117216117216118, + "grad_norm": 0.32921549677848816, + "learning_rate": 1.862592596766763e-05, + "loss": 1.2822954654693604, + "step": 334 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.40565115213394165, + "learning_rate": 1.8606954764260556e-05, + "loss": 0.8941524028778076, + "step": 336 + }, + { + "epoch": 0.6190476190476191, + "grad_norm": 0.12613314390182495, + "learning_rate": 1.8587864472577632e-05, + "loss": 1.2352339029312134, + "step": 338 + }, + { + "epoch": 0.6227106227106227, + "grad_norm": 0.38321077823638916, + "learning_rate": 1.8568655391480882e-05, + "loss": 1.2283233404159546, + "step": 340 + }, + { + "epoch": 0.6263736263736264, + "grad_norm": 0.17666201293468475, + "learning_rate": 1.8549327821692008e-05, + "loss": 0.5744314193725586, + "step": 342 + }, + { + "epoch": 0.63003663003663, + "grad_norm": 0.18057258427143097, + "learning_rate": 1.852988206578767e-05, + "loss": 1.4381451606750488, + "step": 344 + }, + { + "epoch": 0.6336996336996337, + "grad_norm": 0.29170721769332886, + "learning_rate": 1.851031842819475e-05, + "loss": 0.6940987706184387, + "step": 346 + }, + { + "epoch": 0.6373626373626373, + "grad_norm": 0.3080287277698517, + "learning_rate": 1.849063721518559e-05, + "loss": 1.1522477865219116, + "step": 348 + }, + { + "epoch": 0.6410256410256411, + "grad_norm": 0.5422199964523315, + "learning_rate": 1.8470838734873205e-05, + "loss": 0.8789457082748413, + "step": 350 + }, + { + "epoch": 0.6446886446886447, + "grad_norm": 0.3600572347640991, + "learning_rate": 1.8450923297206446e-05, + "loss": 0.921578049659729, + "step": 352 + }, + { + "epoch": 0.6483516483516484, + "grad_norm": 0.2704165279865265, + "learning_rate": 1.8430891213965146e-05, + "loss": 0.9506340026855469, + "step": 354 + }, + { + "epoch": 0.652014652014652, + "grad_norm": 0.193269282579422, + "learning_rate": 1.8410742798755255e-05, + "loss": 1.1715575456619263, + "step": 356 + }, + { + "epoch": 0.6556776556776557, + "grad_norm": 0.27199605107307434, + "learning_rate": 1.8390478367003922e-05, + "loss": 1.151785135269165, + "step": 358 + }, + { + "epoch": 0.6593406593406593, + "grad_norm": 0.4195846915245056, + "learning_rate": 1.8370098235954553e-05, + "loss": 0.6752058267593384, + "step": 360 + }, + { + "epoch": 0.663003663003663, + "grad_norm": 0.23578767478466034, + "learning_rate": 1.834960272466184e-05, + "loss": 0.9451608657836914, + "step": 362 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.24091851711273193, + "learning_rate": 1.832899215398679e-05, + "loss": 0.9306972026824951, + "step": 364 + }, + { + "epoch": 0.6703296703296703, + "grad_norm": 0.2906991243362427, + "learning_rate": 1.8308266846591673e-05, + "loss": 1.1896872520446777, + "step": 366 + }, + { + "epoch": 0.673992673992674, + "grad_norm": 0.8038737177848816, + "learning_rate": 1.828742712693499e-05, + "loss": 1.0567468404769897, + "step": 368 + }, + { + "epoch": 0.6776556776556777, + "grad_norm": 0.3876494765281677, + "learning_rate": 1.8266473321266385e-05, + "loss": 1.0929261445999146, + "step": 370 + }, + { + "epoch": 0.6813186813186813, + "grad_norm": 0.34046557545661926, + "learning_rate": 1.824540575762154e-05, + "loss": 1.1846730709075928, + "step": 372 + }, + { + "epoch": 0.684981684981685, + "grad_norm": 0.17690998315811157, + "learning_rate": 1.8224224765817033e-05, + "loss": 1.2002733945846558, + "step": 374 + }, + { + "epoch": 0.6886446886446886, + "grad_norm": 0.9796584844589233, + "learning_rate": 1.820293067744519e-05, + "loss": 0.8656985759735107, + "step": 376 + }, + { + "epoch": 0.6923076923076923, + "grad_norm": 0.1290479600429535, + "learning_rate": 1.8181523825868882e-05, + "loss": 0.8376519680023193, + "step": 378 + }, + { + "epoch": 0.6959706959706959, + "grad_norm": 0.5467531085014343, + "learning_rate": 1.816000454621631e-05, + "loss": 1.0472257137298584, + "step": 380 + }, + { + "epoch": 0.6996336996336996, + "grad_norm": 0.1477574110031128, + "learning_rate": 1.8138373175375744e-05, + "loss": 0.9799332022666931, + "step": 382 + }, + { + "epoch": 0.7032967032967034, + "grad_norm": 0.2413100004196167, + "learning_rate": 1.8116630051990283e-05, + "loss": 1.1842981576919556, + "step": 384 + }, + { + "epoch": 0.706959706959707, + "grad_norm": 0.20904967188835144, + "learning_rate": 1.8094775516452522e-05, + "loss": 1.0900828838348389, + "step": 386 + }, + { + "epoch": 0.7106227106227107, + "grad_norm": 0.13387838006019592, + "learning_rate": 1.807280991089923e-05, + "loss": 0.8862177729606628, + "step": 388 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.2151884138584137, + "learning_rate": 1.8050733579206005e-05, + "loss": 1.094993233680725, + "step": 390 + }, + { + "epoch": 0.717948717948718, + "grad_norm": 0.12963874638080597, + "learning_rate": 1.8028546866981875e-05, + "loss": 1.1782118082046509, + "step": 392 + }, + { + "epoch": 0.7216117216117216, + "grad_norm": 0.2768458127975464, + "learning_rate": 1.8006250121563903e-05, + "loss": 1.118064284324646, + "step": 394 + }, + { + "epoch": 0.7252747252747253, + "grad_norm": 0.1374271959066391, + "learning_rate": 1.798384369201174e-05, + "loss": 1.2445368766784668, + "step": 396 + }, + { + "epoch": 0.7289377289377289, + "grad_norm": 0.1631670594215393, + "learning_rate": 1.796132792910216e-05, + "loss": 0.921625554561615, + "step": 398 + }, + { + "epoch": 0.7326007326007326, + "grad_norm": 0.13680344820022583, + "learning_rate": 1.7938703185323575e-05, + "loss": 0.8475565910339355, + "step": 400 + }, + { + "epoch": 0.7362637362637363, + "grad_norm": 0.17989139258861542, + "learning_rate": 1.7915969814870508e-05, + "loss": 1.2503498792648315, + "step": 402 + }, + { + "epoch": 0.73992673992674, + "grad_norm": 0.10058735311031342, + "learning_rate": 1.789312817363805e-05, + "loss": 0.8367084264755249, + "step": 404 + }, + { + "epoch": 0.7435897435897436, + "grad_norm": 0.1782654970884323, + "learning_rate": 1.7870178619216304e-05, + "loss": 1.018811821937561, + "step": 406 + }, + { + "epoch": 0.7472527472527473, + "grad_norm": 0.18461261689662933, + "learning_rate": 1.784712151088476e-05, + "loss": 1.0193127393722534, + "step": 408 + }, + { + "epoch": 0.7509157509157509, + "grad_norm": 0.48014354705810547, + "learning_rate": 1.782395720960669e-05, + "loss": 0.8782365322113037, + "step": 410 + }, + { + "epoch": 0.7545787545787546, + "grad_norm": 0.20913685858249664, + "learning_rate": 1.780068607802349e-05, + "loss": 1.1719920635223389, + "step": 412 + }, + { + "epoch": 0.7582417582417582, + "grad_norm": 0.18413150310516357, + "learning_rate": 1.7777308480449006e-05, + "loss": 0.9983189105987549, + "step": 414 + }, + { + "epoch": 0.7619047619047619, + "grad_norm": 0.13885222375392914, + "learning_rate": 1.7753824782863827e-05, + "loss": 1.2759360074996948, + "step": 416 + }, + { + "epoch": 0.7655677655677655, + "grad_norm": 0.05709720030426979, + "learning_rate": 1.773023535290956e-05, + "loss": 0.6787292957305908, + "step": 418 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 0.18361720442771912, + "learning_rate": 1.7706540559883066e-05, + "loss": 1.2402327060699463, + "step": 420 + }, + { + "epoch": 0.7728937728937729, + "grad_norm": 0.1252235621213913, + "learning_rate": 1.7682740774730688e-05, + "loss": 0.9981698989868164, + "step": 422 + }, + { + "epoch": 0.7765567765567766, + "grad_norm": 0.4331834018230438, + "learning_rate": 1.7658836370042443e-05, + "loss": 0.4972750246524811, + "step": 424 + }, + { + "epoch": 0.7802197802197802, + "grad_norm": 0.24165484309196472, + "learning_rate": 1.7634827720046178e-05, + "loss": 0.7953031063079834, + "step": 426 + }, + { + "epoch": 0.7838827838827839, + "grad_norm": 0.28377044200897217, + "learning_rate": 1.7610715200601727e-05, + "loss": 1.0705211162567139, + "step": 428 + }, + { + "epoch": 0.7875457875457875, + "grad_norm": 0.14546895027160645, + "learning_rate": 1.7586499189195016e-05, + "loss": 1.2132048606872559, + "step": 430 + }, + { + "epoch": 0.7912087912087912, + "grad_norm": 0.12278863787651062, + "learning_rate": 1.7562180064932158e-05, + "loss": 1.2941319942474365, + "step": 432 + }, + { + "epoch": 0.7948717948717948, + "grad_norm": 0.3506087064743042, + "learning_rate": 1.7537758208533516e-05, + "loss": 0.8893874883651733, + "step": 434 + }, + { + "epoch": 0.7985347985347986, + "grad_norm": 0.1808759868144989, + "learning_rate": 1.7513234002327738e-05, + "loss": 0.963141918182373, + "step": 436 + }, + { + "epoch": 0.8021978021978022, + "grad_norm": 0.5765663385391235, + "learning_rate": 1.748860783024579e-05, + "loss": 0.8931108117103577, + "step": 438 + }, + { + "epoch": 0.8058608058608059, + "grad_norm": 0.16014474630355835, + "learning_rate": 1.746388007781492e-05, + "loss": 1.3087202310562134, + "step": 440 + }, + { + "epoch": 0.8095238095238095, + "grad_norm": 0.10628227144479752, + "learning_rate": 1.7439051132152644e-05, + "loss": 1.202932596206665, + "step": 442 + }, + { + "epoch": 0.8131868131868132, + "grad_norm": 0.1464785486459732, + "learning_rate": 1.741412138196067e-05, + "loss": 1.2049145698547363, + "step": 444 + }, + { + "epoch": 0.8168498168498168, + "grad_norm": 0.25733712315559387, + "learning_rate": 1.738909121751882e-05, + "loss": 1.2221276760101318, + "step": 446 + }, + { + "epoch": 0.8205128205128205, + "grad_norm": 0.35550013184547424, + "learning_rate": 1.736396103067893e-05, + "loss": 1.230087161064148, + "step": 448 + }, + { + "epoch": 0.8241758241758241, + "grad_norm": 0.18633843958377838, + "learning_rate": 1.7338731214858688e-05, + "loss": 1.3565971851348877, + "step": 450 + }, + { + "epoch": 0.8278388278388278, + "grad_norm": 0.34363844990730286, + "learning_rate": 1.7313402165035504e-05, + "loss": 0.9984432458877563, + "step": 452 + }, + { + "epoch": 0.8315018315018315, + "grad_norm": 0.05576507747173309, + "learning_rate": 1.728797427774031e-05, + "loss": 0.4852646291255951, + "step": 454 + }, + { + "epoch": 0.8351648351648352, + "grad_norm": 0.07310578972101212, + "learning_rate": 1.7262447951051366e-05, + "loss": 0.8963858485221863, + "step": 456 + }, + { + "epoch": 0.8388278388278388, + "grad_norm": 0.28023576736450195, + "learning_rate": 1.7236823584587995e-05, + "loss": 0.8434333801269531, + "step": 458 + }, + { + "epoch": 0.8424908424908425, + "grad_norm": 1.2096292972564697, + "learning_rate": 1.7211101579504382e-05, + "loss": 1.029900312423706, + "step": 460 + }, + { + "epoch": 0.8461538461538461, + "grad_norm": 0.3247793912887573, + "learning_rate": 1.7185282338483243e-05, + "loss": 1.2301326990127563, + "step": 462 + }, + { + "epoch": 0.8498168498168498, + "grad_norm": 0.11958979070186615, + "learning_rate": 1.7159366265729537e-05, + "loss": 1.1807194948196411, + "step": 464 + }, + { + "epoch": 0.8534798534798534, + "grad_norm": 0.04206932708621025, + "learning_rate": 1.713335376696416e-05, + "loss": 1.2045433521270752, + "step": 466 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.15847940742969513, + "learning_rate": 1.7107245249417556e-05, + "loss": 0.8860416412353516, + "step": 468 + }, + { + "epoch": 0.8608058608058609, + "grad_norm": 0.2803979516029358, + "learning_rate": 1.7081041121823375e-05, + "loss": 0.9149615168571472, + "step": 470 + }, + { + "epoch": 0.8644688644688645, + "grad_norm": 0.13727536797523499, + "learning_rate": 1.705474179441205e-05, + "loss": 1.1724745035171509, + "step": 472 + }, + { + "epoch": 0.8681318681318682, + "grad_norm": 0.09685394912958145, + "learning_rate": 1.7028347678904388e-05, + "loss": 0.863320529460907, + "step": 474 + }, + { + "epoch": 0.8717948717948718, + "grad_norm": 0.17197725176811218, + "learning_rate": 1.700185918850512e-05, + "loss": 1.096718192100525, + "step": 476 + }, + { + "epoch": 0.8754578754578755, + "grad_norm": 0.5562029480934143, + "learning_rate": 1.6975276737896443e-05, + "loss": 1.0467816591262817, + "step": 478 + }, + { + "epoch": 0.8791208791208791, + "grad_norm": 2.393152952194214, + "learning_rate": 1.69486007432315e-05, + "loss": 1.0700321197509766, + "step": 480 + }, + { + "epoch": 0.8827838827838828, + "grad_norm": 0.2319096475839615, + "learning_rate": 1.6921831622127905e-05, + "loss": 1.1908187866210938, + "step": 482 + }, + { + "epoch": 0.8864468864468864, + "grad_norm": 0.1952940672636032, + "learning_rate": 1.6894969793661163e-05, + "loss": 1.2682039737701416, + "step": 484 + }, + { + "epoch": 0.8901098901098901, + "grad_norm": 0.1970641314983368, + "learning_rate": 1.686801567835814e-05, + "loss": 0.9106331467628479, + "step": 486 + }, + { + "epoch": 0.8937728937728938, + "grad_norm": 0.3220268785953522, + "learning_rate": 1.6840969698190467e-05, + "loss": 1.1676743030548096, + "step": 488 + }, + { + "epoch": 0.8974358974358975, + "grad_norm": 0.2038663923740387, + "learning_rate": 1.6813832276567942e-05, + "loss": 1.1185270547866821, + "step": 490 + }, + { + "epoch": 0.9010989010989011, + "grad_norm": 0.17804059386253357, + "learning_rate": 1.6786603838331894e-05, + "loss": 1.0551954507827759, + "step": 492 + }, + { + "epoch": 0.9047619047619048, + "grad_norm": 0.4494330883026123, + "learning_rate": 1.6759284809748522e-05, + "loss": 0.5789248943328857, + "step": 494 + }, + { + "epoch": 0.9084249084249084, + "grad_norm": 0.37853649258613586, + "learning_rate": 1.673187561850225e-05, + "loss": 1.2827361822128296, + "step": 496 + }, + { + "epoch": 0.9120879120879121, + "grad_norm": 0.2299915850162506, + "learning_rate": 1.6704376693689003e-05, + "loss": 1.1320176124572754, + "step": 498 + }, + { + "epoch": 0.9157509157509157, + "grad_norm": 0.2586250603199005, + "learning_rate": 1.6676788465809506e-05, + "loss": 0.8153626322746277, + "step": 500 + }, + { + "epoch": 0.9194139194139194, + "grad_norm": 0.061475999653339386, + "learning_rate": 1.6649111366762552e-05, + "loss": 0.8643592596054077, + "step": 502 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 0.24921494722366333, + "learning_rate": 1.66213458298382e-05, + "loss": 0.9420300126075745, + "step": 504 + }, + { + "epoch": 0.9267399267399268, + "grad_norm": 0.0703081414103508, + "learning_rate": 1.659349228971105e-05, + "loss": 0.8538585305213928, + "step": 506 + }, + { + "epoch": 0.9304029304029304, + "grad_norm": 0.14408151805400848, + "learning_rate": 1.6565551182433382e-05, + "loss": 1.1675981283187866, + "step": 508 + }, + { + "epoch": 0.9340659340659341, + "grad_norm": 0.2426132708787918, + "learning_rate": 1.6537522945428386e-05, + "loss": 1.2252295017242432, + "step": 510 + }, + { + "epoch": 0.9377289377289377, + "grad_norm": 0.5424879789352417, + "learning_rate": 1.6509408017483258e-05, + "loss": 1.1778167486190796, + "step": 512 + }, + { + "epoch": 0.9413919413919414, + "grad_norm": 0.142863929271698, + "learning_rate": 1.6481206838742362e-05, + "loss": 0.9604276418685913, + "step": 514 + }, + { + "epoch": 0.945054945054945, + "grad_norm": 0.27180016040802, + "learning_rate": 1.645291985070034e-05, + "loss": 1.1854896545410156, + "step": 516 + }, + { + "epoch": 0.9487179487179487, + "grad_norm": 0.16782020032405853, + "learning_rate": 1.64245474961952e-05, + "loss": 1.2195581197738647, + "step": 518 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.15222570300102234, + "learning_rate": 1.639609021940136e-05, + "loss": 1.218988299369812, + "step": 520 + }, + { + "epoch": 0.9560439560439561, + "grad_norm": 0.2450752556324005, + "learning_rate": 1.6367548465822723e-05, + "loss": 0.8905650973320007, + "step": 522 + }, + { + "epoch": 0.9597069597069597, + "grad_norm": 0.5845417380332947, + "learning_rate": 1.6338922682285697e-05, + "loss": 1.0242419242858887, + "step": 524 + }, + { + "epoch": 0.9633699633699634, + "grad_norm": 2.221842050552368, + "learning_rate": 1.6310213316932187e-05, + "loss": 0.9667062759399414, + "step": 526 + }, + { + "epoch": 0.967032967032967, + "grad_norm": 0.15865476429462433, + "learning_rate": 1.6281420819212578e-05, + "loss": 0.6576095819473267, + "step": 528 + }, + { + "epoch": 0.9706959706959707, + "grad_norm": 0.10818128287792206, + "learning_rate": 1.6252545639878728e-05, + "loss": 0.907448947429657, + "step": 530 + }, + { + "epoch": 0.9743589743589743, + "grad_norm": 1.5423500537872314, + "learning_rate": 1.6223588230976874e-05, + "loss": 1.3604565858840942, + "step": 532 + }, + { + "epoch": 0.978021978021978, + "grad_norm": 0.38299986720085144, + "learning_rate": 1.6194549045840582e-05, + "loss": 0.604587733745575, + "step": 534 + }, + { + "epoch": 0.9816849816849816, + "grad_norm": 0.35183185338974, + "learning_rate": 1.616542853908363e-05, + "loss": 0.8585751056671143, + "step": 536 + }, + { + "epoch": 0.9853479853479854, + "grad_norm": 8.051664352416992, + "learning_rate": 1.6136227166592912e-05, + "loss": 0.8037823438644409, + "step": 538 + }, + { + "epoch": 0.989010989010989, + "grad_norm": 0.36428943276405334, + "learning_rate": 1.6106945385521286e-05, + "loss": 1.1241040229797363, + "step": 540 + }, + { + "epoch": 0.9926739926739927, + "grad_norm": 0.39597558975219727, + "learning_rate": 1.6077583654280416e-05, + "loss": 1.1745156049728394, + "step": 542 + }, + { + "epoch": 0.9963369963369964, + "grad_norm": 1.5844489336013794, + "learning_rate": 1.60481424325336e-05, + "loss": 1.186415195465088, + "step": 544 + }, + { + "epoch": 1.0, + "grad_norm": 0.17938172817230225, + "learning_rate": 1.6018622181188594e-05, + "loss": 1.3618619441986084, + "step": 546 + }, + { + "epoch": 1.0036630036630036, + "grad_norm": 0.3727249503135681, + "learning_rate": 1.598902336239035e-05, + "loss": 0.9441794157028198, + "step": 548 + }, + { + "epoch": 1.0073260073260073, + "grad_norm": 0.24385389685630798, + "learning_rate": 1.595934643951382e-05, + "loss": 1.1801525354385376, + "step": 550 + }, + { + "epoch": 1.010989010989011, + "grad_norm": 0.1038205698132515, + "learning_rate": 1.5929591877156694e-05, + "loss": 0.6706070899963379, + "step": 552 + }, + { + "epoch": 1.0146520146520146, + "grad_norm": 0.2101455181837082, + "learning_rate": 1.5899760141132115e-05, + "loss": 1.1351317167282104, + "step": 554 + }, + { + "epoch": 1.0183150183150182, + "grad_norm": 0.2257443219423294, + "learning_rate": 1.58698516984614e-05, + "loss": 1.0271389484405518, + "step": 556 + }, + { + "epoch": 1.021978021978022, + "grad_norm": 0.1515074372291565, + "learning_rate": 1.583986701736672e-05, + "loss": 1.2197285890579224, + "step": 558 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 0.11847762763500214, + "learning_rate": 1.5809806567263767e-05, + "loss": 0.9212762713432312, + "step": 560 + }, + { + "epoch": 1.0293040293040292, + "grad_norm": 0.40212902426719666, + "learning_rate": 1.577967081875442e-05, + "loss": 1.1707442998886108, + "step": 562 + }, + { + "epoch": 1.032967032967033, + "grad_norm": 0.2982182502746582, + "learning_rate": 1.574946024361936e-05, + "loss": 1.1547435522079468, + "step": 564 + }, + { + "epoch": 1.0366300366300367, + "grad_norm": 0.34150704741477966, + "learning_rate": 1.5719175314810706e-05, + "loss": 1.044006109237671, + "step": 566 + }, + { + "epoch": 1.0402930402930404, + "grad_norm": 0.42247244715690613, + "learning_rate": 1.568881650644458e-05, + "loss": 1.0475645065307617, + "step": 568 + }, + { + "epoch": 1.043956043956044, + "grad_norm": 0.2374022752046585, + "learning_rate": 1.565838429379371e-05, + "loss": 1.1118239164352417, + "step": 570 + }, + { + "epoch": 1.0476190476190477, + "grad_norm": 0.48579737544059753, + "learning_rate": 1.5627879153279986e-05, + "loss": 1.2503812313079834, + "step": 572 + }, + { + "epoch": 1.0512820512820513, + "grad_norm": 0.6810855269432068, + "learning_rate": 1.559730156246699e-05, + "loss": 1.0755858421325684, + "step": 574 + }, + { + "epoch": 1.054945054945055, + "grad_norm": 0.2626459002494812, + "learning_rate": 1.5566652000052533e-05, + "loss": 1.2320541143417358, + "step": 576 + }, + { + "epoch": 1.0586080586080586, + "grad_norm": 1.1261385679244995, + "learning_rate": 1.553593094586115e-05, + "loss": 1.2240521907806396, + "step": 578 + }, + { + "epoch": 1.0622710622710623, + "grad_norm": 0.27919474244117737, + "learning_rate": 1.5505138880836595e-05, + "loss": 1.2212425470352173, + "step": 580 + }, + { + "epoch": 1.065934065934066, + "grad_norm": 0.2261185646057129, + "learning_rate": 1.5474276287034305e-05, + "loss": 0.9912468194961548, + "step": 582 + }, + { + "epoch": 1.0695970695970696, + "grad_norm": 0.23183542490005493, + "learning_rate": 1.544334364761387e-05, + "loss": 1.147226333618164, + "step": 584 + }, + { + "epoch": 1.0732600732600732, + "grad_norm": 0.3248075246810913, + "learning_rate": 1.541234144683144e-05, + "loss": 1.0802392959594727, + "step": 586 + }, + { + "epoch": 1.0769230769230769, + "grad_norm": 0.23752079904079437, + "learning_rate": 1.5381270170032173e-05, + "loss": 0.821092426776886, + "step": 588 + }, + { + "epoch": 1.0805860805860805, + "grad_norm": 0.20078615844249725, + "learning_rate": 1.5350130303642625e-05, + "loss": 1.2033530473709106, + "step": 590 + }, + { + "epoch": 1.0842490842490842, + "grad_norm": 0.13361191749572754, + "learning_rate": 1.5318922335163128e-05, + "loss": 1.0244792699813843, + "step": 592 + }, + { + "epoch": 1.0879120879120878, + "grad_norm": 0.21560516953468323, + "learning_rate": 1.5287646753160174e-05, + "loss": 0.9856408834457397, + "step": 594 + }, + { + "epoch": 1.0915750915750915, + "grad_norm": 0.1309068351984024, + "learning_rate": 1.5256304047258739e-05, + "loss": 1.0107301473617554, + "step": 596 + }, + { + "epoch": 1.0952380952380953, + "grad_norm": 0.40338659286499023, + "learning_rate": 1.522489470813466e-05, + "loss": 1.0993225574493408, + "step": 598 + }, + { + "epoch": 1.098901098901099, + "grad_norm": 0.3209727108478546, + "learning_rate": 1.5193419227506913e-05, + "loss": 1.1531182527542114, + "step": 600 + }, + { + "epoch": 1.1025641025641026, + "grad_norm": 0.1865486353635788, + "learning_rate": 1.5161878098129937e-05, + "loss": 0.8158029317855835, + "step": 602 + }, + { + "epoch": 1.1062271062271063, + "grad_norm": 0.3306953012943268, + "learning_rate": 1.5130271813785908e-05, + "loss": 0.9586283564567566, + "step": 604 + }, + { + "epoch": 1.10989010989011, + "grad_norm": 0.1971583068370819, + "learning_rate": 1.509860086927703e-05, + "loss": 0.8959888815879822, + "step": 606 + }, + { + "epoch": 1.1135531135531136, + "grad_norm": 0.34381699562072754, + "learning_rate": 1.5066865760417757e-05, + "loss": 1.2862759828567505, + "step": 608 + }, + { + "epoch": 1.1172161172161172, + "grad_norm": 0.27361565828323364, + "learning_rate": 1.5035066984027053e-05, + "loss": 0.7218859195709229, + "step": 610 + }, + { + "epoch": 1.120879120879121, + "grad_norm": 0.3765149414539337, + "learning_rate": 1.5003205037920616e-05, + "loss": 1.1658059358596802, + "step": 612 + }, + { + "epoch": 1.1245421245421245, + "grad_norm": 0.21422307193279266, + "learning_rate": 1.497128042090307e-05, + "loss": 0.9905625581741333, + "step": 614 + }, + { + "epoch": 1.1282051282051282, + "grad_norm": 0.1535208374261856, + "learning_rate": 1.493929363276017e-05, + "loss": 1.1560765504837036, + "step": 616 + }, + { + "epoch": 1.1318681318681318, + "grad_norm": 0.22722600400447845, + "learning_rate": 1.4907245174250957e-05, + "loss": 0.41150620579719543, + "step": 618 + }, + { + "epoch": 1.1355311355311355, + "grad_norm": 0.1882268637418747, + "learning_rate": 1.4875135547099953e-05, + "loss": 1.0893880128860474, + "step": 620 + }, + { + "epoch": 1.1391941391941391, + "grad_norm": 1.5121841430664062, + "learning_rate": 1.484296525398927e-05, + "loss": 0.7574386596679688, + "step": 622 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.581392228603363, + "learning_rate": 1.4810734798550769e-05, + "loss": 1.0913819074630737, + "step": 624 + }, + { + "epoch": 1.1465201465201464, + "grad_norm": 0.2060534954071045, + "learning_rate": 1.4778444685358147e-05, + "loss": 1.3770023584365845, + "step": 626 + }, + { + "epoch": 1.15018315018315, + "grad_norm": 0.27674826979637146, + "learning_rate": 1.4746095419919075e-05, + "loss": 0.4543880820274353, + "step": 628 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 0.060255009680986404, + "learning_rate": 1.4713687508667251e-05, + "loss": 1.114593744277954, + "step": 630 + }, + { + "epoch": 1.1575091575091574, + "grad_norm": 1.074992060661316, + "learning_rate": 1.4681221458954484e-05, + "loss": 1.0868229866027832, + "step": 632 + }, + { + "epoch": 1.1611721611721613, + "grad_norm": 0.42347779870033264, + "learning_rate": 1.4648697779042754e-05, + "loss": 0.8624401092529297, + "step": 634 + }, + { + "epoch": 1.164835164835165, + "grad_norm": 0.3118894398212433, + "learning_rate": 1.461611697809625e-05, + "loss": 0.9895141124725342, + "step": 636 + }, + { + "epoch": 1.1684981684981686, + "grad_norm": 0.22496363520622253, + "learning_rate": 1.4583479566173401e-05, + "loss": 1.1844947338104248, + "step": 638 + }, + { + "epoch": 1.1721611721611722, + "grad_norm": 0.07173963636159897, + "learning_rate": 1.4550786054218902e-05, + "loss": 0.7458541989326477, + "step": 640 + }, + { + "epoch": 1.1758241758241759, + "grad_norm": 0.26074904203414917, + "learning_rate": 1.4518036954055685e-05, + "loss": 1.1217985153198242, + "step": 642 + }, + { + "epoch": 1.1794871794871795, + "grad_norm": 0.49440035223960876, + "learning_rate": 1.4485232778376945e-05, + "loss": 1.041925311088562, + "step": 644 + }, + { + "epoch": 1.1831501831501832, + "grad_norm": 0.214096337556839, + "learning_rate": 1.4452374040738078e-05, + "loss": 0.7592092156410217, + "step": 646 + }, + { + "epoch": 1.1868131868131868, + "grad_norm": 1.0578991174697876, + "learning_rate": 1.4419461255548666e-05, + "loss": 0.8962647914886475, + "step": 648 + }, + { + "epoch": 1.1904761904761905, + "grad_norm": 0.26899221539497375, + "learning_rate": 1.4386494938064417e-05, + "loss": 1.141674280166626, + "step": 650 + }, + { + "epoch": 1.1941391941391941, + "grad_norm": 0.21340245008468628, + "learning_rate": 1.4353475604379093e-05, + "loss": 0.6671714186668396, + "step": 652 + }, + { + "epoch": 1.1978021978021978, + "grad_norm": 0.21647410094738007, + "learning_rate": 1.4320403771416438e-05, + "loss": 1.2914996147155762, + "step": 654 + }, + { + "epoch": 1.2014652014652014, + "grad_norm": 0.2455645203590393, + "learning_rate": 1.4287279956922076e-05, + "loss": 0.8283839225769043, + "step": 656 + }, + { + "epoch": 1.205128205128205, + "grad_norm": 1.0074414014816284, + "learning_rate": 1.4254104679455416e-05, + "loss": 0.808253824710846, + "step": 658 + }, + { + "epoch": 1.2087912087912087, + "grad_norm": 0.19323746860027313, + "learning_rate": 1.4220878458381523e-05, + "loss": 1.1553109884262085, + "step": 660 + }, + { + "epoch": 1.2124542124542124, + "grad_norm": 0.48162999749183655, + "learning_rate": 1.418760181386301e-05, + "loss": 1.0250654220581055, + "step": 662 + }, + { + "epoch": 1.2161172161172162, + "grad_norm": 0.23541465401649475, + "learning_rate": 1.4154275266851856e-05, + "loss": 0.8853683471679688, + "step": 664 + }, + { + "epoch": 1.2197802197802199, + "grad_norm": 0.5028203129768372, + "learning_rate": 1.4120899339081291e-05, + "loss": 0.8249969482421875, + "step": 666 + }, + { + "epoch": 1.2234432234432235, + "grad_norm": 0.2646612823009491, + "learning_rate": 1.4087474553057599e-05, + "loss": 0.9055181741714478, + "step": 668 + }, + { + "epoch": 1.2271062271062272, + "grad_norm": 0.2554120123386383, + "learning_rate": 1.405400143205195e-05, + "loss": 0.6745082139968872, + "step": 670 + }, + { + "epoch": 1.2307692307692308, + "grad_norm": 0.25614455342292786, + "learning_rate": 1.4020480500092217e-05, + "loss": 1.1828240156173706, + "step": 672 + }, + { + "epoch": 1.2344322344322345, + "grad_norm": 0.5676877498626709, + "learning_rate": 1.3986912281954745e-05, + "loss": 1.1596636772155762, + "step": 674 + }, + { + "epoch": 1.2380952380952381, + "grad_norm": 0.25206154584884644, + "learning_rate": 1.3953297303156174e-05, + "loss": 1.1791561841964722, + "step": 676 + }, + { + "epoch": 1.2417582417582418, + "grad_norm": 0.3657374382019043, + "learning_rate": 1.391963608994517e-05, + "loss": 0.7706201076507568, + "step": 678 + }, + { + "epoch": 1.2454212454212454, + "grad_norm": 0.9007764458656311, + "learning_rate": 1.3885929169294218e-05, + "loss": 0.8264885544776917, + "step": 680 + }, + { + "epoch": 1.249084249084249, + "grad_norm": 0.6578343510627747, + "learning_rate": 1.3852177068891364e-05, + "loss": 1.192352294921875, + "step": 682 + }, + { + "epoch": 1.2527472527472527, + "grad_norm": 0.17964546382427216, + "learning_rate": 1.3818380317131946e-05, + "loss": 1.1608870029449463, + "step": 684 + }, + { + "epoch": 1.2564102564102564, + "grad_norm": 0.21571148931980133, + "learning_rate": 1.3784539443110323e-05, + "loss": 0.8176043629646301, + "step": 686 + }, + { + "epoch": 1.26007326007326, + "grad_norm": 0.2216089963912964, + "learning_rate": 1.375065497661161e-05, + "loss": 1.1090242862701416, + "step": 688 + }, + { + "epoch": 1.2637362637362637, + "grad_norm": 0.2962993085384369, + "learning_rate": 1.3716727448103356e-05, + "loss": 1.1419543027877808, + "step": 690 + }, + { + "epoch": 1.2673992673992673, + "grad_norm": 0.1734267771244049, + "learning_rate": 1.3682757388727261e-05, + "loss": 1.1804542541503906, + "step": 692 + }, + { + "epoch": 1.271062271062271, + "grad_norm": 0.4003927707672119, + "learning_rate": 1.3648745330290848e-05, + "loss": 0.9813081622123718, + "step": 694 + }, + { + "epoch": 1.2747252747252746, + "grad_norm": 0.7455025315284729, + "learning_rate": 1.361469180525916e-05, + "loss": 1.0020716190338135, + "step": 696 + }, + { + "epoch": 1.2783882783882783, + "grad_norm": 0.23015426099300385, + "learning_rate": 1.358059734674638e-05, + "loss": 0.9081999063491821, + "step": 698 + }, + { + "epoch": 1.282051282051282, + "grad_norm": 0.2477727234363556, + "learning_rate": 1.3546462488507532e-05, + "loss": 0.6512075662612915, + "step": 700 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 0.2636442184448242, + "learning_rate": 1.3512287764930102e-05, + "loss": 0.5245524644851685, + "step": 702 + }, + { + "epoch": 1.2893772893772895, + "grad_norm": 0.3451531231403351, + "learning_rate": 1.347807371102567e-05, + "loss": 1.2959914207458496, + "step": 704 + }, + { + "epoch": 1.293040293040293, + "grad_norm": 0.19401608407497406, + "learning_rate": 1.3443820862421542e-05, + "loss": 0.9994240403175354, + "step": 706 + }, + { + "epoch": 1.2967032967032968, + "grad_norm": 0.09005500376224518, + "learning_rate": 1.3409529755352361e-05, + "loss": 0.955507755279541, + "step": 708 + }, + { + "epoch": 1.3003663003663004, + "grad_norm": 0.14722907543182373, + "learning_rate": 1.3375200926651719e-05, + "loss": 0.5620253682136536, + "step": 710 + }, + { + "epoch": 1.304029304029304, + "grad_norm": 1.1694523096084595, + "learning_rate": 1.3340834913743742e-05, + "loss": 0.8808104991912842, + "step": 712 + }, + { + "epoch": 1.3076923076923077, + "grad_norm": 0.21442115306854248, + "learning_rate": 1.3306432254634676e-05, + "loss": 1.2082892656326294, + "step": 714 + }, + { + "epoch": 1.3113553113553114, + "grad_norm": 0.3201224207878113, + "learning_rate": 1.3271993487904485e-05, + "loss": 1.1394985914230347, + "step": 716 + }, + { + "epoch": 1.315018315018315, + "grad_norm": 0.18615205585956573, + "learning_rate": 1.3237519152698392e-05, + "loss": 0.9257374405860901, + "step": 718 + }, + { + "epoch": 1.3186813186813187, + "grad_norm": 0.23433499038219452, + "learning_rate": 1.3203009788718454e-05, + "loss": 0.92364901304245, + "step": 720 + }, + { + "epoch": 1.3223443223443223, + "grad_norm": 0.5179888606071472, + "learning_rate": 1.3168465936215114e-05, + "loss": 0.9131177067756653, + "step": 722 + }, + { + "epoch": 1.326007326007326, + "grad_norm": 0.27583321928977966, + "learning_rate": 1.3133888135978733e-05, + "loss": 1.2074042558670044, + "step": 724 + }, + { + "epoch": 1.3296703296703296, + "grad_norm": 0.43936124444007874, + "learning_rate": 1.3099276929331132e-05, + "loss": 1.0659313201904297, + "step": 726 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.1440553665161133, + "learning_rate": 1.3064632858117123e-05, + "loss": 1.1416211128234863, + "step": 728 + }, + { + "epoch": 1.3369963369963371, + "grad_norm": 0.4313775599002838, + "learning_rate": 1.3029956464696006e-05, + "loss": 0.7388544082641602, + "step": 730 + }, + { + "epoch": 1.3406593406593408, + "grad_norm": 0.38146156072616577, + "learning_rate": 1.2995248291933099e-05, + "loss": 0.8170838356018066, + "step": 732 + }, + { + "epoch": 1.3443223443223444, + "grad_norm": 0.32572418451309204, + "learning_rate": 1.296050888319123e-05, + "loss": 0.6172389984130859, + "step": 734 + }, + { + "epoch": 1.347985347985348, + "grad_norm": 0.29194140434265137, + "learning_rate": 1.2925738782322232e-05, + "loss": 1.132319450378418, + "step": 736 + }, + { + "epoch": 1.3516483516483517, + "grad_norm": 0.29850953817367554, + "learning_rate": 1.2890938533658429e-05, + "loss": 0.7506582736968994, + "step": 738 + }, + { + "epoch": 1.3553113553113554, + "grad_norm": 0.275314599275589, + "learning_rate": 1.2856108682004116e-05, + "loss": 1.0084264278411865, + "step": 740 + }, + { + "epoch": 1.358974358974359, + "grad_norm": 1.8079997301101685, + "learning_rate": 1.282124977262702e-05, + "loss": 0.6014983057975769, + "step": 742 + }, + { + "epoch": 1.3626373626373627, + "grad_norm": 0.39328068494796753, + "learning_rate": 1.2786362351249785e-05, + "loss": 1.23367178440094, + "step": 744 + }, + { + "epoch": 1.3663003663003663, + "grad_norm": 0.19623665511608124, + "learning_rate": 1.2751446964041405e-05, + "loss": 1.0585216283798218, + "step": 746 + }, + { + "epoch": 1.36996336996337, + "grad_norm": 0.2618620991706848, + "learning_rate": 1.2716504157608693e-05, + "loss": 1.06695556640625, + "step": 748 + }, + { + "epoch": 1.3736263736263736, + "grad_norm": 0.2631770968437195, + "learning_rate": 1.2681534478987703e-05, + "loss": 0.7815660834312439, + "step": 750 + }, + { + "epoch": 1.3772893772893773, + "grad_norm": 0.25882411003112793, + "learning_rate": 1.264653847563519e-05, + "loss": 1.120620608329773, + "step": 752 + }, + { + "epoch": 1.380952380952381, + "grad_norm": 0.09388554096221924, + "learning_rate": 1.2611516695420023e-05, + "loss": 0.9709092974662781, + "step": 754 + }, + { + "epoch": 1.3846153846153846, + "grad_norm": 0.1428227424621582, + "learning_rate": 1.2576469686614608e-05, + "loss": 1.3016420602798462, + "step": 756 + }, + { + "epoch": 1.3882783882783882, + "grad_norm": 0.22650009393692017, + "learning_rate": 1.2541397997886317e-05, + "loss": 1.2032549381256104, + "step": 758 + }, + { + "epoch": 1.3919413919413919, + "grad_norm": 2.37180233001709, + "learning_rate": 1.2506302178288887e-05, + "loss": 1.1462368965148926, + "step": 760 + }, + { + "epoch": 1.3956043956043955, + "grad_norm": 0.25279995799064636, + "learning_rate": 1.2471182777253832e-05, + "loss": 1.1458882093429565, + "step": 762 + }, + { + "epoch": 1.3992673992673992, + "grad_norm": 1.0492079257965088, + "learning_rate": 1.2436040344581824e-05, + "loss": 0.6942178606987, + "step": 764 + }, + { + "epoch": 1.4029304029304028, + "grad_norm": 0.27227652072906494, + "learning_rate": 1.2400875430434119e-05, + "loss": 0.8875712752342224, + "step": 766 + }, + { + "epoch": 1.4065934065934065, + "grad_norm": 0.7895340919494629, + "learning_rate": 1.236568858532391e-05, + "loss": 0.8964008688926697, + "step": 768 + }, + { + "epoch": 1.4102564102564101, + "grad_norm": 0.17773286998271942, + "learning_rate": 1.2330480360107728e-05, + "loss": 1.1805744171142578, + "step": 770 + }, + { + "epoch": 1.4139194139194138, + "grad_norm": 0.23372013866901398, + "learning_rate": 1.2295251305976818e-05, + "loss": 1.2107068300247192, + "step": 772 + }, + { + "epoch": 1.4175824175824177, + "grad_norm": 1.012639045715332, + "learning_rate": 1.2260001974448504e-05, + "loss": 1.019040584564209, + "step": 774 + }, + { + "epoch": 1.4212454212454213, + "grad_norm": 0.23378640413284302, + "learning_rate": 1.222473291735754e-05, + "loss": 1.1538594961166382, + "step": 776 + }, + { + "epoch": 1.424908424908425, + "grad_norm": 0.30686837434768677, + "learning_rate": 1.218944468684752e-05, + "loss": 0.8257187604904175, + "step": 778 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.1656091809272766, + "learning_rate": 1.215413783536217e-05, + "loss": 1.3644243478775024, + "step": 780 + }, + { + "epoch": 1.4322344322344323, + "grad_norm": 0.3846034109592438, + "learning_rate": 1.2118812915636744e-05, + "loss": 1.2287310361862183, + "step": 782 + }, + { + "epoch": 1.435897435897436, + "grad_norm": 0.1750563234090805, + "learning_rate": 1.2083470480689363e-05, + "loss": 1.1567542552947998, + "step": 784 + }, + { + "epoch": 1.4395604395604396, + "grad_norm": 3.485445499420166, + "learning_rate": 1.2048111083812342e-05, + "loss": 0.9774308800697327, + "step": 786 + }, + { + "epoch": 1.4432234432234432, + "grad_norm": 0.34514904022216797, + "learning_rate": 1.2012735278563546e-05, + "loss": 1.1295884847640991, + "step": 788 + }, + { + "epoch": 1.4468864468864469, + "grad_norm": 0.12276670336723328, + "learning_rate": 1.1977343618757702e-05, + "loss": 0.7207637429237366, + "step": 790 + }, + { + "epoch": 1.4505494505494505, + "grad_norm": 0.1892281472682953, + "learning_rate": 1.1941936658457769e-05, + "loss": 0.9219919443130493, + "step": 792 + }, + { + "epoch": 1.4542124542124542, + "grad_norm": 0.21250323951244354, + "learning_rate": 1.1906514951966208e-05, + "loss": 0.8157789707183838, + "step": 794 + }, + { + "epoch": 1.4578754578754578, + "grad_norm": 0.16933120787143707, + "learning_rate": 1.1871079053816357e-05, + "loss": 1.1426329612731934, + "step": 796 + }, + { + "epoch": 1.4615384615384617, + "grad_norm": 0.4070507884025574, + "learning_rate": 1.1835629518763714e-05, + "loss": 0.7938690781593323, + "step": 798 + }, + { + "epoch": 1.4652014652014653, + "grad_norm": 0.47800537943840027, + "learning_rate": 1.1800166901777272e-05, + "loss": 1.024507999420166, + "step": 800 + }, + { + "epoch": 1.468864468864469, + "grad_norm": 0.3107675313949585, + "learning_rate": 1.1764691758030825e-05, + "loss": 1.5597076416015625, + "step": 802 + }, + { + "epoch": 1.4725274725274726, + "grad_norm": 1.4179226160049438, + "learning_rate": 1.1729204642894265e-05, + "loss": 1.0233888626098633, + "step": 804 + }, + { + "epoch": 1.4761904761904763, + "grad_norm": 0.335848331451416, + "learning_rate": 1.1693706111924912e-05, + "loss": 1.1873747110366821, + "step": 806 + }, + { + "epoch": 1.47985347985348, + "grad_norm": 0.2710278630256653, + "learning_rate": 1.1658196720858794e-05, + "loss": 1.1727930307388306, + "step": 808 + }, + { + "epoch": 1.4835164835164836, + "grad_norm": 0.37410542368888855, + "learning_rate": 1.1622677025601966e-05, + "loss": 1.0092246532440186, + "step": 810 + }, + { + "epoch": 1.4871794871794872, + "grad_norm": 0.20915468037128448, + "learning_rate": 1.1587147582221776e-05, + "loss": 0.8401330709457397, + "step": 812 + }, + { + "epoch": 1.4908424908424909, + "grad_norm": 0.6524186134338379, + "learning_rate": 1.1551608946938208e-05, + "loss": 1.2012218236923218, + "step": 814 + }, + { + "epoch": 1.4945054945054945, + "grad_norm": 1.3064533472061157, + "learning_rate": 1.1516061676115124e-05, + "loss": 1.137012004852295, + "step": 816 + }, + { + "epoch": 1.4981684981684982, + "grad_norm": 0.08933006227016449, + "learning_rate": 1.1480506326251595e-05, + "loss": 0.4151400327682495, + "step": 818 + }, + { + "epoch": 1.5018315018315018, + "grad_norm": 0.3812463879585266, + "learning_rate": 1.1444943453973155e-05, + "loss": 1.1498603820800781, + "step": 820 + }, + { + "epoch": 1.5054945054945055, + "grad_norm": 0.16644573211669922, + "learning_rate": 1.1409373616023111e-05, + "loss": 0.9069132804870605, + "step": 822 + }, + { + "epoch": 1.5091575091575091, + "grad_norm": 0.9926286935806274, + "learning_rate": 1.1373797369253818e-05, + "loss": 0.828985869884491, + "step": 824 + }, + { + "epoch": 1.5128205128205128, + "grad_norm": 0.27975189685821533, + "learning_rate": 1.1338215270617967e-05, + "loss": 1.0592471361160278, + "step": 826 + }, + { + "epoch": 1.5164835164835164, + "grad_norm": 1.065266728401184, + "learning_rate": 1.130262787715985e-05, + "loss": 1.0117770433425903, + "step": 828 + }, + { + "epoch": 1.52014652014652, + "grad_norm": 0.755531907081604, + "learning_rate": 1.1267035746006658e-05, + "loss": 1.012010931968689, + "step": 830 + }, + { + "epoch": 1.5238095238095237, + "grad_norm": 0.2652962803840637, + "learning_rate": 1.1231439434359755e-05, + "loss": 1.2559067010879517, + "step": 832 + }, + { + "epoch": 1.5274725274725274, + "grad_norm": 1.055659294128418, + "learning_rate": 1.119583949948594e-05, + "loss": 0.8118237853050232, + "step": 834 + }, + { + "epoch": 1.531135531135531, + "grad_norm": 0.16545364260673523, + "learning_rate": 1.1160236498708742e-05, + "loss": 1.156209111213684, + "step": 836 + }, + { + "epoch": 1.5347985347985347, + "grad_norm": 0.29566454887390137, + "learning_rate": 1.112463098939969e-05, + "loss": 1.207381248474121, + "step": 838 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 0.3882679343223572, + "learning_rate": 1.1089023528969576e-05, + "loss": 1.0491501092910767, + "step": 840 + }, + { + "epoch": 1.542124542124542, + "grad_norm": 0.14187085628509521, + "learning_rate": 1.1053414674859741e-05, + "loss": 0.872413694858551, + "step": 842 + }, + { + "epoch": 1.5457875457875456, + "grad_norm": 0.3208055794239044, + "learning_rate": 1.1017804984533351e-05, + "loss": 1.038373589515686, + "step": 844 + }, + { + "epoch": 1.5494505494505495, + "grad_norm": 0.274668425321579, + "learning_rate": 1.0982195015466652e-05, + "loss": 1.182910680770874, + "step": 846 + }, + { + "epoch": 1.5531135531135531, + "grad_norm": 1.7933740615844727, + "learning_rate": 1.0946585325140261e-05, + "loss": 0.5585577487945557, + "step": 848 + }, + { + "epoch": 1.5567765567765568, + "grad_norm": 0.17488670349121094, + "learning_rate": 1.0910976471030428e-05, + "loss": 1.0341525077819824, + "step": 850 + }, + { + "epoch": 1.5604395604395604, + "grad_norm": 0.38739433884620667, + "learning_rate": 1.0875369010600317e-05, + "loss": 1.1761656999588013, + "step": 852 + }, + { + "epoch": 1.564102564102564, + "grad_norm": 0.4399110674858093, + "learning_rate": 1.083976350129126e-05, + "loss": 0.8638155460357666, + "step": 854 + }, + { + "epoch": 1.5677655677655677, + "grad_norm": 0.34226399660110474, + "learning_rate": 1.0804160500514062e-05, + "loss": 0.8107349872589111, + "step": 856 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 1.9198914766311646, + "learning_rate": 1.0768560565640252e-05, + "loss": 0.9437478184700012, + "step": 858 + }, + { + "epoch": 1.575091575091575, + "grad_norm": 0.15224924683570862, + "learning_rate": 1.0732964253993343e-05, + "loss": 0.7484935522079468, + "step": 860 + }, + { + "epoch": 1.578754578754579, + "grad_norm": 0.26914143562316895, + "learning_rate": 1.0697372122840156e-05, + "loss": 1.1590977907180786, + "step": 862 + }, + { + "epoch": 1.5824175824175826, + "grad_norm": 1.1250046491622925, + "learning_rate": 1.0661784729382036e-05, + "loss": 0.9178829789161682, + "step": 864 + }, + { + "epoch": 1.5860805860805862, + "grad_norm": 0.2830469310283661, + "learning_rate": 1.0626202630746183e-05, + "loss": 1.1874239444732666, + "step": 866 + }, + { + "epoch": 1.5897435897435899, + "grad_norm": 0.4739990830421448, + "learning_rate": 1.0590626383976894e-05, + "loss": 1.1853182315826416, + "step": 868 + }, + { + "epoch": 1.5934065934065935, + "grad_norm": 0.2765795886516571, + "learning_rate": 1.055505654602685e-05, + "loss": 0.5874127149581909, + "step": 870 + }, + { + "epoch": 1.5970695970695972, + "grad_norm": 0.18368150293827057, + "learning_rate": 1.0519493673748406e-05, + "loss": 1.1814969778060913, + "step": 872 + }, + { + "epoch": 1.6007326007326008, + "grad_norm": 0.1637224406003952, + "learning_rate": 1.0483938323884879e-05, + "loss": 1.0709137916564941, + "step": 874 + }, + { + "epoch": 1.6043956043956045, + "grad_norm": 1.2013723850250244, + "learning_rate": 1.0448391053061795e-05, + "loss": 0.7793064117431641, + "step": 876 + }, + { + "epoch": 1.6080586080586081, + "grad_norm": 0.38753247261047363, + "learning_rate": 1.0412852417778225e-05, + "loss": 0.9944717884063721, + "step": 878 + }, + { + "epoch": 1.6117216117216118, + "grad_norm": 0.24931028485298157, + "learning_rate": 1.037732297439804e-05, + "loss": 0.9063097834587097, + "step": 880 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 0.24992220103740692, + "learning_rate": 1.034180327914121e-05, + "loss": 1.1387708187103271, + "step": 882 + }, + { + "epoch": 1.619047619047619, + "grad_norm": 1.5768555402755737, + "learning_rate": 1.030629388807509e-05, + "loss": 0.9485580921173096, + "step": 884 + }, + { + "epoch": 1.6227106227106227, + "grad_norm": 0.31033453345298767, + "learning_rate": 1.0270795357105738e-05, + "loss": 1.145193099975586, + "step": 886 + }, + { + "epoch": 1.6263736263736264, + "grad_norm": 1.4770396947860718, + "learning_rate": 1.023530824196918e-05, + "loss": 0.8466076254844666, + "step": 888 + }, + { + "epoch": 1.63003663003663, + "grad_norm": 0.24302436411380768, + "learning_rate": 1.019983309822273e-05, + "loss": 1.2269943952560425, + "step": 890 + }, + { + "epoch": 1.6336996336996337, + "grad_norm": 2.4740350246429443, + "learning_rate": 1.0164370481236292e-05, + "loss": 0.9066356420516968, + "step": 892 + }, + { + "epoch": 1.6373626373626373, + "grad_norm": 0.26794663071632385, + "learning_rate": 1.0128920946183646e-05, + "loss": 1.1919015645980835, + "step": 894 + }, + { + "epoch": 1.641025641025641, + "grad_norm": 0.5033538937568665, + "learning_rate": 1.0093485048033798e-05, + "loss": 0.7559452652931213, + "step": 896 + }, + { + "epoch": 1.6446886446886446, + "grad_norm": 0.4728987514972687, + "learning_rate": 1.0058063341542238e-05, + "loss": 1.141265869140625, + "step": 898 + }, + { + "epoch": 1.6483516483516483, + "grad_norm": 0.0506194643676281, + "learning_rate": 1.0022656381242297e-05, + "loss": 0.835241973400116, + "step": 900 + }, + { + "epoch": 1.652014652014652, + "grad_norm": 0.20770415663719177, + "learning_rate": 9.98726472143646e-06, + "loss": 0.8866770267486572, + "step": 902 + }, + { + "epoch": 1.6556776556776556, + "grad_norm": 0.3993974030017853, + "learning_rate": 9.951888916187662e-06, + "loss": 1.027202844619751, + "step": 904 + }, + { + "epoch": 1.6593406593406592, + "grad_norm": 0.19648966193199158, + "learning_rate": 9.916529519310638e-06, + "loss": 1.1409128904342651, + "step": 906 + }, + { + "epoch": 1.6630036630036629, + "grad_norm": 0.18085846304893494, + "learning_rate": 9.881187084363257e-06, + "loss": 0.7516112923622131, + "step": 908 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.21360376477241516, + "learning_rate": 9.845862164637834e-06, + "loss": 1.2104125022888184, + "step": 910 + }, + { + "epoch": 1.6703296703296702, + "grad_norm": 0.18259240686893463, + "learning_rate": 9.810555313152486e-06, + "loss": 1.2229918241500854, + "step": 912 + }, + { + "epoch": 1.673992673992674, + "grad_norm": 0.20374548435211182, + "learning_rate": 9.775267082642461e-06, + "loss": 1.1026571989059448, + "step": 914 + }, + { + "epoch": 1.6776556776556777, + "grad_norm": 0.21295559406280518, + "learning_rate": 9.7399980255515e-06, + "loss": 1.2173269987106323, + "step": 916 + }, + { + "epoch": 1.6813186813186813, + "grad_norm": 0.2531648278236389, + "learning_rate": 9.704748694023183e-06, + "loss": 0.8437496423721313, + "step": 918 + }, + { + "epoch": 1.684981684981685, + "grad_norm": 0.16702847182750702, + "learning_rate": 9.669519639892275e-06, + "loss": 1.2237019538879395, + "step": 920 + }, + { + "epoch": 1.6886446886446886, + "grad_norm": 0.2449825406074524, + "learning_rate": 9.634311414676096e-06, + "loss": 1.0298762321472168, + "step": 922 + }, + { + "epoch": 1.6923076923076923, + "grad_norm": 0.43799275159835815, + "learning_rate": 9.599124569565887e-06, + "loss": 0.8457880616188049, + "step": 924 + }, + { + "epoch": 1.695970695970696, + "grad_norm": 0.462398886680603, + "learning_rate": 9.56395965541818e-06, + "loss": 0.9856649041175842, + "step": 926 + }, + { + "epoch": 1.6996336996336996, + "grad_norm": 0.22268880903720856, + "learning_rate": 9.528817222746171e-06, + "loss": 0.8122522234916687, + "step": 928 + }, + { + "epoch": 1.7032967032967035, + "grad_norm": 0.12412460893392563, + "learning_rate": 9.493697821711116e-06, + "loss": 0.9117051362991333, + "step": 930 + }, + { + "epoch": 1.7069597069597071, + "grad_norm": 0.3573848605155945, + "learning_rate": 9.458602002113684e-06, + "loss": 0.9280415773391724, + "step": 932 + }, + { + "epoch": 1.7106227106227108, + "grad_norm": 0.19333380460739136, + "learning_rate": 9.423530313385395e-06, + "loss": 1.3979382514953613, + "step": 934 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.28463131189346313, + "learning_rate": 9.388483304579983e-06, + "loss": 1.1958733797073364, + "step": 936 + }, + { + "epoch": 1.717948717948718, + "grad_norm": 0.5562649965286255, + "learning_rate": 9.353461524364814e-06, + "loss": 0.48058995604515076, + "step": 938 + }, + { + "epoch": 1.7216117216117217, + "grad_norm": 0.2503082752227783, + "learning_rate": 9.318465521012298e-06, + "loss": 0.556159257888794, + "step": 940 + }, + { + "epoch": 1.7252747252747254, + "grad_norm": 0.28722119331359863, + "learning_rate": 9.283495842391313e-06, + "loss": 1.130286693572998, + "step": 942 + }, + { + "epoch": 1.728937728937729, + "grad_norm": 1.835166573524475, + "learning_rate": 9.248553035958596e-06, + "loss": 0.9355916380882263, + "step": 944 + }, + { + "epoch": 1.7326007326007327, + "grad_norm": 0.5465191006660461, + "learning_rate": 9.213637648750217e-06, + "loss": 1.1549943685531616, + "step": 946 + }, + { + "epoch": 1.7362637362637363, + "grad_norm": 0.2970430850982666, + "learning_rate": 9.178750227372983e-06, + "loss": 1.100421667098999, + "step": 948 + }, + { + "epoch": 1.73992673992674, + "grad_norm": 0.36570602655410767, + "learning_rate": 9.143891317995888e-06, + "loss": 0.9944745302200317, + "step": 950 + }, + { + "epoch": 1.7435897435897436, + "grad_norm": 0.16801096498966217, + "learning_rate": 9.109061466341576e-06, + "loss": 0.9477764368057251, + "step": 952 + }, + { + "epoch": 1.7472527472527473, + "grad_norm": 0.3478679358959198, + "learning_rate": 9.074261217677771e-06, + "loss": 1.2121840715408325, + "step": 954 + }, + { + "epoch": 1.750915750915751, + "grad_norm": 0.25078827142715454, + "learning_rate": 9.039491116808773e-06, + "loss": 0.7902039885520935, + "step": 956 + }, + { + "epoch": 1.7545787545787546, + "grad_norm": 0.48620137572288513, + "learning_rate": 9.004751708066906e-06, + "loss": 1.1979793310165405, + "step": 958 + }, + { + "epoch": 1.7582417582417582, + "grad_norm": 0.12998303771018982, + "learning_rate": 8.970043535303999e-06, + "loss": 0.5474309325218201, + "step": 960 + }, + { + "epoch": 1.7619047619047619, + "grad_norm": 0.28665241599082947, + "learning_rate": 8.93536714188288e-06, + "loss": 0.9992084503173828, + "step": 962 + }, + { + "epoch": 1.7655677655677655, + "grad_norm": 0.17530608177185059, + "learning_rate": 8.900723070668869e-06, + "loss": 1.0385854244232178, + "step": 964 + }, + { + "epoch": 1.7692307692307692, + "grad_norm": 0.5553234815597534, + "learning_rate": 8.86611186402127e-06, + "loss": 1.1807163953781128, + "step": 966 + }, + { + "epoch": 1.7728937728937728, + "grad_norm": 2.357882499694824, + "learning_rate": 8.831534063784891e-06, + "loss": 0.5750354528427124, + "step": 968 + }, + { + "epoch": 1.7765567765567765, + "grad_norm": 0.8607165813446045, + "learning_rate": 8.796990211281549e-06, + "loss": 0.8479418158531189, + "step": 970 + }, + { + "epoch": 1.7802197802197801, + "grad_norm": 0.3071545362472534, + "learning_rate": 8.76248084730161e-06, + "loss": 0.9116554856300354, + "step": 972 + }, + { + "epoch": 1.7838827838827838, + "grad_norm": 0.39947524666786194, + "learning_rate": 8.728006512095517e-06, + "loss": 1.2301732301712036, + "step": 974 + }, + { + "epoch": 1.7875457875457874, + "grad_norm": 0.37182819843292236, + "learning_rate": 8.693567745365325e-06, + "loss": 1.1915199756622314, + "step": 976 + }, + { + "epoch": 1.791208791208791, + "grad_norm": 0.155500128865242, + "learning_rate": 8.659165086256263e-06, + "loss": 0.9201015830039978, + "step": 978 + }, + { + "epoch": 1.7948717948717947, + "grad_norm": 0.21511544287204742, + "learning_rate": 8.624799073348282e-06, + "loss": 0.9540326595306396, + "step": 980 + }, + { + "epoch": 1.7985347985347986, + "grad_norm": 0.26243695616722107, + "learning_rate": 8.590470244647643e-06, + "loss": 1.1440948247909546, + "step": 982 + }, + { + "epoch": 1.8021978021978022, + "grad_norm": 1.7372164726257324, + "learning_rate": 8.556179137578461e-06, + "loss": 1.100319504737854, + "step": 984 + }, + { + "epoch": 1.8058608058608059, + "grad_norm": 0.1159660816192627, + "learning_rate": 8.521926288974336e-06, + "loss": 0.6495481133460999, + "step": 986 + }, + { + "epoch": 1.8095238095238095, + "grad_norm": 0.2123069316148758, + "learning_rate": 8.487712235069901e-06, + "loss": 0.8149735927581787, + "step": 988 + }, + { + "epoch": 1.8131868131868132, + "grad_norm": 0.42196395993232727, + "learning_rate": 8.453537511492469e-06, + "loss": 0.7469933032989502, + "step": 990 + }, + { + "epoch": 1.8168498168498168, + "grad_norm": 0.2649979293346405, + "learning_rate": 8.419402653253623e-06, + "loss": 0.7891843914985657, + "step": 992 + }, + { + "epoch": 1.8205128205128205, + "grad_norm": 0.21673156321048737, + "learning_rate": 8.385308194740846e-06, + "loss": 0.8571860790252686, + "step": 994 + }, + { + "epoch": 1.8241758241758241, + "grad_norm": 0.35649216175079346, + "learning_rate": 8.35125466970915e-06, + "loss": 1.1353570222854614, + "step": 996 + }, + { + "epoch": 1.8278388278388278, + "grad_norm": 0.2921370565891266, + "learning_rate": 8.317242611272745e-06, + "loss": 0.8522858023643494, + "step": 998 + }, + { + "epoch": 1.8315018315018317, + "grad_norm": 0.577377438545227, + "learning_rate": 8.283272551896649e-06, + "loss": 1.1177325248718262, + "step": 1000 + }, + { + "epoch": 1.8351648351648353, + "grad_norm": 0.3225407898426056, + "learning_rate": 8.249345023388393e-06, + "loss": 1.145124912261963, + "step": 1002 + }, + { + "epoch": 1.838827838827839, + "grad_norm": 0.34550943970680237, + "learning_rate": 8.21546055688968e-06, + "loss": 1.1394211053848267, + "step": 1004 + }, + { + "epoch": 1.8424908424908426, + "grad_norm": 0.22849220037460327, + "learning_rate": 8.181619682868059e-06, + "loss": 1.1420109272003174, + "step": 1006 + }, + { + "epoch": 1.8461538461538463, + "grad_norm": 0.3721167743206024, + "learning_rate": 8.147822931108638e-06, + "loss": 0.7952710390090942, + "step": 1008 + }, + { + "epoch": 1.84981684981685, + "grad_norm": 0.17802205681800842, + "learning_rate": 8.114070830705785e-06, + "loss": 1.1106821298599243, + "step": 1010 + }, + { + "epoch": 1.8534798534798536, + "grad_norm": 0.15619701147079468, + "learning_rate": 8.080363910054833e-06, + "loss": 0.7631734609603882, + "step": 1012 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.3461393117904663, + "learning_rate": 8.04670269684383e-06, + "loss": 1.162695288658142, + "step": 1014 + }, + { + "epoch": 1.8608058608058609, + "grad_norm": 0.18843670189380646, + "learning_rate": 8.013087718045256e-06, + "loss": 1.1480703353881836, + "step": 1016 + }, + { + "epoch": 1.8644688644688645, + "grad_norm": 0.30292752385139465, + "learning_rate": 7.979519499907786e-06, + "loss": 1.2293591499328613, + "step": 1018 + }, + { + "epoch": 1.8681318681318682, + "grad_norm": 0.09791062027215958, + "learning_rate": 7.945998567948052e-06, + "loss": 0.9643331170082092, + "step": 1020 + }, + { + "epoch": 1.8717948717948718, + "grad_norm": 0.20030497014522552, + "learning_rate": 7.912525446942406e-06, + "loss": 1.1193994283676147, + "step": 1022 + }, + { + "epoch": 1.8754578754578755, + "grad_norm": 0.3839741051197052, + "learning_rate": 7.879100660918713e-06, + "loss": 0.5604696869850159, + "step": 1024 + }, + { + "epoch": 1.879120879120879, + "grad_norm": 0.15717224776744843, + "learning_rate": 7.845724733148149e-06, + "loss": 1.1571592092514038, + "step": 1026 + }, + { + "epoch": 1.8827838827838828, + "grad_norm": 0.27877768874168396, + "learning_rate": 7.812398186136994e-06, + "loss": 0.954494059085846, + "step": 1028 + }, + { + "epoch": 1.8864468864468864, + "grad_norm": 0.16563951969146729, + "learning_rate": 7.779121541618478e-06, + "loss": 1.153045892715454, + "step": 1030 + }, + { + "epoch": 1.89010989010989, + "grad_norm": 0.13534830510616302, + "learning_rate": 7.74589532054459e-06, + "loss": 0.9504430890083313, + "step": 1032 + }, + { + "epoch": 1.8937728937728937, + "grad_norm": 0.3428526222705841, + "learning_rate": 7.712720043077929e-06, + "loss": 0.8016744256019592, + "step": 1034 + }, + { + "epoch": 1.8974358974358974, + "grad_norm": 0.3059753477573395, + "learning_rate": 7.679596228583563e-06, + "loss": 1.1903191804885864, + "step": 1036 + }, + { + "epoch": 1.901098901098901, + "grad_norm": 0.17783567309379578, + "learning_rate": 7.646524395620908e-06, + "loss": 1.157071828842163, + "step": 1038 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 0.7491412162780762, + "learning_rate": 7.613505061935584e-06, + "loss": 1.2254270315170288, + "step": 1040 + }, + { + "epoch": 1.9084249084249083, + "grad_norm": 0.44477730989456177, + "learning_rate": 7.580538744451336e-06, + "loss": 0.6230685710906982, + "step": 1042 + }, + { + "epoch": 1.912087912087912, + "grad_norm": 0.14885884523391724, + "learning_rate": 7.547625959261928e-06, + "loss": 0.8747984766960144, + "step": 1044 + }, + { + "epoch": 1.9157509157509156, + "grad_norm": 0.1716078519821167, + "learning_rate": 7.5147672216230605e-06, + "loss": 1.117228388786316, + "step": 1046 + }, + { + "epoch": 1.9194139194139193, + "grad_norm": 0.20717094838619232, + "learning_rate": 7.481963045944318e-06, + "loss": 0.45573827624320984, + "step": 1048 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 0.5965511798858643, + "learning_rate": 7.449213945781102e-06, + "loss": 0.8882296085357666, + "step": 1050 + }, + { + "epoch": 1.9267399267399268, + "grad_norm": 0.2176082581281662, + "learning_rate": 7.416520433826599e-06, + "loss": 0.8158991932868958, + "step": 1052 + }, + { + "epoch": 1.9304029304029304, + "grad_norm": 0.31369099020957947, + "learning_rate": 7.383883021903755e-06, + "loss": 1.1231168508529663, + "step": 1054 + }, + { + "epoch": 1.934065934065934, + "grad_norm": 0.2577856183052063, + "learning_rate": 7.351302220957251e-06, + "loss": 0.748049259185791, + "step": 1056 + }, + { + "epoch": 1.9377289377289377, + "grad_norm": 0.31049013137817383, + "learning_rate": 7.318778541045517e-06, + "loss": 0.9868760704994202, + "step": 1058 + }, + { + "epoch": 1.9413919413919414, + "grad_norm": 0.3126990795135498, + "learning_rate": 7.286312491332754e-06, + "loss": 1.0847361087799072, + "step": 1060 + }, + { + "epoch": 1.945054945054945, + "grad_norm": 0.235934779047966, + "learning_rate": 7.253904580080926e-06, + "loss": 0.84217369556427, + "step": 1062 + }, + { + "epoch": 1.9487179487179487, + "grad_norm": 0.2684265375137329, + "learning_rate": 7.221555314641853e-06, + "loss": 0.8080073595046997, + "step": 1064 + }, + { + "epoch": 1.9523809523809523, + "grad_norm": 0.28575268387794495, + "learning_rate": 7.18926520144924e-06, + "loss": 1.1642062664031982, + "step": 1066 + }, + { + "epoch": 1.9560439560439562, + "grad_norm": 0.25401219725608826, + "learning_rate": 7.1570347460107335e-06, + "loss": 1.1827412843704224, + "step": 1068 + }, + { + "epoch": 1.9597069597069599, + "grad_norm": 0.11555124074220657, + "learning_rate": 7.124864452900049e-06, + "loss": 0.706343412399292, + "step": 1070 + }, + { + "epoch": 1.9633699633699635, + "grad_norm": 0.2669273912906647, + "learning_rate": 7.0927548257490465e-06, + "loss": 0.8516585230827332, + "step": 1072 + }, + { + "epoch": 1.9670329670329672, + "grad_norm": 0.31186506152153015, + "learning_rate": 7.060706367239836e-06, + "loss": 1.1490978002548218, + "step": 1074 + }, + { + "epoch": 1.9706959706959708, + "grad_norm": 0.19497060775756836, + "learning_rate": 7.028719579096932e-06, + "loss": 1.1234122514724731, + "step": 1076 + }, + { + "epoch": 1.9743589743589745, + "grad_norm": 0.21999667584896088, + "learning_rate": 6.9967949620793854e-06, + "loss": 0.9970806241035461, + "step": 1078 + }, + { + "epoch": 1.978021978021978, + "grad_norm": 0.5335208177566528, + "learning_rate": 6.964933015972947e-06, + "loss": 0.9913358688354492, + "step": 1080 + }, + { + "epoch": 1.9816849816849818, + "grad_norm": 0.2547147572040558, + "learning_rate": 6.933134239582246e-06, + "loss": 1.1181188821792603, + "step": 1082 + }, + { + "epoch": 1.9853479853479854, + "grad_norm": 0.07284919172525406, + "learning_rate": 6.9013991307229745e-06, + "loss": 0.7213895916938782, + "step": 1084 + }, + { + "epoch": 1.989010989010989, + "grad_norm": 1.2812455892562866, + "learning_rate": 6.869728186214093e-06, + "loss": 0.9767944812774658, + "step": 1086 + }, + { + "epoch": 1.9926739926739927, + "grad_norm": 0.2086431384086609, + "learning_rate": 6.8381219018700675e-06, + "loss": 0.9789687395095825, + "step": 1088 + }, + { + "epoch": 1.9963369963369964, + "grad_norm": 0.11151088029146194, + "learning_rate": 6.806580772493088e-06, + "loss": 0.930722713470459, + "step": 1090 + }, + { + "epoch": 2.0, + "grad_norm": 0.6231138706207275, + "learning_rate": 6.775105291865343e-06, + "loss": 1.042896032333374, + "step": 1092 + }, + { + "epoch": 2.0036630036630036, + "grad_norm": 0.1892917901277542, + "learning_rate": 6.743695952741265e-06, + "loss": 1.0761141777038574, + "step": 1094 + }, + { + "epoch": 2.0073260073260073, + "grad_norm": 0.39348432421684265, + "learning_rate": 6.71235324683983e-06, + "loss": 1.1419814825057983, + "step": 1096 + }, + { + "epoch": 2.010989010989011, + "grad_norm": 0.16642175614833832, + "learning_rate": 6.681077664836872e-06, + "loss": 1.088594675064087, + "step": 1098 + }, + { + "epoch": 2.0146520146520146, + "grad_norm": 0.3197147846221924, + "learning_rate": 6.649869696357381e-06, + "loss": 1.1661202907562256, + "step": 1100 + }, + { + "epoch": 2.0183150183150182, + "grad_norm": 0.43613532185554504, + "learning_rate": 6.6187298299678295e-06, + "loss": 0.8467984795570374, + "step": 1102 + }, + { + "epoch": 2.021978021978022, + "grad_norm": 0.24174755811691284, + "learning_rate": 6.587658553168563e-06, + "loss": 1.142805576324463, + "step": 1104 + }, + { + "epoch": 2.0256410256410255, + "grad_norm": 0.1317332535982132, + "learning_rate": 6.556656352386135e-06, + "loss": 0.7679157853126526, + "step": 1106 + }, + { + "epoch": 2.029304029304029, + "grad_norm": 0.18263843655586243, + "learning_rate": 6.525723712965698e-06, + "loss": 1.1841180324554443, + "step": 1108 + }, + { + "epoch": 2.032967032967033, + "grad_norm": 0.3168501853942871, + "learning_rate": 6.494861119163412e-06, + "loss": 0.9058336019515991, + "step": 1110 + }, + { + "epoch": 2.0366300366300365, + "grad_norm": 0.2593517303466797, + "learning_rate": 6.464069054138853e-06, + "loss": 0.6124511957168579, + "step": 1112 + }, + { + "epoch": 2.04029304029304, + "grad_norm": 0.245356947183609, + "learning_rate": 6.433347999947468e-06, + "loss": 0.845076322555542, + "step": 1114 + }, + { + "epoch": 2.043956043956044, + "grad_norm": 0.3254038095474243, + "learning_rate": 6.402698437533012e-06, + "loss": 1.1547578573226929, + "step": 1116 + }, + { + "epoch": 2.0476190476190474, + "grad_norm": 0.5246717929840088, + "learning_rate": 6.372120846720018e-06, + "loss": 1.1199109554290771, + "step": 1118 + }, + { + "epoch": 2.051282051282051, + "grad_norm": 0.6534336805343628, + "learning_rate": 6.341615706206292e-06, + "loss": 0.8209899067878723, + "step": 1120 + }, + { + "epoch": 2.0549450549450547, + "grad_norm": 0.8080871105194092, + "learning_rate": 6.311183493555426e-06, + "loss": 1.3262028694152832, + "step": 1122 + }, + { + "epoch": 2.0586080586080584, + "grad_norm": 0.23124472796916962, + "learning_rate": 6.280824685189296e-06, + "loss": 1.1404337882995605, + "step": 1124 + }, + { + "epoch": 2.062271062271062, + "grad_norm": 0.2290889322757721, + "learning_rate": 6.25053975638064e-06, + "loss": 0.7613569498062134, + "step": 1126 + }, + { + "epoch": 2.065934065934066, + "grad_norm": 0.18532392382621765, + "learning_rate": 6.220329181245585e-06, + "loss": 1.1030632257461548, + "step": 1128 + }, + { + "epoch": 2.06959706959707, + "grad_norm": 0.3047994375228882, + "learning_rate": 6.1901934327362355e-06, + "loss": 1.098275899887085, + "step": 1130 + }, + { + "epoch": 2.0732600732600734, + "grad_norm": 0.24303042888641357, + "learning_rate": 6.16013298263328e-06, + "loss": 1.130845069885254, + "step": 1132 + }, + { + "epoch": 2.076923076923077, + "grad_norm": 0.4075912535190582, + "learning_rate": 6.130148301538601e-06, + "loss": 1.1122570037841797, + "step": 1134 + }, + { + "epoch": 2.0805860805860807, + "grad_norm": 0.635519802570343, + "learning_rate": 6.100239858867887e-06, + "loss": 0.7130240201950073, + "step": 1136 + }, + { + "epoch": 2.0842490842490844, + "grad_norm": 0.19734634459018707, + "learning_rate": 6.070408122843311e-06, + "loss": 1.1177456378936768, + "step": 1138 + }, + { + "epoch": 2.087912087912088, + "grad_norm": 0.28150826692581177, + "learning_rate": 6.040653560486183e-06, + "loss": 1.0220168828964233, + "step": 1140 + }, + { + "epoch": 2.0915750915750917, + "grad_norm": 0.17299877107143402, + "learning_rate": 6.010976637609653e-06, + "loss": 1.106982707977295, + "step": 1142 + }, + { + "epoch": 2.0952380952380953, + "grad_norm": 0.15779122710227966, + "learning_rate": 5.9813778188114125e-06, + "loss": 0.3823546767234802, + "step": 1144 + }, + { + "epoch": 2.098901098901099, + "grad_norm": 0.36108678579330444, + "learning_rate": 5.951857567466401e-06, + "loss": 0.9157997369766235, + "step": 1146 + }, + { + "epoch": 2.1025641025641026, + "grad_norm": 0.177387997508049, + "learning_rate": 5.922416345719588e-06, + "loss": 0.8090324997901917, + "step": 1148 + }, + { + "epoch": 2.1062271062271063, + "grad_norm": 0.3467074930667877, + "learning_rate": 5.893054614478718e-06, + "loss": 0.8260840773582458, + "step": 1150 + }, + { + "epoch": 2.10989010989011, + "grad_norm": 0.2603940963745117, + "learning_rate": 5.8637728334070905e-06, + "loss": 0.9580783843994141, + "step": 1152 + }, + { + "epoch": 2.1135531135531136, + "grad_norm": 0.3268923759460449, + "learning_rate": 5.834571460916371e-06, + "loss": 0.7878354787826538, + "step": 1154 + }, + { + "epoch": 2.1172161172161172, + "grad_norm": 0.4938827455043793, + "learning_rate": 5.805450954159422e-06, + "loss": 1.0745412111282349, + "step": 1156 + }, + { + "epoch": 2.120879120879121, + "grad_norm": 0.2504938244819641, + "learning_rate": 5.776411769023127e-06, + "loss": 1.0261658430099487, + "step": 1158 + }, + { + "epoch": 2.1245421245421245, + "grad_norm": 0.24915434420108795, + "learning_rate": 5.747454360121274e-06, + "loss": 0.870224118232727, + "step": 1160 + }, + { + "epoch": 2.128205128205128, + "grad_norm": 0.4833395183086395, + "learning_rate": 5.718579180787425e-06, + "loss": 0.7795557379722595, + "step": 1162 + }, + { + "epoch": 2.131868131868132, + "grad_norm": 0.9497178196907043, + "learning_rate": 5.689786683067817e-06, + "loss": 0.918286144733429, + "step": 1164 + }, + { + "epoch": 2.1355311355311355, + "grad_norm": 0.6461044549942017, + "learning_rate": 5.661077317714303e-06, + "loss": 0.42682868242263794, + "step": 1166 + }, + { + "epoch": 2.139194139194139, + "grad_norm": 0.6395738124847412, + "learning_rate": 5.632451534177276e-06, + "loss": 0.4123497009277344, + "step": 1168 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.4208049178123474, + "learning_rate": 5.603909780598644e-06, + "loss": 0.8927979469299316, + "step": 1170 + }, + { + "epoch": 2.1465201465201464, + "grad_norm": 0.2959461808204651, + "learning_rate": 5.575452503804805e-06, + "loss": 1.1349587440490723, + "step": 1172 + }, + { + "epoch": 2.15018315018315, + "grad_norm": 0.8988657593727112, + "learning_rate": 5.5470801492996605e-06, + "loss": 1.2815641164779663, + "step": 1174 + }, + { + "epoch": 2.1538461538461537, + "grad_norm": 0.27057546377182007, + "learning_rate": 5.518793161257641e-06, + "loss": 0.7825716137886047, + "step": 1176 + }, + { + "epoch": 2.1575091575091574, + "grad_norm": 0.17563986778259277, + "learning_rate": 5.490591982516749e-06, + "loss": 1.142047643661499, + "step": 1178 + }, + { + "epoch": 2.161172161172161, + "grad_norm": 0.09482431411743164, + "learning_rate": 5.462477054571617e-06, + "loss": 1.127107858657837, + "step": 1180 + }, + { + "epoch": 2.1648351648351647, + "grad_norm": 0.17411017417907715, + "learning_rate": 5.4344488175666154e-06, + "loss": 1.1234813928604126, + "step": 1182 + }, + { + "epoch": 2.1684981684981683, + "grad_norm": 0.13109169900417328, + "learning_rate": 5.406507710288955e-06, + "loss": 1.1429002285003662, + "step": 1184 + }, + { + "epoch": 2.172161172161172, + "grad_norm": 0.16202552616596222, + "learning_rate": 5.378654170161805e-06, + "loss": 0.318652480840683, + "step": 1186 + }, + { + "epoch": 2.1758241758241756, + "grad_norm": 0.44094666838645935, + "learning_rate": 5.3508886332374534e-06, + "loss": 1.2102477550506592, + "step": 1188 + }, + { + "epoch": 2.1794871794871793, + "grad_norm": 0.8763576745986938, + "learning_rate": 5.323211534190496e-06, + "loss": 0.7540980577468872, + "step": 1190 + }, + { + "epoch": 2.183150183150183, + "grad_norm": 0.1877157837152481, + "learning_rate": 5.295623306310999e-06, + "loss": 0.9759551286697388, + "step": 1192 + }, + { + "epoch": 2.186813186813187, + "grad_norm": 0.120526023209095, + "learning_rate": 5.268124381497755e-06, + "loss": 0.802385687828064, + "step": 1194 + }, + { + "epoch": 2.1904761904761907, + "grad_norm": 0.27409690618515015, + "learning_rate": 5.240715190251484e-06, + "loss": 0.9069569110870361, + "step": 1196 + }, + { + "epoch": 2.1941391941391943, + "grad_norm": 0.20103105902671814, + "learning_rate": 5.213396161668111e-06, + "loss": 1.1158097982406616, + "step": 1198 + }, + { + "epoch": 2.197802197802198, + "grad_norm": 0.3620758652687073, + "learning_rate": 5.186167723432061e-06, + "loss": 0.7542502284049988, + "step": 1200 + }, + { + "epoch": 2.2014652014652016, + "grad_norm": 0.3984181582927704, + "learning_rate": 5.159030301809534e-06, + "loss": 1.1818705797195435, + "step": 1202 + }, + { + "epoch": 2.2051282051282053, + "grad_norm": 0.17239412665367126, + "learning_rate": 5.131984321641865e-06, + "loss": 1.1314865350723267, + "step": 1204 + }, + { + "epoch": 2.208791208791209, + "grad_norm": 0.23334990441799164, + "learning_rate": 5.105030206338843e-06, + "loss": 0.4661150276660919, + "step": 1206 + }, + { + "epoch": 2.2124542124542126, + "grad_norm": 3.1555933952331543, + "learning_rate": 5.0781683778720965e-06, + "loss": 0.7386508584022522, + "step": 1208 + }, + { + "epoch": 2.2161172161172162, + "grad_norm": 0.14592549204826355, + "learning_rate": 5.051399256768498e-06, + "loss": 0.6242368817329407, + "step": 1210 + }, + { + "epoch": 2.21978021978022, + "grad_norm": 0.23044905066490173, + "learning_rate": 5.024723262103559e-06, + "loss": 1.0724856853485107, + "step": 1212 + }, + { + "epoch": 2.2234432234432235, + "grad_norm": 0.2958521842956543, + "learning_rate": 4.998140811494881e-06, + "loss": 0.8155311942100525, + "step": 1214 + }, + { + "epoch": 2.227106227106227, + "grad_norm": 0.3939032256603241, + "learning_rate": 4.971652321095614e-06, + "loss": 0.8991819024085999, + "step": 1216 + }, + { + "epoch": 2.230769230769231, + "grad_norm": 0.3258177638053894, + "learning_rate": 4.945258205587955e-06, + "loss": 1.1150569915771484, + "step": 1218 + }, + { + "epoch": 2.2344322344322345, + "grad_norm": 0.2855719029903412, + "learning_rate": 4.918958878176628e-06, + "loss": 1.1184298992156982, + "step": 1220 + }, + { + "epoch": 2.238095238095238, + "grad_norm": 0.2457854449748993, + "learning_rate": 4.8927547505824465e-06, + "loss": 1.1249486207962036, + "step": 1222 + }, + { + "epoch": 2.241758241758242, + "grad_norm": 0.31648901104927063, + "learning_rate": 4.866646233035845e-06, + "loss": 0.49614107608795166, + "step": 1224 + }, + { + "epoch": 2.2454212454212454, + "grad_norm": 0.19430601596832275, + "learning_rate": 4.840633734270464e-06, + "loss": 1.0849982500076294, + "step": 1226 + }, + { + "epoch": 2.249084249084249, + "grad_norm": 0.7305367588996887, + "learning_rate": 4.814717661516762e-06, + "loss": 0.6019390821456909, + "step": 1228 + }, + { + "epoch": 2.2527472527472527, + "grad_norm": 0.03867306187748909, + "learning_rate": 4.788898420495622e-06, + "loss": 0.5977374911308289, + "step": 1230 + }, + { + "epoch": 2.2564102564102564, + "grad_norm": 0.1346009522676468, + "learning_rate": 4.763176415412006e-06, + "loss": 0.6154077649116516, + "step": 1232 + }, + { + "epoch": 2.26007326007326, + "grad_norm": 0.7410935759544373, + "learning_rate": 4.7375520489486395e-06, + "loss": 1.1026445627212524, + "step": 1234 + }, + { + "epoch": 2.2637362637362637, + "grad_norm": 0.2963329553604126, + "learning_rate": 4.71202572225969e-06, + "loss": 0.7703139185905457, + "step": 1236 + }, + { + "epoch": 2.2673992673992673, + "grad_norm": 0.18143776059150696, + "learning_rate": 4.686597834964499e-06, + "loss": 0.9660443067550659, + "step": 1238 + }, + { + "epoch": 2.271062271062271, + "grad_norm": 0.14209656417369843, + "learning_rate": 4.661268785141316e-06, + "loss": 0.8756354451179504, + "step": 1240 + }, + { + "epoch": 2.2747252747252746, + "grad_norm": 0.33028721809387207, + "learning_rate": 4.636038969321073e-06, + "loss": 0.9074218273162842, + "step": 1242 + }, + { + "epoch": 2.2783882783882783, + "grad_norm": 0.263667494058609, + "learning_rate": 4.610908782481179e-06, + "loss": 1.1067166328430176, + "step": 1244 + }, + { + "epoch": 2.282051282051282, + "grad_norm": 0.07068176567554474, + "learning_rate": 4.5858786180393326e-06, + "loss": 0.730255126953125, + "step": 1246 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.4124971628189087, + "learning_rate": 4.560948867847359e-06, + "loss": 0.714017391204834, + "step": 1248 + }, + { + "epoch": 2.2893772893772892, + "grad_norm": 0.3742057681083679, + "learning_rate": 4.536119922185082e-06, + "loss": 1.1044319868087769, + "step": 1250 + }, + { + "epoch": 2.293040293040293, + "grad_norm": 0.6701974272727966, + "learning_rate": 4.511392169754214e-06, + "loss": 0.839243471622467, + "step": 1252 + }, + { + "epoch": 2.2967032967032965, + "grad_norm": 0.33513882756233215, + "learning_rate": 4.486765997672263e-06, + "loss": 1.1593670845031738, + "step": 1254 + }, + { + "epoch": 2.3003663003663, + "grad_norm": 0.20222492516040802, + "learning_rate": 4.46224179146649e-06, + "loss": 1.1839113235473633, + "step": 1256 + }, + { + "epoch": 2.304029304029304, + "grad_norm": 0.21135662496089935, + "learning_rate": 4.437819935067847e-06, + "loss": 0.7787933945655823, + "step": 1258 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 0.8632537126541138, + "learning_rate": 4.413500810804986e-06, + "loss": 1.1648868322372437, + "step": 1260 + }, + { + "epoch": 2.311355311355311, + "grad_norm": 0.31939440965652466, + "learning_rate": 4.389284799398276e-06, + "loss": 1.0294675827026367, + "step": 1262 + }, + { + "epoch": 2.315018315018315, + "grad_norm": 0.2278747856616974, + "learning_rate": 4.365172279953825e-06, + "loss": 0.8878557085990906, + "step": 1264 + }, + { + "epoch": 2.3186813186813184, + "grad_norm": 0.6909482479095459, + "learning_rate": 4.34116362995756e-06, + "loss": 0.7896403670310974, + "step": 1266 + }, + { + "epoch": 2.3223443223443225, + "grad_norm": 0.32261016964912415, + "learning_rate": 4.317259225269313e-06, + "loss": 0.8231415748596191, + "step": 1268 + }, + { + "epoch": 2.326007326007326, + "grad_norm": 0.6928750872612, + "learning_rate": 4.293459440116935e-06, + "loss": 1.1856201887130737, + "step": 1270 + }, + { + "epoch": 2.32967032967033, + "grad_norm": 0.5950198769569397, + "learning_rate": 4.269764647090442e-06, + "loss": 1.0062392950057983, + "step": 1272 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.4709881544113159, + "learning_rate": 4.246175217136176e-06, + "loss": 0.6258052587509155, + "step": 1274 + }, + { + "epoch": 2.336996336996337, + "grad_norm": 1.007083773612976, + "learning_rate": 4.2226915195509954e-06, + "loss": 1.0182530879974365, + "step": 1276 + }, + { + "epoch": 2.340659340659341, + "grad_norm": 0.23191671073436737, + "learning_rate": 4.199313921976511e-06, + "loss": 0.8344160318374634, + "step": 1278 + }, + { + "epoch": 2.3443223443223444, + "grad_norm": 0.3454512059688568, + "learning_rate": 4.176042790393313e-06, + "loss": 0.7739337682723999, + "step": 1280 + }, + { + "epoch": 2.347985347985348, + "grad_norm": 0.49713563919067383, + "learning_rate": 4.152878489115244e-06, + "loss": 0.6238831877708435, + "step": 1282 + }, + { + "epoch": 2.3516483516483517, + "grad_norm": 0.21321338415145874, + "learning_rate": 4.129821380783698e-06, + "loss": 1.091771125793457, + "step": 1284 + }, + { + "epoch": 2.3553113553113554, + "grad_norm": 0.9252800941467285, + "learning_rate": 4.106871826361952e-06, + "loss": 0.6089442372322083, + "step": 1286 + }, + { + "epoch": 2.358974358974359, + "grad_norm": 0.38212814927101135, + "learning_rate": 4.084030185129495e-06, + "loss": 0.9349772334098816, + "step": 1288 + }, + { + "epoch": 2.3626373626373627, + "grad_norm": 0.16641326248645782, + "learning_rate": 4.061296814676429e-06, + "loss": 0.896765947341919, + "step": 1290 + }, + { + "epoch": 2.3663003663003663, + "grad_norm": 0.3341747522354126, + "learning_rate": 4.038672070897844e-06, + "loss": 0.7659744024276733, + "step": 1292 + }, + { + "epoch": 2.36996336996337, + "grad_norm": 0.5208961963653564, + "learning_rate": 4.016156307988262e-06, + "loss": 0.8882443308830261, + "step": 1294 + }, + { + "epoch": 2.3736263736263736, + "grad_norm": 0.2249266356229782, + "learning_rate": 3.9937498784361e-06, + "loss": 1.2214046716690063, + "step": 1296 + }, + { + "epoch": 2.3772893772893773, + "grad_norm": 0.21020947396755219, + "learning_rate": 3.9714531330181275e-06, + "loss": 1.1040786504745483, + "step": 1298 + }, + { + "epoch": 2.380952380952381, + "grad_norm": 0.19527573883533478, + "learning_rate": 3.949266420793999e-06, + "loss": 1.1567643880844116, + "step": 1300 + }, + { + "epoch": 2.3846153846153846, + "grad_norm": 0.44200703501701355, + "learning_rate": 3.9271900891007734e-06, + "loss": 0.8775418996810913, + "step": 1302 + }, + { + "epoch": 2.3882783882783882, + "grad_norm": 0.41954803466796875, + "learning_rate": 3.905224483547479e-06, + "loss": 0.47357720136642456, + "step": 1304 + }, + { + "epoch": 2.391941391941392, + "grad_norm": 0.7417261600494385, + "learning_rate": 3.883369948009714e-06, + "loss": 0.9344196915626526, + "step": 1306 + }, + { + "epoch": 2.3956043956043955, + "grad_norm": 0.6811460256576538, + "learning_rate": 3.861626824624258e-06, + "loss": 1.1155997514724731, + "step": 1308 + }, + { + "epoch": 2.399267399267399, + "grad_norm": 0.42083945870399475, + "learning_rate": 3.839995453783694e-06, + "loss": 0.5117136836051941, + "step": 1310 + }, + { + "epoch": 2.402930402930403, + "grad_norm": 0.18507151305675507, + "learning_rate": 3.818476174131118e-06, + "loss": 1.11769437789917, + "step": 1312 + }, + { + "epoch": 2.4065934065934065, + "grad_norm": 0.1388321965932846, + "learning_rate": 3.7970693225548116e-06, + "loss": 0.8328091502189636, + "step": 1314 + }, + { + "epoch": 2.41025641025641, + "grad_norm": 0.31094399094581604, + "learning_rate": 3.7757752341829723e-06, + "loss": 1.115455985069275, + "step": 1316 + }, + { + "epoch": 2.413919413919414, + "grad_norm": 0.845039963722229, + "learning_rate": 3.754594242378466e-06, + "loss": 0.7884094715118408, + "step": 1318 + }, + { + "epoch": 2.4175824175824174, + "grad_norm": 0.31491175293922424, + "learning_rate": 3.7335266787336194e-06, + "loss": 0.7719835042953491, + "step": 1320 + }, + { + "epoch": 2.421245421245421, + "grad_norm": 0.5341482758522034, + "learning_rate": 3.712572873065012e-06, + "loss": 0.6111771464347839, + "step": 1322 + }, + { + "epoch": 2.4249084249084247, + "grad_norm": 1.4491015672683716, + "learning_rate": 3.69173315340833e-06, + "loss": 0.7621108889579773, + "step": 1324 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 1.1738007068634033, + "learning_rate": 3.6710078460132137e-06, + "loss": 0.7748345732688904, + "step": 1326 + }, + { + "epoch": 2.4322344322344325, + "grad_norm": 0.13677427172660828, + "learning_rate": 3.650397275338161e-06, + "loss": 0.6837164163589478, + "step": 1328 + }, + { + "epoch": 2.435897435897436, + "grad_norm": 1.0551925897598267, + "learning_rate": 3.6299017640454516e-06, + "loss": 0.8087068796157837, + "step": 1330 + }, + { + "epoch": 2.4395604395604398, + "grad_norm": 0.1782245635986328, + "learning_rate": 3.6095216329960786e-06, + "loss": 1.0505911111831665, + "step": 1332 + }, + { + "epoch": 2.4432234432234434, + "grad_norm": 0.20206193625926971, + "learning_rate": 3.5892572012447457e-06, + "loss": 0.7000587582588196, + "step": 1334 + }, + { + "epoch": 2.446886446886447, + "grad_norm": 0.2818435728549957, + "learning_rate": 3.5691087860348577e-06, + "loss": 0.6738724708557129, + "step": 1336 + }, + { + "epoch": 2.4505494505494507, + "grad_norm": 0.43494996428489685, + "learning_rate": 3.549076702793557e-06, + "loss": 1.0319753885269165, + "step": 1338 + }, + { + "epoch": 2.4542124542124544, + "grad_norm": 0.25145280361175537, + "learning_rate": 3.529161265126795e-06, + "loss": 0.8778097033500671, + "step": 1340 + }, + { + "epoch": 2.457875457875458, + "grad_norm": 0.08903591334819794, + "learning_rate": 3.5093627848144128e-06, + "loss": 0.6379270553588867, + "step": 1342 + }, + { + "epoch": 2.4615384615384617, + "grad_norm": 0.2862868309020996, + "learning_rate": 3.4896815718052534e-06, + "loss": 0.8156915903091431, + "step": 1344 + }, + { + "epoch": 2.4652014652014653, + "grad_norm": 0.14661253988742828, + "learning_rate": 3.4701179342123313e-06, + "loss": 1.0697602033615112, + "step": 1346 + }, + { + "epoch": 2.468864468864469, + "grad_norm": 0.1793731302022934, + "learning_rate": 3.4506721783079925e-06, + "loss": 1.1031157970428467, + "step": 1348 + }, + { + "epoch": 2.4725274725274726, + "grad_norm": 0.24754835665225983, + "learning_rate": 3.4313446085191203e-06, + "loss": 0.7289459705352783, + "step": 1350 + }, + { + "epoch": 2.4761904761904763, + "grad_norm": 0.159286767244339, + "learning_rate": 3.4121355274223727e-06, + "loss": 0.8309732675552368, + "step": 1352 + }, + { + "epoch": 2.47985347985348, + "grad_norm": 1.1426700353622437, + "learning_rate": 3.3930452357394473e-06, + "loss": 0.9143206477165222, + "step": 1354 + }, + { + "epoch": 2.4835164835164836, + "grad_norm": 0.05411989986896515, + "learning_rate": 3.3740740323323705e-06, + "loss": 0.9112240672111511, + "step": 1356 + }, + { + "epoch": 2.4871794871794872, + "grad_norm": 0.1567273885011673, + "learning_rate": 3.3552222141988257e-06, + "loss": 1.0814073085784912, + "step": 1358 + }, + { + "epoch": 2.490842490842491, + "grad_norm": 0.3032623827457428, + "learning_rate": 3.336490076467489e-06, + "loss": 0.8779569268226624, + "step": 1360 + }, + { + "epoch": 2.4945054945054945, + "grad_norm": 0.2695983350276947, + "learning_rate": 3.31787791239342e-06, + "loss": 0.8037658929824829, + "step": 1362 + }, + { + "epoch": 2.498168498168498, + "grad_norm": 0.2464946210384369, + "learning_rate": 3.2993860133534763e-06, + "loss": 0.8425225615501404, + "step": 1364 + }, + { + "epoch": 2.501831501831502, + "grad_norm": 0.16769325733184814, + "learning_rate": 3.2810146688417304e-06, + "loss": 1.1095960140228271, + "step": 1366 + }, + { + "epoch": 2.5054945054945055, + "grad_norm": 0.16507335007190704, + "learning_rate": 3.2627641664649666e-06, + "loss": 0.7693407535552979, + "step": 1368 + }, + { + "epoch": 2.509157509157509, + "grad_norm": 0.9319798350334167, + "learning_rate": 3.2446347919381533e-06, + "loss": 0.9375527501106262, + "step": 1370 + }, + { + "epoch": 2.5128205128205128, + "grad_norm": 0.4757522940635681, + "learning_rate": 3.226626829079979e-06, + "loss": 0.6353393197059631, + "step": 1372 + }, + { + "epoch": 2.5164835164835164, + "grad_norm": 0.20639510452747345, + "learning_rate": 3.2087405598084194e-06, + "loss": 0.7712477445602417, + "step": 1374 + }, + { + "epoch": 2.52014652014652, + "grad_norm": 0.2182164192199707, + "learning_rate": 3.1909762641363083e-06, + "loss": 0.9633672833442688, + "step": 1376 + }, + { + "epoch": 2.5238095238095237, + "grad_norm": 0.335929274559021, + "learning_rate": 3.173334220166962e-06, + "loss": 0.7830007076263428, + "step": 1378 + }, + { + "epoch": 2.5274725274725274, + "grad_norm": 0.23919770121574402, + "learning_rate": 3.155814704089823e-06, + "loss": 0.8970922827720642, + "step": 1380 + }, + { + "epoch": 2.531135531135531, + "grad_norm": 0.034020014107227325, + "learning_rate": 3.1384179901761343e-06, + "loss": 0.8635251522064209, + "step": 1382 + }, + { + "epoch": 2.5347985347985347, + "grad_norm": 0.215301051735878, + "learning_rate": 3.1211443507746546e-06, + "loss": 0.7926411628723145, + "step": 1384 + }, + { + "epoch": 2.5384615384615383, + "grad_norm": 0.14359386265277863, + "learning_rate": 3.1039940563073894e-06, + "loss": 1.1008884906768799, + "step": 1386 + }, + { + "epoch": 2.542124542124542, + "grad_norm": 0.46346187591552734, + "learning_rate": 3.0869673752653447e-06, + "loss": 0.7606490254402161, + "step": 1388 + }, + { + "epoch": 2.5457875457875456, + "grad_norm": 0.342520147562027, + "learning_rate": 3.0700645742043476e-06, + "loss": 0.8070803880691528, + "step": 1390 + }, + { + "epoch": 2.5494505494505493, + "grad_norm": 0.3418232202529907, + "learning_rate": 3.0532859177408587e-06, + "loss": 0.983840823173523, + "step": 1392 + }, + { + "epoch": 2.553113553113553, + "grad_norm": 0.1977006494998932, + "learning_rate": 3.03663166854783e-06, + "loss": 0.7210864424705505, + "step": 1394 + }, + { + "epoch": 2.5567765567765566, + "grad_norm": 0.16176871955394745, + "learning_rate": 3.020102087350594e-06, + "loss": 0.34565648436546326, + "step": 1396 + }, + { + "epoch": 2.5604395604395602, + "grad_norm": 0.2752530574798584, + "learning_rate": 3.0036974329227862e-06, + "loss": 1.1138232946395874, + "step": 1398 + }, + { + "epoch": 2.564102564102564, + "grad_norm": 0.2671622931957245, + "learning_rate": 2.9874179620822856e-06, + "loss": 1.1061241626739502, + "step": 1400 + }, + { + "epoch": 2.5677655677655675, + "grad_norm": 0.5395840406417847, + "learning_rate": 2.971263929687207e-06, + "loss": 0.8952687978744507, + "step": 1402 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.2224486917257309, + "learning_rate": 2.9552355886318968e-06, + "loss": 1.105149269104004, + "step": 1404 + }, + { + "epoch": 2.575091575091575, + "grad_norm": 1.0579370260238647, + "learning_rate": 2.9393331898429777e-06, + "loss": 0.5482126474380493, + "step": 1406 + }, + { + "epoch": 2.578754578754579, + "grad_norm": 0.2807334065437317, + "learning_rate": 2.9235569822754317e-06, + "loss": 1.1531801223754883, + "step": 1408 + }, + { + "epoch": 2.5824175824175826, + "grad_norm": 0.360220730304718, + "learning_rate": 2.9079072129086906e-06, + "loss": 0.8873265981674194, + "step": 1410 + }, + { + "epoch": 2.586080586080586, + "grad_norm": 0.6387040019035339, + "learning_rate": 2.89238412674277e-06, + "loss": 0.8655451536178589, + "step": 1412 + }, + { + "epoch": 2.58974358974359, + "grad_norm": 0.5153301358222961, + "learning_rate": 2.8769879667944393e-06, + "loss": 0.953106164932251, + "step": 1414 + }, + { + "epoch": 2.5934065934065935, + "grad_norm": 0.44717395305633545, + "learning_rate": 2.8617189740934113e-06, + "loss": 1.0878214836120605, + "step": 1416 + }, + { + "epoch": 2.597069597069597, + "grad_norm": 0.2903793454170227, + "learning_rate": 2.8465773876785786e-06, + "loss": 0.7813395261764526, + "step": 1418 + }, + { + "epoch": 2.600732600732601, + "grad_norm": 0.13546399772167206, + "learning_rate": 2.8315634445942623e-06, + "loss": 1.085912823677063, + "step": 1420 + }, + { + "epoch": 2.6043956043956045, + "grad_norm": 0.4397055208683014, + "learning_rate": 2.8166773798864978e-06, + "loss": 1.0346283912658691, + "step": 1422 + }, + { + "epoch": 2.608058608058608, + "grad_norm": 0.25535109639167786, + "learning_rate": 2.8019194265993683e-06, + "loss": 1.1859427690505981, + "step": 1424 + }, + { + "epoch": 2.6117216117216118, + "grad_norm": 0.4347076416015625, + "learning_rate": 2.787289815771348e-06, + "loss": 1.2661553621292114, + "step": 1426 + }, + { + "epoch": 2.6153846153846154, + "grad_norm": 0.15085160732269287, + "learning_rate": 2.7727887764316835e-06, + "loss": 1.1397687196731567, + "step": 1428 + }, + { + "epoch": 2.619047619047619, + "grad_norm": 0.3919295370578766, + "learning_rate": 2.758416535596812e-06, + "loss": 1.078932285308838, + "step": 1430 + }, + { + "epoch": 2.6227106227106227, + "grad_norm": 0.67396479845047, + "learning_rate": 2.744173318266809e-06, + "loss": 0.9404812455177307, + "step": 1432 + }, + { + "epoch": 2.6263736263736264, + "grad_norm": 0.6748982667922974, + "learning_rate": 2.7300593474218583e-06, + "loss": 0.944557785987854, + "step": 1434 + }, + { + "epoch": 2.63003663003663, + "grad_norm": 0.33066844940185547, + "learning_rate": 2.7160748440187736e-06, + "loss": 1.3000890016555786, + "step": 1436 + }, + { + "epoch": 2.6336996336996337, + "grad_norm": 0.7671491503715515, + "learning_rate": 2.702220026987525e-06, + "loss": 1.1358200311660767, + "step": 1438 + }, + { + "epoch": 2.6373626373626373, + "grad_norm": 0.3073434829711914, + "learning_rate": 2.6884951132278185e-06, + "loss": 1.0983126163482666, + "step": 1440 + }, + { + "epoch": 2.641025641025641, + "grad_norm": 0.21354691684246063, + "learning_rate": 2.6749003176057092e-06, + "loss": 1.1217743158340454, + "step": 1442 + }, + { + "epoch": 2.6446886446886446, + "grad_norm": 0.8426246047019958, + "learning_rate": 2.6614358529502165e-06, + "loss": 1.0780593156814575, + "step": 1444 + }, + { + "epoch": 2.6483516483516483, + "grad_norm": 0.23787999153137207, + "learning_rate": 2.6481019300500166e-06, + "loss": 0.7865286469459534, + "step": 1446 + }, + { + "epoch": 2.652014652014652, + "grad_norm": 0.1286686360836029, + "learning_rate": 2.634898757650121e-06, + "loss": 0.8131098747253418, + "step": 1448 + }, + { + "epoch": 2.6556776556776556, + "grad_norm": 0.1445346623659134, + "learning_rate": 2.6218265424486233e-06, + "loss": 1.0926539897918701, + "step": 1450 + }, + { + "epoch": 2.659340659340659, + "grad_norm": 0.7427401542663574, + "learning_rate": 2.608885489093455e-06, + "loss": 0.6913706660270691, + "step": 1452 + }, + { + "epoch": 2.663003663003663, + "grad_norm": 0.06115258112549782, + "learning_rate": 2.5960758001791893e-06, + "loss": 0.6983692646026611, + "step": 1454 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.2783794701099396, + "learning_rate": 2.5833976762438605e-06, + "loss": 1.070138692855835, + "step": 1456 + }, + { + "epoch": 2.67032967032967, + "grad_norm": 0.2989695966243744, + "learning_rate": 2.5708513157658295e-06, + "loss": 0.7558898329734802, + "step": 1458 + }, + { + "epoch": 2.6739926739926743, + "grad_norm": 0.06930552423000336, + "learning_rate": 2.5584369151606785e-06, + "loss": 0.7775593400001526, + "step": 1460 + }, + { + "epoch": 2.677655677655678, + "grad_norm": 0.25155073404312134, + "learning_rate": 2.5461546687781325e-06, + "loss": 0.7822065949440002, + "step": 1462 + }, + { + "epoch": 2.6813186813186816, + "grad_norm": 0.24393150210380554, + "learning_rate": 2.5340047688990142e-06, + "loss": 1.0643365383148193, + "step": 1464 + }, + { + "epoch": 2.684981684981685, + "grad_norm": 0.23728282749652863, + "learning_rate": 2.5219874057322453e-06, + "loss": 0.8780837655067444, + "step": 1466 + }, + { + "epoch": 2.688644688644689, + "grad_norm": 0.19571846723556519, + "learning_rate": 2.5101027674118523e-06, + "loss": 1.0327012538909912, + "step": 1468 + }, + { + "epoch": 2.6923076923076925, + "grad_norm": 0.4742324650287628, + "learning_rate": 2.4983510399940377e-06, + "loss": 0.6416606903076172, + "step": 1470 + }, + { + "epoch": 2.695970695970696, + "grad_norm": 0.19169728457927704, + "learning_rate": 2.4867324074542525e-06, + "loss": 0.7808413505554199, + "step": 1472 + }, + { + "epoch": 2.6996336996337, + "grad_norm": 0.18114076554775238, + "learning_rate": 2.4752470516843257e-06, + "loss": 0.4357774257659912, + "step": 1474 + }, + { + "epoch": 2.7032967032967035, + "grad_norm": 0.2890799343585968, + "learning_rate": 2.463895152489617e-06, + "loss": 1.1387816667556763, + "step": 1476 + }, + { + "epoch": 2.706959706959707, + "grad_norm": 0.32990363240242004, + "learning_rate": 2.4526768875861938e-06, + "loss": 0.6536518335342407, + "step": 1478 + }, + { + "epoch": 2.7106227106227108, + "grad_norm": 0.3412167727947235, + "learning_rate": 2.4415924325980575e-06, + "loss": 1.0885263681411743, + "step": 1480 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 0.17220936715602875, + "learning_rate": 2.4306419610543885e-06, + "loss": 1.111185073852539, + "step": 1482 + }, + { + "epoch": 2.717948717948718, + "grad_norm": 0.3688224256038666, + "learning_rate": 2.4198256443868327e-06, + "loss": 1.0014547109603882, + "step": 1484 + }, + { + "epoch": 2.7216117216117217, + "grad_norm": 0.23104700446128845, + "learning_rate": 2.4091436519268167e-06, + "loss": 1.1393983364105225, + "step": 1486 + }, + { + "epoch": 2.7252747252747254, + "grad_norm": 0.22667832672595978, + "learning_rate": 2.3985961509028994e-06, + "loss": 1.1307849884033203, + "step": 1488 + }, + { + "epoch": 2.728937728937729, + "grad_norm": 0.7048260569572449, + "learning_rate": 2.3881833064381478e-06, + "loss": 0.5263049006462097, + "step": 1490 + }, + { + "epoch": 2.7326007326007327, + "grad_norm": 0.1926041543483734, + "learning_rate": 2.3779052815475553e-06, + "loss": 1.1333121061325073, + "step": 1492 + }, + { + "epoch": 2.7362637362637363, + "grad_norm": 0.42971038818359375, + "learning_rate": 2.3677622371354932e-06, + "loss": 0.6400864124298096, + "step": 1494 + }, + { + "epoch": 2.73992673992674, + "grad_norm": 0.2434394806623459, + "learning_rate": 2.357754331993187e-06, + "loss": 1.201290249824524, + "step": 1496 + }, + { + "epoch": 2.7435897435897436, + "grad_norm": 0.16831986606121063, + "learning_rate": 2.347881722796234e-06, + "loss": 1.0851210355758667, + "step": 1498 + }, + { + "epoch": 2.7472527472527473, + "grad_norm": 0.3970390856266022, + "learning_rate": 2.3381445641021445e-06, + "loss": 0.7578325271606445, + "step": 1500 + }, + { + "epoch": 2.750915750915751, + "grad_norm": 0.4430757761001587, + "learning_rate": 2.328543008347928e-06, + "loss": 1.035079002380371, + "step": 1502 + }, + { + "epoch": 2.7545787545787546, + "grad_norm": 0.23971140384674072, + "learning_rate": 2.31907720584771e-06, + "loss": 1.1079963445663452, + "step": 1504 + }, + { + "epoch": 2.758241758241758, + "grad_norm": 0.37300434708595276, + "learning_rate": 2.3097473047903645e-06, + "loss": 1.1286638975143433, + "step": 1506 + }, + { + "epoch": 2.761904761904762, + "grad_norm": 0.0422448106110096, + "learning_rate": 2.3005534512372106e-06, + "loss": 0.9173861145973206, + "step": 1508 + }, + { + "epoch": 2.7655677655677655, + "grad_norm": 0.25997260212898254, + "learning_rate": 2.2914957891197182e-06, + "loss": 1.0080313682556152, + "step": 1510 + }, + { + "epoch": 2.769230769230769, + "grad_norm": 0.30770131945610046, + "learning_rate": 2.2825744602372506e-06, + "loss": 0.9287357330322266, + "step": 1512 + }, + { + "epoch": 2.772893772893773, + "grad_norm": 0.2603771686553955, + "learning_rate": 2.2737896042548537e-06, + "loss": 1.0911868810653687, + "step": 1514 + }, + { + "epoch": 2.7765567765567765, + "grad_norm": 0.38838595151901245, + "learning_rate": 2.2651413587010634e-06, + "loss": 1.0172020196914673, + "step": 1516 + }, + { + "epoch": 2.78021978021978, + "grad_norm": 0.04338948056101799, + "learning_rate": 2.2566298589657546e-06, + "loss": 0.9788475036621094, + "step": 1518 + }, + { + "epoch": 2.7838827838827838, + "grad_norm": 0.3132210373878479, + "learning_rate": 2.2482552382980194e-06, + "loss": 0.5113797187805176, + "step": 1520 + }, + { + "epoch": 2.7875457875457874, + "grad_norm": 0.41038262844085693, + "learning_rate": 2.240017627804088e-06, + "loss": 0.7732734084129333, + "step": 1522 + }, + { + "epoch": 2.791208791208791, + "grad_norm": 0.23137636482715607, + "learning_rate": 2.231917156445265e-06, + "loss": 0.8295901417732239, + "step": 1524 + }, + { + "epoch": 2.7948717948717947, + "grad_norm": 0.3408252000808716, + "learning_rate": 2.223953951035919e-06, + "loss": 1.143431544303894, + "step": 1526 + }, + { + "epoch": 2.7985347985347984, + "grad_norm": 0.2596583664417267, + "learning_rate": 2.216128136241497e-06, + "loss": 1.099791407585144, + "step": 1528 + }, + { + "epoch": 2.802197802197802, + "grad_norm": 0.18814221024513245, + "learning_rate": 2.208439834576568e-06, + "loss": 1.0971970558166504, + "step": 1530 + }, + { + "epoch": 2.8058608058608057, + "grad_norm": 0.20894794166088104, + "learning_rate": 2.200889166402908e-06, + "loss": 0.9817790389060974, + "step": 1532 + }, + { + "epoch": 2.8095238095238093, + "grad_norm": 0.3537425696849823, + "learning_rate": 2.193476249927617e-06, + "loss": 0.7601557970046997, + "step": 1534 + }, + { + "epoch": 2.813186813186813, + "grad_norm": 0.22301346063613892, + "learning_rate": 2.1862012012012647e-06, + "loss": 1.2858803272247314, + "step": 1536 + }, + { + "epoch": 2.8168498168498166, + "grad_norm": 0.5590947270393372, + "learning_rate": 2.179064134116078e-06, + "loss": 0.9057199954986572, + "step": 1538 + }, + { + "epoch": 2.8205128205128203, + "grad_norm": 0.38495972752571106, + "learning_rate": 2.1720651604041543e-06, + "loss": 0.7820447683334351, + "step": 1540 + }, + { + "epoch": 2.824175824175824, + "grad_norm": 0.41492217779159546, + "learning_rate": 2.1652043896357132e-06, + "loss": 0.8878074288368225, + "step": 1542 + }, + { + "epoch": 2.8278388278388276, + "grad_norm": 0.45532068610191345, + "learning_rate": 2.1584819292173844e-06, + "loss": 1.0764187574386597, + "step": 1544 + }, + { + "epoch": 2.8315018315018317, + "grad_norm": 0.2862100899219513, + "learning_rate": 2.1518978843905204e-06, + "loss": 1.1286197900772095, + "step": 1546 + }, + { + "epoch": 2.8351648351648353, + "grad_norm": 0.18794697523117065, + "learning_rate": 2.1454523582295567e-06, + "loss": 1.1888434886932373, + "step": 1548 + }, + { + "epoch": 2.838827838827839, + "grad_norm": 0.4953882396221161, + "learning_rate": 2.1391454516403876e-06, + "loss": 0.8518368601799011, + "step": 1550 + }, + { + "epoch": 2.8424908424908426, + "grad_norm": 0.4500260353088379, + "learning_rate": 2.1329772633587976e-06, + "loss": 0.5600649118423462, + "step": 1552 + }, + { + "epoch": 2.8461538461538463, + "grad_norm": 0.21223555505275726, + "learning_rate": 2.1269478899489068e-06, + "loss": 1.1281017065048218, + "step": 1554 + }, + { + "epoch": 2.84981684981685, + "grad_norm": 0.17230379581451416, + "learning_rate": 2.1210574258016675e-06, + "loss": 0.9187098741531372, + "step": 1556 + }, + { + "epoch": 2.8534798534798536, + "grad_norm": 0.13780535757541656, + "learning_rate": 2.1153059631333785e-06, + "loss": 1.0671217441558838, + "step": 1558 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.29513898491859436, + "learning_rate": 2.1096935919842434e-06, + "loss": 0.5967673063278198, + "step": 1560 + }, + { + "epoch": 2.860805860805861, + "grad_norm": 0.2297191023826599, + "learning_rate": 2.104220400216967e-06, + "loss": 0.705269992351532, + "step": 1562 + }, + { + "epoch": 2.8644688644688645, + "grad_norm": 0.23687344789505005, + "learning_rate": 2.0988864735153724e-06, + "loss": 0.8660311102867126, + "step": 1564 + }, + { + "epoch": 2.868131868131868, + "grad_norm": 0.4433053433895111, + "learning_rate": 2.0936918953830633e-06, + "loss": 0.6954091787338257, + "step": 1566 + }, + { + "epoch": 2.871794871794872, + "grad_norm": 0.30360910296440125, + "learning_rate": 2.088636747142114e-06, + "loss": 0.7069787979125977, + "step": 1568 + }, + { + "epoch": 2.8754578754578755, + "grad_norm": 0.06758692115545273, + "learning_rate": 2.083721107931803e-06, + "loss": 0.7010313868522644, + "step": 1570 + }, + { + "epoch": 2.879120879120879, + "grad_norm": 0.30862146615982056, + "learning_rate": 2.0789450547073634e-06, + "loss": 0.66036057472229, + "step": 1572 + }, + { + "epoch": 2.8827838827838828, + "grad_norm": 0.2272968739271164, + "learning_rate": 2.074308662238789e-06, + "loss": 0.9868662357330322, + "step": 1574 + }, + { + "epoch": 2.8864468864468864, + "grad_norm": 0.4267461895942688, + "learning_rate": 2.069812003109654e-06, + "loss": 0.9745535254478455, + "step": 1576 + }, + { + "epoch": 2.89010989010989, + "grad_norm": 0.08947110921144485, + "learning_rate": 2.0654551477159868e-06, + "loss": 0.8887320160865784, + "step": 1578 + }, + { + "epoch": 2.8937728937728937, + "grad_norm": 0.20402641594409943, + "learning_rate": 2.0612381642651584e-06, + "loss": 1.170695185661316, + "step": 1580 + }, + { + "epoch": 2.8974358974358974, + "grad_norm": 0.3134024441242218, + "learning_rate": 2.057161118774821e-06, + "loss": 0.8388773798942566, + "step": 1582 + }, + { + "epoch": 2.901098901098901, + "grad_norm": 0.04043447971343994, + "learning_rate": 2.05322407507187e-06, + "loss": 0.7783088684082031, + "step": 1584 + }, + { + "epoch": 2.9047619047619047, + "grad_norm": 0.30711326003074646, + "learning_rate": 2.0494270947914507e-06, + "loss": 0.788719892501831, + "step": 1586 + }, + { + "epoch": 2.9084249084249083, + "grad_norm": 0.3433006703853607, + "learning_rate": 2.0457702373759864e-06, + "loss": 1.1323091983795166, + "step": 1588 + }, + { + "epoch": 2.912087912087912, + "grad_norm": 0.2119935005903244, + "learning_rate": 2.0422535600742526e-06, + "loss": 1.1913877725601196, + "step": 1590 + }, + { + "epoch": 2.9157509157509156, + "grad_norm": 0.4435896575450897, + "learning_rate": 2.03887711794048e-06, + "loss": 0.5050473809242249, + "step": 1592 + }, + { + "epoch": 2.9194139194139193, + "grad_norm": 0.5109768509864807, + "learning_rate": 2.0356409638334902e-06, + "loss": 1.1414501667022705, + "step": 1594 + }, + { + "epoch": 2.9230769230769234, + "grad_norm": 0.4784262776374817, + "learning_rate": 2.032545148415871e-06, + "loss": 0.747880756855011, + "step": 1596 + }, + { + "epoch": 2.926739926739927, + "grad_norm": 0.19986164569854736, + "learning_rate": 2.0295897201531838e-06, + "loss": 1.1563737392425537, + "step": 1598 + }, + { + "epoch": 2.9304029304029307, + "grad_norm": 0.2983030378818512, + "learning_rate": 2.026774725313199e-06, + "loss": 0.7789967060089111, + "step": 1600 + }, + { + "epoch": 2.9340659340659343, + "grad_norm": 0.21115927398204803, + "learning_rate": 2.0241002079651803e-06, + "loss": 1.1291173696517944, + "step": 1602 + }, + { + "epoch": 2.937728937728938, + "grad_norm": 0.08551673591136932, + "learning_rate": 2.0215662099791874e-06, + "loss": 0.8520828485488892, + "step": 1604 + }, + { + "epoch": 2.9413919413919416, + "grad_norm": 0.17809857428073883, + "learning_rate": 2.019172771025426e-06, + "loss": 1.1100882291793823, + "step": 1606 + }, + { + "epoch": 2.9450549450549453, + "grad_norm": 0.18868334591388702, + "learning_rate": 2.0169199285736234e-06, + "loss": 0.7092351317405701, + "step": 1608 + }, + { + "epoch": 2.948717948717949, + "grad_norm": 0.32582998275756836, + "learning_rate": 2.0148077178924412e-06, + "loss": 1.0054452419281006, + "step": 1610 + }, + { + "epoch": 2.9523809523809526, + "grad_norm": 1.538607120513916, + "learning_rate": 2.0128361720489263e-06, + "loss": 0.8747723698616028, + "step": 1612 + }, + { + "epoch": 2.956043956043956, + "grad_norm": 1.1463967561721802, + "learning_rate": 2.0110053219079927e-06, + "loss": 0.6871626377105713, + "step": 1614 + }, + { + "epoch": 2.95970695970696, + "grad_norm": 0.29785850644111633, + "learning_rate": 2.009315196131934e-06, + "loss": 0.8241419792175293, + "step": 1616 + }, + { + "epoch": 2.9633699633699635, + "grad_norm": 0.2533627152442932, + "learning_rate": 2.0077658211799823e-06, + "loss": 1.3680229187011719, + "step": 1618 + }, + { + "epoch": 2.967032967032967, + "grad_norm": 0.41148898005485535, + "learning_rate": 2.0063572213078856e-06, + "loss": 1.2273290157318115, + "step": 1620 + }, + { + "epoch": 2.970695970695971, + "grad_norm": 0.2996610701084137, + "learning_rate": 2.0050894185675354e-06, + "loss": 0.9176530838012695, + "step": 1622 + }, + { + "epoch": 2.9743589743589745, + "grad_norm": 0.6570471525192261, + "learning_rate": 2.0039624328066154e-06, + "loss": 0.780877411365509, + "step": 1624 + }, + { + "epoch": 2.978021978021978, + "grad_norm": 0.5498982667922974, + "learning_rate": 2.0029762816682963e-06, + "loss": 0.8718687295913696, + "step": 1626 + }, + { + "epoch": 2.9816849816849818, + "grad_norm": 0.1393526792526245, + "learning_rate": 2.0021309805909546e-06, + "loss": 0.9456213116645813, + "step": 1628 + }, + { + "epoch": 2.9853479853479854, + "grad_norm": 1.554042935371399, + "learning_rate": 2.001426542807935e-06, + "loss": 1.4017192125320435, + "step": 1630 + }, + { + "epoch": 2.989010989010989, + "grad_norm": 0.4462849199771881, + "learning_rate": 2.000862979347339e-06, + "loss": 1.0045907497406006, + "step": 1632 + }, + { + "epoch": 2.9926739926739927, + "grad_norm": 0.33116626739501953, + "learning_rate": 2.0004402990318574e-06, + "loss": 0.7610074281692505, + "step": 1634 + }, + { + "epoch": 2.9963369963369964, + "grad_norm": 0.1885322779417038, + "learning_rate": 2.000158508478629e-06, + "loss": 1.2684826850891113, + "step": 1636 + }, + { + "epoch": 3.0, + "grad_norm": 0.30161595344543457, + "learning_rate": 2.0000176120991345e-06, + "loss": 1.06321382522583, + "step": 1638 + }, + { + "epoch": 3.0, + "step": 1638, + "total_flos": 8.4482141520606e+18, + "train_loss": 1.0233670397410317, + "train_runtime": 54643.5024, + "train_samples_per_second": 0.719, + "train_steps_per_second": 0.03 + } + ], + "logging_steps": 2, + "max_steps": 1638, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 99999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.4482141520606e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}