| { | |
| "best_metric": 0.798653244972229, | |
| "best_model_checkpoint": "FastCoderL4-ITX/checkpoint-500", | |
| "epoch": 1.0, | |
| "eval_steps": 250, | |
| "global_step": 547, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0018281535648994515, | |
| "grad_norm": 16.024444580078125, | |
| "learning_rate": 1.2000000000000002e-07, | |
| "loss": 1.6383, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.003656307129798903, | |
| "grad_norm": 16.114477157592773, | |
| "learning_rate": 2.4000000000000003e-07, | |
| "loss": 1.7323, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.005484460694698354, | |
| "grad_norm": 14.292167663574219, | |
| "learning_rate": 3.6e-07, | |
| "loss": 1.4207, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.007312614259597806, | |
| "grad_norm": 15.010176658630371, | |
| "learning_rate": 4.800000000000001e-07, | |
| "loss": 1.5956, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.009140767824497258, | |
| "grad_norm": 13.827630996704102, | |
| "learning_rate": 6.000000000000001e-07, | |
| "loss": 1.49, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.010968921389396709, | |
| "grad_norm": 15.43071174621582, | |
| "learning_rate": 7.2e-07, | |
| "loss": 1.6081, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.012797074954296161, | |
| "grad_norm": 14.97592544555664, | |
| "learning_rate": 8.4e-07, | |
| "loss": 1.6164, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.014625228519195612, | |
| "grad_norm": 11.73971939086914, | |
| "learning_rate": 9.600000000000001e-07, | |
| "loss": 1.4299, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.016453382084095063, | |
| "grad_norm": 12.449714660644531, | |
| "learning_rate": 1.08e-06, | |
| "loss": 1.3328, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.018281535648994516, | |
| "grad_norm": 12.710100173950195, | |
| "learning_rate": 1.2000000000000002e-06, | |
| "loss": 1.4129, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02010968921389397, | |
| "grad_norm": 12.13203239440918, | |
| "learning_rate": 1.3199999999999999e-06, | |
| "loss": 1.3971, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.021937842778793418, | |
| "grad_norm": 10.500185012817383, | |
| "learning_rate": 1.44e-06, | |
| "loss": 1.4321, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.02376599634369287, | |
| "grad_norm": 10.064560890197754, | |
| "learning_rate": 1.5599999999999999e-06, | |
| "loss": 1.2872, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.025594149908592323, | |
| "grad_norm": 7.85143518447876, | |
| "learning_rate": 1.68e-06, | |
| "loss": 1.2345, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.027422303473491772, | |
| "grad_norm": 7.530126094818115, | |
| "learning_rate": 1.8e-06, | |
| "loss": 1.1803, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.029250457038391225, | |
| "grad_norm": 6.091775417327881, | |
| "learning_rate": 1.9200000000000003e-06, | |
| "loss": 1.2247, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.031078610603290677, | |
| "grad_norm": 4.9651384353637695, | |
| "learning_rate": 2.0400000000000004e-06, | |
| "loss": 1.1655, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.03290676416819013, | |
| "grad_norm": 6.209571361541748, | |
| "learning_rate": 2.16e-06, | |
| "loss": 1.0649, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.03473491773308958, | |
| "grad_norm": 4.946502208709717, | |
| "learning_rate": 2.28e-06, | |
| "loss": 1.1046, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.03656307129798903, | |
| "grad_norm": 4.954932689666748, | |
| "learning_rate": 2.4000000000000003e-06, | |
| "loss": 1.0964, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.038391224862888484, | |
| "grad_norm": 3.8354671001434326, | |
| "learning_rate": 2.52e-06, | |
| "loss": 1.2277, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.04021937842778794, | |
| "grad_norm": 4.310220718383789, | |
| "learning_rate": 2.6399999999999997e-06, | |
| "loss": 1.042, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.04204753199268738, | |
| "grad_norm": 3.9748997688293457, | |
| "learning_rate": 2.76e-06, | |
| "loss": 1.0234, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.043875685557586835, | |
| "grad_norm": 3.9019360542297363, | |
| "learning_rate": 2.88e-06, | |
| "loss": 1.1286, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.04570383912248629, | |
| "grad_norm": 4.246694564819336, | |
| "learning_rate": 3e-06, | |
| "loss": 0.9793, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.04753199268738574, | |
| "grad_norm": 3.8797051906585693, | |
| "learning_rate": 3.1199999999999998e-06, | |
| "loss": 1.0747, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.04936014625228519, | |
| "grad_norm": 4.0023908615112305, | |
| "learning_rate": 3.24e-06, | |
| "loss": 1.1031, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.051188299817184646, | |
| "grad_norm": 4.26245641708374, | |
| "learning_rate": 3.36e-06, | |
| "loss": 1.003, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.05301645338208409, | |
| "grad_norm": 4.6040215492248535, | |
| "learning_rate": 3.48e-06, | |
| "loss": 0.9311, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.054844606946983544, | |
| "grad_norm": 4.464705467224121, | |
| "learning_rate": 3.6e-06, | |
| "loss": 1.0341, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.056672760511883, | |
| "grad_norm": 3.787562608718872, | |
| "learning_rate": 3.72e-06, | |
| "loss": 0.984, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.05850091407678245, | |
| "grad_norm": 3.2259016036987305, | |
| "learning_rate": 3.8400000000000005e-06, | |
| "loss": 0.9167, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.0603290676416819, | |
| "grad_norm": 3.7597789764404297, | |
| "learning_rate": 3.96e-06, | |
| "loss": 1.0784, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.062157221206581355, | |
| "grad_norm": 3.173090934753418, | |
| "learning_rate": 4.080000000000001e-06, | |
| "loss": 0.9436, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.06398537477148081, | |
| "grad_norm": 3.336909055709839, | |
| "learning_rate": 4.2000000000000004e-06, | |
| "loss": 0.8013, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.06581352833638025, | |
| "grad_norm": 2.738156318664551, | |
| "learning_rate": 4.32e-06, | |
| "loss": 1.1238, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.06764168190127971, | |
| "grad_norm": 3.3270339965820312, | |
| "learning_rate": 4.44e-06, | |
| "loss": 0.8423, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.06946983546617916, | |
| "grad_norm": 2.872663736343384, | |
| "learning_rate": 4.56e-06, | |
| "loss": 0.9931, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.0712979890310786, | |
| "grad_norm": 3.2571451663970947, | |
| "learning_rate": 4.68e-06, | |
| "loss": 0.9323, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.07312614259597806, | |
| "grad_norm": 2.999234437942505, | |
| "learning_rate": 4.800000000000001e-06, | |
| "loss": 0.9247, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.07495429616087751, | |
| "grad_norm": 2.9580419063568115, | |
| "learning_rate": 4.92e-06, | |
| "loss": 0.8751, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.07678244972577697, | |
| "grad_norm": 2.8437395095825195, | |
| "learning_rate": 5.04e-06, | |
| "loss": 0.8857, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.07861060329067641, | |
| "grad_norm": 3.175656318664551, | |
| "learning_rate": 5.16e-06, | |
| "loss": 0.8942, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.08043875685557587, | |
| "grad_norm": 2.684788703918457, | |
| "learning_rate": 5.279999999999999e-06, | |
| "loss": 0.8725, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.08226691042047532, | |
| "grad_norm": 3.000286340713501, | |
| "learning_rate": 5.4e-06, | |
| "loss": 0.8803, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.08409506398537477, | |
| "grad_norm": 2.856066942214966, | |
| "learning_rate": 5.52e-06, | |
| "loss": 0.9705, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.08592321755027423, | |
| "grad_norm": 3.0575389862060547, | |
| "learning_rate": 5.64e-06, | |
| "loss": 0.8106, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.08775137111517367, | |
| "grad_norm": 2.649608612060547, | |
| "learning_rate": 5.76e-06, | |
| "loss": 1.0701, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.08957952468007313, | |
| "grad_norm": 3.1014580726623535, | |
| "learning_rate": 5.8800000000000005e-06, | |
| "loss": 0.9607, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.09140767824497258, | |
| "grad_norm": 2.6570193767547607, | |
| "learning_rate": 6e-06, | |
| "loss": 0.9685, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.09323583180987204, | |
| "grad_norm": 3.082258462905884, | |
| "learning_rate": 6.12e-06, | |
| "loss": 1.0039, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.09506398537477148, | |
| "grad_norm": 2.4003512859344482, | |
| "learning_rate": 6.2399999999999995e-06, | |
| "loss": 0.8934, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.09689213893967093, | |
| "grad_norm": 2.605583667755127, | |
| "learning_rate": 6.36e-06, | |
| "loss": 0.8891, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.09872029250457039, | |
| "grad_norm": 2.541799306869507, | |
| "learning_rate": 6.48e-06, | |
| "loss": 0.8183, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.10054844606946983, | |
| "grad_norm": 2.594459056854248, | |
| "learning_rate": 6.6e-06, | |
| "loss": 0.9906, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.10237659963436929, | |
| "grad_norm": 2.9506289958953857, | |
| "learning_rate": 6.72e-06, | |
| "loss": 0.8263, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.10420475319926874, | |
| "grad_norm": 2.8362669944763184, | |
| "learning_rate": 6.840000000000001e-06, | |
| "loss": 0.9, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.10603290676416818, | |
| "grad_norm": 2.6192896366119385, | |
| "learning_rate": 6.96e-06, | |
| "loss": 1.05, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.10786106032906764, | |
| "grad_norm": 2.7502949237823486, | |
| "learning_rate": 7.08e-06, | |
| "loss": 0.87, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.10968921389396709, | |
| "grad_norm": 2.6745474338531494, | |
| "learning_rate": 7.2e-06, | |
| "loss": 0.8163, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.11151736745886655, | |
| "grad_norm": 2.6584086418151855, | |
| "learning_rate": 7.32e-06, | |
| "loss": 0.8813, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.113345521023766, | |
| "grad_norm": 2.689574956893921, | |
| "learning_rate": 7.44e-06, | |
| "loss": 0.9404, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.11517367458866545, | |
| "grad_norm": 2.754441738128662, | |
| "learning_rate": 7.5600000000000005e-06, | |
| "loss": 0.7416, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.1170018281535649, | |
| "grad_norm": 2.8178014755249023, | |
| "learning_rate": 7.680000000000001e-06, | |
| "loss": 0.8377, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.11882998171846434, | |
| "grad_norm": 2.8821122646331787, | |
| "learning_rate": 7.8e-06, | |
| "loss": 0.7101, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.1206581352833638, | |
| "grad_norm": 2.6646909713745117, | |
| "learning_rate": 7.92e-06, | |
| "loss": 1.0581, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.12248628884826325, | |
| "grad_norm": 2.9155476093292236, | |
| "learning_rate": 8.040000000000001e-06, | |
| "loss": 0.8417, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.12431444241316271, | |
| "grad_norm": 2.7877771854400635, | |
| "learning_rate": 8.160000000000001e-06, | |
| "loss": 0.9266, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.12614259597806216, | |
| "grad_norm": 2.625126361846924, | |
| "learning_rate": 8.28e-06, | |
| "loss": 1.0048, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.12797074954296161, | |
| "grad_norm": 2.7259960174560547, | |
| "learning_rate": 8.400000000000001e-06, | |
| "loss": 0.9485, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.12979890310786105, | |
| "grad_norm": 2.743478536605835, | |
| "learning_rate": 8.52e-06, | |
| "loss": 0.9221, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.1316270566727605, | |
| "grad_norm": 2.586174964904785, | |
| "learning_rate": 8.64e-06, | |
| "loss": 0.8967, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.13345521023765997, | |
| "grad_norm": 2.817873954772949, | |
| "learning_rate": 8.759999999999999e-06, | |
| "loss": 0.943, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.13528336380255943, | |
| "grad_norm": 2.692861557006836, | |
| "learning_rate": 8.88e-06, | |
| "loss": 0.8334, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.13711151736745886, | |
| "grad_norm": 2.9305572509765625, | |
| "learning_rate": 9e-06, | |
| "loss": 0.8215, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.13893967093235832, | |
| "grad_norm": 2.898930072784424, | |
| "learning_rate": 9.12e-06, | |
| "loss": 0.8979, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.14076782449725778, | |
| "grad_norm": 2.8066327571868896, | |
| "learning_rate": 9.24e-06, | |
| "loss": 1.0717, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.1425959780621572, | |
| "grad_norm": 3.126624584197998, | |
| "learning_rate": 9.36e-06, | |
| "loss": 0.8887, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.14442413162705667, | |
| "grad_norm": 2.469200611114502, | |
| "learning_rate": 9.48e-06, | |
| "loss": 0.9542, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.14625228519195613, | |
| "grad_norm": 2.6940770149230957, | |
| "learning_rate": 9.600000000000001e-06, | |
| "loss": 0.9756, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1480804387568556, | |
| "grad_norm": 2.847891330718994, | |
| "learning_rate": 9.72e-06, | |
| "loss": 0.8966, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.14990859232175502, | |
| "grad_norm": 2.9159109592437744, | |
| "learning_rate": 9.84e-06, | |
| "loss": 0.8055, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.15173674588665448, | |
| "grad_norm": 2.9693570137023926, | |
| "learning_rate": 9.960000000000001e-06, | |
| "loss": 0.8913, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.15356489945155394, | |
| "grad_norm": 2.6382272243499756, | |
| "learning_rate": 1.008e-05, | |
| "loss": 0.8565, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.15539305301645337, | |
| "grad_norm": 2.7299423217773438, | |
| "learning_rate": 1.02e-05, | |
| "loss": 0.8096, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.15722120658135283, | |
| "grad_norm": 2.7661237716674805, | |
| "learning_rate": 1.032e-05, | |
| "loss": 0.9193, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.1590493601462523, | |
| "grad_norm": 3.0896854400634766, | |
| "learning_rate": 1.044e-05, | |
| "loss": 0.7745, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.16087751371115175, | |
| "grad_norm": 2.6443893909454346, | |
| "learning_rate": 1.0559999999999999e-05, | |
| "loss": 0.8674, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.16270566727605118, | |
| "grad_norm": 3.047353506088257, | |
| "learning_rate": 1.068e-05, | |
| "loss": 0.9062, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.16453382084095064, | |
| "grad_norm": 2.7751214504241943, | |
| "learning_rate": 1.08e-05, | |
| "loss": 0.8222, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1663619744058501, | |
| "grad_norm": 2.5556681156158447, | |
| "learning_rate": 1.092e-05, | |
| "loss": 0.7737, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.16819012797074953, | |
| "grad_norm": 2.840104103088379, | |
| "learning_rate": 1.104e-05, | |
| "loss": 0.9967, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.170018281535649, | |
| "grad_norm": 2.784130811691284, | |
| "learning_rate": 1.116e-05, | |
| "loss": 0.8571, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.17184643510054845, | |
| "grad_norm": 2.5982677936553955, | |
| "learning_rate": 1.128e-05, | |
| "loss": 0.7934, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.1736745886654479, | |
| "grad_norm": 3.1838393211364746, | |
| "learning_rate": 1.1400000000000001e-05, | |
| "loss": 0.8569, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.17550274223034734, | |
| "grad_norm": 2.793653726577759, | |
| "learning_rate": 1.152e-05, | |
| "loss": 0.9144, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.1773308957952468, | |
| "grad_norm": 2.6756796836853027, | |
| "learning_rate": 1.164e-05, | |
| "loss": 0.8517, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.17915904936014626, | |
| "grad_norm": 2.6979010105133057, | |
| "learning_rate": 1.1760000000000001e-05, | |
| "loss": 0.7551, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.1809872029250457, | |
| "grad_norm": 2.9032483100891113, | |
| "learning_rate": 1.1880000000000001e-05, | |
| "loss": 0.777, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.18281535648994515, | |
| "grad_norm": 2.555727243423462, | |
| "learning_rate": 1.2e-05, | |
| "loss": 0.7583, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1846435100548446, | |
| "grad_norm": 2.7780463695526123, | |
| "learning_rate": 1.2120000000000001e-05, | |
| "loss": 1.0916, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.18647166361974407, | |
| "grad_norm": 2.791424512863159, | |
| "learning_rate": 1.224e-05, | |
| "loss": 0.9344, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.1882998171846435, | |
| "grad_norm": 2.590106248855591, | |
| "learning_rate": 1.236e-05, | |
| "loss": 0.8391, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.19012797074954296, | |
| "grad_norm": 2.7519073486328125, | |
| "learning_rate": 1.2479999999999999e-05, | |
| "loss": 0.7809, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.19195612431444242, | |
| "grad_norm": 2.8074002265930176, | |
| "learning_rate": 1.26e-05, | |
| "loss": 0.8258, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.19378427787934185, | |
| "grad_norm": 2.6220719814300537, | |
| "learning_rate": 1.272e-05, | |
| "loss": 0.7542, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.1956124314442413, | |
| "grad_norm": 2.8143625259399414, | |
| "learning_rate": 1.284e-05, | |
| "loss": 0.8587, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.19744058500914077, | |
| "grad_norm": 2.4876911640167236, | |
| "learning_rate": 1.296e-05, | |
| "loss": 0.8425, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.19926873857404023, | |
| "grad_norm": 2.7102651596069336, | |
| "learning_rate": 1.308e-05, | |
| "loss": 0.9726, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.20109689213893966, | |
| "grad_norm": 2.375572919845581, | |
| "learning_rate": 1.32e-05, | |
| "loss": 0.8122, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.20292504570383912, | |
| "grad_norm": 2.485874652862549, | |
| "learning_rate": 1.3320000000000001e-05, | |
| "loss": 0.7726, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.20475319926873858, | |
| "grad_norm": 2.5263822078704834, | |
| "learning_rate": 1.344e-05, | |
| "loss": 0.9219, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.20658135283363802, | |
| "grad_norm": 2.5467567443847656, | |
| "learning_rate": 1.356e-05, | |
| "loss": 0.8116, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.20840950639853748, | |
| "grad_norm": 2.3540358543395996, | |
| "learning_rate": 1.3680000000000001e-05, | |
| "loss": 1.0343, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.21023765996343693, | |
| "grad_norm": 2.6379354000091553, | |
| "learning_rate": 1.3800000000000002e-05, | |
| "loss": 0.8242, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.21206581352833637, | |
| "grad_norm": 2.5178139209747314, | |
| "learning_rate": 1.392e-05, | |
| "loss": 0.8899, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.21389396709323583, | |
| "grad_norm": 2.802619695663452, | |
| "learning_rate": 1.4040000000000001e-05, | |
| "loss": 0.8031, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.21572212065813529, | |
| "grad_norm": 2.7448935508728027, | |
| "learning_rate": 1.416e-05, | |
| "loss": 0.8676, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.21755027422303475, | |
| "grad_norm": 2.626340627670288, | |
| "learning_rate": 1.428e-05, | |
| "loss": 0.9465, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.21937842778793418, | |
| "grad_norm": 2.5691044330596924, | |
| "learning_rate": 1.44e-05, | |
| "loss": 0.712, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.22120658135283364, | |
| "grad_norm": 2.877453565597534, | |
| "learning_rate": 1.452e-05, | |
| "loss": 0.8605, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.2230347349177331, | |
| "grad_norm": 2.409876585006714, | |
| "learning_rate": 1.464e-05, | |
| "loss": 0.8972, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.22486288848263253, | |
| "grad_norm": 2.517220973968506, | |
| "learning_rate": 1.4760000000000001e-05, | |
| "loss": 0.822, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.226691042047532, | |
| "grad_norm": 2.53521728515625, | |
| "learning_rate": 1.488e-05, | |
| "loss": 0.7721, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.22851919561243145, | |
| "grad_norm": 2.533579111099243, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7182, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.2303473491773309, | |
| "grad_norm": 2.8807780742645264, | |
| "learning_rate": 1.5120000000000001e-05, | |
| "loss": 0.8755, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.23217550274223034, | |
| "grad_norm": 2.8886823654174805, | |
| "learning_rate": 1.524e-05, | |
| "loss": 0.8119, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.2340036563071298, | |
| "grad_norm": 2.710432529449463, | |
| "learning_rate": 1.5360000000000002e-05, | |
| "loss": 0.7054, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.23583180987202926, | |
| "grad_norm": 2.3780925273895264, | |
| "learning_rate": 1.548e-05, | |
| "loss": 0.9101, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.2376599634369287, | |
| "grad_norm": 2.6293869018554688, | |
| "learning_rate": 1.56e-05, | |
| "loss": 0.7895, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.23948811700182815, | |
| "grad_norm": 2.584303617477417, | |
| "learning_rate": 1.5720000000000002e-05, | |
| "loss": 1.0317, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.2413162705667276, | |
| "grad_norm": 2.4637179374694824, | |
| "learning_rate": 1.584e-05, | |
| "loss": 0.7805, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.24314442413162707, | |
| "grad_norm": 2.4105379581451416, | |
| "learning_rate": 1.596e-05, | |
| "loss": 0.8044, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.2449725776965265, | |
| "grad_norm": 2.476205825805664, | |
| "learning_rate": 1.6080000000000002e-05, | |
| "loss": 0.7283, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.24680073126142596, | |
| "grad_norm": 2.620548725128174, | |
| "learning_rate": 1.62e-05, | |
| "loss": 0.8035, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.24862888482632542, | |
| "grad_norm": 2.4662225246429443, | |
| "learning_rate": 1.6320000000000003e-05, | |
| "loss": 0.8235, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.25045703839122485, | |
| "grad_norm": 2.405362367630005, | |
| "learning_rate": 1.6440000000000002e-05, | |
| "loss": 0.8681, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.2522851919561243, | |
| "grad_norm": 2.331638813018799, | |
| "learning_rate": 1.656e-05, | |
| "loss": 0.8784, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.25411334552102377, | |
| "grad_norm": 2.796093463897705, | |
| "learning_rate": 1.6680000000000003e-05, | |
| "loss": 0.9942, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.25594149908592323, | |
| "grad_norm": 2.3736331462860107, | |
| "learning_rate": 1.6800000000000002e-05, | |
| "loss": 0.7229, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.2577696526508227, | |
| "grad_norm": 2.4110031127929688, | |
| "learning_rate": 1.6919999999999997e-05, | |
| "loss": 0.8202, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.2595978062157221, | |
| "grad_norm": 2.3349928855895996, | |
| "learning_rate": 1.704e-05, | |
| "loss": 0.7966, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.26142595978062155, | |
| "grad_norm": 2.4862008094787598, | |
| "learning_rate": 1.716e-05, | |
| "loss": 0.8141, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.263254113345521, | |
| "grad_norm": 2.787587881088257, | |
| "learning_rate": 1.728e-05, | |
| "loss": 0.7861, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.26508226691042047, | |
| "grad_norm": 2.687865972518921, | |
| "learning_rate": 1.74e-05, | |
| "loss": 0.9085, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.26691042047531993, | |
| "grad_norm": 2.517024278640747, | |
| "learning_rate": 1.7519999999999998e-05, | |
| "loss": 0.8719, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.2687385740402194, | |
| "grad_norm": 2.4157791137695312, | |
| "learning_rate": 1.764e-05, | |
| "loss": 0.8469, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.27056672760511885, | |
| "grad_norm": 2.647015333175659, | |
| "learning_rate": 1.776e-05, | |
| "loss": 0.8133, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.27239488117001825, | |
| "grad_norm": 2.7705986499786377, | |
| "learning_rate": 1.7879999999999998e-05, | |
| "loss": 0.8819, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.2742230347349177, | |
| "grad_norm": 2.2369964122772217, | |
| "learning_rate": 1.8e-05, | |
| "loss": 0.88, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2760511882998172, | |
| "grad_norm": 2.239433765411377, | |
| "learning_rate": 1.812e-05, | |
| "loss": 0.7873, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.27787934186471663, | |
| "grad_norm": 2.493117332458496, | |
| "learning_rate": 1.824e-05, | |
| "loss": 0.8111, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.2797074954296161, | |
| "grad_norm": 2.5309877395629883, | |
| "learning_rate": 1.836e-05, | |
| "loss": 0.7235, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.28153564899451555, | |
| "grad_norm": 2.403522491455078, | |
| "learning_rate": 1.848e-05, | |
| "loss": 0.816, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.283363802559415, | |
| "grad_norm": 2.8262531757354736, | |
| "learning_rate": 1.86e-05, | |
| "loss": 0.9069, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.2851919561243144, | |
| "grad_norm": 2.51188588142395, | |
| "learning_rate": 1.872e-05, | |
| "loss": 0.8979, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.2870201096892139, | |
| "grad_norm": 2.493990659713745, | |
| "learning_rate": 1.884e-05, | |
| "loss": 0.798, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.28884826325411334, | |
| "grad_norm": 2.5412824153900146, | |
| "learning_rate": 1.896e-05, | |
| "loss": 0.7898, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.2906764168190128, | |
| "grad_norm": 2.4731011390686035, | |
| "learning_rate": 1.908e-05, | |
| "loss": 0.8854, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.29250457038391225, | |
| "grad_norm": 2.6185050010681152, | |
| "learning_rate": 1.9200000000000003e-05, | |
| "loss": 0.8163, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.2943327239488117, | |
| "grad_norm": 2.384073495864868, | |
| "learning_rate": 1.932e-05, | |
| "loss": 0.7888, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.2961608775137112, | |
| "grad_norm": 2.566452741622925, | |
| "learning_rate": 1.944e-05, | |
| "loss": 0.8344, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.2979890310786106, | |
| "grad_norm": 2.4498672485351562, | |
| "learning_rate": 1.9560000000000002e-05, | |
| "loss": 0.8288, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.29981718464351004, | |
| "grad_norm": 2.7561299800872803, | |
| "learning_rate": 1.968e-05, | |
| "loss": 0.8321, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.3016453382084095, | |
| "grad_norm": 2.5148916244506836, | |
| "learning_rate": 1.98e-05, | |
| "loss": 0.8343, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.30347349177330896, | |
| "grad_norm": 2.444960594177246, | |
| "learning_rate": 1.9920000000000002e-05, | |
| "loss": 0.6833, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.3053016453382084, | |
| "grad_norm": 2.5153768062591553, | |
| "learning_rate": 2.004e-05, | |
| "loss": 0.9192, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.3071297989031079, | |
| "grad_norm": 2.301560640335083, | |
| "learning_rate": 2.016e-05, | |
| "loss": 0.7864, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.30895795246800734, | |
| "grad_norm": 2.628103733062744, | |
| "learning_rate": 2.0280000000000002e-05, | |
| "loss": 0.8426, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.31078610603290674, | |
| "grad_norm": 2.4587066173553467, | |
| "learning_rate": 2.04e-05, | |
| "loss": 0.8344, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3126142595978062, | |
| "grad_norm": 2.4356703758239746, | |
| "learning_rate": 2.0520000000000003e-05, | |
| "loss": 0.7558, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.31444241316270566, | |
| "grad_norm": 2.531304121017456, | |
| "learning_rate": 2.064e-05, | |
| "loss": 0.855, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.3162705667276051, | |
| "grad_norm": 2.2168610095977783, | |
| "learning_rate": 2.0759999999999998e-05, | |
| "loss": 0.8551, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.3180987202925046, | |
| "grad_norm": 2.4772465229034424, | |
| "learning_rate": 2.088e-05, | |
| "loss": 0.8782, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.31992687385740404, | |
| "grad_norm": 2.4406375885009766, | |
| "learning_rate": 2.1e-05, | |
| "loss": 0.775, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.3217550274223035, | |
| "grad_norm": 2.638505697250366, | |
| "learning_rate": 2.1119999999999998e-05, | |
| "loss": 0.9181, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.3235831809872029, | |
| "grad_norm": 2.452930212020874, | |
| "learning_rate": 2.124e-05, | |
| "loss": 0.8452, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.32541133455210236, | |
| "grad_norm": 2.370314836502075, | |
| "learning_rate": 2.136e-05, | |
| "loss": 1.0293, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.3272394881170018, | |
| "grad_norm": 2.4259750843048096, | |
| "learning_rate": 2.148e-05, | |
| "loss": 0.7744, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.3290676416819013, | |
| "grad_norm": 2.374286413192749, | |
| "learning_rate": 2.16e-05, | |
| "loss": 0.8336, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.33089579524680074, | |
| "grad_norm": 2.4372458457946777, | |
| "learning_rate": 2.172e-05, | |
| "loss": 0.9673, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.3327239488117002, | |
| "grad_norm": 2.6595754623413086, | |
| "learning_rate": 2.184e-05, | |
| "loss": 0.8805, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.33455210237659966, | |
| "grad_norm": 2.521261692047119, | |
| "learning_rate": 2.196e-05, | |
| "loss": 0.962, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.33638025594149906, | |
| "grad_norm": 2.559983015060425, | |
| "learning_rate": 2.208e-05, | |
| "loss": 0.8236, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.3382084095063985, | |
| "grad_norm": 2.5021865367889404, | |
| "learning_rate": 2.22e-05, | |
| "loss": 0.7696, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.340036563071298, | |
| "grad_norm": 2.389669418334961, | |
| "learning_rate": 2.232e-05, | |
| "loss": 0.9296, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.34186471663619744, | |
| "grad_norm": 2.8006410598754883, | |
| "learning_rate": 2.2440000000000002e-05, | |
| "loss": 1.1051, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.3436928702010969, | |
| "grad_norm": 2.246638774871826, | |
| "learning_rate": 2.256e-05, | |
| "loss": 0.67, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.34552102376599636, | |
| "grad_norm": 2.3323843479156494, | |
| "learning_rate": 2.268e-05, | |
| "loss": 0.7483, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.3473491773308958, | |
| "grad_norm": 2.599168539047241, | |
| "learning_rate": 2.2800000000000002e-05, | |
| "loss": 0.7095, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.3491773308957952, | |
| "grad_norm": 2.5335357189178467, | |
| "learning_rate": 2.292e-05, | |
| "loss": 0.7943, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.3510054844606947, | |
| "grad_norm": 2.523808717727661, | |
| "learning_rate": 2.304e-05, | |
| "loss": 0.8714, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.35283363802559414, | |
| "grad_norm": 2.3433940410614014, | |
| "learning_rate": 2.3160000000000002e-05, | |
| "loss": 0.7879, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.3546617915904936, | |
| "grad_norm": 2.5101304054260254, | |
| "learning_rate": 2.328e-05, | |
| "loss": 0.9299, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.35648994515539306, | |
| "grad_norm": 2.652029275894165, | |
| "learning_rate": 2.3400000000000003e-05, | |
| "loss": 0.813, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.3583180987202925, | |
| "grad_norm": 2.250645160675049, | |
| "learning_rate": 2.3520000000000002e-05, | |
| "loss": 0.9784, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.360146252285192, | |
| "grad_norm": 2.2848877906799316, | |
| "learning_rate": 2.364e-05, | |
| "loss": 0.9483, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.3619744058500914, | |
| "grad_norm": 2.4996519088745117, | |
| "learning_rate": 2.3760000000000003e-05, | |
| "loss": 0.8746, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.36380255941499084, | |
| "grad_norm": 2.451387882232666, | |
| "learning_rate": 2.3880000000000002e-05, | |
| "loss": 0.8514, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.3656307129798903, | |
| "grad_norm": 2.382949113845825, | |
| "learning_rate": 2.4e-05, | |
| "loss": 1.0895, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.36745886654478976, | |
| "grad_norm": 2.407252788543701, | |
| "learning_rate": 2.4120000000000003e-05, | |
| "loss": 0.9273, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.3692870201096892, | |
| "grad_norm": 2.554053544998169, | |
| "learning_rate": 2.4240000000000002e-05, | |
| "loss": 0.8187, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.3711151736745887, | |
| "grad_norm": 2.1548268795013428, | |
| "learning_rate": 2.4360000000000004e-05, | |
| "loss": 0.9683, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.37294332723948814, | |
| "grad_norm": 2.419849395751953, | |
| "learning_rate": 2.448e-05, | |
| "loss": 0.8276, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.37477148080438755, | |
| "grad_norm": 2.300262451171875, | |
| "learning_rate": 2.4599999999999998e-05, | |
| "loss": 0.8748, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.376599634369287, | |
| "grad_norm": 2.4870543479919434, | |
| "learning_rate": 2.472e-05, | |
| "loss": 0.8901, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.37842778793418647, | |
| "grad_norm": 2.703481435775757, | |
| "learning_rate": 2.484e-05, | |
| "loss": 0.871, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.3802559414990859, | |
| "grad_norm": 2.597571611404419, | |
| "learning_rate": 2.4959999999999998e-05, | |
| "loss": 0.747, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.3820840950639854, | |
| "grad_norm": 2.4933812618255615, | |
| "learning_rate": 2.508e-05, | |
| "loss": 0.7869, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.38391224862888484, | |
| "grad_norm": 2.566986322402954, | |
| "learning_rate": 2.52e-05, | |
| "loss": 0.9081, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.3857404021937843, | |
| "grad_norm": 2.4893436431884766, | |
| "learning_rate": 2.5319999999999998e-05, | |
| "loss": 0.866, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.3875685557586837, | |
| "grad_norm": 2.5950074195861816, | |
| "learning_rate": 2.544e-05, | |
| "loss": 0.8783, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.38939670932358317, | |
| "grad_norm": 2.3816328048706055, | |
| "learning_rate": 2.556e-05, | |
| "loss": 0.8963, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.3912248628884826, | |
| "grad_norm": 2.064539670944214, | |
| "learning_rate": 2.568e-05, | |
| "loss": 0.8979, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.3930530164533821, | |
| "grad_norm": 2.43748140335083, | |
| "learning_rate": 2.58e-05, | |
| "loss": 0.8466, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.39488117001828155, | |
| "grad_norm": 2.2571210861206055, | |
| "learning_rate": 2.592e-05, | |
| "loss": 0.8433, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.396709323583181, | |
| "grad_norm": 2.3223443031311035, | |
| "learning_rate": 2.604e-05, | |
| "loss": 0.7485, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.39853747714808047, | |
| "grad_norm": 2.435385227203369, | |
| "learning_rate": 2.616e-05, | |
| "loss": 0.8868, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.40036563071297987, | |
| "grad_norm": 2.4609930515289307, | |
| "learning_rate": 2.628e-05, | |
| "loss": 0.7649, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.40219378427787933, | |
| "grad_norm": 2.3334007263183594, | |
| "learning_rate": 2.64e-05, | |
| "loss": 0.8722, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.4040219378427788, | |
| "grad_norm": 2.4103660583496094, | |
| "learning_rate": 2.652e-05, | |
| "loss": 0.8687, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.40585009140767825, | |
| "grad_norm": 2.386665105819702, | |
| "learning_rate": 2.6640000000000002e-05, | |
| "loss": 0.9062, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.4076782449725777, | |
| "grad_norm": 2.420870065689087, | |
| "learning_rate": 2.676e-05, | |
| "loss": 0.9941, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.40950639853747717, | |
| "grad_norm": 2.643944025039673, | |
| "learning_rate": 2.688e-05, | |
| "loss": 0.8953, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.4113345521023766, | |
| "grad_norm": 2.400880813598633, | |
| "learning_rate": 2.7000000000000002e-05, | |
| "loss": 0.8583, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.41316270566727603, | |
| "grad_norm": 2.415785312652588, | |
| "learning_rate": 2.712e-05, | |
| "loss": 0.7549, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.4149908592321755, | |
| "grad_norm": 2.6550943851470947, | |
| "learning_rate": 2.724e-05, | |
| "loss": 0.9005, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.41681901279707495, | |
| "grad_norm": 2.31974720954895, | |
| "learning_rate": 2.7360000000000002e-05, | |
| "loss": 0.9962, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.4186471663619744, | |
| "grad_norm": 2.463061571121216, | |
| "learning_rate": 2.748e-05, | |
| "loss": 0.7754, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.42047531992687387, | |
| "grad_norm": 2.5701842308044434, | |
| "learning_rate": 2.7600000000000003e-05, | |
| "loss": 0.772, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.42230347349177333, | |
| "grad_norm": 2.3573224544525146, | |
| "learning_rate": 2.7720000000000002e-05, | |
| "loss": 0.8872, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.42413162705667273, | |
| "grad_norm": 2.345667600631714, | |
| "learning_rate": 2.784e-05, | |
| "loss": 0.7977, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.4259597806215722, | |
| "grad_norm": 2.583740234375, | |
| "learning_rate": 2.7960000000000003e-05, | |
| "loss": 0.9406, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.42778793418647165, | |
| "grad_norm": 2.51877760887146, | |
| "learning_rate": 2.8080000000000002e-05, | |
| "loss": 0.8245, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.4296160877513711, | |
| "grad_norm": 2.6624832153320312, | |
| "learning_rate": 2.8199999999999998e-05, | |
| "loss": 0.8747, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.43144424131627057, | |
| "grad_norm": 2.6126315593719482, | |
| "learning_rate": 2.832e-05, | |
| "loss": 0.881, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.43327239488117003, | |
| "grad_norm": 2.533567428588867, | |
| "learning_rate": 2.844e-05, | |
| "loss": 0.9505, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.4351005484460695, | |
| "grad_norm": 2.4115335941314697, | |
| "learning_rate": 2.856e-05, | |
| "loss": 0.9703, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.4369287020109689, | |
| "grad_norm": 2.2946977615356445, | |
| "learning_rate": 2.868e-05, | |
| "loss": 0.8025, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.43875685557586835, | |
| "grad_norm": 2.7821929454803467, | |
| "learning_rate": 2.88e-05, | |
| "loss": 0.8108, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.4405850091407678, | |
| "grad_norm": 2.5924153327941895, | |
| "learning_rate": 2.892e-05, | |
| "loss": 0.7716, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.4424131627056673, | |
| "grad_norm": 2.484504222869873, | |
| "learning_rate": 2.904e-05, | |
| "loss": 0.8917, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.44424131627056673, | |
| "grad_norm": 2.4044761657714844, | |
| "learning_rate": 2.916e-05, | |
| "loss": 0.9806, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.4460694698354662, | |
| "grad_norm": 2.3332765102386475, | |
| "learning_rate": 2.928e-05, | |
| "loss": 0.7616, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.44789762340036565, | |
| "grad_norm": 2.3703112602233887, | |
| "learning_rate": 2.94e-05, | |
| "loss": 0.8937, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.44972577696526506, | |
| "grad_norm": 2.3351054191589355, | |
| "learning_rate": 2.9520000000000002e-05, | |
| "loss": 0.83, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.4515539305301645, | |
| "grad_norm": 2.3738510608673096, | |
| "learning_rate": 2.964e-05, | |
| "loss": 0.904, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.453382084095064, | |
| "grad_norm": 2.5012619495391846, | |
| "learning_rate": 2.976e-05, | |
| "loss": 0.8809, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.45521023765996343, | |
| "grad_norm": 2.5719287395477295, | |
| "learning_rate": 2.9880000000000002e-05, | |
| "loss": 0.773, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.4570383912248629, | |
| "grad_norm": 2.3036999702453613, | |
| "learning_rate": 3e-05, | |
| "loss": 0.7487, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4570383912248629, | |
| "eval_loss": 0.8340924382209778, | |
| "eval_runtime": 11.3221, | |
| "eval_samples_per_second": 98.215, | |
| "eval_steps_per_second": 3.091, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.45886654478976235, | |
| "grad_norm": 2.355015754699707, | |
| "learning_rate": 2.9999160841378727e-05, | |
| "loss": 0.7973, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.4606946983546618, | |
| "grad_norm": 2.296038866043091, | |
| "learning_rate": 2.9996643459406528e-05, | |
| "loss": 0.8632, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.4625228519195612, | |
| "grad_norm": 2.2504048347473145, | |
| "learning_rate": 2.999244813574778e-05, | |
| "loss": 0.704, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.4643510054844607, | |
| "grad_norm": 2.4145545959472656, | |
| "learning_rate": 2.9986575339808077e-05, | |
| "loss": 0.7892, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.46617915904936014, | |
| "grad_norm": 2.3196182250976562, | |
| "learning_rate": 2.997902572868174e-05, | |
| "loss": 0.9237, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.4680073126142596, | |
| "grad_norm": 2.5195236206054688, | |
| "learning_rate": 2.9969800147078265e-05, | |
| "loss": 0.8632, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.46983546617915906, | |
| "grad_norm": 2.3776962757110596, | |
| "learning_rate": 2.995889962722784e-05, | |
| "loss": 0.8948, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.4716636197440585, | |
| "grad_norm": 2.3582563400268555, | |
| "learning_rate": 2.9946325388765812e-05, | |
| "loss": 0.8258, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.473491773308958, | |
| "grad_norm": 2.4774725437164307, | |
| "learning_rate": 2.993207883859627e-05, | |
| "loss": 0.8687, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.4753199268738574, | |
| "grad_norm": 2.2049193382263184, | |
| "learning_rate": 2.99161615707346e-05, | |
| "loss": 0.9289, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.47714808043875684, | |
| "grad_norm": 2.2471542358398438, | |
| "learning_rate": 2.9898575366129145e-05, | |
| "loss": 0.8769, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.4789762340036563, | |
| "grad_norm": 2.2609918117523193, | |
| "learning_rate": 2.9879322192461932e-05, | |
| "loss": 1.0632, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.48080438756855576, | |
| "grad_norm": 2.3569087982177734, | |
| "learning_rate": 2.985840420392851e-05, | |
| "loss": 0.854, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.4826325411334552, | |
| "grad_norm": 2.398346185684204, | |
| "learning_rate": 2.9835823740996944e-05, | |
| "loss": 0.7765, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.4844606946983547, | |
| "grad_norm": 2.251390218734741, | |
| "learning_rate": 2.9811583330145915e-05, | |
| "loss": 0.8045, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.48628884826325414, | |
| "grad_norm": 2.3630456924438477, | |
| "learning_rate": 2.9785685683582057e-05, | |
| "loss": 0.8945, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.48811700182815354, | |
| "grad_norm": 2.259655714035034, | |
| "learning_rate": 2.975813369893649e-05, | |
| "loss": 0.7409, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.489945155393053, | |
| "grad_norm": 2.4072036743164062, | |
| "learning_rate": 2.97289304589406e-05, | |
| "loss": 0.8358, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.49177330895795246, | |
| "grad_norm": 2.3019490242004395, | |
| "learning_rate": 2.9698079231081144e-05, | |
| "loss": 0.8837, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.4936014625228519, | |
| "grad_norm": 2.3812527656555176, | |
| "learning_rate": 2.966558346723463e-05, | |
| "loss": 0.8772, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.4954296160877514, | |
| "grad_norm": 2.3249640464782715, | |
| "learning_rate": 2.963144680328111e-05, | |
| "loss": 0.7369, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.49725776965265084, | |
| "grad_norm": 2.431414842605591, | |
| "learning_rate": 2.959567305869736e-05, | |
| "loss": 0.8207, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.4990859232175503, | |
| "grad_norm": 2.3795621395111084, | |
| "learning_rate": 2.955826623612954e-05, | |
| "loss": 0.73, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.5009140767824497, | |
| "grad_norm": 2.426405906677246, | |
| "learning_rate": 2.9519230520945346e-05, | |
| "loss": 0.9324, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.5027422303473492, | |
| "grad_norm": 2.2649593353271484, | |
| "learning_rate": 2.947857028076569e-05, | |
| "loss": 0.8003, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.5045703839122486, | |
| "grad_norm": 2.481842041015625, | |
| "learning_rate": 2.943629006497606e-05, | |
| "loss": 0.7915, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.506398537477148, | |
| "grad_norm": 2.5210118293762207, | |
| "learning_rate": 2.939239460421746e-05, | |
| "loss": 0.7953, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.5082266910420475, | |
| "grad_norm": 2.3630707263946533, | |
| "learning_rate": 2.934688880985714e-05, | |
| "loss": 0.8232, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.5100548446069469, | |
| "grad_norm": 2.3418996334075928, | |
| "learning_rate": 2.9299777773439056e-05, | |
| "loss": 0.909, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.5118829981718465, | |
| "grad_norm": 2.34122633934021, | |
| "learning_rate": 2.925106676611418e-05, | |
| "loss": 0.7633, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5137111517367459, | |
| "grad_norm": 2.499547243118286, | |
| "learning_rate": 2.9200761238050756e-05, | |
| "loss": 0.851, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.5155393053016454, | |
| "grad_norm": 2.456969738006592, | |
| "learning_rate": 2.9148866817824454e-05, | |
| "loss": 0.8803, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.5173674588665448, | |
| "grad_norm": 2.2602295875549316, | |
| "learning_rate": 2.9095389311788626e-05, | |
| "loss": 0.8049, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.5191956124314442, | |
| "grad_norm": 2.1520049571990967, | |
| "learning_rate": 2.9040334703424637e-05, | |
| "loss": 0.7233, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.5210237659963437, | |
| "grad_norm": 2.4685440063476562, | |
| "learning_rate": 2.8983709152672386e-05, | |
| "loss": 0.9514, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.5228519195612431, | |
| "grad_norm": 2.296013593673706, | |
| "learning_rate": 2.892551899524109e-05, | |
| "loss": 0.7938, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.5246800731261426, | |
| "grad_norm": 2.3713924884796143, | |
| "learning_rate": 2.8865770741900382e-05, | |
| "loss": 0.93, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.526508226691042, | |
| "grad_norm": 2.6389975547790527, | |
| "learning_rate": 2.8804471077751847e-05, | |
| "loss": 0.9036, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.5283363802559415, | |
| "grad_norm": 2.4582440853118896, | |
| "learning_rate": 2.8741626861481043e-05, | |
| "loss": 0.9437, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.5301645338208409, | |
| "grad_norm": 2.3008275032043457, | |
| "learning_rate": 2.8677245124590087e-05, | |
| "loss": 0.7939, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5319926873857403, | |
| "grad_norm": 2.319469928741455, | |
| "learning_rate": 2.8611333070610918e-05, | |
| "loss": 0.8535, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.5338208409506399, | |
| "grad_norm": 2.295746088027954, | |
| "learning_rate": 2.8543898074299322e-05, | |
| "loss": 0.736, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.5356489945155393, | |
| "grad_norm": 2.5527262687683105, | |
| "learning_rate": 2.8474947680809754e-05, | |
| "loss": 0.8192, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.5374771480804388, | |
| "grad_norm": 2.308958053588867, | |
| "learning_rate": 2.8404489604851186e-05, | |
| "loss": 0.9077, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.5393053016453382, | |
| "grad_norm": 2.524796724319458, | |
| "learning_rate": 2.8332531729823853e-05, | |
| "loss": 0.8038, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.5411334552102377, | |
| "grad_norm": 2.420640468597412, | |
| "learning_rate": 2.8259082106937255e-05, | |
| "loss": 0.7417, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.5429616087751371, | |
| "grad_norm": 2.364328384399414, | |
| "learning_rate": 2.8184148954309295e-05, | |
| "loss": 0.8791, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.5447897623400365, | |
| "grad_norm": 2.412336587905884, | |
| "learning_rate": 2.8107740656046775e-05, | |
| "loss": 0.83, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.546617915904936, | |
| "grad_norm": 2.5241622924804688, | |
| "learning_rate": 2.802986576130733e-05, | |
| "loss": 0.8886, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.5484460694698354, | |
| "grad_norm": 2.330146074295044, | |
| "learning_rate": 2.7950532983342863e-05, | |
| "loss": 0.8117, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5502742230347349, | |
| "grad_norm": 2.1738884449005127, | |
| "learning_rate": 2.7869751198524656e-05, | |
| "loss": 0.8588, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.5521023765996343, | |
| "grad_norm": 2.343388319015503, | |
| "learning_rate": 2.7787529445350192e-05, | |
| "loss": 0.7355, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.5539305301645339, | |
| "grad_norm": 2.2163190841674805, | |
| "learning_rate": 2.7703876923431882e-05, | |
| "loss": 0.8508, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.5557586837294333, | |
| "grad_norm": 2.1025807857513428, | |
| "learning_rate": 2.7618802992467718e-05, | |
| "loss": 0.7909, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.5575868372943327, | |
| "grad_norm": 2.4115538597106934, | |
| "learning_rate": 2.753231717119405e-05, | |
| "loss": 0.7964, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.5594149908592322, | |
| "grad_norm": 2.2953007221221924, | |
| "learning_rate": 2.744442913632054e-05, | |
| "loss": 0.8284, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.5612431444241316, | |
| "grad_norm": 2.4674270153045654, | |
| "learning_rate": 2.7355148721447492e-05, | |
| "loss": 0.9302, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.5630712979890311, | |
| "grad_norm": 2.447037935256958, | |
| "learning_rate": 2.7264485915965548e-05, | |
| "loss": 0.9281, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.5648994515539305, | |
| "grad_norm": 2.1784889698028564, | |
| "learning_rate": 2.717245086393801e-05, | |
| "loss": 0.7989, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.56672760511883, | |
| "grad_norm": 2.2562270164489746, | |
| "learning_rate": 2.707905386296588e-05, | |
| "loss": 0.8856, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.5685557586837294, | |
| "grad_norm": 2.272416591644287, | |
| "learning_rate": 2.6984305363035616e-05, | |
| "loss": 1.0322, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.5703839122486288, | |
| "grad_norm": 2.2202160358428955, | |
| "learning_rate": 2.6888215965349974e-05, | |
| "loss": 0.9454, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.5722120658135283, | |
| "grad_norm": 2.4724793434143066, | |
| "learning_rate": 2.6790796421141813e-05, | |
| "loss": 0.8584, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.5740402193784278, | |
| "grad_norm": 2.3383536338806152, | |
| "learning_rate": 2.6692057630471184e-05, | |
| "loss": 0.978, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.5758683729433273, | |
| "grad_norm": 2.173809766769409, | |
| "learning_rate": 2.6592010641005745e-05, | |
| "loss": 0.8318, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.5776965265082267, | |
| "grad_norm": 2.306762456893921, | |
| "learning_rate": 2.649066664678467e-05, | |
| "loss": 0.841, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.5795246800731262, | |
| "grad_norm": 2.038734197616577, | |
| "learning_rate": 2.638803698696615e-05, | |
| "loss": 0.8219, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.5813528336380256, | |
| "grad_norm": 2.2740612030029297, | |
| "learning_rate": 2.6284133144558697e-05, | |
| "loss": 0.8945, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.583180987202925, | |
| "grad_norm": 2.338181972503662, | |
| "learning_rate": 2.6178966745136322e-05, | |
| "loss": 1.0114, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.5850091407678245, | |
| "grad_norm": 2.357879877090454, | |
| "learning_rate": 2.60725495555378e-05, | |
| "loss": 0.7024, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.5868372943327239, | |
| "grad_norm": 2.271117925643921, | |
| "learning_rate": 2.5964893482550076e-05, | |
| "loss": 0.8802, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.5886654478976234, | |
| "grad_norm": 2.092961072921753, | |
| "learning_rate": 2.5856010571576052e-05, | |
| "loss": 0.8343, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.5904936014625228, | |
| "grad_norm": 2.297849655151367, | |
| "learning_rate": 2.574591300528686e-05, | |
| "loss": 0.8124, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.5923217550274223, | |
| "grad_norm": 2.293593645095825, | |
| "learning_rate": 2.563461310225875e-05, | |
| "loss": 0.7819, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.5941499085923218, | |
| "grad_norm": 2.2364585399627686, | |
| "learning_rate": 2.552212331559482e-05, | |
| "loss": 0.9649, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.5959780621572212, | |
| "grad_norm": 2.2145204544067383, | |
| "learning_rate": 2.5408456231531634e-05, | |
| "loss": 0.8959, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.5978062157221207, | |
| "grad_norm": 2.4612884521484375, | |
| "learning_rate": 2.5293624568031008e-05, | |
| "loss": 0.929, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.5996343692870201, | |
| "grad_norm": 2.4367892742156982, | |
| "learning_rate": 2.5177641173356985e-05, | |
| "loss": 0.7942, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.6014625228519196, | |
| "grad_norm": 2.5621209144592285, | |
| "learning_rate": 2.5060519024638312e-05, | |
| "loss": 0.9107, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.603290676416819, | |
| "grad_norm": 2.2086422443389893, | |
| "learning_rate": 2.4942271226416444e-05, | |
| "loss": 0.7485, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6051188299817185, | |
| "grad_norm": 2.4878604412078857, | |
| "learning_rate": 2.482291100917928e-05, | |
| "loss": 0.8663, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.6069469835466179, | |
| "grad_norm": 2.4622035026550293, | |
| "learning_rate": 2.4702451727880862e-05, | |
| "loss": 0.9976, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.6087751371115173, | |
| "grad_norm": 2.313488245010376, | |
| "learning_rate": 2.458090686044712e-05, | |
| "loss": 0.86, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.6106032906764168, | |
| "grad_norm": 2.495249032974243, | |
| "learning_rate": 2.445829000626784e-05, | |
| "loss": 0.7586, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.6124314442413162, | |
| "grad_norm": 2.2994625568389893, | |
| "learning_rate": 2.433461488467505e-05, | |
| "loss": 0.9011, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.6142595978062158, | |
| "grad_norm": 2.410585403442383, | |
| "learning_rate": 2.4209895333408028e-05, | |
| "loss": 0.7784, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.6160877513711152, | |
| "grad_norm": 2.371408462524414, | |
| "learning_rate": 2.4084145307065e-05, | |
| "loss": 0.9034, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.6179159049360147, | |
| "grad_norm": 2.2253592014312744, | |
| "learning_rate": 2.3957378875541795e-05, | |
| "loss": 0.8581, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.6197440585009141, | |
| "grad_norm": 2.18859601020813, | |
| "learning_rate": 2.382961022245759e-05, | |
| "loss": 0.8338, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.6215722120658135, | |
| "grad_norm": 2.1277389526367188, | |
| "learning_rate": 2.3700853643567973e-05, | |
| "loss": 0.7985, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.623400365630713, | |
| "grad_norm": 2.2631025314331055, | |
| "learning_rate": 2.3571123545165362e-05, | |
| "loss": 0.865, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.6252285191956124, | |
| "grad_norm": 2.4531781673431396, | |
| "learning_rate": 2.3440434442467155e-05, | |
| "loss": 0.8673, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.6270566727605119, | |
| "grad_norm": 2.3396685123443604, | |
| "learning_rate": 2.3308800957991657e-05, | |
| "loss": 0.868, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.6288848263254113, | |
| "grad_norm": 2.2110092639923096, | |
| "learning_rate": 2.3176237819921975e-05, | |
| "loss": 0.7553, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.6307129798903108, | |
| "grad_norm": 2.3857622146606445, | |
| "learning_rate": 2.3042759860458142e-05, | |
| "loss": 0.7463, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.6325411334552102, | |
| "grad_norm": 2.304614782333374, | |
| "learning_rate": 2.2908382014157536e-05, | |
| "loss": 0.939, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.6343692870201096, | |
| "grad_norm": 2.360813617706299, | |
| "learning_rate": 2.2773119316263935e-05, | |
| "loss": 0.7792, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.6361974405850092, | |
| "grad_norm": 2.41550612449646, | |
| "learning_rate": 2.2636986901025208e-05, | |
| "loss": 0.8776, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.6380255941499086, | |
| "grad_norm": 2.514841318130493, | |
| "learning_rate": 2.25e-05, | |
| "loss": 0.8356, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.6398537477148081, | |
| "grad_norm": 2.2054624557495117, | |
| "learning_rate": 2.2362173940353522e-05, | |
| "loss": 0.7899, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.6416819012797075, | |
| "grad_norm": 2.144213914871216, | |
| "learning_rate": 2.2223524143142595e-05, | |
| "loss": 0.8054, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.643510054844607, | |
| "grad_norm": 2.340751886367798, | |
| "learning_rate": 2.2084066121590242e-05, | |
| "loss": 0.8224, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.6453382084095064, | |
| "grad_norm": 2.3917925357818604, | |
| "learning_rate": 2.194381547934994e-05, | |
| "loss": 0.8739, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.6471663619744058, | |
| "grad_norm": 2.30846905708313, | |
| "learning_rate": 2.1802787908759767e-05, | |
| "loss": 0.866, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.6489945155393053, | |
| "grad_norm": 2.0527448654174805, | |
| "learning_rate": 2.1660999189086613e-05, | |
| "loss": 0.8253, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.6508226691042047, | |
| "grad_norm": 2.263025999069214, | |
| "learning_rate": 2.1518465184760686e-05, | |
| "loss": 0.8838, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.6526508226691042, | |
| "grad_norm": 2.3904080390930176, | |
| "learning_rate": 2.1375201843600452e-05, | |
| "loss": 0.9442, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.6544789762340036, | |
| "grad_norm": 2.1965222358703613, | |
| "learning_rate": 2.12312251950283e-05, | |
| "loss": 0.6803, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.6563071297989032, | |
| "grad_norm": 2.2777087688446045, | |
| "learning_rate": 2.108655134827701e-05, | |
| "loss": 0.8077, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.6581352833638026, | |
| "grad_norm": 2.2738406658172607, | |
| "learning_rate": 2.0941196490587352e-05, | |
| "loss": 0.855, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.659963436928702, | |
| "grad_norm": 2.04484486579895, | |
| "learning_rate": 2.0795176885396928e-05, | |
| "loss": 0.8816, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.6617915904936015, | |
| "grad_norm": 2.364666223526001, | |
| "learning_rate": 2.064850887052048e-05, | |
| "loss": 0.9707, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.6636197440585009, | |
| "grad_norm": 2.2735183238983154, | |
| "learning_rate": 2.0501208856321895e-05, | |
| "loss": 0.8226, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.6654478976234004, | |
| "grad_norm": 2.370248794555664, | |
| "learning_rate": 2.035329332387808e-05, | |
| "loss": 0.797, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.6672760511882998, | |
| "grad_norm": 2.614694595336914, | |
| "learning_rate": 2.0204778823134936e-05, | |
| "loss": 0.8665, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.6691042047531993, | |
| "grad_norm": 2.3441321849823, | |
| "learning_rate": 2.0055681971055626e-05, | |
| "loss": 0.8658, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.6709323583180987, | |
| "grad_norm": 2.3217623233795166, | |
| "learning_rate": 1.990601944976133e-05, | |
| "loss": 0.8256, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.6727605118829981, | |
| "grad_norm": 2.209233522415161, | |
| "learning_rate": 1.9755808004664702e-05, | |
| "loss": 0.7482, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.6745886654478976, | |
| "grad_norm": 2.4364049434661865, | |
| "learning_rate": 1.9605064442596316e-05, | |
| "loss": 0.8031, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.676416819012797, | |
| "grad_norm": 2.168339967727661, | |
| "learning_rate": 1.9453805629924126e-05, | |
| "loss": 0.8416, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.6782449725776966, | |
| "grad_norm": 2.428342580795288, | |
| "learning_rate": 1.9302048490666356e-05, | |
| "loss": 0.8554, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.680073126142596, | |
| "grad_norm": 1.9630411863327026, | |
| "learning_rate": 1.9149810004597906e-05, | |
| "loss": 0.7988, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.6819012797074955, | |
| "grad_norm": 2.591010570526123, | |
| "learning_rate": 1.8997107205350525e-05, | |
| "loss": 1.048, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.6837294332723949, | |
| "grad_norm": 2.476414442062378, | |
| "learning_rate": 1.884395717850694e-05, | |
| "loss": 0.8041, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.6855575868372943, | |
| "grad_norm": 2.514333486557007, | |
| "learning_rate": 1.8690377059689202e-05, | |
| "loss": 0.8906, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.6873857404021938, | |
| "grad_norm": 2.299752712249756, | |
| "learning_rate": 1.853638403264141e-05, | |
| "loss": 0.9203, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.6892138939670932, | |
| "grad_norm": 2.3039369583129883, | |
| "learning_rate": 1.8381995327307067e-05, | |
| "loss": 0.8833, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.6910420475319927, | |
| "grad_norm": 2.3373348712921143, | |
| "learning_rate": 1.822722821790126e-05, | |
| "loss": 0.7324, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.6928702010968921, | |
| "grad_norm": 2.774083137512207, | |
| "learning_rate": 1.807210002097786e-05, | |
| "loss": 0.8778, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.6946983546617916, | |
| "grad_norm": 2.214552402496338, | |
| "learning_rate": 1.791662809349206e-05, | |
| "loss": 0.8044, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.696526508226691, | |
| "grad_norm": 2.298497438430786, | |
| "learning_rate": 1.7760829830858305e-05, | |
| "loss": 0.8667, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.6983546617915904, | |
| "grad_norm": 2.23805570602417, | |
| "learning_rate": 1.760472266500396e-05, | |
| "loss": 0.7938, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.70018281535649, | |
| "grad_norm": 2.18110990524292, | |
| "learning_rate": 1.744832406241889e-05, | |
| "loss": 0.8147, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.7020109689213894, | |
| "grad_norm": 2.2718112468719482, | |
| "learning_rate": 1.7291651522201208e-05, | |
| "loss": 0.973, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.7038391224862889, | |
| "grad_norm": 2.254279375076294, | |
| "learning_rate": 1.713472257409928e-05, | |
| "loss": 0.7439, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.7056672760511883, | |
| "grad_norm": 2.268983840942383, | |
| "learning_rate": 1.6977554776550403e-05, | |
| "loss": 0.8309, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.7074954296160878, | |
| "grad_norm": 2.189608097076416, | |
| "learning_rate": 1.682016571471623e-05, | |
| "loss": 0.8748, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.7093235831809872, | |
| "grad_norm": 2.231454610824585, | |
| "learning_rate": 1.6662572998515166e-05, | |
| "loss": 0.8759, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.7111517367458866, | |
| "grad_norm": 2.324653148651123, | |
| "learning_rate": 1.6504794260652077e-05, | |
| "loss": 0.7731, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.7129798903107861, | |
| "grad_norm": 2.113718271255493, | |
| "learning_rate": 1.6346847154645376e-05, | |
| "loss": 0.7961, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.7148080438756855, | |
| "grad_norm": 2.413463830947876, | |
| "learning_rate": 1.6188749352851825e-05, | |
| "loss": 0.9315, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.716636197440585, | |
| "grad_norm": 2.175915002822876, | |
| "learning_rate": 1.6030518544489215e-05, | |
| "loss": 0.7061, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.7184643510054844, | |
| "grad_norm": 2.2238268852233887, | |
| "learning_rate": 1.587217243365714e-05, | |
| "loss": 0.8585, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.720292504570384, | |
| "grad_norm": 2.3010525703430176, | |
| "learning_rate": 1.5713728737356138e-05, | |
| "loss": 0.8064, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.7221206581352834, | |
| "grad_norm": 2.2713418006896973, | |
| "learning_rate": 1.555520518350537e-05, | |
| "loss": 0.8125, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.7239488117001828, | |
| "grad_norm": 2.311316967010498, | |
| "learning_rate": 1.5396619508959102e-05, | |
| "loss": 0.7494, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.7257769652650823, | |
| "grad_norm": 2.3094563484191895, | |
| "learning_rate": 1.523798945752212e-05, | |
| "loss": 0.8135, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.7276051188299817, | |
| "grad_norm": 2.1408050060272217, | |
| "learning_rate": 1.5079332777964467e-05, | |
| "loss": 0.8519, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.7294332723948812, | |
| "grad_norm": 2.196596622467041, | |
| "learning_rate": 1.4920667222035532e-05, | |
| "loss": 0.9019, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.7312614259597806, | |
| "grad_norm": 2.4077069759368896, | |
| "learning_rate": 1.4762010542477881e-05, | |
| "loss": 0.8437, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7330895795246801, | |
| "grad_norm": 2.138925075531006, | |
| "learning_rate": 1.46033804910409e-05, | |
| "loss": 0.7867, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.7349177330895795, | |
| "grad_norm": 2.280134439468384, | |
| "learning_rate": 1.4444794816494629e-05, | |
| "loss": 1.0417, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.7367458866544789, | |
| "grad_norm": 2.484534502029419, | |
| "learning_rate": 1.4286271262643866e-05, | |
| "loss": 0.7929, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.7385740402193784, | |
| "grad_norm": 2.2009499073028564, | |
| "learning_rate": 1.4127827566342864e-05, | |
| "loss": 0.7963, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.7404021937842779, | |
| "grad_norm": 2.313990831375122, | |
| "learning_rate": 1.3969481455510787e-05, | |
| "loss": 0.9538, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.7422303473491774, | |
| "grad_norm": 2.1209707260131836, | |
| "learning_rate": 1.3811250647148172e-05, | |
| "loss": 0.8327, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.7440585009140768, | |
| "grad_norm": 2.3821375370025635, | |
| "learning_rate": 1.3653152845354625e-05, | |
| "loss": 0.8677, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.7458866544789763, | |
| "grad_norm": 2.179967164993286, | |
| "learning_rate": 1.3495205739347925e-05, | |
| "loss": 0.8095, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.7477148080438757, | |
| "grad_norm": 2.5116395950317383, | |
| "learning_rate": 1.3337427001484836e-05, | |
| "loss": 0.9218, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.7495429616087751, | |
| "grad_norm": 2.173802375793457, | |
| "learning_rate": 1.3179834285283773e-05, | |
| "loss": 0.7475, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.7513711151736746, | |
| "grad_norm": 2.0795040130615234, | |
| "learning_rate": 1.3022445223449596e-05, | |
| "loss": 0.8749, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.753199268738574, | |
| "grad_norm": 2.1474385261535645, | |
| "learning_rate": 1.2865277425900725e-05, | |
| "loss": 0.8277, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.7550274223034735, | |
| "grad_norm": 2.243417978286743, | |
| "learning_rate": 1.2708348477798795e-05, | |
| "loss": 0.8147, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.7568555758683729, | |
| "grad_norm": 2.3106589317321777, | |
| "learning_rate": 1.255167593758111e-05, | |
| "loss": 0.7848, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.7586837294332724, | |
| "grad_norm": 2.397627830505371, | |
| "learning_rate": 1.2395277334996045e-05, | |
| "loss": 0.9778, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.7605118829981719, | |
| "grad_norm": 2.3535757064819336, | |
| "learning_rate": 1.2239170169141696e-05, | |
| "loss": 0.7996, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.7623400365630713, | |
| "grad_norm": 2.224731922149658, | |
| "learning_rate": 1.2083371906507939e-05, | |
| "loss": 0.8442, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.7641681901279708, | |
| "grad_norm": 2.4303503036499023, | |
| "learning_rate": 1.1927899979022143e-05, | |
| "loss": 0.8317, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.7659963436928702, | |
| "grad_norm": 2.4696667194366455, | |
| "learning_rate": 1.1772771782098748e-05, | |
| "loss": 0.8581, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.7678244972577697, | |
| "grad_norm": 2.2766096591949463, | |
| "learning_rate": 1.1618004672692937e-05, | |
| "loss": 0.781, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.7696526508226691, | |
| "grad_norm": 2.2170205116271973, | |
| "learning_rate": 1.146361596735859e-05, | |
| "loss": 0.6847, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.7714808043875686, | |
| "grad_norm": 2.301888942718506, | |
| "learning_rate": 1.1309622940310798e-05, | |
| "loss": 0.9334, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.773308957952468, | |
| "grad_norm": 2.0786006450653076, | |
| "learning_rate": 1.1156042821493062e-05, | |
| "loss": 0.8339, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.7751371115173674, | |
| "grad_norm": 2.1867787837982178, | |
| "learning_rate": 1.1002892794649478e-05, | |
| "loss": 0.8398, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.7769652650822669, | |
| "grad_norm": 2.1924829483032227, | |
| "learning_rate": 1.0850189995402096e-05, | |
| "loss": 0.8241, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.7787934186471663, | |
| "grad_norm": 2.104240655899048, | |
| "learning_rate": 1.069795150933365e-05, | |
| "loss": 0.83, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.7806215722120659, | |
| "grad_norm": 2.301518201828003, | |
| "learning_rate": 1.0546194370075882e-05, | |
| "loss": 0.7494, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.7824497257769653, | |
| "grad_norm": 2.3547585010528564, | |
| "learning_rate": 1.0394935557403684e-05, | |
| "loss": 0.7907, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.7842778793418648, | |
| "grad_norm": 2.225034713745117, | |
| "learning_rate": 1.0244191995335299e-05, | |
| "loss": 0.8484, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.7861060329067642, | |
| "grad_norm": 2.3130884170532227, | |
| "learning_rate": 1.0093980550238676e-05, | |
| "loss": 0.8425, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.7879341864716636, | |
| "grad_norm": 2.425241708755493, | |
| "learning_rate": 9.944318028944374e-06, | |
| "loss": 0.9269, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 0.7897623400365631, | |
| "grad_norm": 2.1149165630340576, | |
| "learning_rate": 9.795221176865065e-06, | |
| "loss": 0.7503, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.7915904936014625, | |
| "grad_norm": 2.3856897354125977, | |
| "learning_rate": 9.646706676121924e-06, | |
| "loss": 0.8628, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 0.793418647166362, | |
| "grad_norm": 2.1912615299224854, | |
| "learning_rate": 9.49879114367811e-06, | |
| "loss": 0.8198, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.7952468007312614, | |
| "grad_norm": 2.1112685203552246, | |
| "learning_rate": 9.351491129479519e-06, | |
| "loss": 0.8933, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.7970749542961609, | |
| "grad_norm": 2.3817248344421387, | |
| "learning_rate": 9.20482311460307e-06, | |
| "loss": 0.8212, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.7989031078610603, | |
| "grad_norm": 2.216339349746704, | |
| "learning_rate": 9.058803509412647e-06, | |
| "loss": 0.7964, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 0.8007312614259597, | |
| "grad_norm": 2.2197396755218506, | |
| "learning_rate": 8.913448651722994e-06, | |
| "loss": 0.7535, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.8025594149908593, | |
| "grad_norm": 2.083980083465576, | |
| "learning_rate": 8.768774804971705e-06, | |
| "loss": 0.9009, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 0.8043875685557587, | |
| "grad_norm": 2.0909934043884277, | |
| "learning_rate": 8.624798156399554e-06, | |
| "loss": 0.8016, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.8062157221206582, | |
| "grad_norm": 2.4581222534179688, | |
| "learning_rate": 8.481534815239323e-06, | |
| "loss": 0.9227, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.8080438756855576, | |
| "grad_norm": 2.1503217220306396, | |
| "learning_rate": 8.339000810913388e-06, | |
| "loss": 0.7305, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.8098720292504571, | |
| "grad_norm": 1.9855475425720215, | |
| "learning_rate": 8.197212091240237e-06, | |
| "loss": 0.7195, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 0.8117001828153565, | |
| "grad_norm": 2.25361967086792, | |
| "learning_rate": 8.056184520650064e-06, | |
| "loss": 0.7594, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.8135283363802559, | |
| "grad_norm": 2.2054708003997803, | |
| "learning_rate": 7.915933878409762e-06, | |
| "loss": 0.7931, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.8153564899451554, | |
| "grad_norm": 2.134115219116211, | |
| "learning_rate": 7.776475856857409e-06, | |
| "loss": 0.7195, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.8171846435100548, | |
| "grad_norm": 1.9758131504058838, | |
| "learning_rate": 7.63782605964648e-06, | |
| "loss": 0.872, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.8190127970749543, | |
| "grad_norm": 2.291642904281616, | |
| "learning_rate": 7.500000000000004e-06, | |
| "loss": 0.8467, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.8208409506398537, | |
| "grad_norm": 2.2243387699127197, | |
| "learning_rate": 7.3630130989748e-06, | |
| "loss": 0.9038, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 0.8226691042047533, | |
| "grad_norm": 2.283393383026123, | |
| "learning_rate": 7.226880683736066e-06, | |
| "loss": 0.8102, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8244972577696527, | |
| "grad_norm": 2.078200101852417, | |
| "learning_rate": 7.091617985842463e-06, | |
| "loss": 0.761, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 0.8263254113345521, | |
| "grad_norm": 2.3057701587677, | |
| "learning_rate": 6.9572401395418615e-06, | |
| "loss": 0.8682, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.8281535648994516, | |
| "grad_norm": 2.171827793121338, | |
| "learning_rate": 6.8237621800780255e-06, | |
| "loss": 0.7561, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 0.829981718464351, | |
| "grad_norm": 2.3417348861694336, | |
| "learning_rate": 6.691199042008346e-06, | |
| "loss": 0.8277, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.8318098720292505, | |
| "grad_norm": 2.1309165954589844, | |
| "learning_rate": 6.559565557532847e-06, | |
| "loss": 0.8441, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.8336380255941499, | |
| "grad_norm": 2.3415029048919678, | |
| "learning_rate": 6.428876454834643e-06, | |
| "loss": 0.787, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.8354661791590493, | |
| "grad_norm": 2.2141568660736084, | |
| "learning_rate": 6.2991463564320296e-06, | |
| "loss": 0.8158, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 0.8372943327239488, | |
| "grad_norm": 2.0096514225006104, | |
| "learning_rate": 6.170389777542409e-06, | |
| "loss": 0.7489, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.8391224862888482, | |
| "grad_norm": 2.125929355621338, | |
| "learning_rate": 6.0426211244582105e-06, | |
| "loss": 0.8803, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 0.8409506398537477, | |
| "grad_norm": 2.0805740356445312, | |
| "learning_rate": 5.915854692935002e-06, | |
| "loss": 0.773, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.8427787934186471, | |
| "grad_norm": 2.357139825820923, | |
| "learning_rate": 5.790104666591974e-06, | |
| "loss": 0.7609, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 0.8446069469835467, | |
| "grad_norm": 2.277031898498535, | |
| "learning_rate": 5.665385115324954e-06, | |
| "loss": 0.8573, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.8464351005484461, | |
| "grad_norm": 2.2020912170410156, | |
| "learning_rate": 5.541709993732168e-06, | |
| "loss": 0.9261, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 0.8482632541133455, | |
| "grad_norm": 2.294649362564087, | |
| "learning_rate": 5.419093139552878e-06, | |
| "loss": 0.8164, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.850091407678245, | |
| "grad_norm": 2.047896385192871, | |
| "learning_rate": 5.297548272119138e-06, | |
| "loss": 0.8419, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.8519195612431444, | |
| "grad_norm": 2.4558777809143066, | |
| "learning_rate": 5.177088990820725e-06, | |
| "loss": 0.8319, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 0.8537477148080439, | |
| "grad_norm": 2.008725643157959, | |
| "learning_rate": 5.05772877358356e-06, | |
| "loss": 0.7503, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 0.8555758683729433, | |
| "grad_norm": 2.16011643409729, | |
| "learning_rate": 4.939480975361687e-06, | |
| "loss": 0.7007, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.8574040219378428, | |
| "grad_norm": 2.166571855545044, | |
| "learning_rate": 4.822358826643019e-06, | |
| "loss": 0.7383, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 0.8592321755027422, | |
| "grad_norm": 2.3428239822387695, | |
| "learning_rate": 4.706375431968998e-06, | |
| "loss": 0.792, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.8610603290676416, | |
| "grad_norm": 2.3133058547973633, | |
| "learning_rate": 4.591543768468364e-06, | |
| "loss": 0.7791, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 0.8628884826325411, | |
| "grad_norm": 2.227383852005005, | |
| "learning_rate": 4.4778766844051795e-06, | |
| "loss": 0.8838, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.8647166361974405, | |
| "grad_norm": 1.9852975606918335, | |
| "learning_rate": 4.365386897741249e-06, | |
| "loss": 0.8375, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 0.8665447897623401, | |
| "grad_norm": 2.151278018951416, | |
| "learning_rate": 4.254086994713141e-06, | |
| "loss": 0.7966, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.8683729433272395, | |
| "grad_norm": 2.355102777481079, | |
| "learning_rate": 4.1439894284239474e-06, | |
| "loss": 0.8264, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.870201096892139, | |
| "grad_norm": 2.390646457672119, | |
| "learning_rate": 4.035106517449926e-06, | |
| "loss": 0.8292, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.8720292504570384, | |
| "grad_norm": 2.1484568119049072, | |
| "learning_rate": 3.9274504444622025e-06, | |
| "loss": 0.8624, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 0.8738574040219378, | |
| "grad_norm": 2.134361505508423, | |
| "learning_rate": 3.82103325486368e-06, | |
| "loss": 0.8226, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 0.8756855575868373, | |
| "grad_norm": 2.1799209117889404, | |
| "learning_rate": 3.715866855441309e-06, | |
| "loss": 0.7563, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 0.8775137111517367, | |
| "grad_norm": 2.338834285736084, | |
| "learning_rate": 3.6119630130338537e-06, | |
| "loss": 0.8319, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.8793418647166362, | |
| "grad_norm": 2.032010555267334, | |
| "learning_rate": 3.5093333532153316e-06, | |
| "loss": 0.7693, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 0.8811700182815356, | |
| "grad_norm": 2.1978771686553955, | |
| "learning_rate": 3.4079893589942544e-06, | |
| "loss": 0.7642, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 0.8829981718464351, | |
| "grad_norm": 2.5220754146575928, | |
| "learning_rate": 3.3079423695288204e-06, | |
| "loss": 0.9182, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 0.8848263254113345, | |
| "grad_norm": 2.1148622035980225, | |
| "learning_rate": 3.2092035788581907e-06, | |
| "loss": 0.8411, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.886654478976234, | |
| "grad_norm": 2.1336936950683594, | |
| "learning_rate": 3.1117840346500287e-06, | |
| "loss": 0.7711, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.8884826325411335, | |
| "grad_norm": 2.175741672515869, | |
| "learning_rate": 3.0156946369643803e-06, | |
| "loss": 0.9526, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.8903107861060329, | |
| "grad_norm": 2.207550525665283, | |
| "learning_rate": 2.9209461370341204e-06, | |
| "loss": 0.7538, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 0.8921389396709324, | |
| "grad_norm": 2.0048232078552246, | |
| "learning_rate": 2.8275491360619875e-06, | |
| "loss": 0.8079, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.8939670932358318, | |
| "grad_norm": 2.2302756309509277, | |
| "learning_rate": 2.735514084034457e-06, | |
| "loss": 0.8385, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 0.8957952468007313, | |
| "grad_norm": 2.7533788681030273, | |
| "learning_rate": 2.64485127855251e-06, | |
| "loss": 0.7718, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.8976234003656307, | |
| "grad_norm": 2.3614344596862793, | |
| "learning_rate": 2.5555708636794594e-06, | |
| "loss": 0.7767, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 0.8994515539305301, | |
| "grad_norm": 2.726402521133423, | |
| "learning_rate": 2.467682828805956e-06, | |
| "loss": 0.7917, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.9012797074954296, | |
| "grad_norm": 2.2285687923431396, | |
| "learning_rate": 2.38119700753228e-06, | |
| "loss": 0.8958, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 0.903107861060329, | |
| "grad_norm": 2.1934146881103516, | |
| "learning_rate": 2.2961230765681158e-06, | |
| "loss": 0.7796, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 0.9049360146252285, | |
| "grad_norm": 2.349043607711792, | |
| "learning_rate": 2.212470554649805e-06, | |
| "loss": 0.8538, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.906764168190128, | |
| "grad_norm": 1.995997667312622, | |
| "learning_rate": 2.130248801475344e-06, | |
| "loss": 0.8433, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.9085923217550275, | |
| "grad_norm": 2.1767685413360596, | |
| "learning_rate": 2.0494670166571356e-06, | |
| "loss": 0.8276, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 0.9104204753199269, | |
| "grad_norm": 2.255619525909424, | |
| "learning_rate": 1.9701342386926712e-06, | |
| "loss": 0.7797, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 0.9122486288848263, | |
| "grad_norm": 2.3576643466949463, | |
| "learning_rate": 1.892259343953226e-06, | |
| "loss": 0.9015, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 0.9140767824497258, | |
| "grad_norm": 1.9980827569961548, | |
| "learning_rate": 1.815851045690708e-06, | |
| "loss": 0.6846, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9140767824497258, | |
| "eval_loss": 0.798653244972229, | |
| "eval_runtime": 11.4055, | |
| "eval_samples_per_second": 97.497, | |
| "eval_steps_per_second": 3.069, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9159049360146252, | |
| "grad_norm": 2.24575138092041, | |
| "learning_rate": 1.7409178930627473e-06, | |
| "loss": 0.8362, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 0.9177330895795247, | |
| "grad_norm": 2.058715343475342, | |
| "learning_rate": 1.6674682701761496e-06, | |
| "loss": 0.8225, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 0.9195612431444241, | |
| "grad_norm": 2.0738391876220703, | |
| "learning_rate": 1.5955103951488177e-06, | |
| "loss": 0.7747, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 0.9213893967093236, | |
| "grad_norm": 2.142606735229492, | |
| "learning_rate": 1.5250523191902455e-06, | |
| "loss": 0.8331, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 0.923217550274223, | |
| "grad_norm": 2.2022759914398193, | |
| "learning_rate": 1.456101925700684e-06, | |
| "loss": 0.8037, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.9250457038391224, | |
| "grad_norm": 2.1481759548187256, | |
| "learning_rate": 1.3886669293890837e-06, | |
| "loss": 0.7431, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 0.926873857404022, | |
| "grad_norm": 2.3185274600982666, | |
| "learning_rate": 1.322754875409915e-06, | |
| "loss": 0.7726, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 0.9287020109689214, | |
| "grad_norm": 2.315138816833496, | |
| "learning_rate": 1.2583731385189562e-06, | |
| "loss": 0.7026, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 0.9305301645338209, | |
| "grad_norm": 2.050353527069092, | |
| "learning_rate": 1.1955289222481513e-06, | |
| "loss": 0.7373, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 0.9323583180987203, | |
| "grad_norm": 2.3529744148254395, | |
| "learning_rate": 1.1342292580996195e-06, | |
| "loss": 0.8461, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.9341864716636198, | |
| "grad_norm": 2.264411687850952, | |
| "learning_rate": 1.0744810047589116e-06, | |
| "loss": 1.05, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 0.9360146252285192, | |
| "grad_norm": 2.2528390884399414, | |
| "learning_rate": 1.0162908473276133e-06, | |
| "loss": 0.8218, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.9378427787934186, | |
| "grad_norm": 2.23812198638916, | |
| "learning_rate": 9.596652965753632e-07, | |
| "loss": 0.8533, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 0.9396709323583181, | |
| "grad_norm": 2.4503235816955566, | |
| "learning_rate": 9.046106882113753e-07, | |
| "loss": 0.8821, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 0.9414990859232175, | |
| "grad_norm": 2.152954578399658, | |
| "learning_rate": 8.511331821755459e-07, | |
| "loss": 0.7932, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.943327239488117, | |
| "grad_norm": 2.1594455242156982, | |
| "learning_rate": 7.992387619492436e-07, | |
| "loss": 0.7988, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 0.9451553930530164, | |
| "grad_norm": 2.086651086807251, | |
| "learning_rate": 7.489332338858202e-07, | |
| "loss": 0.8552, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 0.946983546617916, | |
| "grad_norm": 2.134727954864502, | |
| "learning_rate": 7.002222265609476e-07, | |
| "loss": 0.8825, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 0.9488117001828154, | |
| "grad_norm": 2.169853448867798, | |
| "learning_rate": 6.53111190142861e-07, | |
| "loss": 0.8105, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 0.9506398537477148, | |
| "grad_norm": 2.000743865966797, | |
| "learning_rate": 6.076053957825411e-07, | |
| "loss": 0.6882, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.9524680073126143, | |
| "grad_norm": 2.1314992904663086, | |
| "learning_rate": 5.637099350239427e-07, | |
| "loss": 0.7354, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 0.9542961608775137, | |
| "grad_norm": 2.3546230792999268, | |
| "learning_rate": 5.214297192343104e-07, | |
| "loss": 0.8793, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 0.9561243144424132, | |
| "grad_norm": 2.054684638977051, | |
| "learning_rate": 4.807694790546563e-07, | |
| "loss": 0.8644, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 0.9579524680073126, | |
| "grad_norm": 2.0605905055999756, | |
| "learning_rate": 4.417337638704588e-07, | |
| "loss": 0.675, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 0.9597806215722121, | |
| "grad_norm": 2.196253776550293, | |
| "learning_rate": 4.043269413026429e-07, | |
| "loss": 0.8171, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.9616087751371115, | |
| "grad_norm": 2.239720582962036, | |
| "learning_rate": 3.6855319671889433e-07, | |
| "loss": 0.7863, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 0.9634369287020109, | |
| "grad_norm": 2.3303980827331543, | |
| "learning_rate": 3.3441653276537253e-07, | |
| "loss": 0.7169, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 0.9652650822669104, | |
| "grad_norm": 2.10151743888855, | |
| "learning_rate": 3.0192076891885745e-07, | |
| "loss": 0.8925, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 0.9670932358318098, | |
| "grad_norm": 2.475900411605835, | |
| "learning_rate": 2.710695410593994e-07, | |
| "loss": 0.8043, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 0.9689213893967094, | |
| "grad_norm": 2.0351574420928955, | |
| "learning_rate": 2.418663010635114e-07, | |
| "loss": 0.6677, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.9707495429616088, | |
| "grad_norm": 2.2573163509368896, | |
| "learning_rate": 2.1431431641794287e-07, | |
| "loss": 0.8685, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 0.9725776965265083, | |
| "grad_norm": 2.2806551456451416, | |
| "learning_rate": 1.8841666985408566e-07, | |
| "loss": 1.0264, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 0.9744058500914077, | |
| "grad_norm": 2.0971100330352783, | |
| "learning_rate": 1.6417625900305656e-07, | |
| "loss": 0.663, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 0.9762340036563071, | |
| "grad_norm": 2.1479334831237793, | |
| "learning_rate": 1.4159579607148976e-07, | |
| "loss": 0.7461, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 0.9780621572212066, | |
| "grad_norm": 2.1846601963043213, | |
| "learning_rate": 1.206778075380699e-07, | |
| "loss": 0.7843, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.979890310786106, | |
| "grad_norm": 2.18355131149292, | |
| "learning_rate": 1.0142463387085465e-07, | |
| "loss": 0.8233, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 0.9817184643510055, | |
| "grad_norm": 1.9972505569458008, | |
| "learning_rate": 8.38384292653993e-08, | |
| "loss": 0.6412, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 0.9835466179159049, | |
| "grad_norm": 2.2325432300567627, | |
| "learning_rate": 6.792116140373117e-08, | |
| "loss": 0.7315, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 0.9853747714808044, | |
| "grad_norm": 2.270096778869629, | |
| "learning_rate": 5.367461123419071e-08, | |
| "loss": 0.7166, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 0.9872029250457038, | |
| "grad_norm": 2.084451675415039, | |
| "learning_rate": 4.110037277216427e-08, | |
| "loss": 0.7703, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.9890310786106032, | |
| "grad_norm": 2.1931207180023193, | |
| "learning_rate": 3.0199852921735104e-08, | |
| "loss": 0.9388, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 0.9908592321755028, | |
| "grad_norm": 2.08048939704895, | |
| "learning_rate": 2.0974271318260907e-08, | |
| "loss": 0.669, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 0.9926873857404022, | |
| "grad_norm": 2.402120351791382, | |
| "learning_rate": 1.342466019192301e-08, | |
| "loss": 0.8257, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 0.9945155393053017, | |
| "grad_norm": 2.3177034854888916, | |
| "learning_rate": 7.551864252223762e-09, | |
| "loss": 0.8117, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.9963436928702011, | |
| "grad_norm": 2.3477089405059814, | |
| "learning_rate": 3.3565405934721237e-09, | |
| "loss": 0.8285, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.9981718464351006, | |
| "grad_norm": 2.4188199043273926, | |
| "learning_rate": 8.391586212741498e-10, | |
| "loss": 0.8643, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 2.4587948322296143, | |
| "learning_rate": 0.0, | |
| "loss": 0.8511, | |
| "step": 547 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 547, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 250, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.615833264128e+16, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |