| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.02869777005849354, | |
| "eval_steps": 100, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 5.739554011698708e-05, | |
| "grad_norm": 1.8802112340927124, | |
| "learning_rate": 5.9999999999999995e-05, | |
| "loss": 2.9438, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.00011479108023397416, | |
| "grad_norm": 1.9408955574035645, | |
| "learning_rate": 0.00011999999999999999, | |
| "loss": 2.9429, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.00017218662035096125, | |
| "grad_norm": 2.9192652702331543, | |
| "learning_rate": 0.00017999999999999998, | |
| "loss": 2.952, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.00022958216046794832, | |
| "grad_norm": 2.3403642177581787, | |
| "learning_rate": 0.00023999999999999998, | |
| "loss": 2.9307, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.00028697770058493544, | |
| "grad_norm": 2.134683847427368, | |
| "learning_rate": 0.0003, | |
| "loss": 2.8917, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0003443732407019225, | |
| "grad_norm": 1.5358260869979858, | |
| "learning_rate": 0.00035999999999999997, | |
| "loss": 2.9205, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.0004017687808189096, | |
| "grad_norm": 0.9012013673782349, | |
| "learning_rate": 0.00041999999999999996, | |
| "loss": 2.8937, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.00045916432093589664, | |
| "grad_norm": 0.9427694082260132, | |
| "learning_rate": 0.00047999999999999996, | |
| "loss": 2.904, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0005165598610528837, | |
| "grad_norm": 1.662156105041504, | |
| "learning_rate": 0.00054, | |
| "loss": 2.9114, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.0005739554011698709, | |
| "grad_norm": 1.2877967357635498, | |
| "learning_rate": 0.0006, | |
| "loss": 2.9185, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.000631350941286858, | |
| "grad_norm": 1.3717082738876343, | |
| "learning_rate": 0.0005999969170437548, | |
| "loss": 2.899, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.000688746481403845, | |
| "grad_norm": 1.3706175088882446, | |
| "learning_rate": 0.0005999876683017478, | |
| "loss": 2.8522, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.0007461420215208321, | |
| "grad_norm": 0.7431464791297913, | |
| "learning_rate": 0.0005999722541541584, | |
| "loss": 2.8894, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.0008035375616378192, | |
| "grad_norm": 0.5839619040489197, | |
| "learning_rate": 0.0005999506752346019, | |
| "loss": 2.8866, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.0008609331017548062, | |
| "grad_norm": 0.5229901671409607, | |
| "learning_rate": 0.0005999229324301031, | |
| "loss": 2.8608, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0009183286418717933, | |
| "grad_norm": 0.6879259943962097, | |
| "learning_rate": 0.00059988902688106, | |
| "loss": 2.8801, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.0009757241819887805, | |
| "grad_norm": 0.4949502646923065, | |
| "learning_rate": 0.0005998489599811971, | |
| "loss": 2.8857, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.0010331197221057674, | |
| "grad_norm": 0.5659216642379761, | |
| "learning_rate": 0.0005998027333775077, | |
| "loss": 2.8172, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.0010905152622227546, | |
| "grad_norm": 0.43849167227745056, | |
| "learning_rate": 0.0005997503489701861, | |
| "loss": 2.8479, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.0011479108023397418, | |
| "grad_norm": 0.5036750435829163, | |
| "learning_rate": 0.0005996918089125504, | |
| "loss": 2.8957, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0012053063424567287, | |
| "grad_norm": 0.40093106031417847, | |
| "learning_rate": 0.000599627115610953, | |
| "loss": 2.8951, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.001262701882573716, | |
| "grad_norm": 0.3499244153499603, | |
| "learning_rate": 0.0005995562717246821, | |
| "loss": 2.8535, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.0013200974226907029, | |
| "grad_norm": 0.3672889769077301, | |
| "learning_rate": 0.0005994792801658526, | |
| "loss": 2.8507, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.00137749296280769, | |
| "grad_norm": 0.3307906985282898, | |
| "learning_rate": 0.0005993961440992859, | |
| "loss": 2.8597, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.001434888502924677, | |
| "grad_norm": 0.33352652192115784, | |
| "learning_rate": 0.0005993068669423797, | |
| "loss": 2.8023, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0014922840430416642, | |
| "grad_norm": 0.30308255553245544, | |
| "learning_rate": 0.0005992114523649685, | |
| "loss": 2.864, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.0015496795831586513, | |
| "grad_norm": 0.2800331711769104, | |
| "learning_rate": 0.000599109904289172, | |
| "loss": 2.8459, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.0016070751232756383, | |
| "grad_norm": 0.2467849850654602, | |
| "learning_rate": 0.0005990022268892337, | |
| "loss": 2.8298, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.0016644706633926255, | |
| "grad_norm": 0.25928932428359985, | |
| "learning_rate": 0.0005988884245913497, | |
| "loss": 2.8061, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.0017218662035096124, | |
| "grad_norm": 0.2770285904407501, | |
| "learning_rate": 0.0005987685020734869, | |
| "loss": 2.8363, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0017792617436265996, | |
| "grad_norm": 0.2888840436935425, | |
| "learning_rate": 0.0005986424642651901, | |
| "loss": 2.847, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.0018366572837435866, | |
| "grad_norm": 0.3389260172843933, | |
| "learning_rate": 0.0005985103163473802, | |
| "loss": 2.8185, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.0018940528238605737, | |
| "grad_norm": 0.3043622672557831, | |
| "learning_rate": 0.0005983720637521404, | |
| "loss": 2.8073, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.001951448363977561, | |
| "grad_norm": 0.2626359760761261, | |
| "learning_rate": 0.0005982277121624933, | |
| "loss": 2.8278, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.002008843904094548, | |
| "grad_norm": 0.2601317763328552, | |
| "learning_rate": 0.0005980772675121675, | |
| "loss": 2.8293, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.002066239444211535, | |
| "grad_norm": 0.2932066023349762, | |
| "learning_rate": 0.0005979207359853532, | |
| "loss": 2.842, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.002123634984328522, | |
| "grad_norm": 0.3828963041305542, | |
| "learning_rate": 0.0005977581240164485, | |
| "loss": 2.8383, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.002181030524445509, | |
| "grad_norm": 0.2928522527217865, | |
| "learning_rate": 0.0005975894382897944, | |
| "loss": 2.8291, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.0022384260645624964, | |
| "grad_norm": 0.2287234663963318, | |
| "learning_rate": 0.0005974146857394005, | |
| "loss": 2.8422, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.0022958216046794835, | |
| "grad_norm": 0.2722682058811188, | |
| "learning_rate": 0.0005972338735486597, | |
| "loss": 2.8217, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0023532171447964703, | |
| "grad_norm": 0.21170516312122345, | |
| "learning_rate": 0.0005970470091500531, | |
| "loss": 2.831, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.0024106126849134575, | |
| "grad_norm": 0.22243160009384155, | |
| "learning_rate": 0.0005968541002248439, | |
| "loss": 2.862, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.0024680082250304446, | |
| "grad_norm": 0.18485133349895477, | |
| "learning_rate": 0.0005966551547027627, | |
| "loss": 2.8531, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.002525403765147432, | |
| "grad_norm": 0.21640127897262573, | |
| "learning_rate": 0.0005964501807616806, | |
| "loss": 2.8245, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.0025827993052644185, | |
| "grad_norm": 0.2716100513935089, | |
| "learning_rate": 0.0005962391868272735, | |
| "loss": 2.8093, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.0026401948453814057, | |
| "grad_norm": 0.19726517796516418, | |
| "learning_rate": 0.0005960221815726757, | |
| "loss": 2.8214, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.002697590385498393, | |
| "grad_norm": 0.2424098700284958, | |
| "learning_rate": 0.0005957991739181231, | |
| "loss": 2.818, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.00275498592561538, | |
| "grad_norm": 0.2414388209581375, | |
| "learning_rate": 0.0005955701730305872, | |
| "loss": 2.8491, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.0028123814657323673, | |
| "grad_norm": 0.25403571128845215, | |
| "learning_rate": 0.0005953351883233972, | |
| "loss": 2.8321, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.002869777005849354, | |
| "grad_norm": 0.30923786759376526, | |
| "learning_rate": 0.0005950942294558544, | |
| "loss": 2.8298, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.002927172545966341, | |
| "grad_norm": 0.22294141352176666, | |
| "learning_rate": 0.0005948473063328338, | |
| "loss": 2.8015, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.0029845680860833283, | |
| "grad_norm": 0.2882789075374603, | |
| "learning_rate": 0.0005945944291043779, | |
| "loss": 2.8256, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.0030419636262003155, | |
| "grad_norm": 0.25416064262390137, | |
| "learning_rate": 0.0005943356081652793, | |
| "loss": 2.8211, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.0030993591663173027, | |
| "grad_norm": 0.2488490343093872, | |
| "learning_rate": 0.0005940708541546529, | |
| "loss": 2.8618, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.0031567547064342894, | |
| "grad_norm": 0.27515849471092224, | |
| "learning_rate": 0.000593800177955499, | |
| "loss": 2.802, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.0032141502465512766, | |
| "grad_norm": 0.2030380666255951, | |
| "learning_rate": 0.0005935235906942563, | |
| "loss": 2.8229, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.003271545786668264, | |
| "grad_norm": 0.2384052276611328, | |
| "learning_rate": 0.0005932411037403436, | |
| "loss": 2.8122, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.003328941326785251, | |
| "grad_norm": 0.2543489336967468, | |
| "learning_rate": 0.000592952728705693, | |
| "loss": 2.8302, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.003386336866902238, | |
| "grad_norm": 0.2387794405221939, | |
| "learning_rate": 0.000592658477444273, | |
| "loss": 2.835, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.003443732407019225, | |
| "grad_norm": 0.2748169004917145, | |
| "learning_rate": 0.0005923583620516003, | |
| "loss": 2.834, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.003501127947136212, | |
| "grad_norm": 0.2565017640590668, | |
| "learning_rate": 0.0005920523948642431, | |
| "loss": 2.8452, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.0035585234872531992, | |
| "grad_norm": 0.25502678751945496, | |
| "learning_rate": 0.0005917405884593144, | |
| "loss": 2.8345, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.0036159190273701864, | |
| "grad_norm": 0.22830121219158173, | |
| "learning_rate": 0.0005914229556539538, | |
| "loss": 2.7989, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.003673314567487173, | |
| "grad_norm": 0.3146669268608093, | |
| "learning_rate": 0.0005910995095048024, | |
| "loss": 2.845, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.0037307101076041603, | |
| "grad_norm": 0.2924383580684662, | |
| "learning_rate": 0.000590770263307464, | |
| "loss": 2.8303, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.0037881056477211475, | |
| "grad_norm": 0.2577711343765259, | |
| "learning_rate": 0.0005904352305959605, | |
| "loss": 2.8156, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.0038455011878381347, | |
| "grad_norm": 0.2631978988647461, | |
| "learning_rate": 0.0005900944251421745, | |
| "loss": 2.833, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.003902896727955122, | |
| "grad_norm": 0.21994397044181824, | |
| "learning_rate": 0.000589747860955283, | |
| "loss": 2.8136, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.003960292268072109, | |
| "grad_norm": 0.3000943064689636, | |
| "learning_rate": 0.0005893955522811827, | |
| "loss": 2.8415, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.004017687808189096, | |
| "grad_norm": 0.24310976266860962, | |
| "learning_rate": 0.0005890375136019032, | |
| "loss": 2.8148, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.004075083348306083, | |
| "grad_norm": 0.24616850912570953, | |
| "learning_rate": 0.0005886737596350122, | |
| "loss": 2.8329, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.00413247888842307, | |
| "grad_norm": 0.2714521884918213, | |
| "learning_rate": 0.0005883043053330105, | |
| "loss": 2.8356, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.004189874428540057, | |
| "grad_norm": 0.2601388096809387, | |
| "learning_rate": 0.0005879291658827176, | |
| "loss": 2.8228, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.004247269968657044, | |
| "grad_norm": 0.22764116525650024, | |
| "learning_rate": 0.0005875483567046467, | |
| "loss": 2.801, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.004304665508774032, | |
| "grad_norm": 0.22346433997154236, | |
| "learning_rate": 0.0005871618934523719, | |
| "loss": 2.7948, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.004362061048891018, | |
| "grad_norm": 0.18839874863624573, | |
| "learning_rate": 0.0005867697920118835, | |
| "loss": 2.8341, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.004419456589008005, | |
| "grad_norm": 0.25794312357902527, | |
| "learning_rate": 0.0005863720685009362, | |
| "loss": 2.815, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.004476852129124993, | |
| "grad_norm": 0.2352106124162674, | |
| "learning_rate": 0.0005859687392683856, | |
| "loss": 2.8169, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.0045342476692419795, | |
| "grad_norm": 0.28784099221229553, | |
| "learning_rate": 0.0005855598208935169, | |
| "loss": 2.8506, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.004591643209358967, | |
| "grad_norm": 0.22999855875968933, | |
| "learning_rate": 0.0005851453301853628, | |
| "loss": 2.8377, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.004649038749475954, | |
| "grad_norm": 0.21411263942718506, | |
| "learning_rate": 0.0005847252841820128, | |
| "loss": 2.8137, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.0047064342895929406, | |
| "grad_norm": 0.2420736700296402, | |
| "learning_rate": 0.0005842997001499129, | |
| "loss": 2.7929, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.004763829829709928, | |
| "grad_norm": 0.24426190555095673, | |
| "learning_rate": 0.0005838685955831558, | |
| "loss": 2.8273, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.004821225369826915, | |
| "grad_norm": 0.20297811925411224, | |
| "learning_rate": 0.0005834319882027617, | |
| "loss": 2.7993, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.0048786209099439025, | |
| "grad_norm": 0.2474389523267746, | |
| "learning_rate": 0.00058298989595595, | |
| "loss": 2.8252, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.004936016450060889, | |
| "grad_norm": 0.22601982951164246, | |
| "learning_rate": 0.0005825423370154012, | |
| "loss": 2.8421, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.004993411990177876, | |
| "grad_norm": 0.24997788667678833, | |
| "learning_rate": 0.0005820893297785106, | |
| "loss": 2.8485, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.005050807530294864, | |
| "grad_norm": 0.19994623959064484, | |
| "learning_rate": 0.0005816308928666314, | |
| "loss": 2.8456, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.00510820307041185, | |
| "grad_norm": 0.19206245243549347, | |
| "learning_rate": 0.0005811670451243093, | |
| "loss": 2.8035, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.005165598610528837, | |
| "grad_norm": 0.2515026032924652, | |
| "learning_rate": 0.0005806978056185083, | |
| "loss": 2.8232, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.005222994150645825, | |
| "grad_norm": 0.22921022772789001, | |
| "learning_rate": 0.0005802231936378267, | |
| "loss": 2.8366, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.0052803896907628114, | |
| "grad_norm": 0.248809352517128, | |
| "learning_rate": 0.000579743228691704, | |
| "loss": 2.8331, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.005337785230879799, | |
| "grad_norm": 0.18247073888778687, | |
| "learning_rate": 0.0005792579305096191, | |
| "loss": 2.8249, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.005395180770996786, | |
| "grad_norm": 0.2440440058708191, | |
| "learning_rate": 0.0005787673190402799, | |
| "loss": 2.837, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.0054525763111137725, | |
| "grad_norm": 0.21160444617271423, | |
| "learning_rate": 0.0005782714144508019, | |
| "loss": 2.7864, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.00550997185123076, | |
| "grad_norm": 0.21344538033008575, | |
| "learning_rate": 0.0005777702371258806, | |
| "loss": 2.847, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.005567367391347747, | |
| "grad_norm": 0.24861139059066772, | |
| "learning_rate": 0.0005772638076669529, | |
| "loss": 2.8267, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.0056247629314647345, | |
| "grad_norm": 0.290520042181015, | |
| "learning_rate": 0.0005767521468913501, | |
| "loss": 2.827, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.005682158471581721, | |
| "grad_norm": 0.20536312460899353, | |
| "learning_rate": 0.0005762352758314429, | |
| "loss": 2.8476, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.005739554011698708, | |
| "grad_norm": 0.21782469749450684, | |
| "learning_rate": 0.000575713215733776, | |
| "loss": 2.844, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.005739554011698708, | |
| "eval_loss": 2.7509028911590576, | |
| "eval_runtime": 85.2068, | |
| "eval_samples_per_second": 50.641, | |
| "eval_steps_per_second": 12.663, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.005796949551815696, | |
| "grad_norm": 0.2523731291294098, | |
| "learning_rate": 0.0005751859880581954, | |
| "loss": 2.8125, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.005854345091932682, | |
| "grad_norm": 0.30107325315475464, | |
| "learning_rate": 0.0005746536144769656, | |
| "loss": 2.8108, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.00591174063204967, | |
| "grad_norm": 0.24103832244873047, | |
| "learning_rate": 0.0005741161168738794, | |
| "loss": 2.8282, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.005969136172166657, | |
| "grad_norm": 0.31273001432418823, | |
| "learning_rate": 0.0005735735173433582, | |
| "loss": 2.8104, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.006026531712283643, | |
| "grad_norm": 0.19059035181999207, | |
| "learning_rate": 0.0005730258381895433, | |
| "loss": 2.8186, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.006083927252400631, | |
| "grad_norm": 0.25082021951675415, | |
| "learning_rate": 0.0005724731019253797, | |
| "loss": 2.8154, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.006141322792517618, | |
| "grad_norm": 0.23254480957984924, | |
| "learning_rate": 0.0005719153312716904, | |
| "loss": 2.8121, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.006198718332634605, | |
| "grad_norm": 0.24095705151557922, | |
| "learning_rate": 0.0005713525491562421, | |
| "loss": 2.8361, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.006256113872751592, | |
| "grad_norm": 0.17760275304317474, | |
| "learning_rate": 0.0005707847787128034, | |
| "loss": 2.8396, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.006313509412868579, | |
| "grad_norm": 0.20905229449272156, | |
| "learning_rate": 0.0005702120432801934, | |
| "loss": 2.8284, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0063709049529855665, | |
| "grad_norm": 0.19538630545139313, | |
| "learning_rate": 0.0005696343664013227, | |
| "loss": 2.8417, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.006428300493102553, | |
| "grad_norm": 0.2408672571182251, | |
| "learning_rate": 0.0005690517718222248, | |
| "loss": 2.8416, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.006485696033219541, | |
| "grad_norm": 0.19618412852287292, | |
| "learning_rate": 0.0005684642834910813, | |
| "loss": 2.8683, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.006543091573336528, | |
| "grad_norm": 0.17854906618595123, | |
| "learning_rate": 0.0005678719255572363, | |
| "loss": 2.8232, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.006600487113453514, | |
| "grad_norm": 0.2527766227722168, | |
| "learning_rate": 0.0005672747223702044, | |
| "loss": 2.8219, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.006657882653570502, | |
| "grad_norm": 0.21465440094470978, | |
| "learning_rate": 0.0005666726984786695, | |
| "loss": 2.8308, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.006715278193687489, | |
| "grad_norm": 0.2080729454755783, | |
| "learning_rate": 0.000566065878629476, | |
| "loss": 2.8369, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.006772673733804476, | |
| "grad_norm": 0.18979360163211823, | |
| "learning_rate": 0.0005654542877666108, | |
| "loss": 2.7997, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.006830069273921463, | |
| "grad_norm": 0.20258580148220062, | |
| "learning_rate": 0.0005648379510301792, | |
| "loss": 2.846, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.00688746481403845, | |
| "grad_norm": 0.2112026810646057, | |
| "learning_rate": 0.0005642168937553701, | |
| "loss": 2.8521, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.006944860354155437, | |
| "grad_norm": 0.25105029344558716, | |
| "learning_rate": 0.0005635911414714158, | |
| "loss": 2.8081, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.007002255894272424, | |
| "grad_norm": 0.21830224990844727, | |
| "learning_rate": 0.0005629607199005416, | |
| "loss": 2.8161, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.007059651434389411, | |
| "grad_norm": 0.19216330349445343, | |
| "learning_rate": 0.0005623256549569091, | |
| "loss": 2.805, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.0071170469745063985, | |
| "grad_norm": 0.19969609379768372, | |
| "learning_rate": 0.000561685972745551, | |
| "loss": 2.7859, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.007174442514623385, | |
| "grad_norm": 0.22093947231769562, | |
| "learning_rate": 0.0005610416995612973, | |
| "loss": 2.8194, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.007231838054740373, | |
| "grad_norm": 0.2148187905550003, | |
| "learning_rate": 0.0005603928618876952, | |
| "loss": 2.8565, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.0072892335948573595, | |
| "grad_norm": 0.18277674913406372, | |
| "learning_rate": 0.0005597394863959201, | |
| "loss": 2.8187, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.007346629134974346, | |
| "grad_norm": 0.22607837617397308, | |
| "learning_rate": 0.0005590815999436795, | |
| "loss": 2.8607, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.007404024675091334, | |
| "grad_norm": 0.22417186200618744, | |
| "learning_rate": 0.0005584192295741086, | |
| "loss": 2.8198, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.007461420215208321, | |
| "grad_norm": 0.229670912027359, | |
| "learning_rate": 0.0005577524025146591, | |
| "loss": 2.8477, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.007518815755325308, | |
| "grad_norm": 0.1985808163881302, | |
| "learning_rate": 0.0005570811461759794, | |
| "loss": 2.8058, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.007576211295442295, | |
| "grad_norm": 0.22260330617427826, | |
| "learning_rate": 0.0005564054881507886, | |
| "loss": 2.8369, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.007633606835559282, | |
| "grad_norm": 0.20925524830818176, | |
| "learning_rate": 0.0005557254562127417, | |
| "loss": 2.8205, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.007691002375676269, | |
| "grad_norm": 0.26581674814224243, | |
| "learning_rate": 0.0005550410783152882, | |
| "loss": 2.8164, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.007748397915793256, | |
| "grad_norm": 0.2182077318429947, | |
| "learning_rate": 0.0005543523825905229, | |
| "loss": 2.8279, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.007805793455910244, | |
| "grad_norm": 0.24468722939491272, | |
| "learning_rate": 0.0005536593973480297, | |
| "loss": 2.8281, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.007863188996027231, | |
| "grad_norm": 0.22021321952342987, | |
| "learning_rate": 0.0005529621510737175, | |
| "loss": 2.8028, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.007920584536144217, | |
| "grad_norm": 0.20566654205322266, | |
| "learning_rate": 0.0005522606724286498, | |
| "loss": 2.7937, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.007977980076261205, | |
| "grad_norm": 0.1960543841123581, | |
| "learning_rate": 0.0005515549902478665, | |
| "loss": 2.8089, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.008035375616378192, | |
| "grad_norm": 0.2689999043941498, | |
| "learning_rate": 0.0005508451335391975, | |
| "loss": 2.7959, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.008092771156495178, | |
| "grad_norm": 0.19776718318462372, | |
| "learning_rate": 0.0005501311314820721, | |
| "loss": 2.8442, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.008150166696612166, | |
| "grad_norm": 0.2156287282705307, | |
| "learning_rate": 0.0005494130134263184, | |
| "loss": 2.8224, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.008207562236729153, | |
| "grad_norm": 0.17528703808784485, | |
| "learning_rate": 0.0005486908088909568, | |
| "loss": 2.8659, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.00826495777684614, | |
| "grad_norm": 0.1757359504699707, | |
| "learning_rate": 0.0005479645475629872, | |
| "loss": 2.8119, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.008322353316963127, | |
| "grad_norm": 0.1916513890028, | |
| "learning_rate": 0.0005472342592961683, | |
| "loss": 2.8069, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.008379748857080115, | |
| "grad_norm": 0.19162799417972565, | |
| "learning_rate": 0.0005464999741097901, | |
| "loss": 2.8211, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.0084371443971971, | |
| "grad_norm": 0.1881379634141922, | |
| "learning_rate": 0.0005457617221874408, | |
| "loss": 2.7954, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.008494539937314088, | |
| "grad_norm": 0.22305060923099518, | |
| "learning_rate": 0.0005450195338757654, | |
| "loss": 2.8447, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.008551935477431076, | |
| "grad_norm": 0.25081732869148254, | |
| "learning_rate": 0.0005442734396832185, | |
| "loss": 2.8205, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.008609331017548063, | |
| "grad_norm": 0.24046167731285095, | |
| "learning_rate": 0.00054352347027881, | |
| "loss": 2.8246, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.00866672655766505, | |
| "grad_norm": 0.20985569059848785, | |
| "learning_rate": 0.0005427696564908447, | |
| "loss": 2.8384, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.008724122097782037, | |
| "grad_norm": 0.18979063630104065, | |
| "learning_rate": 0.000542012029305655, | |
| "loss": 2.8261, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.008781517637899024, | |
| "grad_norm": 0.21513347327709198, | |
| "learning_rate": 0.0005412506198663268, | |
| "loss": 2.8197, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.00883891317801601, | |
| "grad_norm": 0.25432831048965454, | |
| "learning_rate": 0.0005404854594714204, | |
| "loss": 2.8091, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.008896308718132998, | |
| "grad_norm": 0.261273592710495, | |
| "learning_rate": 0.0005397165795736823, | |
| "loss": 2.8324, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.008953704258249985, | |
| "grad_norm": 0.22144336998462677, | |
| "learning_rate": 0.0005389440117787538, | |
| "loss": 2.8459, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.009011099798366971, | |
| "grad_norm": 0.1860560178756714, | |
| "learning_rate": 0.000538167787843871, | |
| "loss": 2.8552, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.009068495338483959, | |
| "grad_norm": 0.2402401566505432, | |
| "learning_rate": 0.0005373879396765593, | |
| "loss": 2.8229, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.009125890878600947, | |
| "grad_norm": 0.2112584114074707, | |
| "learning_rate": 0.0005366044993333228, | |
| "loss": 2.823, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.009183286418717934, | |
| "grad_norm": 0.24757996201515198, | |
| "learning_rate": 0.0005358174990183254, | |
| "loss": 2.8458, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.00924068195883492, | |
| "grad_norm": 0.20984984934329987, | |
| "learning_rate": 0.0005350269710820675, | |
| "loss": 2.8375, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.009298077498951908, | |
| "grad_norm": 0.22329501807689667, | |
| "learning_rate": 0.0005342329480200562, | |
| "loss": 2.815, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.009355473039068895, | |
| "grad_norm": 0.26144203543663025, | |
| "learning_rate": 0.0005334354624714697, | |
| "loss": 2.8286, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.009412868579185881, | |
| "grad_norm": 0.20015327632427216, | |
| "learning_rate": 0.0005326345472178154, | |
| "loss": 2.8304, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.009470264119302869, | |
| "grad_norm": 0.29256758093833923, | |
| "learning_rate": 0.0005318302351815823, | |
| "loss": 2.7884, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.009527659659419856, | |
| "grad_norm": 0.22914084792137146, | |
| "learning_rate": 0.000531022559424888, | |
| "loss": 2.8253, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.009585055199536842, | |
| "grad_norm": 0.2677003741264343, | |
| "learning_rate": 0.0005302115531481195, | |
| "loss": 2.8084, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.00964245073965383, | |
| "grad_norm": 0.2672327756881714, | |
| "learning_rate": 0.000529397249688568, | |
| "loss": 2.8351, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.009699846279770817, | |
| "grad_norm": 0.21281464397907257, | |
| "learning_rate": 0.0005285796825190598, | |
| "loss": 2.8463, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.009757241819887805, | |
| "grad_norm": 0.22858156263828278, | |
| "learning_rate": 0.0005277588852465788, | |
| "loss": 2.8156, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.009814637360004791, | |
| "grad_norm": 0.20694582164287567, | |
| "learning_rate": 0.0005269348916108859, | |
| "loss": 2.8392, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.009872032900121779, | |
| "grad_norm": 0.22438685595989227, | |
| "learning_rate": 0.0005261077354831322, | |
| "loss": 2.8336, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.009929428440238766, | |
| "grad_norm": 0.2279587984085083, | |
| "learning_rate": 0.0005252774508644666, | |
| "loss": 2.7972, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.009986823980355752, | |
| "grad_norm": 0.21278439462184906, | |
| "learning_rate": 0.0005244440718846375, | |
| "loss": 2.7946, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.01004421952047274, | |
| "grad_norm": 0.23399871587753296, | |
| "learning_rate": 0.0005236076328005906, | |
| "loss": 2.8648, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.010101615060589727, | |
| "grad_norm": 0.2649572193622589, | |
| "learning_rate": 0.0005227681679950607, | |
| "loss": 2.8453, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.010159010600706713, | |
| "grad_norm": 0.21067285537719727, | |
| "learning_rate": 0.0005219257119751581, | |
| "loss": 2.8357, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.0102164061408237, | |
| "grad_norm": 0.22862860560417175, | |
| "learning_rate": 0.0005210802993709497, | |
| "loss": 2.8235, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.010273801680940688, | |
| "grad_norm": 0.22179283201694489, | |
| "learning_rate": 0.0005202319649340369, | |
| "loss": 2.82, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.010331197221057674, | |
| "grad_norm": 0.16690605878829956, | |
| "learning_rate": 0.0005193807435361252, | |
| "loss": 2.8237, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.010388592761174662, | |
| "grad_norm": 0.21572506427764893, | |
| "learning_rate": 0.0005185266701675927, | |
| "loss": 2.8403, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.01044598830129165, | |
| "grad_norm": 0.1778525710105896, | |
| "learning_rate": 0.0005176697799360502, | |
| "loss": 2.8204, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.010503383841408637, | |
| "grad_norm": 0.18887534737586975, | |
| "learning_rate": 0.0005168101080648989, | |
| "loss": 2.8146, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.010560779381525623, | |
| "grad_norm": 0.18108077347278595, | |
| "learning_rate": 0.0005159476898918823, | |
| "loss": 2.853, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.01061817492164261, | |
| "grad_norm": 0.1870754212141037, | |
| "learning_rate": 0.0005150825608676336, | |
| "loss": 2.8537, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.010675570461759598, | |
| "grad_norm": 0.16484060883522034, | |
| "learning_rate": 0.0005142147565542188, | |
| "loss": 2.8194, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.010732966001876584, | |
| "grad_norm": 0.18527449667453766, | |
| "learning_rate": 0.0005133443126236739, | |
| "loss": 2.8402, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.010790361541993572, | |
| "grad_norm": 0.17674389481544495, | |
| "learning_rate": 0.0005124712648565398, | |
| "loss": 2.8412, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.01084775708211056, | |
| "grad_norm": 0.2521503269672394, | |
| "learning_rate": 0.0005115956491403907, | |
| "loss": 2.8348, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.010905152622227545, | |
| "grad_norm": 0.17621657252311707, | |
| "learning_rate": 0.000510717501468359, | |
| "loss": 2.8293, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.010962548162344533, | |
| "grad_norm": 0.2621336579322815, | |
| "learning_rate": 0.0005098368579376563, | |
| "loss": 2.8164, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.01101994370246152, | |
| "grad_norm": 0.18950189650058746, | |
| "learning_rate": 0.0005089537547480885, | |
| "loss": 2.7976, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.011077339242578508, | |
| "grad_norm": 0.24857239425182343, | |
| "learning_rate": 0.0005080682282005692, | |
| "loss": 2.8323, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.011134734782695494, | |
| "grad_norm": 0.16708490252494812, | |
| "learning_rate": 0.0005071803146956262, | |
| "loss": 2.801, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.011192130322812481, | |
| "grad_norm": 0.24443359673023224, | |
| "learning_rate": 0.000506290050731906, | |
| "loss": 2.8121, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.011249525862929469, | |
| "grad_norm": 0.2458924949169159, | |
| "learning_rate": 0.0005053974729046734, | |
| "loss": 2.8325, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.011306921403046455, | |
| "grad_norm": 0.2034812569618225, | |
| "learning_rate": 0.0005045026179043067, | |
| "loss": 2.8123, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.011364316943163442, | |
| "grad_norm": 0.2774895429611206, | |
| "learning_rate": 0.0005036055225147901, | |
| "loss": 2.8324, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.01142171248328043, | |
| "grad_norm": 0.22201013565063477, | |
| "learning_rate": 0.0005027062236122014, | |
| "loss": 2.8195, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.011479108023397416, | |
| "grad_norm": 0.1893691122531891, | |
| "learning_rate": 0.0005018047581631961, | |
| "loss": 2.8177, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.011479108023397416, | |
| "eval_loss": 2.749150037765503, | |
| "eval_runtime": 85.2258, | |
| "eval_samples_per_second": 50.63, | |
| "eval_steps_per_second": 12.66, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.011536503563514404, | |
| "grad_norm": 0.2689765691757202, | |
| "learning_rate": 0.0005009011632234881, | |
| "loss": 2.8438, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.011593899103631391, | |
| "grad_norm": 0.2234533727169037, | |
| "learning_rate": 0.0004999954759363262, | |
| "loss": 2.8103, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.011651294643748379, | |
| "grad_norm": 0.25140801072120667, | |
| "learning_rate": 0.0004990877335309675, | |
| "loss": 2.8178, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.011708690183865365, | |
| "grad_norm": 0.3070688843727112, | |
| "learning_rate": 0.0004981779733211468, | |
| "loss": 2.8518, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.011766085723982352, | |
| "grad_norm": 0.25637757778167725, | |
| "learning_rate": 0.0004972662327035431, | |
| "loss": 2.8578, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.01182348126409934, | |
| "grad_norm": 0.2551119923591614, | |
| "learning_rate": 0.0004963525491562421, | |
| "loss": 2.8237, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.011880876804216326, | |
| "grad_norm": 0.2416735738515854, | |
| "learning_rate": 0.0004954369602371958, | |
| "loss": 2.8195, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.011938272344333313, | |
| "grad_norm": 0.3950039744377136, | |
| "learning_rate": 0.0004945195035826785, | |
| "loss": 2.8087, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.011995667884450301, | |
| "grad_norm": 0.16370531916618347, | |
| "learning_rate": 0.00049360021690574, | |
| "loss": 2.8464, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.012053063424567287, | |
| "grad_norm": 0.28070008754730225, | |
| "learning_rate": 0.0004926791379946549, | |
| "loss": 2.8377, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.012110458964684274, | |
| "grad_norm": 0.1902085244655609, | |
| "learning_rate": 0.0004917563047113695, | |
| "loss": 2.8279, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.012167854504801262, | |
| "grad_norm": 0.27748385071754456, | |
| "learning_rate": 0.0004908317549899456, | |
| "loss": 2.837, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.012225250044918248, | |
| "grad_norm": 0.18437190353870392, | |
| "learning_rate": 0.0004899055268350012, | |
| "loss": 2.8301, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.012282645585035236, | |
| "grad_norm": 0.22971947491168976, | |
| "learning_rate": 0.0004889776583201479, | |
| "loss": 2.8051, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.012340041125152223, | |
| "grad_norm": 0.238089457154274, | |
| "learning_rate": 0.0004880481875864261, | |
| "loss": 2.8162, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.01239743666526921, | |
| "grad_norm": 0.24253320693969727, | |
| "learning_rate": 0.0004871171528407371, | |
| "loss": 2.8181, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.012454832205386197, | |
| "grad_norm": 0.2351958006620407, | |
| "learning_rate": 0.0004861845923542728, | |
| "loss": 2.8136, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.012512227745503184, | |
| "grad_norm": 0.23203608393669128, | |
| "learning_rate": 0.0004852505444609422, | |
| "loss": 2.804, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.012569623285620172, | |
| "grad_norm": 0.1896822452545166, | |
| "learning_rate": 0.00048431504755579575, | |
| "loss": 2.8118, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.012627018825737158, | |
| "grad_norm": 0.18357349932193756, | |
| "learning_rate": 0.0004833781400934471, | |
| "loss": 2.8205, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.012684414365854145, | |
| "grad_norm": 0.23723295331001282, | |
| "learning_rate": 0.00048243986058649246, | |
| "loss": 2.8291, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.012741809905971133, | |
| "grad_norm": 0.1937919706106186, | |
| "learning_rate": 0.0004815002476039273, | |
| "loss": 2.8416, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.012799205446088119, | |
| "grad_norm": 0.19754467904567719, | |
| "learning_rate": 0.0004805593397695613, | |
| "loss": 2.7963, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.012856600986205106, | |
| "grad_norm": 0.1592610776424408, | |
| "learning_rate": 0.00047961717576043, | |
| "loss": 2.8264, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.012913996526322094, | |
| "grad_norm": 0.2083783745765686, | |
| "learning_rate": 0.00047867379430520585, | |
| "loss": 2.8348, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.012971392066439082, | |
| "grad_norm": 0.1895647495985031, | |
| "learning_rate": 0.00047772923418260525, | |
| "loss": 2.8212, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.013028787606556068, | |
| "grad_norm": 0.2173570841550827, | |
| "learning_rate": 0.0004767835342197954, | |
| "loss": 2.8098, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.013086183146673055, | |
| "grad_norm": 0.1693475991487503, | |
| "learning_rate": 0.0004758367332907978, | |
| "loss": 2.796, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.013143578686790043, | |
| "grad_norm": 0.21635355055332184, | |
| "learning_rate": 0.00047488887031489017, | |
| "loss": 2.843, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.013200974226907029, | |
| "grad_norm": 0.18521156907081604, | |
| "learning_rate": 0.0004739399842550068, | |
| "loss": 2.8296, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.013258369767024016, | |
| "grad_norm": 0.22925664484500885, | |
| "learning_rate": 0.00047299011411613734, | |
| "loss": 2.8287, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.013315765307141004, | |
| "grad_norm": 0.24881386756896973, | |
| "learning_rate": 0.00047203929894372264, | |
| "loss": 2.8257, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.01337316084725799, | |
| "grad_norm": 0.20801618695259094, | |
| "learning_rate": 0.00047108757782205043, | |
| "loss": 2.8241, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.013430556387374977, | |
| "grad_norm": 0.199665367603302, | |
| "learning_rate": 0.0004701349898726483, | |
| "loss": 2.7916, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.013487951927491965, | |
| "grad_norm": 0.25221607089042664, | |
| "learning_rate": 0.00046918157425267584, | |
| "loss": 2.8233, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.013545347467608953, | |
| "grad_norm": 0.1931813657283783, | |
| "learning_rate": 0.00046822737015331505, | |
| "loss": 2.8016, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.013602743007725938, | |
| "grad_norm": 0.17353369295597076, | |
| "learning_rate": 0.00046727241679815894, | |
| "loss": 2.8125, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.013660138547842926, | |
| "grad_norm": 0.22225958108901978, | |
| "learning_rate": 0.0004663167534415996, | |
| "loss": 2.824, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.013717534087959914, | |
| "grad_norm": 0.17010116577148438, | |
| "learning_rate": 0.0004653604193672147, | |
| "loss": 2.8425, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.0137749296280769, | |
| "grad_norm": 0.2103683203458786, | |
| "learning_rate": 0.00046440345388615225, | |
| "loss": 2.8641, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.013832325168193887, | |
| "grad_norm": 0.17934557795524597, | |
| "learning_rate": 0.00046344589633551497, | |
| "loss": 2.8069, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.013889720708310875, | |
| "grad_norm": 0.2116999328136444, | |
| "learning_rate": 0.0004624877860767434, | |
| "loss": 2.8601, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.01394711624842786, | |
| "grad_norm": 0.20861205458641052, | |
| "learning_rate": 0.0004615291624939975, | |
| "loss": 2.8232, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.014004511788544848, | |
| "grad_norm": 0.24393285810947418, | |
| "learning_rate": 0.0004605700649925381, | |
| "loss": 2.8041, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.014061907328661836, | |
| "grad_norm": 0.2089577168226242, | |
| "learning_rate": 0.0004596105329971069, | |
| "loss": 2.8351, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.014119302868778822, | |
| "grad_norm": 0.20232421159744263, | |
| "learning_rate": 0.00045865060595030616, | |
| "loss": 2.8171, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.01417669840889581, | |
| "grad_norm": 0.22081732749938965, | |
| "learning_rate": 0.00045769032331097686, | |
| "loss": 2.8202, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.014234093949012797, | |
| "grad_norm": 0.17081516981124878, | |
| "learning_rate": 0.00045672972455257723, | |
| "loss": 2.8358, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.014291489489129785, | |
| "grad_norm": 0.3317008316516876, | |
| "learning_rate": 0.0004557688491615597, | |
| "loss": 2.8302, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.01434888502924677, | |
| "grad_norm": 0.23239760100841522, | |
| "learning_rate": 0.0004548077366357483, | |
| "loss": 2.8191, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.014406280569363758, | |
| "grad_norm": 0.22138993442058563, | |
| "learning_rate": 0.0004538464264827143, | |
| "loss": 2.8096, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.014463676109480746, | |
| "grad_norm": 0.23655574023723602, | |
| "learning_rate": 0.000452884958218153, | |
| "loss": 2.8295, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.014521071649597731, | |
| "grad_norm": 0.2227945327758789, | |
| "learning_rate": 0.000451923371364259, | |
| "loss": 2.8158, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.014578467189714719, | |
| "grad_norm": 0.20443300902843475, | |
| "learning_rate": 0.0004509617054481017, | |
| "loss": 2.83, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.014635862729831707, | |
| "grad_norm": 0.22221451997756958, | |
| "learning_rate": 0.00045, | |
| "loss": 2.8253, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.014693258269948693, | |
| "grad_norm": 0.1941068023443222, | |
| "learning_rate": 0.00044903829455189825, | |
| "loss": 2.83, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.01475065381006568, | |
| "grad_norm": 0.1914331614971161, | |
| "learning_rate": 0.0004480766286357409, | |
| "loss": 2.8162, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.014808049350182668, | |
| "grad_norm": 0.21014779806137085, | |
| "learning_rate": 0.0004471150417818469, | |
| "loss": 2.7993, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.014865444890299655, | |
| "grad_norm": 0.2057676762342453, | |
| "learning_rate": 0.00044615357351728566, | |
| "loss": 2.8223, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.014922840430416641, | |
| "grad_norm": 0.19875939190387726, | |
| "learning_rate": 0.00044519226336425165, | |
| "loss": 2.8016, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.014980235970533629, | |
| "grad_norm": 0.23691999912261963, | |
| "learning_rate": 0.0004442311508384402, | |
| "loss": 2.8373, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.015037631510650616, | |
| "grad_norm": 0.1729947328567505, | |
| "learning_rate": 0.0004432702754474228, | |
| "loss": 2.8233, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.015095027050767602, | |
| "grad_norm": 0.18821187317371368, | |
| "learning_rate": 0.00044230967668902306, | |
| "loss": 2.8128, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.01515242259088459, | |
| "grad_norm": 0.2283882200717926, | |
| "learning_rate": 0.00044134939404969387, | |
| "loss": 2.8178, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.015209818131001578, | |
| "grad_norm": 0.16724412143230438, | |
| "learning_rate": 0.000440389467002893, | |
| "loss": 2.8249, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.015267213671118563, | |
| "grad_norm": 0.18209712207317352, | |
| "learning_rate": 0.00043942993500746183, | |
| "loss": 2.8095, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.015324609211235551, | |
| "grad_norm": 0.1857995092868805, | |
| "learning_rate": 0.00043847083750600253, | |
| "loss": 2.806, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.015382004751352539, | |
| "grad_norm": 0.20734605193138123, | |
| "learning_rate": 0.0004375122139232566, | |
| "loss": 2.8695, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.015439400291469526, | |
| "grad_norm": 0.23138895630836487, | |
| "learning_rate": 0.00043655410366448495, | |
| "loss": 2.8033, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.015496795831586512, | |
| "grad_norm": 0.20481987297534943, | |
| "learning_rate": 0.0004355965461138477, | |
| "loss": 2.8269, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.0155541913717035, | |
| "grad_norm": 0.2318529337644577, | |
| "learning_rate": 0.00043463958063278524, | |
| "loss": 2.8332, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.015611586911820487, | |
| "grad_norm": 0.2501411736011505, | |
| "learning_rate": 0.00043368324655840035, | |
| "loss": 2.8445, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.015668982451937475, | |
| "grad_norm": 0.26137158274650574, | |
| "learning_rate": 0.0004327275832018411, | |
| "loss": 2.8279, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.015726377992054463, | |
| "grad_norm": 0.19074887037277222, | |
| "learning_rate": 0.0004317726298466849, | |
| "loss": 2.8132, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.015783773532171447, | |
| "grad_norm": 0.26000818610191345, | |
| "learning_rate": 0.0004308184257473241, | |
| "loss": 2.8091, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.015841169072288434, | |
| "grad_norm": 0.16060984134674072, | |
| "learning_rate": 0.0004298650101273517, | |
| "loss": 2.8206, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.015898564612405422, | |
| "grad_norm": 0.284445583820343, | |
| "learning_rate": 0.00042891242217794954, | |
| "loss": 2.7867, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.01595596015252241, | |
| "grad_norm": 0.15903466939926147, | |
| "learning_rate": 0.0004279607010562773, | |
| "loss": 2.83, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.016013355692639397, | |
| "grad_norm": 0.24330751597881317, | |
| "learning_rate": 0.0004270098858838626, | |
| "loss": 2.817, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.016070751232756385, | |
| "grad_norm": 0.1687777042388916, | |
| "learning_rate": 0.0004260600157449931, | |
| "loss": 2.8112, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.01612814677287337, | |
| "grad_norm": 0.18230785429477692, | |
| "learning_rate": 0.0004251111296851098, | |
| "loss": 2.8394, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.016185542312990357, | |
| "grad_norm": 0.1889660507440567, | |
| "learning_rate": 0.00042416326670920217, | |
| "loss": 2.8109, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.016242937853107344, | |
| "grad_norm": 0.16135123372077942, | |
| "learning_rate": 0.0004232164657802045, | |
| "loss": 2.7953, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.016300333393224332, | |
| "grad_norm": 0.15787218511104584, | |
| "learning_rate": 0.00042227076581739467, | |
| "loss": 2.7921, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.01635772893334132, | |
| "grad_norm": 0.16313977539539337, | |
| "learning_rate": 0.0004213262056947942, | |
| "loss": 2.8107, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.016415124473458307, | |
| "grad_norm": 0.18806132674217224, | |
| "learning_rate": 0.0004203828242395699, | |
| "loss": 2.8451, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.016472520013575295, | |
| "grad_norm": 0.17279674112796783, | |
| "learning_rate": 0.00041944066023043866, | |
| "loss": 2.8333, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.01652991555369228, | |
| "grad_norm": 0.17451834678649902, | |
| "learning_rate": 0.00041849975239607255, | |
| "loss": 2.7798, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.016587311093809266, | |
| "grad_norm": 0.1943039745092392, | |
| "learning_rate": 0.00041756013941350747, | |
| "loss": 2.8011, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.016644706633926254, | |
| "grad_norm": 0.1578904092311859, | |
| "learning_rate": 0.0004166218599065528, | |
| "loss": 2.852, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.01670210217404324, | |
| "grad_norm": 0.20066620409488678, | |
| "learning_rate": 0.0004156849524442042, | |
| "loss": 2.7876, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.01675949771416023, | |
| "grad_norm": 0.18306495249271393, | |
| "learning_rate": 0.0004147494555390577, | |
| "loss": 2.817, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.016816893254277217, | |
| "grad_norm": 0.1622687727212906, | |
| "learning_rate": 0.0004138154076457271, | |
| "loss": 2.815, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.0168742887943942, | |
| "grad_norm": 0.2056518942117691, | |
| "learning_rate": 0.0004128828471592628, | |
| "loss": 2.8131, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.01693168433451119, | |
| "grad_norm": 0.17123937606811523, | |
| "learning_rate": 0.00041195181241357383, | |
| "loss": 2.8025, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.016989079874628176, | |
| "grad_norm": 0.2233334332704544, | |
| "learning_rate": 0.00041102234167985204, | |
| "loss": 2.8347, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.017046475414745164, | |
| "grad_norm": 0.20740529894828796, | |
| "learning_rate": 0.0004100944731649987, | |
| "loss": 2.8099, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.01710387095486215, | |
| "grad_norm": 0.20391066372394562, | |
| "learning_rate": 0.0004091682450100543, | |
| "loss": 2.8363, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.01716126649497914, | |
| "grad_norm": 0.17306548357009888, | |
| "learning_rate": 0.0004082436952886305, | |
| "loss": 2.8211, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.017218662035096127, | |
| "grad_norm": 0.24933576583862305, | |
| "learning_rate": 0.0004073208620053451, | |
| "loss": 2.8048, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.017218662035096127, | |
| "eval_loss": 2.7432332038879395, | |
| "eval_runtime": 85.2508, | |
| "eval_samples_per_second": 50.615, | |
| "eval_steps_per_second": 12.657, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.01727605757521311, | |
| "grad_norm": 0.231708824634552, | |
| "learning_rate": 0.00040639978309425995, | |
| "loss": 2.8025, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.0173334531153301, | |
| "grad_norm": 0.15970614552497864, | |
| "learning_rate": 0.00040548049641732137, | |
| "loss": 2.8392, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.017390848655447086, | |
| "grad_norm": 0.20457029342651367, | |
| "learning_rate": 0.0004045630397628042, | |
| "loss": 2.8247, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.017448244195564074, | |
| "grad_norm": 0.1734900325536728, | |
| "learning_rate": 0.00040364745084375787, | |
| "loss": 2.7979, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.01750563973568106, | |
| "grad_norm": 0.19265452027320862, | |
| "learning_rate": 0.00040273376729645685, | |
| "loss": 2.8033, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.01756303527579805, | |
| "grad_norm": 0.19174844026565552, | |
| "learning_rate": 0.00040182202667885317, | |
| "loss": 2.8354, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.017620430815915036, | |
| "grad_norm": 0.27793413400650024, | |
| "learning_rate": 0.00040091226646903245, | |
| "loss": 2.797, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.01767782635603202, | |
| "grad_norm": 0.1806309074163437, | |
| "learning_rate": 0.00040000452406367367, | |
| "loss": 2.8046, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.017735221896149008, | |
| "grad_norm": 0.2249089479446411, | |
| "learning_rate": 0.0003990988367765118, | |
| "loss": 2.8125, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.017792617436265996, | |
| "grad_norm": 0.27839699387550354, | |
| "learning_rate": 0.00039819524183680384, | |
| "loss": 2.8183, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.017850012976382983, | |
| "grad_norm": 0.1877232789993286, | |
| "learning_rate": 0.00039729377638779857, | |
| "loss": 2.7989, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.01790740851649997, | |
| "grad_norm": 0.25160273909568787, | |
| "learning_rate": 0.00039639447748520985, | |
| "loss": 2.8536, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.01796480405661696, | |
| "grad_norm": 0.23843353986740112, | |
| "learning_rate": 0.0003954973820956932, | |
| "loss": 2.8064, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.018022199596733943, | |
| "grad_norm": 0.2549470365047455, | |
| "learning_rate": 0.00039460252709532656, | |
| "loss": 2.8415, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.01807959513685093, | |
| "grad_norm": 0.39248892664909363, | |
| "learning_rate": 0.0003937099492680938, | |
| "loss": 2.8137, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.018136990676967918, | |
| "grad_norm": 0.24034982919692993, | |
| "learning_rate": 0.0003928196853043737, | |
| "loss": 2.8301, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.018194386217084905, | |
| "grad_norm": 0.29434794187545776, | |
| "learning_rate": 0.00039193177179943083, | |
| "loss": 2.8288, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.018251781757201893, | |
| "grad_norm": 0.21636317670345306, | |
| "learning_rate": 0.0003910462452519114, | |
| "loss": 2.8121, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.01830917729731888, | |
| "grad_norm": 0.2217407375574112, | |
| "learning_rate": 0.0003901631420623437, | |
| "loss": 2.8551, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.01836657283743587, | |
| "grad_norm": 0.20126426219940186, | |
| "learning_rate": 0.0003892824985316409, | |
| "loss": 2.7812, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.018423968377552852, | |
| "grad_norm": 0.20343463122844696, | |
| "learning_rate": 0.0003884043508596093, | |
| "loss": 2.7959, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.01848136391766984, | |
| "grad_norm": 0.22265484929084778, | |
| "learning_rate": 0.00038752873514346015, | |
| "loss": 2.8254, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.018538759457786828, | |
| "grad_norm": 0.20545947551727295, | |
| "learning_rate": 0.000386655687376326, | |
| "loss": 2.8166, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.018596154997903815, | |
| "grad_norm": 0.17015507817268372, | |
| "learning_rate": 0.00038578524344578115, | |
| "loss": 2.806, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.018653550538020803, | |
| "grad_norm": 0.19378258287906647, | |
| "learning_rate": 0.00038491743913236624, | |
| "loss": 2.7979, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.01871094607813779, | |
| "grad_norm": 0.2112617790699005, | |
| "learning_rate": 0.0003840523101081177, | |
| "loss": 2.8149, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.018768341618254775, | |
| "grad_norm": 0.18846029043197632, | |
| "learning_rate": 0.0003831898919351011, | |
| "loss": 2.8334, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.018825737158371762, | |
| "grad_norm": 0.20672033727169037, | |
| "learning_rate": 0.00038233022006394976, | |
| "loss": 2.8061, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.01888313269848875, | |
| "grad_norm": 0.2700256109237671, | |
| "learning_rate": 0.00038147332983240717, | |
| "loss": 2.8101, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.018940528238605737, | |
| "grad_norm": 0.16990099847316742, | |
| "learning_rate": 0.00038061925646387467, | |
| "loss": 2.8227, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.018997923778722725, | |
| "grad_norm": 0.2140357792377472, | |
| "learning_rate": 0.0003797680350659631, | |
| "loss": 2.8018, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.019055319318839713, | |
| "grad_norm": 0.2538260221481323, | |
| "learning_rate": 0.0003789197006290502, | |
| "loss": 2.7725, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.0191127148589567, | |
| "grad_norm": 0.1694011092185974, | |
| "learning_rate": 0.0003780742880248419, | |
| "loss": 2.7973, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.019170110399073684, | |
| "grad_norm": 0.2092764526605606, | |
| "learning_rate": 0.0003772318320049391, | |
| "loss": 2.8256, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.019227505939190672, | |
| "grad_norm": 0.22675682604312897, | |
| "learning_rate": 0.0003763923671994093, | |
| "loss": 2.8092, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.01928490147930766, | |
| "grad_norm": 0.20571155846118927, | |
| "learning_rate": 0.0003755559281153625, | |
| "loss": 2.8176, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.019342297019424647, | |
| "grad_norm": 0.18606650829315186, | |
| "learning_rate": 0.0003747225491355334, | |
| "loss": 2.8019, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.019399692559541635, | |
| "grad_norm": 0.19859890639781952, | |
| "learning_rate": 0.00037389226451686763, | |
| "loss": 2.8036, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.019457088099658622, | |
| "grad_norm": 0.1632896512746811, | |
| "learning_rate": 0.00037306510838911404, | |
| "loss": 2.797, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.01951448363977561, | |
| "grad_norm": 0.17494754493236542, | |
| "learning_rate": 0.00037224111475342116, | |
| "loss": 2.8152, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.019571879179892594, | |
| "grad_norm": 0.20659732818603516, | |
| "learning_rate": 0.00037142031748094016, | |
| "loss": 2.8061, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.019629274720009582, | |
| "grad_norm": 0.18716713786125183, | |
| "learning_rate": 0.00037060275031143184, | |
| "loss": 2.8419, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.01968667026012657, | |
| "grad_norm": 0.2575749158859253, | |
| "learning_rate": 0.0003697884468518805, | |
| "loss": 2.7814, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.019744065800243557, | |
| "grad_norm": 0.19076134264469147, | |
| "learning_rate": 0.0003689774405751119, | |
| "loss": 2.797, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.019801461340360545, | |
| "grad_norm": 0.19563442468643188, | |
| "learning_rate": 0.00036816976481841764, | |
| "loss": 2.8269, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.019858856880477532, | |
| "grad_norm": 0.1790810525417328, | |
| "learning_rate": 0.0003673654527821846, | |
| "loss": 2.7856, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.019916252420594516, | |
| "grad_norm": 0.2125868797302246, | |
| "learning_rate": 0.00036656453752853025, | |
| "loss": 2.7973, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.019973647960711504, | |
| "grad_norm": 0.1454995572566986, | |
| "learning_rate": 0.00036576705197994376, | |
| "loss": 2.7869, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.02003104350082849, | |
| "grad_norm": 0.2808379530906677, | |
| "learning_rate": 0.00036497302891793255, | |
| "loss": 2.7923, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.02008843904094548, | |
| "grad_norm": 0.1776140034198761, | |
| "learning_rate": 0.0003641825009816745, | |
| "loss": 2.8194, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.020145834581062467, | |
| "grad_norm": 0.22207793593406677, | |
| "learning_rate": 0.0003633955006666771, | |
| "loss": 2.8234, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.020203230121179454, | |
| "grad_norm": 0.24642404913902283, | |
| "learning_rate": 0.0003626120603234406, | |
| "loss": 2.8351, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.020260625661296442, | |
| "grad_norm": 0.24731726944446564, | |
| "learning_rate": 0.000361832212156129, | |
| "loss": 2.7983, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.020318021201413426, | |
| "grad_norm": 0.21677981317043304, | |
| "learning_rate": 0.0003610559882212461, | |
| "loss": 2.8372, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.020375416741530414, | |
| "grad_norm": 0.28350090980529785, | |
| "learning_rate": 0.00036028342042631755, | |
| "loss": 2.8138, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.0204328122816474, | |
| "grad_norm": 0.22418756783008575, | |
| "learning_rate": 0.00035951454052857954, | |
| "loss": 2.7897, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.02049020782176439, | |
| "grad_norm": 0.27765804529190063, | |
| "learning_rate": 0.000358749380133673, | |
| "loss": 2.8139, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.020547603361881377, | |
| "grad_norm": 0.2694258391857147, | |
| "learning_rate": 0.000357987970694345, | |
| "loss": 2.7881, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.020604998901998364, | |
| "grad_norm": 0.3746117055416107, | |
| "learning_rate": 0.00035723034350915525, | |
| "loss": 2.8108, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.02066239444211535, | |
| "grad_norm": 0.22864773869514465, | |
| "learning_rate": 0.00035647652972119, | |
| "loss": 2.8102, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.020719789982232336, | |
| "grad_norm": 0.2728801369667053, | |
| "learning_rate": 0.0003557265603167814, | |
| "loss": 2.8046, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.020777185522349324, | |
| "grad_norm": 0.2561710774898529, | |
| "learning_rate": 0.0003549804661242345, | |
| "loss": 2.8242, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.02083458106246631, | |
| "grad_norm": 0.26235631108283997, | |
| "learning_rate": 0.00035423827781255914, | |
| "loss": 2.847, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.0208919766025833, | |
| "grad_norm": 0.24725806713104248, | |
| "learning_rate": 0.0003535000258902099, | |
| "loss": 2.7873, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.020949372142700286, | |
| "grad_norm": 0.2562279999256134, | |
| "learning_rate": 0.0003527657407038317, | |
| "loss": 2.799, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.021006767682817274, | |
| "grad_norm": 0.20368199050426483, | |
| "learning_rate": 0.00035203545243701266, | |
| "loss": 2.8011, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.021064163222934258, | |
| "grad_norm": 0.25594958662986755, | |
| "learning_rate": 0.0003513091911090431, | |
| "loss": 2.8099, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.021121558763051246, | |
| "grad_norm": 0.20084761083126068, | |
| "learning_rate": 0.00035058698657368154, | |
| "loss": 2.8249, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.021178954303168233, | |
| "grad_norm": 0.24110020697116852, | |
| "learning_rate": 0.00034986886851792775, | |
| "loss": 2.8058, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.02123634984328522, | |
| "grad_norm": 0.2016633003950119, | |
| "learning_rate": 0.0003491548664608024, | |
| "loss": 2.7935, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.02129374538340221, | |
| "grad_norm": 0.2722468376159668, | |
| "learning_rate": 0.0003484450097521336, | |
| "loss": 2.8146, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.021351140923519196, | |
| "grad_norm": 0.2089434564113617, | |
| "learning_rate": 0.0003477393275713501, | |
| "loss": 2.8231, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.021408536463636184, | |
| "grad_norm": 0.24770453572273254, | |
| "learning_rate": 0.0003470378489262824, | |
| "loss": 2.7994, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.021465932003753168, | |
| "grad_norm": 0.21104897558689117, | |
| "learning_rate": 0.00034634060265197026, | |
| "loss": 2.8189, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.021523327543870156, | |
| "grad_norm": 0.23374824225902557, | |
| "learning_rate": 0.000345647617409477, | |
| "loss": 2.783, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.021580723083987143, | |
| "grad_norm": 0.24334168434143066, | |
| "learning_rate": 0.00034495892168471176, | |
| "loss": 2.8092, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.02163811862410413, | |
| "grad_norm": 0.22772932052612305, | |
| "learning_rate": 0.00034427454378725827, | |
| "loss": 2.8178, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.02169551416422112, | |
| "grad_norm": 0.22545067965984344, | |
| "learning_rate": 0.00034359451184921125, | |
| "loss": 2.7961, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.021752909704338106, | |
| "grad_norm": 0.2873929738998413, | |
| "learning_rate": 0.00034291885382402044, | |
| "loss": 2.8408, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.02181030524445509, | |
| "grad_norm": 0.2099824994802475, | |
| "learning_rate": 0.00034224759748534083, | |
| "loss": 2.782, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.021867700784572078, | |
| "grad_norm": 0.32221996784210205, | |
| "learning_rate": 0.0003415807704258913, | |
| "loss": 2.8337, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.021925096324689065, | |
| "grad_norm": 0.2531490623950958, | |
| "learning_rate": 0.0003409184000563204, | |
| "loss": 2.8273, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.021982491864806053, | |
| "grad_norm": 0.3075484037399292, | |
| "learning_rate": 0.00034026051360407973, | |
| "loss": 2.7805, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.02203988740492304, | |
| "grad_norm": 0.2366313338279724, | |
| "learning_rate": 0.0003396071381123047, | |
| "loss": 2.8278, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.022097282945040028, | |
| "grad_norm": 0.2348204106092453, | |
| "learning_rate": 0.00033895830043870266, | |
| "loss": 2.7922, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.022154678485157016, | |
| "grad_norm": 0.28124627470970154, | |
| "learning_rate": 0.00033831402725444896, | |
| "loss": 2.8065, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.022212074025274, | |
| "grad_norm": 0.1927008032798767, | |
| "learning_rate": 0.0003376743450430907, | |
| "loss": 2.7958, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.022269469565390988, | |
| "grad_norm": 0.26325997710227966, | |
| "learning_rate": 0.0003370392800994583, | |
| "loss": 2.8313, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.022326865105507975, | |
| "grad_norm": 0.23394963145256042, | |
| "learning_rate": 0.0003364088585285842, | |
| "loss": 2.8126, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.022384260645624963, | |
| "grad_norm": 0.26055994629859924, | |
| "learning_rate": 0.00033578310624462983, | |
| "loss": 2.787, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.02244165618574195, | |
| "grad_norm": 0.2207145392894745, | |
| "learning_rate": 0.0003351620489698208, | |
| "loss": 2.796, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.022499051725858938, | |
| "grad_norm": 0.34231698513031006, | |
| "learning_rate": 0.0003345457122333891, | |
| "loss": 2.7951, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.022556447265975922, | |
| "grad_norm": 0.22361671924591064, | |
| "learning_rate": 0.00033393412137052396, | |
| "loss": 2.8251, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.02261384280609291, | |
| "grad_norm": 0.24573372304439545, | |
| "learning_rate": 0.0003333273015213304, | |
| "loss": 2.7899, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.022671238346209897, | |
| "grad_norm": 0.22109688818454742, | |
| "learning_rate": 0.0003327252776297955, | |
| "loss": 2.8178, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.022728633886326885, | |
| "grad_norm": 0.22289875149726868, | |
| "learning_rate": 0.00033212807444276364, | |
| "loss": 2.8053, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.022786029426443873, | |
| "grad_norm": 0.21445147693157196, | |
| "learning_rate": 0.00033153571650891865, | |
| "loss": 2.7998, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.02284342496656086, | |
| "grad_norm": 0.25061139464378357, | |
| "learning_rate": 0.00033094822817777514, | |
| "loss": 2.8055, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.022900820506677848, | |
| "grad_norm": 0.24680854380130768, | |
| "learning_rate": 0.0003303656335986773, | |
| "loss": 2.8143, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.022958216046794832, | |
| "grad_norm": 0.16644932329654694, | |
| "learning_rate": 0.0003297879567198065, | |
| "loss": 2.8192, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.022958216046794832, | |
| "eval_loss": 2.738191604614258, | |
| "eval_runtime": 85.3252, | |
| "eval_samples_per_second": 50.571, | |
| "eval_steps_per_second": 12.646, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.02301561158691182, | |
| "grad_norm": 0.2816384434700012, | |
| "learning_rate": 0.00032921522128719657, | |
| "loss": 2.8209, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.023073007127028807, | |
| "grad_norm": 0.20395685732364655, | |
| "learning_rate": 0.00032864745084375783, | |
| "loss": 2.8021, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.023130402667145795, | |
| "grad_norm": 0.24216794967651367, | |
| "learning_rate": 0.00032808466872830957, | |
| "loss": 2.8447, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.023187798207262782, | |
| "grad_norm": 0.2526738941669464, | |
| "learning_rate": 0.00032752689807462017, | |
| "loss": 2.7906, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.02324519374737977, | |
| "grad_norm": 0.21725283563137054, | |
| "learning_rate": 0.0003269741618104566, | |
| "loss": 2.7943, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.023302589287496758, | |
| "grad_norm": 0.2765718102455139, | |
| "learning_rate": 0.00032642648265664175, | |
| "loss": 2.8109, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.02335998482761374, | |
| "grad_norm": 0.20015880465507507, | |
| "learning_rate": 0.00032588388312612053, | |
| "loss": 2.8239, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.02341738036773073, | |
| "grad_norm": 0.26865240931510925, | |
| "learning_rate": 0.0003253463855230344, | |
| "loss": 2.8279, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.023474775907847717, | |
| "grad_norm": 0.23522211611270905, | |
| "learning_rate": 0.0003248140119418046, | |
| "loss": 2.8123, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.023532171447964705, | |
| "grad_norm": 0.2388644963502884, | |
| "learning_rate": 0.0003242867842662239, | |
| "loss": 2.8057, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.023589566988081692, | |
| "grad_norm": 0.18323197960853577, | |
| "learning_rate": 0.00032376472416855703, | |
| "loss": 2.8193, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.02364696252819868, | |
| "grad_norm": 0.24734856188297272, | |
| "learning_rate": 0.00032324785310864983, | |
| "loss": 2.7924, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.023704358068315664, | |
| "grad_norm": 0.1722363829612732, | |
| "learning_rate": 0.0003227361923330471, | |
| "loss": 2.8242, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.02376175360843265, | |
| "grad_norm": 0.2052358090877533, | |
| "learning_rate": 0.00032222976287411934, | |
| "loss": 2.8129, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.02381914914854964, | |
| "grad_norm": 0.2536105811595917, | |
| "learning_rate": 0.00032172858554919807, | |
| "loss": 2.8207, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.023876544688666627, | |
| "grad_norm": 0.23084022104740143, | |
| "learning_rate": 0.00032123268095972005, | |
| "loss": 2.8156, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.023933940228783614, | |
| "grad_norm": 0.28741586208343506, | |
| "learning_rate": 0.00032074206949038073, | |
| "loss": 2.8008, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.023991335768900602, | |
| "grad_norm": 0.2419297993183136, | |
| "learning_rate": 0.0003202567713082959, | |
| "loss": 2.8112, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.02404873130901759, | |
| "grad_norm": 0.19744537770748138, | |
| "learning_rate": 0.0003197768063621732, | |
| "loss": 2.7894, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.024106126849134574, | |
| "grad_norm": 0.22780993580818176, | |
| "learning_rate": 0.0003193021943814916, | |
| "loss": 2.8019, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.02416352238925156, | |
| "grad_norm": 0.2176397144794464, | |
| "learning_rate": 0.00031883295487569063, | |
| "loss": 2.8183, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.02422091792936855, | |
| "grad_norm": 0.23891203105449677, | |
| "learning_rate": 0.00031836910713336857, | |
| "loss": 2.8022, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.024278313469485537, | |
| "grad_norm": 0.18507017195224762, | |
| "learning_rate": 0.0003179106702214893, | |
| "loss": 2.8013, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.024335709009602524, | |
| "grad_norm": 0.20408926904201508, | |
| "learning_rate": 0.0003174576629845987, | |
| "loss": 2.8085, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.024393104549719512, | |
| "grad_norm": 0.18055075407028198, | |
| "learning_rate": 0.00031701010404404996, | |
| "loss": 2.8341, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.024450500089836496, | |
| "grad_norm": 0.22974956035614014, | |
| "learning_rate": 0.0003165680117972382, | |
| "loss": 2.8044, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.024507895629953484, | |
| "grad_norm": 0.17688511312007904, | |
| "learning_rate": 0.00031613140441684413, | |
| "loss": 2.7866, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.02456529117007047, | |
| "grad_norm": 0.22350828349590302, | |
| "learning_rate": 0.000315700299850087, | |
| "loss": 2.7939, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.02462268671018746, | |
| "grad_norm": 0.2138863056898117, | |
| "learning_rate": 0.0003152747158179871, | |
| "loss": 2.8112, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.024680082250304446, | |
| "grad_norm": 0.1666262447834015, | |
| "learning_rate": 0.0003148546698146371, | |
| "loss": 2.8464, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.024737477790421434, | |
| "grad_norm": 0.23217864334583282, | |
| "learning_rate": 0.00031444017910648293, | |
| "loss": 2.8154, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 0.02479487333053842, | |
| "grad_norm": 0.23967209458351135, | |
| "learning_rate": 0.00031403126073161424, | |
| "loss": 2.8068, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.024852268870655406, | |
| "grad_norm": 0.2363416850566864, | |
| "learning_rate": 0.0003136279314990637, | |
| "loss": 2.832, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 0.024909664410772393, | |
| "grad_norm": 0.20204566419124603, | |
| "learning_rate": 0.00031323020798811643, | |
| "loss": 2.8118, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.02496705995088938, | |
| "grad_norm": 0.2645012438297272, | |
| "learning_rate": 0.00031283810654762816, | |
| "loss": 2.7988, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.02502445549100637, | |
| "grad_norm": 0.31096434593200684, | |
| "learning_rate": 0.0003124516432953532, | |
| "loss": 2.8021, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.025081851031123356, | |
| "grad_norm": 0.25740697979927063, | |
| "learning_rate": 0.00031207083411728236, | |
| "loss": 2.828, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 0.025139246571240344, | |
| "grad_norm": 0.24895477294921875, | |
| "learning_rate": 0.00031169569466698937, | |
| "loss": 2.8073, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.02519664211135733, | |
| "grad_norm": 0.2860502004623413, | |
| "learning_rate": 0.00031132624036498774, | |
| "loss": 2.8275, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 0.025254037651474315, | |
| "grad_norm": 0.3134096562862396, | |
| "learning_rate": 0.00031096248639809674, | |
| "loss": 2.816, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.025311433191591303, | |
| "grad_norm": 0.2185070812702179, | |
| "learning_rate": 0.0003106044477188172, | |
| "loss": 2.7799, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.02536882873170829, | |
| "grad_norm": 0.3582714796066284, | |
| "learning_rate": 0.0003102521390447169, | |
| "loss": 2.7923, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.02542622427182528, | |
| "grad_norm": 0.19494207203388214, | |
| "learning_rate": 0.00030990557485782553, | |
| "loss": 2.7999, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 0.025483619811942266, | |
| "grad_norm": 0.2574940025806427, | |
| "learning_rate": 0.0003095647694040394, | |
| "loss": 2.8087, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.025541015352059254, | |
| "grad_norm": 0.17501215636730194, | |
| "learning_rate": 0.0003092297366925359, | |
| "loss": 2.7817, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.025598410892176238, | |
| "grad_norm": 0.4073377251625061, | |
| "learning_rate": 0.0003089004904951976, | |
| "loss": 2.813, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.025655806432293225, | |
| "grad_norm": 0.21654489636421204, | |
| "learning_rate": 0.000308577044346046, | |
| "loss": 2.8165, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.025713201972410213, | |
| "grad_norm": 0.26500189304351807, | |
| "learning_rate": 0.0003082594115406856, | |
| "loss": 2.8229, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.0257705975125272, | |
| "grad_norm": 0.188262477517128, | |
| "learning_rate": 0.00030794760513575675, | |
| "loss": 2.8112, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 0.025827993052644188, | |
| "grad_norm": 0.3432970643043518, | |
| "learning_rate": 0.00030764163794839966, | |
| "loss": 2.8241, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.025885388592761176, | |
| "grad_norm": 0.23415225744247437, | |
| "learning_rate": 0.0003073415225557269, | |
| "loss": 2.8039, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 0.025942784132878163, | |
| "grad_norm": 0.2670385241508484, | |
| "learning_rate": 0.0003070472712943069, | |
| "loss": 2.8215, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.026000179672995147, | |
| "grad_norm": 0.17434735596179962, | |
| "learning_rate": 0.00030675889625965646, | |
| "loss": 2.8352, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 0.026057575213112135, | |
| "grad_norm": 0.2789264917373657, | |
| "learning_rate": 0.0003064764093057437, | |
| "loss": 2.7856, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.026114970753229123, | |
| "grad_norm": 0.2666022479534149, | |
| "learning_rate": 0.0003061998220445009, | |
| "loss": 2.8063, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.02617236629334611, | |
| "grad_norm": 0.22438260912895203, | |
| "learning_rate": 0.00030592914584534706, | |
| "loss": 2.7783, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.026229761833463098, | |
| "grad_norm": 0.2177169770002365, | |
| "learning_rate": 0.00030566439183472063, | |
| "loss": 2.786, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 0.026287157373580086, | |
| "grad_norm": 0.22771142423152924, | |
| "learning_rate": 0.000305405570895622, | |
| "loss": 2.7881, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.02634455291369707, | |
| "grad_norm": 0.29228097200393677, | |
| "learning_rate": 0.00030515269366716613, | |
| "loss": 2.7876, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 0.026401948453814057, | |
| "grad_norm": 0.18204721808433533, | |
| "learning_rate": 0.00030490577054414553, | |
| "loss": 2.8153, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.026459343993931045, | |
| "grad_norm": 0.19830970466136932, | |
| "learning_rate": 0.0003046648116766027, | |
| "loss": 2.7884, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 0.026516739534048032, | |
| "grad_norm": 0.17311398684978485, | |
| "learning_rate": 0.00030442982696941276, | |
| "loss": 2.8055, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.02657413507416502, | |
| "grad_norm": 0.21194536983966827, | |
| "learning_rate": 0.0003042008260818768, | |
| "loss": 2.815, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 0.026631530614282008, | |
| "grad_norm": 0.22366400063037872, | |
| "learning_rate": 0.0003039778184273243, | |
| "loss": 2.7994, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.026688926154398995, | |
| "grad_norm": 0.17785237729549408, | |
| "learning_rate": 0.00030376081317272645, | |
| "loss": 2.8049, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.02674632169451598, | |
| "grad_norm": 0.2285715490579605, | |
| "learning_rate": 0.00030354981923831934, | |
| "loss": 2.8105, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 0.026803717234632967, | |
| "grad_norm": 0.17985928058624268, | |
| "learning_rate": 0.0003033448452972373, | |
| "loss": 2.8246, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 0.026861112774749955, | |
| "grad_norm": 0.2026437669992447, | |
| "learning_rate": 0.000303145899775156, | |
| "loss": 2.8192, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.026918508314866942, | |
| "grad_norm": 0.2605213522911072, | |
| "learning_rate": 0.0003029529908499469, | |
| "loss": 2.826, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 0.02697590385498393, | |
| "grad_norm": 0.22592206299304962, | |
| "learning_rate": 0.00030276612645134017, | |
| "loss": 2.7987, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.027033299395100917, | |
| "grad_norm": 0.2988434433937073, | |
| "learning_rate": 0.0003025853142605994, | |
| "loss": 2.826, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 0.027090694935217905, | |
| "grad_norm": 0.2247052788734436, | |
| "learning_rate": 0.0003024105617102055, | |
| "loss": 2.815, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.02714809047533489, | |
| "grad_norm": 0.26565778255462646, | |
| "learning_rate": 0.00030224187598355145, | |
| "loss": 2.8283, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 0.027205486015451877, | |
| "grad_norm": 0.2834932804107666, | |
| "learning_rate": 0.00030207926401464675, | |
| "loss": 2.8088, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.027262881555568864, | |
| "grad_norm": 0.2396688312292099, | |
| "learning_rate": 0.0003019227324878324, | |
| "loss": 2.8024, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.027320277095685852, | |
| "grad_norm": 0.2600051760673523, | |
| "learning_rate": 0.0003017722878375066, | |
| "loss": 2.8258, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.02737767263580284, | |
| "grad_norm": 0.26368406414985657, | |
| "learning_rate": 0.00030162793624785957, | |
| "loss": 2.7875, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 0.027435068175919827, | |
| "grad_norm": 0.389852911233902, | |
| "learning_rate": 0.0003014896836526197, | |
| "loss": 2.8166, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 0.02749246371603681, | |
| "grad_norm": 0.23984675109386444, | |
| "learning_rate": 0.0003013575357348098, | |
| "loss": 2.8025, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 0.0275498592561538, | |
| "grad_norm": 0.24591901898384094, | |
| "learning_rate": 0.00030123149792651307, | |
| "loss": 2.7898, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.027607254796270787, | |
| "grad_norm": 0.24797213077545166, | |
| "learning_rate": 0.00030111157540865026, | |
| "loss": 2.8291, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 0.027664650336387774, | |
| "grad_norm": 0.2542579770088196, | |
| "learning_rate": 0.0003009977731107663, | |
| "loss": 2.7868, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 0.027722045876504762, | |
| "grad_norm": 0.21780452132225037, | |
| "learning_rate": 0.00030089009571082794, | |
| "loss": 2.8051, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 0.02777944141662175, | |
| "grad_norm": 0.2790198028087616, | |
| "learning_rate": 0.0003007885476350314, | |
| "loss": 2.8004, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.027836836956738737, | |
| "grad_norm": 0.2793212831020355, | |
| "learning_rate": 0.00030069313305762025, | |
| "loss": 2.8077, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.02789423249685572, | |
| "grad_norm": 0.2663847506046295, | |
| "learning_rate": 0.0003006038559007141, | |
| "loss": 2.805, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.02795162803697271, | |
| "grad_norm": 0.2695571482181549, | |
| "learning_rate": 0.0003005207198341473, | |
| "loss": 2.8102, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 0.028009023577089696, | |
| "grad_norm": 0.3027716875076294, | |
| "learning_rate": 0.0003004437282753177, | |
| "loss": 2.7944, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.028066419117206684, | |
| "grad_norm": 0.25220444798469543, | |
| "learning_rate": 0.0003003728843890469, | |
| "loss": 2.781, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 0.02812381465732367, | |
| "grad_norm": 0.2733742594718933, | |
| "learning_rate": 0.0003003081910874495, | |
| "loss": 2.8138, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.02818121019744066, | |
| "grad_norm": 0.23873530328273773, | |
| "learning_rate": 0.00030024965102981387, | |
| "loss": 2.8017, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 0.028238605737557643, | |
| "grad_norm": 0.29158100485801697, | |
| "learning_rate": 0.0003001972666224923, | |
| "loss": 2.8084, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.02829600127767463, | |
| "grad_norm": 0.3079324960708618, | |
| "learning_rate": 0.00030015104001880274, | |
| "loss": 2.8061, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 0.02835339681779162, | |
| "grad_norm": 0.2448122203350067, | |
| "learning_rate": 0.00030011097311893984, | |
| "loss": 2.7817, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 0.028410792357908606, | |
| "grad_norm": 0.3495275378227234, | |
| "learning_rate": 0.00030007706756989683, | |
| "loss": 2.8053, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.028468187898025594, | |
| "grad_norm": 0.19935691356658936, | |
| "learning_rate": 0.000300049324765398, | |
| "loss": 2.7985, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.02852558343814258, | |
| "grad_norm": 0.30157798528671265, | |
| "learning_rate": 0.0003000277458458415, | |
| "loss": 2.8271, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 0.02858297897825957, | |
| "grad_norm": 0.23343823850154877, | |
| "learning_rate": 0.00030001233169825214, | |
| "loss": 2.807, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 0.028640374518376553, | |
| "grad_norm": 0.25404173135757446, | |
| "learning_rate": 0.0003000030829562451, | |
| "loss": 2.8072, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 0.02869777005849354, | |
| "grad_norm": 0.28863540291786194, | |
| "learning_rate": 0.0003, | |
| "loss": 2.8088, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.02869777005849354, | |
| "eval_loss": 2.735079288482666, | |
| "eval_runtime": 85.4355, | |
| "eval_samples_per_second": 50.506, | |
| "eval_steps_per_second": 12.629, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 150, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.69922551431168e+17, | |
| "train_batch_size": 22, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |