| { | |
| "best_metric": 0.540250366102693, | |
| "best_model_checkpoint": "/data0/checkpoints/Qwen2.5-Math-7B-ScalePRM-v3.0/checkpoint-600", | |
| "epoch": 0.757934628138323, | |
| "eval_steps": 100, | |
| "global_step": 600, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0012632243802305385, | |
| "grad_norm": 2.429877281188965, | |
| "learning_rate": 7e-06, | |
| "loss": 1.1465, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.002526448760461077, | |
| "grad_norm": 2.2121567726135254, | |
| "learning_rate": 7e-06, | |
| "loss": 1.0451, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0037896731406916154, | |
| "grad_norm": 1.3855836391448975, | |
| "learning_rate": 7e-06, | |
| "loss": 0.735, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.005052897520922154, | |
| "grad_norm": 0.3500981330871582, | |
| "learning_rate": 7e-06, | |
| "loss": 0.5603, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0063161219011526925, | |
| "grad_norm": 0.40845438838005066, | |
| "learning_rate": 7e-06, | |
| "loss": 1.0627, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.007579346281383231, | |
| "grad_norm": 0.9035907983779907, | |
| "learning_rate": 7e-06, | |
| "loss": 5.0569, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.00884257066161377, | |
| "grad_norm": 0.6223624348640442, | |
| "learning_rate": 7e-06, | |
| "loss": 9.4706, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.010105795041844308, | |
| "grad_norm": 0.5229220390319824, | |
| "learning_rate": 7e-06, | |
| "loss": 5.6685, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.011369019422074847, | |
| "grad_norm": 0.5141741633415222, | |
| "learning_rate": 7e-06, | |
| "loss": 5.8831, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.012632243802305385, | |
| "grad_norm": 0.6420879364013672, | |
| "learning_rate": 7e-06, | |
| "loss": 4.6915, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.013895468182535923, | |
| "grad_norm": 0.47964179515838623, | |
| "learning_rate": 7e-06, | |
| "loss": 3.9531, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.015158692562766462, | |
| "grad_norm": 0.40398040413856506, | |
| "learning_rate": 7e-06, | |
| "loss": 1.9501, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.016421916942996998, | |
| "grad_norm": 0.5141711235046387, | |
| "learning_rate": 7e-06, | |
| "loss": 1.9378, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.01768514132322754, | |
| "grad_norm": 0.24602794647216797, | |
| "learning_rate": 7e-06, | |
| "loss": 1.535, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.018948365703458078, | |
| "grad_norm": 0.21208855509757996, | |
| "learning_rate": 7e-06, | |
| "loss": 1.5177, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.020211590083688617, | |
| "grad_norm": 0.6067216992378235, | |
| "learning_rate": 7e-06, | |
| "loss": 0.9104, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.021474814463919155, | |
| "grad_norm": 0.422442227602005, | |
| "learning_rate": 7e-06, | |
| "loss": 0.9266, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.022738038844149693, | |
| "grad_norm": 0.659572958946228, | |
| "learning_rate": 7e-06, | |
| "loss": 0.7499, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.02400126322438023, | |
| "grad_norm": 0.4817348122596741, | |
| "learning_rate": 7e-06, | |
| "loss": 0.7436, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.02526448760461077, | |
| "grad_norm": 0.20682591199874878, | |
| "learning_rate": 7e-06, | |
| "loss": 0.4786, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.026527711984841308, | |
| "grad_norm": 0.2613360583782196, | |
| "learning_rate": 7e-06, | |
| "loss": 0.8464, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.027790936365071846, | |
| "grad_norm": 0.2720305621623993, | |
| "learning_rate": 7e-06, | |
| "loss": 0.7255, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.029054160745302385, | |
| "grad_norm": 0.25043392181396484, | |
| "learning_rate": 7e-06, | |
| "loss": 0.9661, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.030317385125532923, | |
| "grad_norm": 0.2801963686943054, | |
| "learning_rate": 7e-06, | |
| "loss": 0.9602, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.03158060950576346, | |
| "grad_norm": 0.2137051522731781, | |
| "learning_rate": 7e-06, | |
| "loss": 0.8132, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.032843833885993996, | |
| "grad_norm": 0.13553065061569214, | |
| "learning_rate": 7e-06, | |
| "loss": 0.4873, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.03410705826622454, | |
| "grad_norm": 0.1350618302822113, | |
| "learning_rate": 7e-06, | |
| "loss": 0.5186, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.03537028264645508, | |
| "grad_norm": 0.1236298605799675, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3401, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.036633507026685615, | |
| "grad_norm": 0.15515856444835663, | |
| "learning_rate": 7e-06, | |
| "loss": 1.2493, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.037896731406916156, | |
| "grad_norm": 0.09012973308563232, | |
| "learning_rate": 7e-06, | |
| "loss": 0.7651, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03915995578714669, | |
| "grad_norm": 0.14378102123737335, | |
| "learning_rate": 7e-06, | |
| "loss": 0.4005, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.04042318016737723, | |
| "grad_norm": 0.2546883523464203, | |
| "learning_rate": 7e-06, | |
| "loss": 0.5304, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.04168640454760777, | |
| "grad_norm": 0.1023496687412262, | |
| "learning_rate": 7e-06, | |
| "loss": 0.493, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.04294962892783831, | |
| "grad_norm": 0.1719491183757782, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3707, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.044212853308068845, | |
| "grad_norm": 0.08337250351905823, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3143, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.045476077688299386, | |
| "grad_norm": 0.09040359407663345, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3942, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.04673930206852992, | |
| "grad_norm": 0.10850965231657028, | |
| "learning_rate": 7e-06, | |
| "loss": 0.4453, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.04800252644876046, | |
| "grad_norm": 0.08887636661529541, | |
| "learning_rate": 7e-06, | |
| "loss": 1.2015, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.049265750828991, | |
| "grad_norm": 0.1864442229270935, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3895, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.05052897520922154, | |
| "grad_norm": 0.175123393535614, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3626, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.051792199589452075, | |
| "grad_norm": 0.10572918504476547, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3335, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.053055423969682616, | |
| "grad_norm": 0.09624486416578293, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3065, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.05431864834991315, | |
| "grad_norm": 0.13604743778705597, | |
| "learning_rate": 7e-06, | |
| "loss": 0.317, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.05558187273014369, | |
| "grad_norm": 0.15408551692962646, | |
| "learning_rate": 7e-06, | |
| "loss": 0.7709, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.05684509711037423, | |
| "grad_norm": 0.09676961600780487, | |
| "learning_rate": 7e-06, | |
| "loss": 0.5114, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.05810832149060477, | |
| "grad_norm": 0.11936207115650177, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2785, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.059371545870835304, | |
| "grad_norm": 0.1744876503944397, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2689, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.060634770251065846, | |
| "grad_norm": 0.17397810518741608, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3316, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.06189799463129638, | |
| "grad_norm": 0.1329212635755539, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2853, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.06316121901152692, | |
| "grad_norm": 0.09555013477802277, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3182, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06442444339175746, | |
| "grad_norm": 0.15529152750968933, | |
| "learning_rate": 7e-06, | |
| "loss": 0.5497, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.06568766777198799, | |
| "grad_norm": 0.09599810838699341, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3102, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.06695089215221854, | |
| "grad_norm": 0.12325876951217651, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2788, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.06821411653244908, | |
| "grad_norm": 0.2820286154747009, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3934, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.06947734091267961, | |
| "grad_norm": 0.17912541329860687, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3709, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.07074056529291016, | |
| "grad_norm": 0.14083553850650787, | |
| "learning_rate": 7e-06, | |
| "loss": 0.4105, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.0720037896731407, | |
| "grad_norm": 0.09743569046258926, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2376, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.07326701405337123, | |
| "grad_norm": 0.10704771429300308, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2714, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.07453023843360176, | |
| "grad_norm": 0.11463718861341476, | |
| "learning_rate": 7e-06, | |
| "loss": 0.456, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.07579346281383231, | |
| "grad_norm": 0.12085901200771332, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3099, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07705668719406285, | |
| "grad_norm": 0.10744248330593109, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2312, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.07831991157429338, | |
| "grad_norm": 0.08374691009521484, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2685, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.07958313595452392, | |
| "grad_norm": 0.10826320946216583, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3069, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.08084636033475447, | |
| "grad_norm": 0.17864489555358887, | |
| "learning_rate": 7e-06, | |
| "loss": 0.5024, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.082109584714985, | |
| "grad_norm": 0.11988472938537598, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3164, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.08337280909521554, | |
| "grad_norm": 0.1612488180398941, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3295, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.08463603347544607, | |
| "grad_norm": 0.13754408061504364, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2406, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.08589925785567662, | |
| "grad_norm": 0.11351214349269867, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3149, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.08716248223590715, | |
| "grad_norm": 0.07585523277521133, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2706, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.08842570661613769, | |
| "grad_norm": 0.0744984969496727, | |
| "learning_rate": 7e-06, | |
| "loss": 0.214, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.08968893099636822, | |
| "grad_norm": 0.2244742512702942, | |
| "learning_rate": 7e-06, | |
| "loss": 0.4242, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.09095215537659877, | |
| "grad_norm": 0.08662209659814835, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2691, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.09221537975682931, | |
| "grad_norm": 0.10564761608839035, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3228, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.09347860413705984, | |
| "grad_norm": 0.18067984282970428, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3384, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.09474182851729038, | |
| "grad_norm": 0.07762212306261063, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2377, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.09600505289752093, | |
| "grad_norm": 0.07793518900871277, | |
| "learning_rate": 7e-06, | |
| "loss": 0.248, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.09726827727775146, | |
| "grad_norm": 0.1307854801416397, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2667, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.098531501657982, | |
| "grad_norm": 0.09771443158388138, | |
| "learning_rate": 7e-06, | |
| "loss": 0.318, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.09979472603821253, | |
| "grad_norm": 0.10437527298927307, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3303, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.10105795041844308, | |
| "grad_norm": 0.11160580813884735, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2845, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.10232117479867361, | |
| "grad_norm": 0.0809980109333992, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2736, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.10358439917890415, | |
| "grad_norm": 0.10574865341186523, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3012, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.1048476235591347, | |
| "grad_norm": 0.07807318866252899, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2283, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.10611084793936523, | |
| "grad_norm": 0.10281991213560104, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2544, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.10737407231959577, | |
| "grad_norm": 0.12749870121479034, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2973, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.1086372966998263, | |
| "grad_norm": 0.13138003647327423, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3399, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.10990052108005685, | |
| "grad_norm": 0.10815514624118805, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3221, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.11116374546028739, | |
| "grad_norm": 0.13537508249282837, | |
| "learning_rate": 7e-06, | |
| "loss": 0.308, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.11242696984051792, | |
| "grad_norm": 0.09689060598611832, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2511, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.11369019422074846, | |
| "grad_norm": 0.08782925456762314, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2936, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.114953418600979, | |
| "grad_norm": 0.12655287981033325, | |
| "learning_rate": 7e-06, | |
| "loss": 0.4158, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.11621664298120954, | |
| "grad_norm": 0.11866717785596848, | |
| "learning_rate": 7e-06, | |
| "loss": 0.4059, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.11747986736144007, | |
| "grad_norm": 0.12691305577754974, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2453, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.11874309174167061, | |
| "grad_norm": 0.11844722181558609, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2249, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.12000631612190116, | |
| "grad_norm": 0.07606595754623413, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2789, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.12126954050213169, | |
| "grad_norm": 0.11529266834259033, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2654, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.12253276488236223, | |
| "grad_norm": 0.12648285925388336, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2279, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.12379598926259276, | |
| "grad_norm": 0.1504458636045456, | |
| "learning_rate": 7e-06, | |
| "loss": 0.4048, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.1250592136428233, | |
| "grad_norm": 0.09578829258680344, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2912, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.12632243802305385, | |
| "grad_norm": 0.10936733335256577, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2644, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.12632243802305385, | |
| "eval_correct_accuracy": 0.5708227311280747, | |
| "eval_error_accuracy": 0.45610085547050877, | |
| "eval_f1": 0.5070537660000148, | |
| "eval_loss": 0.43133699893951416, | |
| "eval_runtime": 35.3366, | |
| "eval_samples_per_second": 96.217, | |
| "eval_steps_per_second": 6.028, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1275856624032844, | |
| "grad_norm": 0.1165054589509964, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2693, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.12884888678351492, | |
| "grad_norm": 0.08343573659658432, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2388, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.13011211116374546, | |
| "grad_norm": 0.10629656910896301, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2603, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.13137533554397599, | |
| "grad_norm": 0.07509850710630417, | |
| "learning_rate": 7e-06, | |
| "loss": 0.253, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.13263855992420653, | |
| "grad_norm": 0.08039335906505585, | |
| "learning_rate": 7e-06, | |
| "loss": 0.224, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.13390178430443708, | |
| "grad_norm": 0.10666981339454651, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3945, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.1351650086846676, | |
| "grad_norm": 0.16490086913108826, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3087, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.13642823306489815, | |
| "grad_norm": 0.09013114124536514, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3355, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.1376914574451287, | |
| "grad_norm": 0.1580226719379425, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2433, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.13895468182535922, | |
| "grad_norm": 0.09130299836397171, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1928, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.14021790620558977, | |
| "grad_norm": 0.07702811807394028, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2319, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.14148113058582032, | |
| "grad_norm": 0.14257381856441498, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2496, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.14274435496605084, | |
| "grad_norm": 0.11546823382377625, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2592, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.1440075793462814, | |
| "grad_norm": 0.12595829367637634, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2539, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.1452708037265119, | |
| "grad_norm": 0.10172153264284134, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2728, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.14653402810674246, | |
| "grad_norm": 0.10145121812820435, | |
| "learning_rate": 7e-06, | |
| "loss": 0.163, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.147797252486973, | |
| "grad_norm": 0.15631917119026184, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2526, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.14906047686720353, | |
| "grad_norm": 0.13442394137382507, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2591, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.15032370124743408, | |
| "grad_norm": 0.08642445504665375, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2505, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.15158692562766463, | |
| "grad_norm": 0.13054709136486053, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2704, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.15285015000789515, | |
| "grad_norm": 0.19653519988059998, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2384, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.1541133743881257, | |
| "grad_norm": 0.20973946154117584, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2385, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.15537659876835622, | |
| "grad_norm": 0.096860371530056, | |
| "learning_rate": 7e-06, | |
| "loss": 0.241, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.15663982314858677, | |
| "grad_norm": 0.10356521606445312, | |
| "learning_rate": 7e-06, | |
| "loss": 0.4737, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.15790304752881731, | |
| "grad_norm": 0.17340725660324097, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2673, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.15916627190904784, | |
| "grad_norm": 0.3000679612159729, | |
| "learning_rate": 7e-06, | |
| "loss": 0.431, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.16042949628927838, | |
| "grad_norm": 0.11215244233608246, | |
| "learning_rate": 7e-06, | |
| "loss": 0.206, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.16169272066950893, | |
| "grad_norm": 0.07078877836465836, | |
| "learning_rate": 7e-06, | |
| "loss": 0.201, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.16295594504973945, | |
| "grad_norm": 0.16037459671497345, | |
| "learning_rate": 7e-06, | |
| "loss": 0.268, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.16421916942997, | |
| "grad_norm": 0.20243118703365326, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2503, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.16548239381020052, | |
| "grad_norm": 0.1389663964509964, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2517, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.16674561819043107, | |
| "grad_norm": 0.12263572961091995, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2359, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.16800884257066162, | |
| "grad_norm": 0.14491412043571472, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3347, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.16927206695089214, | |
| "grad_norm": 0.1378932148218155, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2638, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.1705352913311227, | |
| "grad_norm": 0.07053989171981812, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2299, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.17179851571135324, | |
| "grad_norm": 0.19610151648521423, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2789, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.17306174009158376, | |
| "grad_norm": 0.1290581375360489, | |
| "learning_rate": 7e-06, | |
| "loss": 0.209, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.1743249644718143, | |
| "grad_norm": 0.1481819599866867, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2723, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.17558818885204486, | |
| "grad_norm": 0.1427401453256607, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2778, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.17685141323227538, | |
| "grad_norm": 0.0666273981332779, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2008, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.17811463761250593, | |
| "grad_norm": 0.13182522356510162, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2441, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.17937786199273645, | |
| "grad_norm": 0.08374546468257904, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2603, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.180641086372967, | |
| "grad_norm": 0.10638394951820374, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2354, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.18190431075319755, | |
| "grad_norm": 0.10801179707050323, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2875, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.18316753513342807, | |
| "grad_norm": 0.13121351599693298, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2304, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.18443075951365862, | |
| "grad_norm": 0.10176476836204529, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2311, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.18569398389388916, | |
| "grad_norm": 0.10199464112520218, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2522, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.18695720827411969, | |
| "grad_norm": 0.09650130569934845, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2351, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.18822043265435023, | |
| "grad_norm": 0.12842021882534027, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2244, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.18948365703458075, | |
| "grad_norm": 0.1237226277589798, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2706, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.1907468814148113, | |
| "grad_norm": 0.12939125299453735, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2445, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.19201010579504185, | |
| "grad_norm": 0.11460690945386887, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2601, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.19327333017527237, | |
| "grad_norm": 0.18108275532722473, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3465, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.19453655455550292, | |
| "grad_norm": 0.0727877989411354, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1878, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.19579977893573347, | |
| "grad_norm": 0.12313497066497803, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2311, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.197063003315964, | |
| "grad_norm": 0.1377153992652893, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2573, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.19832622769619454, | |
| "grad_norm": 0.08758647739887238, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2156, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.19958945207642506, | |
| "grad_norm": 0.11441980302333832, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2801, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.2008526764566556, | |
| "grad_norm": 0.12151770293712616, | |
| "learning_rate": 7e-06, | |
| "loss": 0.242, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.20211590083688616, | |
| "grad_norm": 0.159256711602211, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2612, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.20337912521711668, | |
| "grad_norm": 0.08577941358089447, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2115, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.20464234959734723, | |
| "grad_norm": 0.1190810427069664, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2434, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.20590557397757778, | |
| "grad_norm": 0.09624910354614258, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2438, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.2071687983578083, | |
| "grad_norm": 0.16024184226989746, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2088, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.20843202273803885, | |
| "grad_norm": 0.1891951858997345, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2751, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.2096952471182694, | |
| "grad_norm": 0.08837898820638657, | |
| "learning_rate": 7e-06, | |
| "loss": 0.212, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.21095847149849992, | |
| "grad_norm": 0.0905027762055397, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2189, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.21222169587873047, | |
| "grad_norm": 0.07917249947786331, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2324, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.213484920258961, | |
| "grad_norm": 0.13524577021598816, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2143, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.21474814463919153, | |
| "grad_norm": 0.13222923874855042, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2983, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.21601136901942208, | |
| "grad_norm": 0.1525893360376358, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2408, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.2172745933996526, | |
| "grad_norm": 0.08309401571750641, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1722, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.21853781777988315, | |
| "grad_norm": 0.08370368182659149, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1981, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.2198010421601137, | |
| "grad_norm": 0.11228370666503906, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2336, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.22106426654034422, | |
| "grad_norm": 0.19010692834854126, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3069, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.22232749092057477, | |
| "grad_norm": 0.08182361721992493, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2549, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.2235907153008053, | |
| "grad_norm": 0.1046992763876915, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2458, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.22485393968103584, | |
| "grad_norm": 0.11583778262138367, | |
| "learning_rate": 7e-06, | |
| "loss": 0.269, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.2261171640612664, | |
| "grad_norm": 0.07805290818214417, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1784, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.2273803884414969, | |
| "grad_norm": 0.11022092401981354, | |
| "learning_rate": 7e-06, | |
| "loss": 0.232, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.22864361282172746, | |
| "grad_norm": 0.1311209499835968, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2603, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.229906837201958, | |
| "grad_norm": 0.08558022975921631, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2524, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.23117006158218853, | |
| "grad_norm": 0.0957944467663765, | |
| "learning_rate": 7e-06, | |
| "loss": 0.281, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.23243328596241908, | |
| "grad_norm": 0.086683489382267, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2112, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.2336965103426496, | |
| "grad_norm": 0.09485982358455658, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2146, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.23495973472288015, | |
| "grad_norm": 0.14843790233135223, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2036, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.2362229591031107, | |
| "grad_norm": 0.09375383704900742, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2386, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.23748618348334122, | |
| "grad_norm": 0.10639740526676178, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2202, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.23874940786357177, | |
| "grad_norm": 0.10205169022083282, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2297, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.24001263224380232, | |
| "grad_norm": 0.1138874888420105, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2511, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.24127585662403284, | |
| "grad_norm": 0.12742598354816437, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2247, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.24253908100426338, | |
| "grad_norm": 0.14605408906936646, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2366, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.24380230538449393, | |
| "grad_norm": 0.10053393989801407, | |
| "learning_rate": 7e-06, | |
| "loss": 0.4711, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.24506552976472445, | |
| "grad_norm": 0.08829181641340256, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1501, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.246328754144955, | |
| "grad_norm": 0.1484231799840927, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2063, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.24759197852518552, | |
| "grad_norm": 0.17242765426635742, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2317, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.24885520290541607, | |
| "grad_norm": 0.12016981095075607, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2272, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.2501184272856466, | |
| "grad_norm": 0.1021333634853363, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2402, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.25138165166587717, | |
| "grad_norm": 0.11179149895906448, | |
| "learning_rate": 7e-06, | |
| "loss": 0.246, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.2526448760461077, | |
| "grad_norm": 0.10811345279216766, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2125, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2526448760461077, | |
| "eval_correct_accuracy": 0.5988125530110263, | |
| "eval_error_accuracy": 0.4524988743809095, | |
| "eval_f1": 0.5154742907624302, | |
| "eval_loss": 0.4177984297275543, | |
| "eval_runtime": 35.0506, | |
| "eval_samples_per_second": 97.003, | |
| "eval_steps_per_second": 6.077, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2539081004263382, | |
| "grad_norm": 0.12190552800893784, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2372, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.2551713248065688, | |
| "grad_norm": 0.07629604637622833, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1976, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.2564345491867993, | |
| "grad_norm": 0.10825781524181366, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2169, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.25769777356702983, | |
| "grad_norm": 0.09181591868400574, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2225, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.2589609979472604, | |
| "grad_norm": 0.1266108900308609, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1858, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.26022422232749093, | |
| "grad_norm": 0.11106186360120773, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2443, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.26148744670772145, | |
| "grad_norm": 0.11874532699584961, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2224, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.26275067108795197, | |
| "grad_norm": 0.06901393085718155, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1683, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.26401389546818255, | |
| "grad_norm": 0.1774539351463318, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2588, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.26527711984841307, | |
| "grad_norm": 0.06564710289239883, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1966, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.2665403442286436, | |
| "grad_norm": 0.1348266899585724, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2094, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.26780356860887417, | |
| "grad_norm": 0.10280844569206238, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2208, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.2690667929891047, | |
| "grad_norm": 0.09777519851922989, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2259, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.2703300173693352, | |
| "grad_norm": 0.11480893194675446, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2402, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.2715932417495658, | |
| "grad_norm": 0.17719541490077972, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2692, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.2728564661297963, | |
| "grad_norm": 0.07069459557533264, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1781, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.2741196905100268, | |
| "grad_norm": 0.06251855194568634, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1819, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.2753829148902574, | |
| "grad_norm": 0.1753867119550705, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2362, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.2766461392704879, | |
| "grad_norm": 0.1843274086713791, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2638, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.27790936365071844, | |
| "grad_norm": 0.18026292324066162, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2274, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.279172588030949, | |
| "grad_norm": 0.0640600174665451, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3739, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.28043581241117954, | |
| "grad_norm": 0.091743104159832, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2274, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.28169903679141006, | |
| "grad_norm": 0.10185891389846802, | |
| "learning_rate": 7e-06, | |
| "loss": 0.471, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.28296226117164064, | |
| "grad_norm": 0.08672218769788742, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2171, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.28422548555187116, | |
| "grad_norm": 0.11758771538734436, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2211, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.2854887099321017, | |
| "grad_norm": 0.07176447659730911, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1967, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.2867519343123322, | |
| "grad_norm": 0.1037454828619957, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2457, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.2880151586925628, | |
| "grad_norm": 0.07262658327817917, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2026, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.2892783830727933, | |
| "grad_norm": 0.13171784579753876, | |
| "learning_rate": 7e-06, | |
| "loss": 0.209, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.2905416074530238, | |
| "grad_norm": 0.08208411931991577, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1964, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2918048318332544, | |
| "grad_norm": 0.10370495170354843, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2202, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.2930680562134849, | |
| "grad_norm": 0.26831239461898804, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2651, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.29433128059371544, | |
| "grad_norm": 0.12230344116687775, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2265, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.295594504973946, | |
| "grad_norm": 0.08064734190702438, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1711, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.29685772935417654, | |
| "grad_norm": 0.10691053420305252, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1753, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.29812095373440706, | |
| "grad_norm": 0.08961788564920425, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2682, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.29938417811463763, | |
| "grad_norm": 0.2417578548192978, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2622, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.30064740249486815, | |
| "grad_norm": 0.09739197045564651, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1747, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.3019106268750987, | |
| "grad_norm": 0.15415729582309723, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2289, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.30317385125532925, | |
| "grad_norm": 0.08798956125974655, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2076, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3044370756355598, | |
| "grad_norm": 0.09532306343317032, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3761, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.3057003000157903, | |
| "grad_norm": 0.06419141590595245, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2308, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.30696352439602087, | |
| "grad_norm": 0.13766047358512878, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2203, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.3082267487762514, | |
| "grad_norm": 0.09225375950336456, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2023, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.3094899731564819, | |
| "grad_norm": 0.1266135275363922, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2823, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.31075319753671243, | |
| "grad_norm": 0.17997467517852783, | |
| "learning_rate": 7e-06, | |
| "loss": 0.225, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.312016421916943, | |
| "grad_norm": 0.12776713073253632, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1906, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.31327964629717353, | |
| "grad_norm": 0.14866380393505096, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2119, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.31454287067740405, | |
| "grad_norm": 0.11824511736631393, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2219, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.31580609505763463, | |
| "grad_norm": 0.14409460127353668, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2116, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.31706931943786515, | |
| "grad_norm": 0.10304541140794754, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2073, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.31833254381809567, | |
| "grad_norm": 0.09163326770067215, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1882, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.31959576819832625, | |
| "grad_norm": 0.12692378461360931, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2386, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.32085899257855677, | |
| "grad_norm": 0.1747879534959793, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2054, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.3221222169587873, | |
| "grad_norm": 0.12346009910106659, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2397, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.32338544133901787, | |
| "grad_norm": 0.1731298863887787, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2575, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.3246486657192484, | |
| "grad_norm": 0.08011125028133392, | |
| "learning_rate": 7e-06, | |
| "loss": 0.215, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.3259118900994789, | |
| "grad_norm": 0.13160613179206848, | |
| "learning_rate": 7e-06, | |
| "loss": 0.222, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.3271751144797095, | |
| "grad_norm": 0.18522977828979492, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2548, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.32843833885994, | |
| "grad_norm": 0.14212659001350403, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3002, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.3297015632401705, | |
| "grad_norm": 0.13445697724819183, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2351, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.33096478762040105, | |
| "grad_norm": 0.11636935919523239, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2106, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.3322280120006316, | |
| "grad_norm": 0.14159604907035828, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2531, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.33349123638086214, | |
| "grad_norm": 0.10319356620311737, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2346, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.33475446076109266, | |
| "grad_norm": 0.09796885401010513, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2059, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.33601768514132324, | |
| "grad_norm": 0.1082499697804451, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1863, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.33728090952155376, | |
| "grad_norm": 0.12492396682500839, | |
| "learning_rate": 7e-06, | |
| "loss": 0.215, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.3385441339017843, | |
| "grad_norm": 0.06617411971092224, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1782, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.33980735828201486, | |
| "grad_norm": 0.15060101449489594, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3509, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.3410705826622454, | |
| "grad_norm": 0.11944282054901123, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1862, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.3423338070424759, | |
| "grad_norm": 0.12389136850833893, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2409, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.3435970314227065, | |
| "grad_norm": 0.11000983417034149, | |
| "learning_rate": 7e-06, | |
| "loss": 0.223, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.344860255802937, | |
| "grad_norm": 0.09012436866760254, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2068, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.3461234801831675, | |
| "grad_norm": 0.27014490962028503, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2087, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.3473867045633981, | |
| "grad_norm": 0.08035814762115479, | |
| "learning_rate": 7e-06, | |
| "loss": 0.262, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.3486499289436286, | |
| "grad_norm": 0.09129905700683594, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2082, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.34991315332385914, | |
| "grad_norm": 0.11665099114179611, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1841, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.3511763777040897, | |
| "grad_norm": 0.08812276273965836, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2164, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.35243960208432024, | |
| "grad_norm": 0.1272403746843338, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1927, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.35370282646455076, | |
| "grad_norm": 0.11256379634141922, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2991, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3549660508447813, | |
| "grad_norm": 0.15795424580574036, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2363, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.35622927522501185, | |
| "grad_norm": 0.22632326185703278, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2088, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.3574924996052424, | |
| "grad_norm": 0.18535131216049194, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2746, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.3587557239854729, | |
| "grad_norm": 0.08579732477664948, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1899, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.3600189483657035, | |
| "grad_norm": 0.10859379917383194, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2067, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.361282172745934, | |
| "grad_norm": 0.07765299826860428, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1761, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.3625453971261645, | |
| "grad_norm": 0.17053595185279846, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2373, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.3638086215063951, | |
| "grad_norm": 0.09873699396848679, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2176, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.3650718458866256, | |
| "grad_norm": 0.07418286055326462, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1797, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.36633507026685613, | |
| "grad_norm": 0.11981359124183655, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1988, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3675982946470867, | |
| "grad_norm": 0.06424502283334732, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2121, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.36886151902731723, | |
| "grad_norm": 0.09006607532501221, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1945, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.37012474340754775, | |
| "grad_norm": 0.10973497480154037, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2046, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.37138796778777833, | |
| "grad_norm": 0.09228470921516418, | |
| "learning_rate": 7e-06, | |
| "loss": 0.207, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.37265119216800885, | |
| "grad_norm": 0.10961271822452545, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2128, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.37391441654823937, | |
| "grad_norm": 0.09072300046682358, | |
| "learning_rate": 7e-06, | |
| "loss": 0.4585, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.37517764092846995, | |
| "grad_norm": 0.08374742418527603, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2178, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.37644086530870047, | |
| "grad_norm": 0.05344458296895027, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1595, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.377704089688931, | |
| "grad_norm": 0.07841549813747406, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2306, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.3789673140691615, | |
| "grad_norm": 0.09865035116672516, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2274, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.3789673140691615, | |
| "eval_correct_accuracy": 0.5818490245971162, | |
| "eval_error_accuracy": 0.494822152183701, | |
| "eval_f1": 0.5348184158843582, | |
| "eval_loss": 0.41273096203804016, | |
| "eval_runtime": 35.0595, | |
| "eval_samples_per_second": 96.978, | |
| "eval_steps_per_second": 6.075, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.3802305384493921, | |
| "grad_norm": 0.11520479619503021, | |
| "learning_rate": 7e-06, | |
| "loss": 0.194, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.3814937628296226, | |
| "grad_norm": 0.17391149699687958, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2053, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.38275698720985313, | |
| "grad_norm": 0.08927040547132492, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1885, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.3840202115900837, | |
| "grad_norm": 0.10747874528169632, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2357, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.3852834359703142, | |
| "grad_norm": 0.0821816474199295, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2017, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.38654666035054475, | |
| "grad_norm": 0.08718965202569962, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2333, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.3878098847307753, | |
| "grad_norm": 0.14753767848014832, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2501, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.38907310911100584, | |
| "grad_norm": 0.12474358081817627, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1895, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.39033633349123636, | |
| "grad_norm": 0.14409278333187103, | |
| "learning_rate": 7e-06, | |
| "loss": 0.208, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.39159955787146694, | |
| "grad_norm": 0.06918184459209442, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1817, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.39286278225169746, | |
| "grad_norm": 0.08502199500799179, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1832, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.394126006631928, | |
| "grad_norm": 0.06989938765764236, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1741, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.39538923101215856, | |
| "grad_norm": 0.08131398260593414, | |
| "learning_rate": 7e-06, | |
| "loss": 0.226, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.3966524553923891, | |
| "grad_norm": 0.16150841116905212, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2081, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.3979156797726196, | |
| "grad_norm": 0.10033854097127914, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1757, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.3991789041528501, | |
| "grad_norm": 0.2944275438785553, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2039, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.4004421285330807, | |
| "grad_norm": 0.09300543367862701, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2403, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.4017053529133112, | |
| "grad_norm": 0.089630626142025, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2457, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.40296857729354174, | |
| "grad_norm": 0.06648046523332596, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2155, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.4042318016737723, | |
| "grad_norm": 0.18262338638305664, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2087, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.40549502605400284, | |
| "grad_norm": 0.0919061154127121, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2062, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.40675825043423336, | |
| "grad_norm": 0.113703154027462, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1859, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.40802147481446394, | |
| "grad_norm": 0.20705194771289825, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1769, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.40928469919469446, | |
| "grad_norm": 0.11209185421466827, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1819, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.410547923574925, | |
| "grad_norm": 0.05803574621677399, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1852, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.41181114795515555, | |
| "grad_norm": 0.16077323257923126, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2117, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.4130743723353861, | |
| "grad_norm": 0.10078177601099014, | |
| "learning_rate": 7e-06, | |
| "loss": 0.193, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.4143375967156166, | |
| "grad_norm": 0.09989168494939804, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2053, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.4156008210958472, | |
| "grad_norm": 0.13987579941749573, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2678, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.4168640454760777, | |
| "grad_norm": 0.13039669394493103, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1998, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.4181272698563082, | |
| "grad_norm": 0.1029522716999054, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2337, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.4193904942365388, | |
| "grad_norm": 0.08752740174531937, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1854, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.4206537186167693, | |
| "grad_norm": 0.07876112312078476, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1909, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.42191694299699983, | |
| "grad_norm": 0.2126246988773346, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2535, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.42318016737723035, | |
| "grad_norm": 0.11913909763097763, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2184, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.42444339175746093, | |
| "grad_norm": 0.1513642817735672, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1994, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.42570661613769145, | |
| "grad_norm": 0.1306588500738144, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2126, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.426969840517922, | |
| "grad_norm": 0.1171175092458725, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1994, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.42823306489815255, | |
| "grad_norm": 0.05895727127790451, | |
| "learning_rate": 7e-06, | |
| "loss": 0.155, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.42949628927838307, | |
| "grad_norm": 0.08570288121700287, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1986, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.4307595136586136, | |
| "grad_norm": 0.0765470489859581, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1696, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.43202273803884417, | |
| "grad_norm": 0.08286664634943008, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1626, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.4332859624190747, | |
| "grad_norm": 0.2282284051179886, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1877, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.4345491867993052, | |
| "grad_norm": 0.11943413317203522, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1897, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.4358124111795358, | |
| "grad_norm": 0.10935524851083755, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1828, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.4370756355597663, | |
| "grad_norm": 0.07996437698602676, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1991, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.4383388599399968, | |
| "grad_norm": 0.13088780641555786, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3873, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.4396020843202274, | |
| "grad_norm": 0.15082432329654694, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2091, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.4408653087004579, | |
| "grad_norm": 0.132376566529274, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2471, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.44212853308068845, | |
| "grad_norm": 0.07796452194452286, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1751, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.443391757460919, | |
| "grad_norm": 0.12849055230617523, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3155, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.44465498184114954, | |
| "grad_norm": 0.06422396749258041, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1486, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.44591820622138006, | |
| "grad_norm": 0.1800646334886551, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1993, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.4471814306016106, | |
| "grad_norm": 0.15747664868831635, | |
| "learning_rate": 7e-06, | |
| "loss": 0.209, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.44844465498184116, | |
| "grad_norm": 0.11023043096065521, | |
| "learning_rate": 7e-06, | |
| "loss": 0.21, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.4497078793620717, | |
| "grad_norm": 0.0927424430847168, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1745, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.4509711037423022, | |
| "grad_norm": 0.08278126269578934, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3105, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.4522343281225328, | |
| "grad_norm": 0.08794251829385757, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1979, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.4534975525027633, | |
| "grad_norm": 0.11653570830821991, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1828, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.4547607768829938, | |
| "grad_norm": 0.11114069074392319, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1826, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.4560240012632244, | |
| "grad_norm": 0.2608173191547394, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2304, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.4572872256434549, | |
| "grad_norm": 0.08441725373268127, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1757, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.45855045002368544, | |
| "grad_norm": 0.10891429334878922, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2122, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.459813674403916, | |
| "grad_norm": 0.07106776535511017, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1721, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.46107689878414654, | |
| "grad_norm": 0.08842181414365768, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2226, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.46234012316437706, | |
| "grad_norm": 0.0870131179690361, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2474, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.46360334754460764, | |
| "grad_norm": 0.14521507918834686, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2615, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.46486657192483816, | |
| "grad_norm": 0.09553767740726471, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1791, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.4661297963050687, | |
| "grad_norm": 0.11010967195034027, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1874, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.4673930206852992, | |
| "grad_norm": 0.09533923864364624, | |
| "learning_rate": 7e-06, | |
| "loss": 0.228, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.4686562450655298, | |
| "grad_norm": 0.0890774354338646, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2345, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.4699194694457603, | |
| "grad_norm": 0.12173017859458923, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2082, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.4711826938259908, | |
| "grad_norm": 0.0602993369102478, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1893, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.4724459182062214, | |
| "grad_norm": 0.13122287392616272, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2178, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.4737091425864519, | |
| "grad_norm": 0.07299527525901794, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1888, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.47497236696668244, | |
| "grad_norm": 0.08244926482439041, | |
| "learning_rate": 7e-06, | |
| "loss": 0.174, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.476235591346913, | |
| "grad_norm": 0.08397851884365082, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2108, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.47749881572714353, | |
| "grad_norm": 0.07320383936166763, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1804, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.47876204010737405, | |
| "grad_norm": 0.0849589854478836, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1829, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.48002526448760463, | |
| "grad_norm": 0.10207744687795639, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2174, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.48128848886783515, | |
| "grad_norm": 0.07175120711326599, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1847, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.4825517132480657, | |
| "grad_norm": 0.10446271300315857, | |
| "learning_rate": 7e-06, | |
| "loss": 0.216, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.48381493762829625, | |
| "grad_norm": 0.23799718916416168, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2136, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.48507816200852677, | |
| "grad_norm": 0.11531874537467957, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2357, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.4863413863887573, | |
| "grad_norm": 0.10034700483083725, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2258, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.48760461076898787, | |
| "grad_norm": 0.0934348776936531, | |
| "learning_rate": 7e-06, | |
| "loss": 0.284, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.4888678351492184, | |
| "grad_norm": 0.165315181016922, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2264, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.4901310595294489, | |
| "grad_norm": 0.1086471751332283, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2028, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.49139428390967943, | |
| "grad_norm": 0.14764176309108734, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1775, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.49265750828991, | |
| "grad_norm": 0.21734580397605896, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1924, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.4939207326701405, | |
| "grad_norm": 0.0923137441277504, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2031, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.49518395705037105, | |
| "grad_norm": 0.06933951377868652, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1739, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.4964471814306016, | |
| "grad_norm": 0.0930216833949089, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2083, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.49771040581083215, | |
| "grad_norm": 0.08797884732484818, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2636, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.49897363019106267, | |
| "grad_norm": 0.0919070690870285, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2154, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.5002368545712932, | |
| "grad_norm": 0.07787168025970459, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2207, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.5015000789515237, | |
| "grad_norm": 0.11572758853435516, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2002, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.5027633033317543, | |
| "grad_norm": 0.08295108377933502, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2614, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.5040265277119849, | |
| "grad_norm": 0.0625801831483841, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1644, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.5052897520922154, | |
| "grad_norm": 0.07405094802379608, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2234, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5052897520922154, | |
| "eval_correct_accuracy": 0.5852417302798982, | |
| "eval_error_accuracy": 0.4709590274651058, | |
| "eval_f1": 0.5219175883059916, | |
| "eval_loss": 0.4229665994644165, | |
| "eval_runtime": 35.2003, | |
| "eval_samples_per_second": 96.59, | |
| "eval_steps_per_second": 6.051, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5065529764724459, | |
| "grad_norm": 0.09175197780132294, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2027, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.5078162008526764, | |
| "grad_norm": 0.1550239622592926, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2559, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.509079425232907, | |
| "grad_norm": 0.139438658952713, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1762, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.5103426496131376, | |
| "grad_norm": 0.11481575667858124, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1623, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.5116058739933681, | |
| "grad_norm": 0.05404340475797653, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1961, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.5128690983735986, | |
| "grad_norm": 0.14743672311306, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2279, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.5141323227538291, | |
| "grad_norm": 0.11647465080022812, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2001, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.5153955471340597, | |
| "grad_norm": 0.08203577995300293, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1752, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.5166587715142902, | |
| "grad_norm": 0.11073414981365204, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3686, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.5179219958945208, | |
| "grad_norm": 0.11331301182508469, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3378, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.5191852202747513, | |
| "grad_norm": 0.09435959905385971, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1562, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.5204484446549819, | |
| "grad_norm": 0.08365237712860107, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1926, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.5217116690352124, | |
| "grad_norm": 0.08092326670885086, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1931, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.5229748934154429, | |
| "grad_norm": 0.07763849943876266, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2038, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.5242381177956734, | |
| "grad_norm": 0.1350603550672531, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2392, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.5255013421759039, | |
| "grad_norm": 0.10287491232156754, | |
| "learning_rate": 7e-06, | |
| "loss": 0.196, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.5267645665561346, | |
| "grad_norm": 0.0719987079501152, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2016, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.5280277909363651, | |
| "grad_norm": 0.22227227687835693, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2131, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.5292910153165956, | |
| "grad_norm": 0.06136275455355644, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2044, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.5305542396968261, | |
| "grad_norm": 0.0627446323633194, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1793, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5318174640770567, | |
| "grad_norm": 0.20960237085819244, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1982, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.5330806884572872, | |
| "grad_norm": 0.11971580237150192, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2587, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.5343439128375178, | |
| "grad_norm": 0.0932474359869957, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1947, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.5356071372177483, | |
| "grad_norm": 0.09686949849128723, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1907, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.5368703615979789, | |
| "grad_norm": 0.07940957695245743, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2535, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.5381335859782094, | |
| "grad_norm": 0.09676375240087509, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2038, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.5393968103584399, | |
| "grad_norm": 0.09989267587661743, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1976, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.5406600347386704, | |
| "grad_norm": 0.0823327898979187, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1708, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.541923259118901, | |
| "grad_norm": 0.10633084177970886, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1619, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.5431864834991316, | |
| "grad_norm": 0.08448205143213272, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1854, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.5444497078793621, | |
| "grad_norm": 0.07697522640228271, | |
| "learning_rate": 7e-06, | |
| "loss": 0.188, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 0.5457129322595926, | |
| "grad_norm": 0.1970750391483307, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2479, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.5469761566398231, | |
| "grad_norm": 0.08660274744033813, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1536, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 0.5482393810200537, | |
| "grad_norm": 0.0931171253323555, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2169, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.5495026054002842, | |
| "grad_norm": 0.07317376136779785, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1941, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.5507658297805148, | |
| "grad_norm": 0.07506151497364044, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1761, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.5520290541607453, | |
| "grad_norm": 0.059854380786418915, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3068, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 0.5532922785409758, | |
| "grad_norm": 0.1609865128993988, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1857, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.5545555029212064, | |
| "grad_norm": 0.08996118605136871, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1805, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 0.5558187273014369, | |
| "grad_norm": 0.12089324742555618, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1826, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.5570819516816674, | |
| "grad_norm": 0.08772964775562286, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1791, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.558345176061898, | |
| "grad_norm": 0.10977458208799362, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2084, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.5596084004421286, | |
| "grad_norm": 0.09188458323478699, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3045, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 0.5608716248223591, | |
| "grad_norm": 0.07033522427082062, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1926, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.5621348492025896, | |
| "grad_norm": 0.0652671530842781, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1998, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.5633980735828201, | |
| "grad_norm": 0.07860173285007477, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2172, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.5646612979630506, | |
| "grad_norm": 0.0679745227098465, | |
| "learning_rate": 7e-06, | |
| "loss": 0.179, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.5659245223432813, | |
| "grad_norm": 0.10545714199542999, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1547, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.5671877467235118, | |
| "grad_norm": 0.07516340911388397, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1643, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 0.5684509711037423, | |
| "grad_norm": 0.06046690791845322, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1624, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5697141954839728, | |
| "grad_norm": 0.07889428734779358, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1808, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 0.5709774198642034, | |
| "grad_norm": 0.08698045462369919, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2257, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.5722406442444339, | |
| "grad_norm": 0.08498376607894897, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1773, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 0.5735038686246644, | |
| "grad_norm": 0.12781842052936554, | |
| "learning_rate": 7e-06, | |
| "loss": 0.258, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.574767093004895, | |
| "grad_norm": 0.11314232647418976, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1608, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.5760303173851256, | |
| "grad_norm": 0.3507859408855438, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1656, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.5772935417653561, | |
| "grad_norm": 0.08430968970060349, | |
| "learning_rate": 7e-06, | |
| "loss": 0.201, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 0.5785567661455866, | |
| "grad_norm": 0.09361864626407623, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2034, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.5798199905258171, | |
| "grad_norm": 0.09698746353387833, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2444, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 0.5810832149060476, | |
| "grad_norm": 0.16023226082324982, | |
| "learning_rate": 7e-06, | |
| "loss": 0.19, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.5823464392862783, | |
| "grad_norm": 0.08157742023468018, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2149, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 0.5836096636665088, | |
| "grad_norm": 0.09342104941606522, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2022, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.5848728880467393, | |
| "grad_norm": 0.07538167387247086, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2516, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 0.5861361124269698, | |
| "grad_norm": 0.12720584869384766, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2208, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.5873993368072004, | |
| "grad_norm": 0.08613109588623047, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2193, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.5886625611874309, | |
| "grad_norm": 0.08249358087778091, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1676, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 0.5899257855676615, | |
| "grad_norm": 0.1288759857416153, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3238, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 0.591189009947892, | |
| "grad_norm": 0.05164247751235962, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1418, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.5924522343281226, | |
| "grad_norm": 0.0994580090045929, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2238, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 0.5937154587083531, | |
| "grad_norm": 0.09253129363059998, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2006, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5949786830885836, | |
| "grad_norm": 0.12927457690238953, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2027, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 0.5962419074688141, | |
| "grad_norm": 0.11283280700445175, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2169, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.5975051318490446, | |
| "grad_norm": 0.174880713224411, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1746, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 0.5987683562292753, | |
| "grad_norm": 0.11614017933607101, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2968, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.6000315806095058, | |
| "grad_norm": 0.07818127423524857, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1643, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.6012948049897363, | |
| "grad_norm": 0.08300397545099258, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1712, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.6025580293699668, | |
| "grad_norm": 0.1014489009976387, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2089, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 0.6038212537501974, | |
| "grad_norm": 0.11591055244207382, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2231, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 0.6050844781304279, | |
| "grad_norm": 0.13137224316596985, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1844, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 0.6063477025106585, | |
| "grad_norm": 0.09693000465631485, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2033, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.607610926890889, | |
| "grad_norm": 0.1250012218952179, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2219, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 0.6088741512711195, | |
| "grad_norm": 0.18828216195106506, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2427, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 0.6101373756513501, | |
| "grad_norm": 0.11057613044977188, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2193, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 0.6114006000315806, | |
| "grad_norm": 0.15523040294647217, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1883, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.6126638244118111, | |
| "grad_norm": 0.16174635291099548, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2067, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.6139270487920417, | |
| "grad_norm": 0.2738276422023773, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2292, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.6151902731722723, | |
| "grad_norm": 0.05995164066553116, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1956, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 0.6164534975525028, | |
| "grad_norm": 0.05519471690058708, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1501, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.6177167219327333, | |
| "grad_norm": 0.08133929967880249, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2224, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 0.6189799463129638, | |
| "grad_norm": 0.12239203602075577, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2503, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.6202431706931943, | |
| "grad_norm": 0.11004896461963654, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2796, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 0.6215063950734249, | |
| "grad_norm": 0.1722228229045868, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1807, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.6227696194536555, | |
| "grad_norm": 0.10695190727710724, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1782, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 0.624032843833886, | |
| "grad_norm": 0.08578750491142273, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2389, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 0.6252960682141165, | |
| "grad_norm": 0.10670057684183121, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1801, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.6265592925943471, | |
| "grad_norm": 0.04314388707280159, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1366, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.6278225169745776, | |
| "grad_norm": 0.11937737464904785, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2017, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 0.6290857413548081, | |
| "grad_norm": 0.08274619281291962, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1721, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 0.6303489657350387, | |
| "grad_norm": 0.07380262762308121, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1962, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 0.6316121901152693, | |
| "grad_norm": 0.04727354645729065, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1509, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6316121901152693, | |
| "eval_correct_accuracy": 0.7056827820186599, | |
| "eval_error_accuracy": 0.4376407023863125, | |
| "eval_f1": 0.5402416946684214, | |
| "eval_loss": 0.362473726272583, | |
| "eval_runtime": 35.262, | |
| "eval_samples_per_second": 96.421, | |
| "eval_steps_per_second": 6.04, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6328754144954998, | |
| "grad_norm": 0.08614058047533035, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1706, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 0.6341386388757303, | |
| "grad_norm": 0.06968270987272263, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2028, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 0.6354018632559608, | |
| "grad_norm": 0.13758571445941925, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2296, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 0.6366650876361913, | |
| "grad_norm": 0.07083171606063843, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1807, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 0.637928312016422, | |
| "grad_norm": 0.06689167022705078, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1997, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.6391915363966525, | |
| "grad_norm": 0.07969733327627182, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1803, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 0.640454760776883, | |
| "grad_norm": 0.095677949488163, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1874, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 0.6417179851571135, | |
| "grad_norm": 0.10759231448173523, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1656, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 0.6429812095373441, | |
| "grad_norm": 0.13282425701618195, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2538, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 0.6442444339175746, | |
| "grad_norm": 0.09664168953895569, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1768, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.6455076582978051, | |
| "grad_norm": 0.11897934973239899, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2023, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 0.6467708826780357, | |
| "grad_norm": 0.05450622737407684, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1277, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.6480341070582663, | |
| "grad_norm": 0.07977665215730667, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2231, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 0.6492973314384968, | |
| "grad_norm": 0.19492259621620178, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2253, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 0.6505605558187273, | |
| "grad_norm": 0.09466379135847092, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3611, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.6518237801989578, | |
| "grad_norm": 0.17244236171245575, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2149, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 0.6530870045791883, | |
| "grad_norm": 0.08291974663734436, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1848, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 0.654350228959419, | |
| "grad_norm": 0.06109621748328209, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1435, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 0.6556134533396495, | |
| "grad_norm": 0.06171726807951927, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1749, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 0.65687667771988, | |
| "grad_norm": 0.09645943343639374, | |
| "learning_rate": 7e-06, | |
| "loss": 0.197, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.6581399021001105, | |
| "grad_norm": 0.09050124883651733, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1609, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 0.659403126480341, | |
| "grad_norm": 0.09600576758384705, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1826, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 0.6606663508605716, | |
| "grad_norm": 0.1261880248785019, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1875, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 0.6619295752408021, | |
| "grad_norm": 0.13587896525859833, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1905, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 0.6631927996210327, | |
| "grad_norm": 0.12359704077243805, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2087, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.6644560240012632, | |
| "grad_norm": 0.10092345625162125, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2041, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 0.6657192483814938, | |
| "grad_norm": 0.12595926225185394, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1928, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 0.6669824727617243, | |
| "grad_norm": 0.08753985911607742, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1444, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 0.6682456971419548, | |
| "grad_norm": 0.08193645626306534, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1545, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 0.6695089215221853, | |
| "grad_norm": 0.07170840352773666, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1652, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.670772145902416, | |
| "grad_norm": 0.18759992718696594, | |
| "learning_rate": 7e-06, | |
| "loss": 0.232, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 0.6720353702826465, | |
| "grad_norm": 0.13691110908985138, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1905, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 0.673298594662877, | |
| "grad_norm": 0.06453829258680344, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2283, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 0.6745618190431075, | |
| "grad_norm": 0.12694236636161804, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2013, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 0.675825043423338, | |
| "grad_norm": 0.06403839588165283, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1585, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.6770882678035686, | |
| "grad_norm": 0.13636727631092072, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2156, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 0.6783514921837992, | |
| "grad_norm": 0.12285730242729187, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1967, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 0.6796147165640297, | |
| "grad_norm": 0.0780211091041565, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1751, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 0.6808779409442602, | |
| "grad_norm": 0.09688100218772888, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2141, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 0.6821411653244908, | |
| "grad_norm": 0.07864505052566528, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2138, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.6834043897047213, | |
| "grad_norm": 0.060981281101703644, | |
| "learning_rate": 7e-06, | |
| "loss": 0.187, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 0.6846676140849518, | |
| "grad_norm": 0.06510937958955765, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1771, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 0.6859308384651823, | |
| "grad_norm": 0.07638704031705856, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2129, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 0.687194062845413, | |
| "grad_norm": 0.11518476903438568, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1854, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.6884572872256435, | |
| "grad_norm": 0.06868738681077957, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1581, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.689720511605874, | |
| "grad_norm": 0.09059899300336838, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2245, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 0.6909837359861045, | |
| "grad_norm": 0.06422233581542969, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1559, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 0.692246960366335, | |
| "grad_norm": 0.10189103335142136, | |
| "learning_rate": 7e-06, | |
| "loss": 0.193, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 0.6935101847465656, | |
| "grad_norm": 0.08199501782655716, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1908, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 0.6947734091267962, | |
| "grad_norm": 0.20546898245811462, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2011, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.6960366335070267, | |
| "grad_norm": 0.14664340019226074, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1892, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 0.6972998578872572, | |
| "grad_norm": 0.08695843815803528, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1871, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 0.6985630822674878, | |
| "grad_norm": 0.08112246543169022, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1616, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 0.6998263066477183, | |
| "grad_norm": 0.08381661772727966, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2231, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 0.7010895310279488, | |
| "grad_norm": 0.09177428483963013, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1956, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.7023527554081794, | |
| "grad_norm": 0.08766631782054901, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1871, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 0.70361597978841, | |
| "grad_norm": 0.07755694538354874, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2718, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 0.7048792041686405, | |
| "grad_norm": 0.08710070699453354, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1909, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 0.706142428548871, | |
| "grad_norm": 0.07648595422506332, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1948, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 0.7074056529291015, | |
| "grad_norm": 0.10871299356222153, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2093, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.708668877309332, | |
| "grad_norm": 0.07032714784145355, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1699, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 0.7099321016895626, | |
| "grad_norm": 0.0873897522687912, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1372, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 0.7111953260697932, | |
| "grad_norm": 0.07188841700553894, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1794, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 0.7124585504500237, | |
| "grad_norm": 0.07733464986085892, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2043, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 0.7137217748302542, | |
| "grad_norm": 0.07270821928977966, | |
| "learning_rate": 7e-06, | |
| "loss": 0.177, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.7149849992104848, | |
| "grad_norm": 0.1570441722869873, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2411, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 0.7162482235907153, | |
| "grad_norm": 0.2707260549068451, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2307, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 0.7175114479709458, | |
| "grad_norm": 0.07656281441450119, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1759, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 0.7187746723511764, | |
| "grad_norm": 0.09973770380020142, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1965, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 0.720037896731407, | |
| "grad_norm": 0.06791306287050247, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1749, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.7213011211116375, | |
| "grad_norm": 0.17801041901111603, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1941, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 0.722564345491868, | |
| "grad_norm": 0.18452543020248413, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2446, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 0.7238275698720985, | |
| "grad_norm": 0.12178942561149597, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1583, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 0.725090794252329, | |
| "grad_norm": 0.13167473673820496, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2324, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 0.7263540186325597, | |
| "grad_norm": 0.05255408585071564, | |
| "learning_rate": 7e-06, | |
| "loss": 0.195, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.7276172430127902, | |
| "grad_norm": 0.09154222905635834, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1878, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.7288804673930207, | |
| "grad_norm": 0.0887879729270935, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2052, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 0.7301436917732512, | |
| "grad_norm": 0.1336040198802948, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1952, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 0.7314069161534817, | |
| "grad_norm": 0.08207479119300842, | |
| "learning_rate": 7e-06, | |
| "loss": 0.185, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 0.7326701405337123, | |
| "grad_norm": 0.05941140279173851, | |
| "learning_rate": 7e-06, | |
| "loss": 0.204, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.7339333649139428, | |
| "grad_norm": 0.06899949908256531, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1408, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 0.7351965892941734, | |
| "grad_norm": 0.09259360283613205, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1823, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 0.7364598136744039, | |
| "grad_norm": 0.1346062421798706, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2174, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 0.7377230380546345, | |
| "grad_norm": 0.1547420769929886, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1864, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 0.738986262434865, | |
| "grad_norm": 0.10551164299249649, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1554, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.7402494868150955, | |
| "grad_norm": 0.08826129138469696, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2044, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 0.741512711195326, | |
| "grad_norm": 0.07170785963535309, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1777, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 0.7427759355755567, | |
| "grad_norm": 0.1085812896490097, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2822, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 0.7440391599557872, | |
| "grad_norm": 0.08545360714197159, | |
| "learning_rate": 7e-06, | |
| "loss": 0.19, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 0.7453023843360177, | |
| "grad_norm": 0.05576294660568237, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1826, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.7465656087162482, | |
| "grad_norm": 0.056626636534929276, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2191, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 0.7478288330964787, | |
| "grad_norm": 0.06961087882518768, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2004, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 0.7490920574767093, | |
| "grad_norm": 0.09317582845687866, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1465, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 0.7503552818569399, | |
| "grad_norm": 0.13993658125400543, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1876, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 0.7516185062371704, | |
| "grad_norm": 0.06080286204814911, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2012, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.7528817306174009, | |
| "grad_norm": 0.060514189302921295, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1658, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 0.7541449549976315, | |
| "grad_norm": 0.09004813432693481, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3195, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 0.755408179377862, | |
| "grad_norm": 0.07283802330493927, | |
| "learning_rate": 7e-06, | |
| "loss": 0.2021, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 0.7566714037580925, | |
| "grad_norm": 0.08824078738689423, | |
| "learning_rate": 7e-06, | |
| "loss": 0.1941, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 0.757934628138323, | |
| "grad_norm": 0.09339006245136261, | |
| "learning_rate": 7e-06, | |
| "loss": 0.4697, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.757934628138323, | |
| "eval_correct_accuracy": 0.6098388464800678, | |
| "eval_error_accuracy": 0.48491670418730304, | |
| "eval_f1": 0.540250366102693, | |
| "eval_loss": 0.42047378420829773, | |
| "eval_runtime": 35.1541, | |
| "eval_samples_per_second": 96.717, | |
| "eval_steps_per_second": 6.059, | |
| "step": 600 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 791, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.7774891537268736e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |