| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 240, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0125, |
| "grad_norm": 10.18331757027076, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 0.8960644006729126, |
| "num_input_tokens_seen": 0, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 9.16263025394979, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.8488595485687256, |
| "num_input_tokens_seen": 0, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.0375, |
| "grad_norm": 8.92834992115862, |
| "learning_rate": 3e-06, |
| "loss": 0.8538516759872437, |
| "num_input_tokens_seen": 0, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 9.282181600009132, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.856260359287262, |
| "num_input_tokens_seen": 0, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.0625, |
| "grad_norm": 5.998755609205955, |
| "learning_rate": 5e-06, |
| "loss": 0.7300517559051514, |
| "num_input_tokens_seen": 0, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.075, |
| "grad_norm": 4.585755656488519, |
| "learning_rate": 6e-06, |
| "loss": 0.6810914278030396, |
| "num_input_tokens_seen": 0, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.0875, |
| "grad_norm": 3.5581298112309763, |
| "learning_rate": 7e-06, |
| "loss": 0.6221330761909485, |
| "num_input_tokens_seen": 0, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 4.186415400005925, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 0.5942986011505127, |
| "num_input_tokens_seen": 0, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.1125, |
| "grad_norm": 2.0815923930377633, |
| "learning_rate": 9e-06, |
| "loss": 0.5573145151138306, |
| "num_input_tokens_seen": 0, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 1.538737138740704, |
| "learning_rate": 1e-05, |
| "loss": 0.5063657164573669, |
| "num_input_tokens_seen": 0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.1375, |
| "grad_norm": 1.3167709252541377, |
| "learning_rate": 1.1000000000000001e-05, |
| "loss": 0.5080946087837219, |
| "num_input_tokens_seen": 0, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 1.0093046488935542, |
| "learning_rate": 1.2e-05, |
| "loss": 0.5118667483329773, |
| "num_input_tokens_seen": 0, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.1625, |
| "grad_norm": 1.0176825029370344, |
| "learning_rate": 1.3000000000000001e-05, |
| "loss": 0.4826335608959198, |
| "num_input_tokens_seen": 0, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.175, |
| "grad_norm": 0.868692377639643, |
| "learning_rate": 1.4e-05, |
| "loss": 0.4561425447463989, |
| "num_input_tokens_seen": 0, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.1875, |
| "grad_norm": 0.7576207830400876, |
| "learning_rate": 1.5000000000000002e-05, |
| "loss": 0.46192091703414917, |
| "num_input_tokens_seen": 0, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.6607030167581055, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 0.436120867729187, |
| "num_input_tokens_seen": 0, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.2125, |
| "grad_norm": 0.6388324060478595, |
| "learning_rate": 1.7e-05, |
| "loss": 0.43788450956344604, |
| "num_input_tokens_seen": 0, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.225, |
| "grad_norm": 0.6522444786796912, |
| "learning_rate": 1.8e-05, |
| "loss": 0.42514723539352417, |
| "num_input_tokens_seen": 0, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.2375, |
| "grad_norm": 0.5939148285210124, |
| "learning_rate": 1.9e-05, |
| "loss": 0.41062527894973755, |
| "num_input_tokens_seen": 0, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 0.6228429094034043, |
| "learning_rate": 2e-05, |
| "loss": 0.3994959890842438, |
| "num_input_tokens_seen": 0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.2625, |
| "grad_norm": 0.5940088612229671, |
| "learning_rate": 1.9999658256641746e-05, |
| "loss": 0.41216734051704407, |
| "num_input_tokens_seen": 0, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.275, |
| "grad_norm": 0.5920054647245636, |
| "learning_rate": 1.9998633049924693e-05, |
| "loss": 0.41810792684555054, |
| "num_input_tokens_seen": 0, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.2875, |
| "grad_norm": 0.5480649368192407, |
| "learning_rate": 1.999692444992035e-05, |
| "loss": 0.4036347568035126, |
| "num_input_tokens_seen": 0, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 0.5565757348287612, |
| "learning_rate": 1.999453257340926e-05, |
| "loss": 0.419705331325531, |
| "num_input_tokens_seen": 0, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.3125, |
| "grad_norm": 0.5548087155409337, |
| "learning_rate": 1.999145758387301e-05, |
| "loss": 0.40048182010650635, |
| "num_input_tokens_seen": 0, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.325, |
| "grad_norm": 0.5338888290743757, |
| "learning_rate": 1.998769969148305e-05, |
| "loss": 0.40444415807724, |
| "num_input_tokens_seen": 0, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.3375, |
| "grad_norm": 0.5018287159150987, |
| "learning_rate": 1.9983259153086328e-05, |
| "loss": 0.3908202052116394, |
| "num_input_tokens_seen": 0, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 0.5666363779267786, |
| "learning_rate": 1.9978136272187745e-05, |
| "loss": 0.3917597532272339, |
| "num_input_tokens_seen": 0, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.3625, |
| "grad_norm": 0.5311731782089377, |
| "learning_rate": 1.997233139892941e-05, |
| "loss": 0.38186532258987427, |
| "num_input_tokens_seen": 0, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.375, |
| "grad_norm": 0.5017076164418427, |
| "learning_rate": 1.99658449300667e-05, |
| "loss": 0.3953409492969513, |
| "num_input_tokens_seen": 0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.3875, |
| "grad_norm": 0.5191236527322374, |
| "learning_rate": 1.995867730894114e-05, |
| "loss": 0.3810531198978424, |
| "num_input_tokens_seen": 0, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.479389601325034, |
| "learning_rate": 1.9950829025450116e-05, |
| "loss": 0.3787350058555603, |
| "num_input_tokens_seen": 0, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.4125, |
| "grad_norm": 0.5052682473625102, |
| "learning_rate": 1.9942300616013378e-05, |
| "loss": 0.3784908056259155, |
| "num_input_tokens_seen": 0, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.425, |
| "grad_norm": 0.5093747529791427, |
| "learning_rate": 1.9933092663536384e-05, |
| "loss": 0.377264142036438, |
| "num_input_tokens_seen": 0, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.4375, |
| "grad_norm": 0.5496127833092357, |
| "learning_rate": 1.992320579737045e-05, |
| "loss": 0.4013134837150574, |
| "num_input_tokens_seen": 0, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 0.5194182976027281, |
| "learning_rate": 1.9912640693269754e-05, |
| "loss": 0.3868390917778015, |
| "num_input_tokens_seen": 0, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.4625, |
| "grad_norm": 0.4772201831075947, |
| "learning_rate": 1.990139807334512e-05, |
| "loss": 0.3554729223251343, |
| "num_input_tokens_seen": 0, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.475, |
| "grad_norm": 0.5411145901050805, |
| "learning_rate": 1.9889478706014687e-05, |
| "loss": 0.3768259882926941, |
| "num_input_tokens_seen": 0, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.4875, |
| "grad_norm": 0.47151248168784593, |
| "learning_rate": 1.9876883405951378e-05, |
| "loss": 0.3637596368789673, |
| "num_input_tokens_seen": 0, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.4900043722490537, |
| "learning_rate": 1.9863613034027224e-05, |
| "loss": 0.3731005787849426, |
| "num_input_tokens_seen": 0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.5125, |
| "grad_norm": 0.4740418320125202, |
| "learning_rate": 1.984966849725452e-05, |
| "loss": 0.36029529571533203, |
| "num_input_tokens_seen": 0, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.525, |
| "grad_norm": 0.5282467622205699, |
| "learning_rate": 1.9835050748723826e-05, |
| "loss": 0.371197909116745, |
| "num_input_tokens_seen": 0, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.5375, |
| "grad_norm": 0.4755646348335201, |
| "learning_rate": 1.981976078753884e-05, |
| "loss": 0.34178704023361206, |
| "num_input_tokens_seen": 0, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 0.507685264776996, |
| "learning_rate": 1.9803799658748096e-05, |
| "loss": 0.3663065731525421, |
| "num_input_tokens_seen": 0, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.5625, |
| "grad_norm": 0.5265705406067266, |
| "learning_rate": 1.9787168453273546e-05, |
| "loss": 0.3737153708934784, |
| "num_input_tokens_seen": 0, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.575, |
| "grad_norm": 0.5571656899248943, |
| "learning_rate": 1.9769868307835996e-05, |
| "loss": 0.3628424406051636, |
| "num_input_tokens_seen": 0, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.5875, |
| "grad_norm": 0.569279834268425, |
| "learning_rate": 1.97519004048774e-05, |
| "loss": 0.37988948822021484, |
| "num_input_tokens_seen": 0, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 0.4969114483932452, |
| "learning_rate": 1.973326597248006e-05, |
| "loss": 0.3636181950569153, |
| "num_input_tokens_seen": 0, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.6125, |
| "grad_norm": 0.5439018148028109, |
| "learning_rate": 1.9713966284282677e-05, |
| "loss": 0.3687742352485657, |
| "num_input_tokens_seen": 0, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.625, |
| "grad_norm": 0.47096960108233016, |
| "learning_rate": 1.9694002659393306e-05, |
| "loss": 0.3656526207923889, |
| "num_input_tokens_seen": 0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.6375, |
| "grad_norm": 0.49069042740477836, |
| "learning_rate": 1.9673376462299186e-05, |
| "loss": 0.3719867467880249, |
| "num_input_tokens_seen": 0, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 0.5230240054949966, |
| "learning_rate": 1.9652089102773487e-05, |
| "loss": 0.3670026361942291, |
| "num_input_tokens_seen": 0, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.6625, |
| "grad_norm": 0.539667408085974, |
| "learning_rate": 1.963014203577896e-05, |
| "loss": 0.3777502775192261, |
| "num_input_tokens_seen": 0, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.675, |
| "grad_norm": 0.5303927925423055, |
| "learning_rate": 1.9607536761368484e-05, |
| "loss": 0.360956609249115, |
| "num_input_tokens_seen": 0, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.6875, |
| "grad_norm": 0.5669762895801259, |
| "learning_rate": 1.958427482458253e-05, |
| "loss": 0.3712082505226135, |
| "num_input_tokens_seen": 0, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 0.4924233132486405, |
| "learning_rate": 1.9560357815343577e-05, |
| "loss": 0.34069931507110596, |
| "num_input_tokens_seen": 0, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.7125, |
| "grad_norm": 0.4939181685825231, |
| "learning_rate": 1.9535787368347444e-05, |
| "loss": 0.3445311188697815, |
| "num_input_tokens_seen": 0, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.725, |
| "grad_norm": 0.5132891832051437, |
| "learning_rate": 1.9510565162951538e-05, |
| "loss": 0.352506160736084, |
| "num_input_tokens_seen": 0, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.7375, |
| "grad_norm": 0.5228074950046403, |
| "learning_rate": 1.9484692923060095e-05, |
| "loss": 0.352910578250885, |
| "num_input_tokens_seen": 0, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 0.48331453046149664, |
| "learning_rate": 1.9458172417006347e-05, |
| "loss": 0.3518715500831604, |
| "num_input_tokens_seen": 0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.7625, |
| "grad_norm": 0.4971643896288794, |
| "learning_rate": 1.9431005457431654e-05, |
| "loss": 0.3532984256744385, |
| "num_input_tokens_seen": 0, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.775, |
| "grad_norm": 0.5331008024051833, |
| "learning_rate": 1.9403193901161614e-05, |
| "loss": 0.3648609220981598, |
| "num_input_tokens_seen": 0, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.7875, |
| "grad_norm": 1.2839239169737044, |
| "learning_rate": 1.9374739649079155e-05, |
| "loss": 0.3517247438430786, |
| "num_input_tokens_seen": 0, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 0.5877379242744403, |
| "learning_rate": 1.934564464599461e-05, |
| "loss": 0.3657301962375641, |
| "num_input_tokens_seen": 0, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.8125, |
| "grad_norm": 0.4867340909281986, |
| "learning_rate": 1.9315910880512792e-05, |
| "loss": 0.36049771308898926, |
| "num_input_tokens_seen": 0, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.825, |
| "grad_norm": 0.49356286585265735, |
| "learning_rate": 1.9285540384897073e-05, |
| "loss": 0.34778010845184326, |
| "num_input_tokens_seen": 0, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.8375, |
| "grad_norm": 0.5535250380039742, |
| "learning_rate": 1.9254535234930486e-05, |
| "loss": 0.36141228675842285, |
| "num_input_tokens_seen": 0, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 0.5243933357988406, |
| "learning_rate": 1.922289754977385e-05, |
| "loss": 0.3524037301540375, |
| "num_input_tokens_seen": 0, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.8625, |
| "grad_norm": 0.5324496372479985, |
| "learning_rate": 1.919062949182091e-05, |
| "loss": 0.36538389325141907, |
| "num_input_tokens_seen": 0, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.875, |
| "grad_norm": 0.49491533066436794, |
| "learning_rate": 1.9157733266550577e-05, |
| "loss": 0.35084018111228943, |
| "num_input_tokens_seen": 0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.8875, |
| "grad_norm": 0.5287693804586798, |
| "learning_rate": 1.9124211122376138e-05, |
| "loss": 0.3459678292274475, |
| "num_input_tokens_seen": 0, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 0.4675049739386483, |
| "learning_rate": 1.909006535049163e-05, |
| "loss": 0.34027597308158875, |
| "num_input_tokens_seen": 0, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.9125, |
| "grad_norm": 0.5068359141834208, |
| "learning_rate": 1.9055298284715192e-05, |
| "loss": 0.34234559535980225, |
| "num_input_tokens_seen": 0, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.925, |
| "grad_norm": 0.5454693950983837, |
| "learning_rate": 1.9019912301329593e-05, |
| "loss": 0.3697267770767212, |
| "num_input_tokens_seen": 0, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.9375, |
| "grad_norm": 0.48781498197167644, |
| "learning_rate": 1.898390981891979e-05, |
| "loss": 0.3399868607521057, |
| "num_input_tokens_seen": 0, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 0.467529345150941, |
| "learning_rate": 1.8947293298207637e-05, |
| "loss": 0.3481365442276001, |
| "num_input_tokens_seen": 0, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.9625, |
| "grad_norm": 0.5173617613436983, |
| "learning_rate": 1.891006524188368e-05, |
| "loss": 0.35163643956184387, |
| "num_input_tokens_seen": 0, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.975, |
| "grad_norm": 0.4964553718237553, |
| "learning_rate": 1.887222819443612e-05, |
| "loss": 0.3495897650718689, |
| "num_input_tokens_seen": 0, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.9875, |
| "grad_norm": 0.5266061782006969, |
| "learning_rate": 1.883378474197689e-05, |
| "loss": 0.3525278568267822, |
| "num_input_tokens_seen": 0, |
| "step": 79 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.48903026352892565, |
| "learning_rate": 1.879473751206489e-05, |
| "loss": 0.3540067672729492, |
| "num_input_tokens_seen": 0, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.0125, |
| "grad_norm": 0.8791291129207356, |
| "learning_rate": 1.875508917352643e-05, |
| "loss": 0.26043254137039185, |
| "num_input_tokens_seen": 0, |
| "step": 81 |
| }, |
| { |
| "epoch": 1.025, |
| "grad_norm": 0.6487054540761681, |
| "learning_rate": 1.8714842436272774e-05, |
| "loss": 0.2440587878227234, |
| "num_input_tokens_seen": 0, |
| "step": 82 |
| }, |
| { |
| "epoch": 1.0375, |
| "grad_norm": 0.6795282145993521, |
| "learning_rate": 1.8674000051114953e-05, |
| "loss": 0.2597944140434265, |
| "num_input_tokens_seen": 0, |
| "step": 83 |
| }, |
| { |
| "epoch": 1.05, |
| "grad_norm": 0.6670449632773737, |
| "learning_rate": 1.863256480957574e-05, |
| "loss": 0.23764365911483765, |
| "num_input_tokens_seen": 0, |
| "step": 84 |
| }, |
| { |
| "epoch": 1.0625, |
| "grad_norm": 0.8445000033015495, |
| "learning_rate": 1.8590539543698852e-05, |
| "loss": 0.22128596901893616, |
| "num_input_tokens_seen": 0, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.075, |
| "grad_norm": 0.7319387611539306, |
| "learning_rate": 1.854792712585539e-05, |
| "loss": 0.22451584041118622, |
| "num_input_tokens_seen": 0, |
| "step": 86 |
| }, |
| { |
| "epoch": 1.0875, |
| "grad_norm": 0.7347266658746741, |
| "learning_rate": 1.8504730468547508e-05, |
| "loss": 0.23484283685684204, |
| "num_input_tokens_seen": 0, |
| "step": 87 |
| }, |
| { |
| "epoch": 1.1, |
| "grad_norm": 0.684429737748084, |
| "learning_rate": 1.8460952524209355e-05, |
| "loss": 0.22682486474514008, |
| "num_input_tokens_seen": 0, |
| "step": 88 |
| }, |
| { |
| "epoch": 1.1125, |
| "grad_norm": 0.6253196189027981, |
| "learning_rate": 1.8416596285005274e-05, |
| "loss": 0.23638179898262024, |
| "num_input_tokens_seen": 0, |
| "step": 89 |
| }, |
| { |
| "epoch": 1.125, |
| "grad_norm": 0.7140008070317728, |
| "learning_rate": 1.8371664782625287e-05, |
| "loss": 0.22379158437252045, |
| "num_input_tokens_seen": 0, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.1375, |
| "grad_norm": 0.5609948172492557, |
| "learning_rate": 1.8326161088077905e-05, |
| "loss": 0.23043715953826904, |
| "num_input_tokens_seen": 0, |
| "step": 91 |
| }, |
| { |
| "epoch": 1.15, |
| "grad_norm": 0.5762843622395748, |
| "learning_rate": 1.8280088311480203e-05, |
| "loss": 0.22419373691082, |
| "num_input_tokens_seen": 0, |
| "step": 92 |
| }, |
| { |
| "epoch": 1.1625, |
| "grad_norm": 0.7331777727867734, |
| "learning_rate": 1.823344960184526e-05, |
| "loss": 0.22210514545440674, |
| "num_input_tokens_seen": 0, |
| "step": 93 |
| }, |
| { |
| "epoch": 1.175, |
| "grad_norm": 0.6205497447918622, |
| "learning_rate": 1.8186248146866928e-05, |
| "loss": 0.23843634128570557, |
| "num_input_tokens_seen": 0, |
| "step": 94 |
| }, |
| { |
| "epoch": 1.1875, |
| "grad_norm": 0.645067422698941, |
| "learning_rate": 1.813848717270195e-05, |
| "loss": 0.22561757266521454, |
| "num_input_tokens_seen": 0, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 0.5305489735340998, |
| "learning_rate": 1.8090169943749477e-05, |
| "loss": 0.23090419173240662, |
| "num_input_tokens_seen": 0, |
| "step": 96 |
| }, |
| { |
| "epoch": 1.2125, |
| "grad_norm": 0.652421080289407, |
| "learning_rate": 1.804129976242792e-05, |
| "loss": 0.21994651854038239, |
| "num_input_tokens_seen": 0, |
| "step": 97 |
| }, |
| { |
| "epoch": 1.225, |
| "grad_norm": 0.546464902083412, |
| "learning_rate": 1.7991879968949248e-05, |
| "loss": 0.2290341556072235, |
| "num_input_tokens_seen": 0, |
| "step": 98 |
| }, |
| { |
| "epoch": 1.2375, |
| "grad_norm": 0.6476920457708351, |
| "learning_rate": 1.7941913941090712e-05, |
| "loss": 0.24561816453933716, |
| "num_input_tokens_seen": 0, |
| "step": 99 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 0.5782245623531915, |
| "learning_rate": 1.789140509396394e-05, |
| "loss": 0.23482097685337067, |
| "num_input_tokens_seen": 0, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.2625, |
| "grad_norm": 0.6348611730944753, |
| "learning_rate": 1.784035687978153e-05, |
| "loss": 0.21998700499534607, |
| "num_input_tokens_seen": 0, |
| "step": 101 |
| }, |
| { |
| "epoch": 1.275, |
| "grad_norm": 0.5855684501933186, |
| "learning_rate": 1.7788772787621126e-05, |
| "loss": 0.22731050848960876, |
| "num_input_tokens_seen": 0, |
| "step": 102 |
| }, |
| { |
| "epoch": 1.2875, |
| "grad_norm": 0.5849783819104052, |
| "learning_rate": 1.7736656343186897e-05, |
| "loss": 0.21847045421600342, |
| "num_input_tokens_seen": 0, |
| "step": 103 |
| }, |
| { |
| "epoch": 1.3, |
| "grad_norm": 0.6468451663174432, |
| "learning_rate": 1.7684011108568593e-05, |
| "loss": 0.23889131844043732, |
| "num_input_tokens_seen": 0, |
| "step": 104 |
| }, |
| { |
| "epoch": 1.3125, |
| "grad_norm": 0.5379228455435396, |
| "learning_rate": 1.7630840681998068e-05, |
| "loss": 0.22917379438877106, |
| "num_input_tokens_seen": 0, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.325, |
| "grad_norm": 0.6386415797352409, |
| "learning_rate": 1.757714869760335e-05, |
| "loss": 0.22476345300674438, |
| "num_input_tokens_seen": 0, |
| "step": 106 |
| }, |
| { |
| "epoch": 1.3375, |
| "grad_norm": 0.5432607518100792, |
| "learning_rate": 1.752293882516025e-05, |
| "loss": 0.23026630282402039, |
| "num_input_tokens_seen": 0, |
| "step": 107 |
| }, |
| { |
| "epoch": 1.35, |
| "grad_norm": 0.6251676560462321, |
| "learning_rate": 1.7468214769841542e-05, |
| "loss": 0.22793394327163696, |
| "num_input_tokens_seen": 0, |
| "step": 108 |
| }, |
| { |
| "epoch": 1.3625, |
| "grad_norm": 0.5786323989132912, |
| "learning_rate": 1.7412980271963712e-05, |
| "loss": 0.24610519409179688, |
| "num_input_tokens_seen": 0, |
| "step": 109 |
| }, |
| { |
| "epoch": 1.375, |
| "grad_norm": 0.5735661628137917, |
| "learning_rate": 1.735723910673132e-05, |
| "loss": 0.21196551620960236, |
| "num_input_tokens_seen": 0, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.3875, |
| "grad_norm": 0.5881031218701362, |
| "learning_rate": 1.7300995083978965e-05, |
| "loss": 0.22360339760780334, |
| "num_input_tokens_seen": 0, |
| "step": 111 |
| }, |
| { |
| "epoch": 1.4, |
| "grad_norm": 0.5603841737931269, |
| "learning_rate": 1.7244252047910893e-05, |
| "loss": 0.22254161536693573, |
| "num_input_tokens_seen": 0, |
| "step": 112 |
| }, |
| { |
| "epoch": 1.4125, |
| "grad_norm": 0.566304468507664, |
| "learning_rate": 1.718701387683824e-05, |
| "loss": 0.21939656138420105, |
| "num_input_tokens_seen": 0, |
| "step": 113 |
| }, |
| { |
| "epoch": 1.425, |
| "grad_norm": 0.532198517126917, |
| "learning_rate": 1.7129284482913973e-05, |
| "loss": 0.2217104434967041, |
| "num_input_tokens_seen": 0, |
| "step": 114 |
| }, |
| { |
| "epoch": 1.4375, |
| "grad_norm": 0.5486267196897048, |
| "learning_rate": 1.7071067811865477e-05, |
| "loss": 0.22763556241989136, |
| "num_input_tokens_seen": 0, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.45, |
| "grad_norm": 0.5318598905653756, |
| "learning_rate": 1.7012367842724887e-05, |
| "loss": 0.22226908802986145, |
| "num_input_tokens_seen": 0, |
| "step": 116 |
| }, |
| { |
| "epoch": 1.4625, |
| "grad_norm": 0.5219241851069808, |
| "learning_rate": 1.6953188587557122e-05, |
| "loss": 0.2251712530851364, |
| "num_input_tokens_seen": 0, |
| "step": 117 |
| }, |
| { |
| "epoch": 1.475, |
| "grad_norm": 0.5576204648175208, |
| "learning_rate": 1.6893534091185658e-05, |
| "loss": 0.2300577610731125, |
| "num_input_tokens_seen": 0, |
| "step": 118 |
| }, |
| { |
| "epoch": 1.4875, |
| "grad_norm": 0.5157156531813916, |
| "learning_rate": 1.6833408430916085e-05, |
| "loss": 0.23422931134700775, |
| "num_input_tokens_seen": 0, |
| "step": 119 |
| }, |
| { |
| "epoch": 1.5, |
| "grad_norm": 0.5525368224210166, |
| "learning_rate": 1.6772815716257414e-05, |
| "loss": 0.23097172379493713, |
| "num_input_tokens_seen": 0, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.5125, |
| "grad_norm": 0.5671209735467264, |
| "learning_rate": 1.6711760088641197e-05, |
| "loss": 0.22700801491737366, |
| "num_input_tokens_seen": 0, |
| "step": 121 |
| }, |
| { |
| "epoch": 1.525, |
| "grad_norm": 0.49340662518168044, |
| "learning_rate": 1.6650245721138483e-05, |
| "loss": 0.21679332852363586, |
| "num_input_tokens_seen": 0, |
| "step": 122 |
| }, |
| { |
| "epoch": 1.5375, |
| "grad_norm": 0.5241279113240838, |
| "learning_rate": 1.658827681817458e-05, |
| "loss": 0.23128211498260498, |
| "num_input_tokens_seen": 0, |
| "step": 123 |
| }, |
| { |
| "epoch": 1.55, |
| "grad_norm": 0.5605937616757047, |
| "learning_rate": 1.6525857615241686e-05, |
| "loss": 0.2341003566980362, |
| "num_input_tokens_seen": 0, |
| "step": 124 |
| }, |
| { |
| "epoch": 1.5625, |
| "grad_norm": 0.5731377051701606, |
| "learning_rate": 1.646299237860941e-05, |
| "loss": 0.24362272024154663, |
| "num_input_tokens_seen": 0, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.575, |
| "grad_norm": 0.4973323542505726, |
| "learning_rate": 1.6399685405033168e-05, |
| "loss": 0.21342068910598755, |
| "num_input_tokens_seen": 0, |
| "step": 126 |
| }, |
| { |
| "epoch": 1.5875, |
| "grad_norm": 0.5599176772369371, |
| "learning_rate": 1.6335941021460507e-05, |
| "loss": 0.2271297574043274, |
| "num_input_tokens_seen": 0, |
| "step": 127 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 0.5203662408979232, |
| "learning_rate": 1.6271763584735373e-05, |
| "loss": 0.22479626536369324, |
| "num_input_tokens_seen": 0, |
| "step": 128 |
| }, |
| { |
| "epoch": 1.6125, |
| "grad_norm": 0.5095863344896252, |
| "learning_rate": 1.6207157481300315e-05, |
| "loss": 0.22810155153274536, |
| "num_input_tokens_seen": 0, |
| "step": 129 |
| }, |
| { |
| "epoch": 1.625, |
| "grad_norm": 0.5317196453792732, |
| "learning_rate": 1.6142127126896682e-05, |
| "loss": 0.21053889393806458, |
| "num_input_tokens_seen": 0, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.6375, |
| "grad_norm": 0.5425148287230596, |
| "learning_rate": 1.6076676966262815e-05, |
| "loss": 0.23062941431999207, |
| "num_input_tokens_seen": 0, |
| "step": 131 |
| }, |
| { |
| "epoch": 1.65, |
| "grad_norm": 0.6023208751142557, |
| "learning_rate": 1.6010811472830253e-05, |
| "loss": 0.2463783472776413, |
| "num_input_tokens_seen": 0, |
| "step": 132 |
| }, |
| { |
| "epoch": 1.6625, |
| "grad_norm": 0.5550292129482939, |
| "learning_rate": 1.5944535148417982e-05, |
| "loss": 0.23075446486473083, |
| "num_input_tokens_seen": 0, |
| "step": 133 |
| }, |
| { |
| "epoch": 1.675, |
| "grad_norm": 0.5643030708792179, |
| "learning_rate": 1.5877852522924733e-05, |
| "loss": 0.22998251020908356, |
| "num_input_tokens_seen": 0, |
| "step": 134 |
| }, |
| { |
| "epoch": 1.6875, |
| "grad_norm": 0.6739206936197925, |
| "learning_rate": 1.5810768154019386e-05, |
| "loss": 0.23184943199157715, |
| "num_input_tokens_seen": 0, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.7, |
| "grad_norm": 0.5322077543412791, |
| "learning_rate": 1.5743286626829437e-05, |
| "loss": 0.24749401211738586, |
| "num_input_tokens_seen": 0, |
| "step": 136 |
| }, |
| { |
| "epoch": 1.7125, |
| "grad_norm": 0.6117204460062114, |
| "learning_rate": 1.5675412553627638e-05, |
| "loss": 0.22619368135929108, |
| "num_input_tokens_seen": 0, |
| "step": 137 |
| }, |
| { |
| "epoch": 1.725, |
| "grad_norm": 0.5683821158495919, |
| "learning_rate": 1.560715057351673e-05, |
| "loss": 0.2415420114994049, |
| "num_input_tokens_seen": 0, |
| "step": 138 |
| }, |
| { |
| "epoch": 1.7375, |
| "grad_norm": 0.5531774081547891, |
| "learning_rate": 1.5538505352112373e-05, |
| "loss": 0.23251904547214508, |
| "num_input_tokens_seen": 0, |
| "step": 139 |
| }, |
| { |
| "epoch": 1.75, |
| "grad_norm": 0.5656996064712688, |
| "learning_rate": 1.5469481581224274e-05, |
| "loss": 0.2148549109697342, |
| "num_input_tokens_seen": 0, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.7625, |
| "grad_norm": 0.5342290575425301, |
| "learning_rate": 1.5400083978535475e-05, |
| "loss": 0.221043199300766, |
| "num_input_tokens_seen": 0, |
| "step": 141 |
| }, |
| { |
| "epoch": 1.775, |
| "grad_norm": 0.5312824045538664, |
| "learning_rate": 1.533031728727994e-05, |
| "loss": 0.22295251488685608, |
| "num_input_tokens_seen": 0, |
| "step": 142 |
| }, |
| { |
| "epoch": 1.7875, |
| "grad_norm": 0.5532281742783289, |
| "learning_rate": 1.526018627591834e-05, |
| "loss": 0.23230503499507904, |
| "num_input_tokens_seen": 0, |
| "step": 143 |
| }, |
| { |
| "epoch": 1.8, |
| "grad_norm": 0.5486342376292719, |
| "learning_rate": 1.5189695737812153e-05, |
| "loss": 0.22794640064239502, |
| "num_input_tokens_seen": 0, |
| "step": 144 |
| }, |
| { |
| "epoch": 1.8125, |
| "grad_norm": 0.5282686765938053, |
| "learning_rate": 1.5118850490896012e-05, |
| "loss": 0.23277482390403748, |
| "num_input_tokens_seen": 0, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.825, |
| "grad_norm": 0.5221771559817829, |
| "learning_rate": 1.504765537734844e-05, |
| "loss": 0.23067018389701843, |
| "num_input_tokens_seen": 0, |
| "step": 146 |
| }, |
| { |
| "epoch": 1.8375, |
| "grad_norm": 0.5641718599884086, |
| "learning_rate": 1.4976115263260876e-05, |
| "loss": 0.21835649013519287, |
| "num_input_tokens_seen": 0, |
| "step": 147 |
| }, |
| { |
| "epoch": 1.85, |
| "grad_norm": 0.5505224831003699, |
| "learning_rate": 1.4904235038305084e-05, |
| "loss": 0.2256629765033722, |
| "num_input_tokens_seen": 0, |
| "step": 148 |
| }, |
| { |
| "epoch": 1.8625, |
| "grad_norm": 0.5686783195373294, |
| "learning_rate": 1.4832019615398962e-05, |
| "loss": 0.215658038854599, |
| "num_input_tokens_seen": 0, |
| "step": 149 |
| }, |
| { |
| "epoch": 1.875, |
| "grad_norm": 0.5570580083339357, |
| "learning_rate": 1.4759473930370738e-05, |
| "loss": 0.23637929558753967, |
| "num_input_tokens_seen": 0, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.8875, |
| "grad_norm": 0.5613938118868574, |
| "learning_rate": 1.4686602941621618e-05, |
| "loss": 0.2198466658592224, |
| "num_input_tokens_seen": 0, |
| "step": 151 |
| }, |
| { |
| "epoch": 1.9, |
| "grad_norm": 0.5554584703399501, |
| "learning_rate": 1.461341162978688e-05, |
| "loss": 0.2246859222650528, |
| "num_input_tokens_seen": 0, |
| "step": 152 |
| }, |
| { |
| "epoch": 1.9125, |
| "grad_norm": 0.5909451217927424, |
| "learning_rate": 1.4539904997395468e-05, |
| "loss": 0.24911729991436005, |
| "num_input_tokens_seen": 0, |
| "step": 153 |
| }, |
| { |
| "epoch": 1.925, |
| "grad_norm": 0.5842777476606686, |
| "learning_rate": 1.4466088068528068e-05, |
| "loss": 0.23788337409496307, |
| "num_input_tokens_seen": 0, |
| "step": 154 |
| }, |
| { |
| "epoch": 1.9375, |
| "grad_norm": 0.4981326579110442, |
| "learning_rate": 1.4391965888473705e-05, |
| "loss": 0.2209474742412567, |
| "num_input_tokens_seen": 0, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.95, |
| "grad_norm": 0.5397167568674605, |
| "learning_rate": 1.4317543523384928e-05, |
| "loss": 0.23926502466201782, |
| "num_input_tokens_seen": 0, |
| "step": 156 |
| }, |
| { |
| "epoch": 1.9625, |
| "grad_norm": 0.5359703981598113, |
| "learning_rate": 1.4242826059931538e-05, |
| "loss": 0.2312660813331604, |
| "num_input_tokens_seen": 0, |
| "step": 157 |
| }, |
| { |
| "epoch": 1.975, |
| "grad_norm": 0.5386500827778109, |
| "learning_rate": 1.4167818604952906e-05, |
| "loss": 0.2199496328830719, |
| "num_input_tokens_seen": 0, |
| "step": 158 |
| }, |
| { |
| "epoch": 1.9875, |
| "grad_norm": 0.5538059936709024, |
| "learning_rate": 1.409252628510894e-05, |
| "loss": 0.23412078619003296, |
| "num_input_tokens_seen": 0, |
| "step": 159 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.540376667893175, |
| "learning_rate": 1.4016954246529697e-05, |
| "loss": 0.2249927967786789, |
| "num_input_tokens_seen": 0, |
| "step": 160 |
| }, |
| { |
| "epoch": 2.0125, |
| "grad_norm": 1.0198244871776765, |
| "learning_rate": 1.3941107654463619e-05, |
| "loss": 0.12236860394477844, |
| "num_input_tokens_seen": 0, |
| "step": 161 |
| }, |
| { |
| "epoch": 2.025, |
| "grad_norm": 0.8629260127173042, |
| "learning_rate": 1.3864991692924524e-05, |
| "loss": 0.11816570162773132, |
| "num_input_tokens_seen": 0, |
| "step": 162 |
| }, |
| { |
| "epoch": 2.0375, |
| "grad_norm": 0.6404376890511378, |
| "learning_rate": 1.3788611564337277e-05, |
| "loss": 0.10550174862146378, |
| "num_input_tokens_seen": 0, |
| "step": 163 |
| }, |
| { |
| "epoch": 2.05, |
| "grad_norm": 1.140833870667296, |
| "learning_rate": 1.3711972489182208e-05, |
| "loss": 0.10521072149276733, |
| "num_input_tokens_seen": 0, |
| "step": 164 |
| }, |
| { |
| "epoch": 2.0625, |
| "grad_norm": 1.004887415099318, |
| "learning_rate": 1.3635079705638298e-05, |
| "loss": 0.10304014384746552, |
| "num_input_tokens_seen": 0, |
| "step": 165 |
| }, |
| { |
| "epoch": 2.075, |
| "grad_norm": 0.7658325943968483, |
| "learning_rate": 1.3557938469225167e-05, |
| "loss": 0.10292299091815948, |
| "num_input_tokens_seen": 0, |
| "step": 166 |
| }, |
| { |
| "epoch": 2.0875, |
| "grad_norm": 0.7338849003327649, |
| "learning_rate": 1.3480554052443847e-05, |
| "loss": 0.10219870507717133, |
| "num_input_tokens_seen": 0, |
| "step": 167 |
| }, |
| { |
| "epoch": 2.1, |
| "grad_norm": 0.7745676732651274, |
| "learning_rate": 1.3402931744416432e-05, |
| "loss": 0.10130374133586884, |
| "num_input_tokens_seen": 0, |
| "step": 168 |
| }, |
| { |
| "epoch": 2.1125, |
| "grad_norm": 0.6584333483269132, |
| "learning_rate": 1.332507685052457e-05, |
| "loss": 0.10410261154174805, |
| "num_input_tokens_seen": 0, |
| "step": 169 |
| }, |
| { |
| "epoch": 2.125, |
| "grad_norm": 0.6821199419154345, |
| "learning_rate": 1.3246994692046837e-05, |
| "loss": 0.09977763891220093, |
| "num_input_tokens_seen": 0, |
| "step": 170 |
| }, |
| { |
| "epoch": 2.1375, |
| "grad_norm": 0.6358679555892452, |
| "learning_rate": 1.3168690605795044e-05, |
| "loss": 0.09835107624530792, |
| "num_input_tokens_seen": 0, |
| "step": 171 |
| }, |
| { |
| "epoch": 2.15, |
| "grad_norm": 0.6632837484026789, |
| "learning_rate": 1.3090169943749475e-05, |
| "loss": 0.09917747974395752, |
| "num_input_tokens_seen": 0, |
| "step": 172 |
| }, |
| { |
| "epoch": 2.1625, |
| "grad_norm": 0.6342553931938368, |
| "learning_rate": 1.3011438072693077e-05, |
| "loss": 0.09428848326206207, |
| "num_input_tokens_seen": 0, |
| "step": 173 |
| }, |
| { |
| "epoch": 2.175, |
| "grad_norm": 0.6387577881812592, |
| "learning_rate": 1.293250037384465e-05, |
| "loss": 0.0958920419216156, |
| "num_input_tokens_seen": 0, |
| "step": 174 |
| }, |
| { |
| "epoch": 2.1875, |
| "grad_norm": 0.6954184373882712, |
| "learning_rate": 1.2853362242491054e-05, |
| "loss": 0.09917563199996948, |
| "num_input_tokens_seen": 0, |
| "step": 175 |
| }, |
| { |
| "epoch": 2.2, |
| "grad_norm": 0.5974385656105545, |
| "learning_rate": 1.2774029087618448e-05, |
| "loss": 0.09344319999217987, |
| "num_input_tokens_seen": 0, |
| "step": 176 |
| }, |
| { |
| "epoch": 2.2125, |
| "grad_norm": 0.6472498757039972, |
| "learning_rate": 1.269450633154258e-05, |
| "loss": 0.10768131911754608, |
| "num_input_tokens_seen": 0, |
| "step": 177 |
| }, |
| { |
| "epoch": 2.225, |
| "grad_norm": 0.6685276002639324, |
| "learning_rate": 1.26147994095382e-05, |
| "loss": 0.10777394473552704, |
| "num_input_tokens_seen": 0, |
| "step": 178 |
| }, |
| { |
| "epoch": 2.2375, |
| "grad_norm": 0.6340976446307695, |
| "learning_rate": 1.253491376946754e-05, |
| "loss": 0.10363772511482239, |
| "num_input_tokens_seen": 0, |
| "step": 179 |
| }, |
| { |
| "epoch": 2.25, |
| "grad_norm": 0.5829966761174877, |
| "learning_rate": 1.2454854871407993e-05, |
| "loss": 0.09860391169786453, |
| "num_input_tokens_seen": 0, |
| "step": 180 |
| }, |
| { |
| "epoch": 2.2625, |
| "grad_norm": 0.6211718617191284, |
| "learning_rate": 1.2374628187278888e-05, |
| "loss": 0.09785199165344238, |
| "num_input_tokens_seen": 0, |
| "step": 181 |
| }, |
| { |
| "epoch": 2.275, |
| "grad_norm": 0.5692692956767395, |
| "learning_rate": 1.2294239200467516e-05, |
| "loss": 0.09915725886821747, |
| "num_input_tokens_seen": 0, |
| "step": 182 |
| }, |
| { |
| "epoch": 2.2875, |
| "grad_norm": 0.6193608649665566, |
| "learning_rate": 1.2213693405454345e-05, |
| "loss": 0.09728467464447021, |
| "num_input_tokens_seen": 0, |
| "step": 183 |
| }, |
| { |
| "epoch": 2.3, |
| "grad_norm": 0.6644514294612655, |
| "learning_rate": 1.213299630743747e-05, |
| "loss": 0.10535012185573578, |
| "num_input_tokens_seen": 0, |
| "step": 184 |
| }, |
| { |
| "epoch": 2.3125, |
| "grad_norm": 0.5826634498573604, |
| "learning_rate": 1.2052153421956343e-05, |
| "loss": 0.09781628847122192, |
| "num_input_tokens_seen": 0, |
| "step": 185 |
| }, |
| { |
| "epoch": 2.325, |
| "grad_norm": 0.6951420866310761, |
| "learning_rate": 1.1971170274514802e-05, |
| "loss": 0.09797348082065582, |
| "num_input_tokens_seen": 0, |
| "step": 186 |
| }, |
| { |
| "epoch": 2.3375, |
| "grad_norm": 0.6449424096000549, |
| "learning_rate": 1.1890052400203405e-05, |
| "loss": 0.11403355002403259, |
| "num_input_tokens_seen": 0, |
| "step": 187 |
| }, |
| { |
| "epoch": 2.35, |
| "grad_norm": 0.6977055112415091, |
| "learning_rate": 1.1808805343321102e-05, |
| "loss": 0.09498105198144913, |
| "num_input_tokens_seen": 0, |
| "step": 188 |
| }, |
| { |
| "epoch": 2.3625, |
| "grad_norm": 0.6309628403695112, |
| "learning_rate": 1.1727434656996306e-05, |
| "loss": 0.0992412120103836, |
| "num_input_tokens_seen": 0, |
| "step": 189 |
| }, |
| { |
| "epoch": 2.375, |
| "grad_norm": 0.6731776940075981, |
| "learning_rate": 1.164594590280734e-05, |
| "loss": 0.11293548345565796, |
| "num_input_tokens_seen": 0, |
| "step": 190 |
| }, |
| { |
| "epoch": 2.3875, |
| "grad_norm": 0.6741092344709629, |
| "learning_rate": 1.156434465040231e-05, |
| "loss": 0.094619981944561, |
| "num_input_tokens_seen": 0, |
| "step": 191 |
| }, |
| { |
| "epoch": 2.4, |
| "grad_norm": 0.5967386003242268, |
| "learning_rate": 1.148263647711842e-05, |
| "loss": 0.09347110986709595, |
| "num_input_tokens_seen": 0, |
| "step": 192 |
| }, |
| { |
| "epoch": 2.4125, |
| "grad_norm": 0.7159915413192838, |
| "learning_rate": 1.140082696760078e-05, |
| "loss": 0.10074266791343689, |
| "num_input_tokens_seen": 0, |
| "step": 193 |
| }, |
| { |
| "epoch": 2.425, |
| "grad_norm": 0.5968881913694516, |
| "learning_rate": 1.1318921713420691e-05, |
| "loss": 0.10465800762176514, |
| "num_input_tokens_seen": 0, |
| "step": 194 |
| }, |
| { |
| "epoch": 2.4375, |
| "grad_norm": 0.6589415029348668, |
| "learning_rate": 1.123692631269348e-05, |
| "loss": 0.10594508051872253, |
| "num_input_tokens_seen": 0, |
| "step": 195 |
| }, |
| { |
| "epoch": 2.45, |
| "grad_norm": 0.663988394877507, |
| "learning_rate": 1.1154846369695864e-05, |
| "loss": 0.09538954496383667, |
| "num_input_tokens_seen": 0, |
| "step": 196 |
| }, |
| { |
| "epoch": 2.4625, |
| "grad_norm": 0.6228831383495524, |
| "learning_rate": 1.107268749448292e-05, |
| "loss": 0.09765101969242096, |
| "num_input_tokens_seen": 0, |
| "step": 197 |
| }, |
| { |
| "epoch": 2.475, |
| "grad_norm": 0.61772733625651, |
| "learning_rate": 1.099045530250463e-05, |
| "loss": 0.09396866708993912, |
| "num_input_tokens_seen": 0, |
| "step": 198 |
| }, |
| { |
| "epoch": 2.4875, |
| "grad_norm": 0.5929697542542042, |
| "learning_rate": 1.0908155414222083e-05, |
| "loss": 0.09863300621509552, |
| "num_input_tokens_seen": 0, |
| "step": 199 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 0.6899961462901135, |
| "learning_rate": 1.0825793454723325e-05, |
| "loss": 0.09633656591176987, |
| "num_input_tokens_seen": 0, |
| "step": 200 |
| }, |
| { |
| "epoch": 2.5125, |
| "grad_norm": 0.5997938148312979, |
| "learning_rate": 1.0743375053338879e-05, |
| "loss": 0.10011985152959824, |
| "num_input_tokens_seen": 0, |
| "step": 201 |
| }, |
| { |
| "epoch": 2.525, |
| "grad_norm": 0.6332752356735514, |
| "learning_rate": 1.0660905843256995e-05, |
| "loss": 0.09651218354701996, |
| "num_input_tokens_seen": 0, |
| "step": 202 |
| }, |
| { |
| "epoch": 2.5375, |
| "grad_norm": 0.6204192543359215, |
| "learning_rate": 1.0578391461138642e-05, |
| "loss": 0.0994209498167038, |
| "num_input_tokens_seen": 0, |
| "step": 203 |
| }, |
| { |
| "epoch": 2.55, |
| "grad_norm": 0.6132942561027718, |
| "learning_rate": 1.0495837546732224e-05, |
| "loss": 0.10166719555854797, |
| "num_input_tokens_seen": 0, |
| "step": 204 |
| }, |
| { |
| "epoch": 2.5625, |
| "grad_norm": 0.5964143279459994, |
| "learning_rate": 1.0413249742488132e-05, |
| "loss": 0.10079332441091537, |
| "num_input_tokens_seen": 0, |
| "step": 205 |
| }, |
| { |
| "epoch": 2.575, |
| "grad_norm": 0.6090535510158961, |
| "learning_rate": 1.0330633693173083e-05, |
| "loss": 0.10682345926761627, |
| "num_input_tokens_seen": 0, |
| "step": 206 |
| }, |
| { |
| "epoch": 2.5875, |
| "grad_norm": 0.5757143060836244, |
| "learning_rate": 1.0247995045484303e-05, |
| "loss": 0.10448747873306274, |
| "num_input_tokens_seen": 0, |
| "step": 207 |
| }, |
| { |
| "epoch": 2.6, |
| "grad_norm": 0.5318320587544387, |
| "learning_rate": 1.0165339447663586e-05, |
| "loss": 0.09677817672491074, |
| "num_input_tokens_seen": 0, |
| "step": 208 |
| }, |
| { |
| "epoch": 2.6125, |
| "grad_norm": 0.5653047610868341, |
| "learning_rate": 1.008267254911125e-05, |
| "loss": 0.10041716694831848, |
| "num_input_tokens_seen": 0, |
| "step": 209 |
| }, |
| { |
| "epoch": 2.625, |
| "grad_norm": 0.6709564219802151, |
| "learning_rate": 1e-05, |
| "loss": 0.11834404617547989, |
| "num_input_tokens_seen": 0, |
| "step": 210 |
| }, |
| { |
| "epoch": 2.6375, |
| "grad_norm": 0.59364936824183, |
| "learning_rate": 9.917327450888751e-06, |
| "loss": 0.10398055613040924, |
| "num_input_tokens_seen": 0, |
| "step": 211 |
| }, |
| { |
| "epoch": 2.65, |
| "grad_norm": 0.6135322123850998, |
| "learning_rate": 9.834660552336415e-06, |
| "loss": 0.09955007582902908, |
| "num_input_tokens_seen": 0, |
| "step": 212 |
| }, |
| { |
| "epoch": 2.6625, |
| "grad_norm": 0.5771675846362164, |
| "learning_rate": 9.7520049545157e-06, |
| "loss": 0.09840433299541473, |
| "num_input_tokens_seen": 0, |
| "step": 213 |
| }, |
| { |
| "epoch": 2.675, |
| "grad_norm": 0.6231353540589576, |
| "learning_rate": 9.669366306826919e-06, |
| "loss": 0.10380570590496063, |
| "num_input_tokens_seen": 0, |
| "step": 214 |
| }, |
| { |
| "epoch": 2.6875, |
| "grad_norm": 0.5948673639371316, |
| "learning_rate": 9.586750257511868e-06, |
| "loss": 0.0970538780093193, |
| "num_input_tokens_seen": 0, |
| "step": 215 |
| }, |
| { |
| "epoch": 2.7, |
| "grad_norm": 0.6060511739998478, |
| "learning_rate": 9.504162453267776e-06, |
| "loss": 0.10509540140628815, |
| "num_input_tokens_seen": 0, |
| "step": 216 |
| }, |
| { |
| "epoch": 2.7125, |
| "grad_norm": 0.5510731357722317, |
| "learning_rate": 9.421608538861361e-06, |
| "loss": 0.09432905912399292, |
| "num_input_tokens_seen": 0, |
| "step": 217 |
| }, |
| { |
| "epoch": 2.725, |
| "grad_norm": 0.6639639280503086, |
| "learning_rate": 9.339094156743007e-06, |
| "loss": 0.09837593883275986, |
| "num_input_tokens_seen": 0, |
| "step": 218 |
| }, |
| { |
| "epoch": 2.7375, |
| "grad_norm": 0.6270361275419634, |
| "learning_rate": 9.256624946661126e-06, |
| "loss": 0.10134841501712799, |
| "num_input_tokens_seen": 0, |
| "step": 219 |
| }, |
| { |
| "epoch": 2.75, |
| "grad_norm": 0.6771440915713338, |
| "learning_rate": 9.174206545276678e-06, |
| "loss": 0.11078701913356781, |
| "num_input_tokens_seen": 0, |
| "step": 220 |
| }, |
| { |
| "epoch": 2.7625, |
| "grad_norm": 0.705959945496674, |
| "learning_rate": 9.091844585777919e-06, |
| "loss": 0.1001749038696289, |
| "num_input_tokens_seen": 0, |
| "step": 221 |
| }, |
| { |
| "epoch": 2.775, |
| "grad_norm": 0.60937291192483, |
| "learning_rate": 9.009544697495373e-06, |
| "loss": 0.10557325184345245, |
| "num_input_tokens_seen": 0, |
| "step": 222 |
| }, |
| { |
| "epoch": 2.7875, |
| "grad_norm": 0.5893171675572725, |
| "learning_rate": 8.927312505517086e-06, |
| "loss": 0.10320694744586945, |
| "num_input_tokens_seen": 0, |
| "step": 223 |
| }, |
| { |
| "epoch": 2.8, |
| "grad_norm": 0.6133977099413181, |
| "learning_rate": 8.84515363030414e-06, |
| "loss": 0.09662497788667679, |
| "num_input_tokens_seen": 0, |
| "step": 224 |
| }, |
| { |
| "epoch": 2.8125, |
| "grad_norm": 0.6010610432879008, |
| "learning_rate": 8.763073687306523e-06, |
| "loss": 0.10481660068035126, |
| "num_input_tokens_seen": 0, |
| "step": 225 |
| }, |
| { |
| "epoch": 2.825, |
| "grad_norm": 0.6248420832412895, |
| "learning_rate": 8.68107828657931e-06, |
| "loss": 0.10239937901496887, |
| "num_input_tokens_seen": 0, |
| "step": 226 |
| }, |
| { |
| "epoch": 2.8375, |
| "grad_norm": 0.5868088627329976, |
| "learning_rate": 8.599173032399222e-06, |
| "loss": 0.10687953978776932, |
| "num_input_tokens_seen": 0, |
| "step": 227 |
| }, |
| { |
| "epoch": 2.85, |
| "grad_norm": 0.6268898716250957, |
| "learning_rate": 8.51736352288158e-06, |
| "loss": 0.10268527269363403, |
| "num_input_tokens_seen": 0, |
| "step": 228 |
| }, |
| { |
| "epoch": 2.8625, |
| "grad_norm": 0.6191681347772287, |
| "learning_rate": 8.43565534959769e-06, |
| "loss": 0.10145796835422516, |
| "num_input_tokens_seen": 0, |
| "step": 229 |
| }, |
| { |
| "epoch": 2.875, |
| "grad_norm": 0.6058846171541453, |
| "learning_rate": 8.35405409719266e-06, |
| "loss": 0.104601189494133, |
| "num_input_tokens_seen": 0, |
| "step": 230 |
| }, |
| { |
| "epoch": 2.8875, |
| "grad_norm": 0.5661519092091898, |
| "learning_rate": 8.2725653430037e-06, |
| "loss": 0.09631498157978058, |
| "num_input_tokens_seen": 0, |
| "step": 231 |
| }, |
| { |
| "epoch": 2.9, |
| "grad_norm": 0.5902711791447997, |
| "learning_rate": 8.191194656678905e-06, |
| "loss": 0.0936303585767746, |
| "num_input_tokens_seen": 0, |
| "step": 232 |
| }, |
| { |
| "epoch": 2.9125, |
| "grad_norm": 0.5638822103591541, |
| "learning_rate": 8.109947599796599e-06, |
| "loss": 0.1050085574388504, |
| "num_input_tokens_seen": 0, |
| "step": 233 |
| }, |
| { |
| "epoch": 2.925, |
| "grad_norm": 0.5703156532416298, |
| "learning_rate": 8.0288297254852e-06, |
| "loss": 0.09568509459495544, |
| "num_input_tokens_seen": 0, |
| "step": 234 |
| }, |
| { |
| "epoch": 2.9375, |
| "grad_norm": 0.6177421968590858, |
| "learning_rate": 7.947846578043658e-06, |
| "loss": 0.10282063484191895, |
| "num_input_tokens_seen": 0, |
| "step": 235 |
| }, |
| { |
| "epoch": 2.95, |
| "grad_norm": 0.5724279751966526, |
| "learning_rate": 7.867003692562533e-06, |
| "loss": 0.10039152950048447, |
| "num_input_tokens_seen": 0, |
| "step": 236 |
| }, |
| { |
| "epoch": 2.9625, |
| "grad_norm": 0.665494938328959, |
| "learning_rate": 7.786306594545658e-06, |
| "loss": 0.09484650194644928, |
| "num_input_tokens_seen": 0, |
| "step": 237 |
| }, |
| { |
| "epoch": 2.975, |
| "grad_norm": 0.5909403346328018, |
| "learning_rate": 7.705760799532485e-06, |
| "loss": 0.0970713198184967, |
| "num_input_tokens_seen": 0, |
| "step": 238 |
| }, |
| { |
| "epoch": 2.9875, |
| "grad_norm": 0.5930301687493515, |
| "learning_rate": 7.625371812721115e-06, |
| "loss": 0.1040472462773323, |
| "num_input_tokens_seen": 0, |
| "step": 239 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 0.5975944410229318, |
| "learning_rate": 7.545145128592009e-06, |
| "loss": 0.10229268670082092, |
| "num_input_tokens_seen": 0, |
| "step": 240 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 400, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 200924346515456.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|