| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 284.2696901095738, | |
| "learning_rate": 2e-07, | |
| "loss": 7.1138, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 280.0890600259792, | |
| "learning_rate": 4e-07, | |
| "loss": 6.8466, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 296.98995011993617, | |
| "learning_rate": 6e-07, | |
| "loss": 6.8936, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 284.92519297592935, | |
| "learning_rate": 8e-07, | |
| "loss": 7.1196, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 296.67655171044885, | |
| "learning_rate": 1e-06, | |
| "loss": 6.7063, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 281.91608332740634, | |
| "learning_rate": 9.99726628670463e-07, | |
| "loss": 6.028, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 245.30165342232291, | |
| "learning_rate": 9.989068136093872e-07, | |
| "loss": 5.593, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 171.71633417584104, | |
| "learning_rate": 9.975414512725056e-07, | |
| "loss": 4.1199, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 153.69431185375615, | |
| "learning_rate": 9.956320346634875e-07, | |
| "loss": 3.8539, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 138.4099703250891, | |
| "learning_rate": 9.931806517013612e-07, | |
| "loss": 3.364, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 121.96592995402922, | |
| "learning_rate": 9.901899829374047e-07, | |
| "loss": 3.2359, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 83.33555123373614, | |
| "learning_rate": 9.866632986240029e-07, | |
| "loss": 2.0694, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 69.8343923660202, | |
| "learning_rate": 9.826044551386742e-07, | |
| "loss": 1.8041, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 61.02444391130647, | |
| "learning_rate": 9.780178907671788e-07, | |
| "loss": 1.8221, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 51.06086209218741, | |
| "learning_rate": 9.729086208503173e-07, | |
| "loss": 1.5577, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 39.386178814833706, | |
| "learning_rate": 9.672822322997304e-07, | |
| "loss": 1.3898, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 33.769199410950286, | |
| "learning_rate": 9.611448774886923e-07, | |
| "loss": 1.2839, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 32.43709972193044, | |
| "learning_rate": 9.545032675245813e-07, | |
| "loss": 1.1126, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 29.12905700014587, | |
| "learning_rate": 9.473646649103817e-07, | |
| "loss": 0.9637, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 19.116333899951297, | |
| "learning_rate": 9.397368756032444e-07, | |
| "loss": 0.6976, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 17.362547626796633, | |
| "learning_rate": 9.316282404787869e-07, | |
| "loss": 0.6686, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 12.641692912258486, | |
| "learning_rate": 9.230476262104676e-07, | |
| "loss": 0.6055, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 9.423373458222406, | |
| "learning_rate": 9.1400441557401e-07, | |
| "loss": 0.5312, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 7.714449953085253, | |
| "learning_rate": 9.045084971874737e-07, | |
| "loss": 0.5334, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 6.3626526485098545, | |
| "learning_rate": 8.945702546981968e-07, | |
| "loss": 0.5246, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.787541205113461, | |
| "learning_rate": 8.842005554284295e-07, | |
| "loss": 0.4586, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 5.217072482502457, | |
| "learning_rate": 8.734107384920769e-07, | |
| "loss": 0.514, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 5.69075014149454, | |
| "learning_rate": 8.622126023955445e-07, | |
| "loss": 0.469, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.767332317747505, | |
| "learning_rate": 8.506183921362442e-07, | |
| "loss": 0.4928, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.29694390512729, | |
| "learning_rate": 8.386407858128706e-07, | |
| "loss": 0.416, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 5.4751365453564524, | |
| "learning_rate": 8.262928807620843e-07, | |
| "loss": 0.4065, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 4.210968327443881, | |
| "learning_rate": 8.135881792367685e-07, | |
| "loss": 0.4302, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 4.889628663080109, | |
| "learning_rate": 8.005405736415125e-07, | |
| "loss": 0.475, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.01223684022235, | |
| "learning_rate": 7.871643313414718e-07, | |
| "loss": 0.4418, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 4.362633576491621, | |
| "learning_rate": 7.734740790612136e-07, | |
| "loss": 0.4074, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 3.9179637860680874, | |
| "learning_rate": 7.594847868906076e-07, | |
| "loss": 0.4006, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 3.8386018601431067, | |
| "learning_rate": 7.452117519152541e-07, | |
| "loss": 0.4099, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 4.465384708590031, | |
| "learning_rate": 7.306705814893439e-07, | |
| "loss": 0.4258, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 4.869675336079214, | |
| "learning_rate": 7.158771761692464e-07, | |
| "loss": 0.4066, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 4.6892092384872255, | |
| "learning_rate": 7.008477123264847e-07, | |
| "loss": 0.4272, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 4.282400513219925, | |
| "learning_rate": 6.855986244591103e-07, | |
| "loss": 0.4166, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 4.44179530817891, | |
| "learning_rate": 6.701465872208216e-07, | |
| "loss": 0.3928, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 4.753857420255081, | |
| "learning_rate": 6.545084971874736e-07, | |
| "loss": 0.4291, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 5.121989871402109, | |
| "learning_rate": 6.387014543809223e-07, | |
| "loss": 0.3841, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 4.306307235348373, | |
| "learning_rate": 6.227427435703995e-07, | |
| "loss": 0.3897, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 4.1819998879575095, | |
| "learning_rate": 6.066498153718734e-07, | |
| "loss": 0.3938, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 3.9858764933030493, | |
| "learning_rate": 5.90440267166055e-07, | |
| "loss": 0.3722, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 3.768016055450793, | |
| "learning_rate": 5.741318238559209e-07, | |
| "loss": 0.4295, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 4.155664350209782, | |
| "learning_rate": 5.577423184847931e-07, | |
| "loss": 0.3834, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 3.79515186565417, | |
| "learning_rate": 5.412896727361662e-07, | |
| "loss": 0.3391, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 3.4230723188666476, | |
| "learning_rate": 5.247918773366111e-07, | |
| "loss": 0.3942, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 3.0931484903099773, | |
| "learning_rate": 5.082669723831793e-07, | |
| "loss": 0.3848, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 4.693387026086119, | |
| "learning_rate": 4.917330276168208e-07, | |
| "loss": 0.385, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 3.531161429609241, | |
| "learning_rate": 4.752081226633888e-07, | |
| "loss": 0.3694, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 3.696655191494645, | |
| "learning_rate": 4.5871032726383385e-07, | |
| "loss": 0.3615, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 3.798666905310865, | |
| "learning_rate": 4.4225768151520694e-07, | |
| "loss": 0.3257, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 3.595957261529934, | |
| "learning_rate": 4.258681761440789e-07, | |
| "loss": 0.351, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 3.443958917780827, | |
| "learning_rate": 4.095597328339452e-07, | |
| "loss": 0.3964, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 4.222996925336396, | |
| "learning_rate": 3.9335018462812664e-07, | |
| "loss": 0.3783, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 4.141091647019225, | |
| "learning_rate": 3.772572564296004e-07, | |
| "loss": 0.3617, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 4.207602140662548, | |
| "learning_rate": 3.612985456190778e-07, | |
| "loss": 0.3997, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 3.4580694579960918, | |
| "learning_rate": 3.454915028125263e-07, | |
| "loss": 0.3582, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 3.422220201677041, | |
| "learning_rate": 3.2985341277917846e-07, | |
| "loss": 0.3804, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 3.273283148367399, | |
| "learning_rate": 3.1440137554088953e-07, | |
| "loss": 0.3646, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 3.099164792571154, | |
| "learning_rate": 2.9915228767351535e-07, | |
| "loss": 0.3387, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 3.6768199519226314, | |
| "learning_rate": 2.841228238307536e-07, | |
| "loss": 0.3839, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 3.684496636177031, | |
| "learning_rate": 2.6932941851065615e-07, | |
| "loss": 0.3635, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 2.8715160468944236, | |
| "learning_rate": 2.547882480847461e-07, | |
| "loss": 0.3905, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 4.542266866557925, | |
| "learning_rate": 2.4051521310939254e-07, | |
| "loss": 0.3666, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 3.721830078063186, | |
| "learning_rate": 2.2652592093878665e-07, | |
| "loss": 0.3317, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 3.7314666594735724, | |
| "learning_rate": 2.128356686585282e-07, | |
| "loss": 0.3231, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 3.283124086872181, | |
| "learning_rate": 1.9945942635848745e-07, | |
| "loss": 0.3447, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 3.9554685048457183, | |
| "learning_rate": 1.8641182076323148e-07, | |
| "loss": 0.3284, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 3.8402484907207435, | |
| "learning_rate": 1.7370711923791564e-07, | |
| "loss": 0.3591, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 3.4683454793599884, | |
| "learning_rate": 1.6135921418712955e-07, | |
| "loss": 0.3444, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 3.837469581874028, | |
| "learning_rate": 1.493816078637557e-07, | |
| "loss": 0.4039, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 3.498682351133943, | |
| "learning_rate": 1.3778739760445552e-07, | |
| "loss": 0.3203, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 4.16268013662858, | |
| "learning_rate": 1.2658926150792322e-07, | |
| "loss": 0.3715, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 3.404009699611626, | |
| "learning_rate": 1.1579944457157059e-07, | |
| "loss": 0.3226, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 3.8135862465406904, | |
| "learning_rate": 1.0542974530180327e-07, | |
| "loss": 0.3627, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 3.3969260525552616, | |
| "learning_rate": 9.549150281252632e-08, | |
| "loss": 0.3646, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 2.9007273599583057, | |
| "learning_rate": 8.599558442598998e-08, | |
| "loss": 0.3506, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 2.8266227536670523, | |
| "learning_rate": 7.695237378953224e-08, | |
| "loss": 0.3471, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 2.8680227347397893, | |
| "learning_rate": 6.837175952121304e-08, | |
| "loss": 0.3362, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 3.017390726560224, | |
| "learning_rate": 6.026312439675551e-08, | |
| "loss": 0.354, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 2.75220355066635, | |
| "learning_rate": 5.263533508961826e-08, | |
| "loss": 0.3589, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 2.5609091595597784, | |
| "learning_rate": 4.549673247541874e-08, | |
| "loss": 0.3209, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 3.366788037939117, | |
| "learning_rate": 3.8855122511307626e-08, | |
| "loss": 0.3707, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 2.998733006487237, | |
| "learning_rate": 3.271776770026963e-08, | |
| "loss": 0.3314, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 3.38338292899199, | |
| "learning_rate": 2.7091379149682682e-08, | |
| "loss": 0.3634, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 3.62894701397729, | |
| "learning_rate": 2.1982109232821176e-08, | |
| "loss": 0.3762, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 3.6241096556456243, | |
| "learning_rate": 1.7395544861325718e-08, | |
| "loss": 0.3393, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 3.681150537295675, | |
| "learning_rate": 1.3336701375997127e-08, | |
| "loss": 0.3408, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 3.573864183770881, | |
| "learning_rate": 9.810017062595321e-09, | |
| "loss": 0.3388, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 3.1628676346413824, | |
| "learning_rate": 6.819348298638839e-09, | |
| "loss": 0.3396, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 4.253309468483455, | |
| "learning_rate": 4.367965336512403e-09, | |
| "loss": 0.3292, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 3.3389999287279166, | |
| "learning_rate": 2.458548727494292e-09, | |
| "loss": 0.3415, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 3.0295989312166327, | |
| "learning_rate": 1.0931863906127325e-09, | |
| "loss": 0.3261, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 2.9789770023523388, | |
| "learning_rate": 2.733713295369755e-10, | |
| "loss": 0.3191, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 3.6932194833157013, | |
| "learning_rate": 0.0, | |
| "loss": 0.355, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 100, | |
| "total_flos": 6384023396352.0, | |
| "train_loss": 1.047203412950039, | |
| "train_runtime": 2682.144, | |
| "train_samples_per_second": 9.545, | |
| "train_steps_per_second": 0.037 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 100, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6384023396352.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |