| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.8, | |
| "eval_steps": 500, | |
| "global_step": 19000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 5.518036842346191, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 1.8359, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 9.091249465942383, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 1.6874, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 4.175647735595703, | |
| "learning_rate": 1.2e-05, | |
| "loss": 1.6864, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.6093075275421143, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 1.6442, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 3.347723960876465, | |
| "learning_rate": 2e-05, | |
| "loss": 1.6588, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "eval_loss": 1.6333842277526855, | |
| "eval_runtime": 32.0898, | |
| "eval_samples_per_second": 31.163, | |
| "eval_steps_per_second": 7.791, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.886333465576172, | |
| "learning_rate": 1.9932203389830512e-05, | |
| "loss": 1.6738, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 2.807443618774414, | |
| "learning_rate": 1.986440677966102e-05, | |
| "loss": 1.625, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 3.290576219558716, | |
| "learning_rate": 1.9796610169491527e-05, | |
| "loss": 1.623, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.103792667388916, | |
| "learning_rate": 1.9728813559322034e-05, | |
| "loss": 1.6119, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 4.206387996673584, | |
| "learning_rate": 1.9661016949152545e-05, | |
| "loss": 1.5908, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "eval_loss": 1.5665849447250366, | |
| "eval_runtime": 32.0387, | |
| "eval_samples_per_second": 31.212, | |
| "eval_steps_per_second": 7.803, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 4.045094013214111, | |
| "learning_rate": 1.9593220338983052e-05, | |
| "loss": 1.5673, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 3.8652350902557373, | |
| "learning_rate": 1.9525423728813562e-05, | |
| "loss": 1.5724, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 2.9019522666931152, | |
| "learning_rate": 1.945762711864407e-05, | |
| "loss": 1.5576, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 4.4180498123168945, | |
| "learning_rate": 1.938983050847458e-05, | |
| "loss": 1.5675, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 2.4532687664031982, | |
| "learning_rate": 1.9322033898305087e-05, | |
| "loss": 1.5344, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "eval_loss": 1.5171211957931519, | |
| "eval_runtime": 32.0626, | |
| "eval_samples_per_second": 31.189, | |
| "eval_steps_per_second": 7.797, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 3.2281365394592285, | |
| "learning_rate": 1.9254237288135595e-05, | |
| "loss": 1.5391, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 3.9512276649475098, | |
| "learning_rate": 1.91871186440678e-05, | |
| "loss": 1.5185, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 3.428534984588623, | |
| "learning_rate": 1.9119322033898308e-05, | |
| "loss": 1.5175, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 3.495711088180542, | |
| "learning_rate": 1.9051525423728815e-05, | |
| "loss": 1.5164, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 2.2846789360046387, | |
| "learning_rate": 1.8983728813559322e-05, | |
| "loss": 1.5111, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_loss": 1.52150297164917, | |
| "eval_runtime": 32.0907, | |
| "eval_samples_per_second": 31.162, | |
| "eval_steps_per_second": 7.79, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 3.574172019958496, | |
| "learning_rate": 1.8915932203389833e-05, | |
| "loss": 1.5017, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 2.372629165649414, | |
| "learning_rate": 1.884813559322034e-05, | |
| "loss": 1.4921, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 3.576235771179199, | |
| "learning_rate": 1.878033898305085e-05, | |
| "loss": 1.4955, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 3.2708258628845215, | |
| "learning_rate": 1.8712542372881358e-05, | |
| "loss": 1.4876, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 3.395094394683838, | |
| "learning_rate": 1.8644745762711865e-05, | |
| "loss": 1.4824, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "eval_loss": 1.5081515312194824, | |
| "eval_runtime": 32.105, | |
| "eval_samples_per_second": 31.148, | |
| "eval_steps_per_second": 7.787, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 2.276524066925049, | |
| "learning_rate": 1.8576949152542373e-05, | |
| "loss": 1.5011, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 2.002469539642334, | |
| "learning_rate": 1.8509152542372883e-05, | |
| "loss": 1.4885, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 3.197190523147583, | |
| "learning_rate": 1.844135593220339e-05, | |
| "loss": 1.4658, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 3.1256494522094727, | |
| "learning_rate": 1.83735593220339e-05, | |
| "loss": 1.4664, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 3.351365804672241, | |
| "learning_rate": 1.8305762711864408e-05, | |
| "loss": 1.4346, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "eval_loss": 1.452864408493042, | |
| "eval_runtime": 32.1316, | |
| "eval_samples_per_second": 31.122, | |
| "eval_steps_per_second": 7.781, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 2.3482844829559326, | |
| "learning_rate": 1.823796610169492e-05, | |
| "loss": 1.4468, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 2.369398355484009, | |
| "learning_rate": 1.8170169491525426e-05, | |
| "loss": 1.4347, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 4.017729759216309, | |
| "learning_rate": 1.8102372881355933e-05, | |
| "loss": 1.4521, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 3.2789974212646484, | |
| "learning_rate": 1.803457627118644e-05, | |
| "loss": 1.4646, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 3.6428675651550293, | |
| "learning_rate": 1.796677966101695e-05, | |
| "loss": 1.4449, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "eval_loss": 1.4620351791381836, | |
| "eval_runtime": 32.1148, | |
| "eval_samples_per_second": 31.138, | |
| "eval_steps_per_second": 7.785, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 2.8244452476501465, | |
| "learning_rate": 1.789898305084746e-05, | |
| "loss": 1.4519, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 2.510248899459839, | |
| "learning_rate": 1.783118644067797e-05, | |
| "loss": 1.4344, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 3.1063272953033447, | |
| "learning_rate": 1.7763389830508476e-05, | |
| "loss": 1.4184, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 3.453012228012085, | |
| "learning_rate": 1.7695593220338983e-05, | |
| "loss": 1.4423, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 3.2033214569091797, | |
| "learning_rate": 1.762779661016949e-05, | |
| "loss": 1.4131, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_loss": 1.4358395338058472, | |
| "eval_runtime": 32.0793, | |
| "eval_samples_per_second": 31.173, | |
| "eval_steps_per_second": 7.793, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 3.898287057876587, | |
| "learning_rate": 1.756e-05, | |
| "loss": 1.4205, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 2.821258068084717, | |
| "learning_rate": 1.749220338983051e-05, | |
| "loss": 1.4089, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 3.3444740772247314, | |
| "learning_rate": 1.742440677966102e-05, | |
| "loss": 1.412, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 2.98468017578125, | |
| "learning_rate": 1.7356610169491526e-05, | |
| "loss": 1.4194, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 3.4117178916931152, | |
| "learning_rate": 1.7288813559322034e-05, | |
| "loss": 1.3851, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "eval_loss": 1.3892821073532104, | |
| "eval_runtime": 32.0401, | |
| "eval_samples_per_second": 31.211, | |
| "eval_steps_per_second": 7.803, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 4.513248443603516, | |
| "learning_rate": 1.7221016949152544e-05, | |
| "loss": 1.3933, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 2.634674310684204, | |
| "learning_rate": 1.715322033898305e-05, | |
| "loss": 1.4078, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 2.2215662002563477, | |
| "learning_rate": 1.7085423728813562e-05, | |
| "loss": 1.4148, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 2.4833333492279053, | |
| "learning_rate": 1.7018305084745765e-05, | |
| "loss": 1.3975, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 9.368002891540527, | |
| "learning_rate": 1.6950508474576272e-05, | |
| "loss": 1.3656, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 1.3485517501831055, | |
| "eval_runtime": 32.0552, | |
| "eval_samples_per_second": 31.196, | |
| "eval_steps_per_second": 7.799, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 3.2558579444885254, | |
| "learning_rate": 1.688271186440678e-05, | |
| "loss": 1.3784, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 2.6388778686523438, | |
| "learning_rate": 1.681491525423729e-05, | |
| "loss": 1.3518, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 3.5066235065460205, | |
| "learning_rate": 1.67471186440678e-05, | |
| "loss": 1.3667, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 2.646228551864624, | |
| "learning_rate": 1.6679322033898308e-05, | |
| "loss": 1.3561, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 5.225134372711182, | |
| "learning_rate": 1.6611525423728815e-05, | |
| "loss": 1.3852, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "eval_loss": 1.3907063007354736, | |
| "eval_runtime": 32.0688, | |
| "eval_samples_per_second": 31.183, | |
| "eval_steps_per_second": 7.796, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 3.219271421432495, | |
| "learning_rate": 1.6543728813559322e-05, | |
| "loss": 1.3582, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 3.149339199066162, | |
| "learning_rate": 1.6475932203389833e-05, | |
| "loss": 1.3529, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 2.8628132343292236, | |
| "learning_rate": 1.640813559322034e-05, | |
| "loss": 1.3508, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 2.588643789291382, | |
| "learning_rate": 1.634033898305085e-05, | |
| "loss": 1.3409, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 2.781371831893921, | |
| "learning_rate": 1.6272542372881358e-05, | |
| "loss": 1.3321, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "eval_loss": 1.352632761001587, | |
| "eval_runtime": 32.0388, | |
| "eval_samples_per_second": 31.212, | |
| "eval_steps_per_second": 7.803, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 2.8276853561401367, | |
| "learning_rate": 1.6204745762711865e-05, | |
| "loss": 1.3299, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 2.248138189315796, | |
| "learning_rate": 1.6136949152542372e-05, | |
| "loss": 1.3344, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 2.4552414417266846, | |
| "learning_rate": 1.6069152542372883e-05, | |
| "loss": 1.3461, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 2.599705457687378, | |
| "learning_rate": 1.600135593220339e-05, | |
| "loss": 1.336, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 2.9430196285247803, | |
| "learning_rate": 1.59335593220339e-05, | |
| "loss": 1.326, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "eval_loss": 1.3490904569625854, | |
| "eval_runtime": 32.0447, | |
| "eval_samples_per_second": 31.206, | |
| "eval_steps_per_second": 7.802, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 2.6514320373535156, | |
| "learning_rate": 1.5865762711864408e-05, | |
| "loss": 1.3307, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 3.654257297515869, | |
| "learning_rate": 1.579796610169492e-05, | |
| "loss": 1.3384, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 2.250063896179199, | |
| "learning_rate": 1.5730169491525426e-05, | |
| "loss": 1.3345, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 3.3941681385040283, | |
| "learning_rate": 1.5662372881355933e-05, | |
| "loss": 1.3365, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 3.2504653930664062, | |
| "learning_rate": 1.559525423728814e-05, | |
| "loss": 1.3341, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "eval_loss": 1.3799883127212524, | |
| "eval_runtime": 32.0499, | |
| "eval_samples_per_second": 31.201, | |
| "eval_steps_per_second": 7.8, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 4.476578712463379, | |
| "learning_rate": 1.5527457627118646e-05, | |
| "loss": 1.3137, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 2.3496310710906982, | |
| "learning_rate": 1.5459661016949153e-05, | |
| "loss": 1.3091, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 2.8387935161590576, | |
| "learning_rate": 1.539186440677966e-05, | |
| "loss": 1.3111, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 2.771179437637329, | |
| "learning_rate": 1.532406779661017e-05, | |
| "loss": 1.3237, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 6.6864800453186035, | |
| "learning_rate": 1.5256271186440678e-05, | |
| "loss": 1.2966, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "eval_loss": 1.3431659936904907, | |
| "eval_runtime": 32.03, | |
| "eval_samples_per_second": 31.221, | |
| "eval_steps_per_second": 7.805, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 3.6726136207580566, | |
| "learning_rate": 1.5188474576271189e-05, | |
| "loss": 1.3214, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 2.3472280502319336, | |
| "learning_rate": 1.5120677966101696e-05, | |
| "loss": 1.2976, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 4.14886474609375, | |
| "learning_rate": 1.5052881355932205e-05, | |
| "loss": 1.3035, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 2.4956181049346924, | |
| "learning_rate": 1.4985084745762712e-05, | |
| "loss": 1.291, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 2.7873871326446533, | |
| "learning_rate": 1.4917288135593221e-05, | |
| "loss": 1.3038, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_loss": 1.2782970666885376, | |
| "eval_runtime": 32.0401, | |
| "eval_samples_per_second": 31.211, | |
| "eval_steps_per_second": 7.803, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 3.488408327102661, | |
| "learning_rate": 1.4849491525423729e-05, | |
| "loss": 1.2882, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 2.6660850048065186, | |
| "learning_rate": 1.478169491525424e-05, | |
| "loss": 1.2943, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 2.9623501300811768, | |
| "learning_rate": 1.4713898305084746e-05, | |
| "loss": 1.3037, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 3.1543288230895996, | |
| "learning_rate": 1.4646101694915255e-05, | |
| "loss": 1.2855, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 4.273654460906982, | |
| "learning_rate": 1.4578305084745763e-05, | |
| "loss": 1.2909, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "eval_loss": 1.2626874446868896, | |
| "eval_runtime": 32.031, | |
| "eval_samples_per_second": 31.22, | |
| "eval_steps_per_second": 7.805, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 3.6739509105682373, | |
| "learning_rate": 1.4510508474576273e-05, | |
| "loss": 1.2998, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 3.7686262130737305, | |
| "learning_rate": 1.444271186440678e-05, | |
| "loss": 1.2705, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 2.919701099395752, | |
| "learning_rate": 1.437491525423729e-05, | |
| "loss": 1.2868, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 2.2622547149658203, | |
| "learning_rate": 1.4307118644067797e-05, | |
| "loss": 1.2878, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 3.142400026321411, | |
| "learning_rate": 1.4239322033898306e-05, | |
| "loss": 1.2917, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "eval_loss": 1.2586073875427246, | |
| "eval_runtime": 32.0452, | |
| "eval_samples_per_second": 31.206, | |
| "eval_steps_per_second": 7.801, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 3.069307565689087, | |
| "learning_rate": 1.4171525423728816e-05, | |
| "loss": 1.2823, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 2.5960686206817627, | |
| "learning_rate": 1.4103728813559323e-05, | |
| "loss": 1.2857, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 2.7897732257843018, | |
| "learning_rate": 1.4035932203389832e-05, | |
| "loss": 1.2802, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 2.958104133605957, | |
| "learning_rate": 1.396813559322034e-05, | |
| "loss": 1.2557, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 10.324545860290527, | |
| "learning_rate": 1.390033898305085e-05, | |
| "loss": 1.2593, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "eval_loss": 1.2821636199951172, | |
| "eval_runtime": 32.0428, | |
| "eval_samples_per_second": 31.208, | |
| "eval_steps_per_second": 7.802, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 2.709005832672119, | |
| "learning_rate": 1.3833220338983051e-05, | |
| "loss": 1.2662, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 3.7861385345458984, | |
| "learning_rate": 1.376542372881356e-05, | |
| "loss": 1.2771, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 3.6657748222351074, | |
| "learning_rate": 1.3697627118644067e-05, | |
| "loss": 1.2507, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 2.3001210689544678, | |
| "learning_rate": 1.3629830508474578e-05, | |
| "loss": 1.2707, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 3.042156219482422, | |
| "learning_rate": 1.3562033898305085e-05, | |
| "loss": 1.2603, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 1.328155755996704, | |
| "eval_runtime": 32.083, | |
| "eval_samples_per_second": 31.169, | |
| "eval_steps_per_second": 7.792, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 3.44785213470459, | |
| "learning_rate": 1.3494237288135594e-05, | |
| "loss": 1.2366, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 2.8859453201293945, | |
| "learning_rate": 1.3426440677966105e-05, | |
| "loss": 1.2456, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 4.074487686157227, | |
| "learning_rate": 1.3359322033898305e-05, | |
| "loss": 1.2381, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 2.731782913208008, | |
| "learning_rate": 1.3291525423728814e-05, | |
| "loss": 1.2455, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 3.386629343032837, | |
| "learning_rate": 1.3223728813559322e-05, | |
| "loss": 1.2106, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "eval_loss": 1.2644318342208862, | |
| "eval_runtime": 32.0624, | |
| "eval_samples_per_second": 31.189, | |
| "eval_steps_per_second": 7.797, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 3.1178646087646484, | |
| "learning_rate": 1.3155932203389832e-05, | |
| "loss": 1.2235, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 4.623811721801758, | |
| "learning_rate": 1.308813559322034e-05, | |
| "loss": 1.2485, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 2.8791615962982178, | |
| "learning_rate": 1.3020338983050848e-05, | |
| "loss": 1.223, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 3.4094972610473633, | |
| "learning_rate": 1.2952542372881356e-05, | |
| "loss": 1.2052, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 3.0983850955963135, | |
| "learning_rate": 1.2884745762711866e-05, | |
| "loss": 1.2365, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "eval_loss": 1.2809499502182007, | |
| "eval_runtime": 32.0686, | |
| "eval_samples_per_second": 31.183, | |
| "eval_steps_per_second": 7.796, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 3.0877022743225098, | |
| "learning_rate": 1.2816949152542375e-05, | |
| "loss": 1.2395, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 3.992199182510376, | |
| "learning_rate": 1.2749152542372882e-05, | |
| "loss": 1.2363, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 2.398602247238159, | |
| "learning_rate": 1.2681355932203391e-05, | |
| "loss": 1.2013, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 2.6053998470306396, | |
| "learning_rate": 1.2613559322033899e-05, | |
| "loss": 1.2072, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 2.841458559036255, | |
| "learning_rate": 1.2545762711864409e-05, | |
| "loss": 1.2342, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "eval_loss": 1.2798084020614624, | |
| "eval_runtime": 32.0506, | |
| "eval_samples_per_second": 31.201, | |
| "eval_steps_per_second": 7.8, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 7.328969478607178, | |
| "learning_rate": 1.2477966101694916e-05, | |
| "loss": 1.2122, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 3.2942442893981934, | |
| "learning_rate": 1.2410169491525425e-05, | |
| "loss": 1.2196, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 3.7863004207611084, | |
| "learning_rate": 1.2342372881355933e-05, | |
| "loss": 1.2078, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 4.496535778045654, | |
| "learning_rate": 1.2274576271186443e-05, | |
| "loss": 1.2191, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 3.3873374462127686, | |
| "learning_rate": 1.220677966101695e-05, | |
| "loss": 1.2232, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_loss": 1.233884334564209, | |
| "eval_runtime": 32.1206, | |
| "eval_samples_per_second": 31.133, | |
| "eval_steps_per_second": 7.783, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 3.0606558322906494, | |
| "learning_rate": 1.213898305084746e-05, | |
| "loss": 1.2159, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 3.141958475112915, | |
| "learning_rate": 1.2071186440677967e-05, | |
| "loss": 1.1979, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 2.942594051361084, | |
| "learning_rate": 1.2003389830508475e-05, | |
| "loss": 1.2046, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 4.217489719390869, | |
| "learning_rate": 1.1935593220338983e-05, | |
| "loss": 1.2257, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 2.2574400901794434, | |
| "learning_rate": 1.1867796610169493e-05, | |
| "loss": 1.2101, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "eval_loss": 1.2290363311767578, | |
| "eval_runtime": 32.0859, | |
| "eval_samples_per_second": 31.166, | |
| "eval_steps_per_second": 7.792, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 3.426748752593994, | |
| "learning_rate": 1.18e-05, | |
| "loss": 1.1919, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 3.2718164920806885, | |
| "learning_rate": 1.173220338983051e-05, | |
| "loss": 1.1847, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 2.755702018737793, | |
| "learning_rate": 1.1664406779661017e-05, | |
| "loss": 1.1958, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 2.5422780513763428, | |
| "learning_rate": 1.1596610169491527e-05, | |
| "loss": 1.2087, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 2.8023762702941895, | |
| "learning_rate": 1.1528813559322035e-05, | |
| "loss": 1.2158, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "eval_loss": 1.2218210697174072, | |
| "eval_runtime": 32.0875, | |
| "eval_samples_per_second": 31.165, | |
| "eval_steps_per_second": 7.791, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 2.298729181289673, | |
| "learning_rate": 1.1461016949152543e-05, | |
| "loss": 1.2015, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 3.4919259548187256, | |
| "learning_rate": 1.1393898305084748e-05, | |
| "loss": 1.2073, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 2.8417234420776367, | |
| "learning_rate": 1.1326101694915255e-05, | |
| "loss": 1.1753, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 2.5789356231689453, | |
| "learning_rate": 1.1258305084745764e-05, | |
| "loss": 1.2063, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 2.643751859664917, | |
| "learning_rate": 1.1190508474576271e-05, | |
| "loss": 1.1954, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "eval_loss": 1.2146854400634766, | |
| "eval_runtime": 32.1302, | |
| "eval_samples_per_second": 31.123, | |
| "eval_steps_per_second": 7.781, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 3.573153495788574, | |
| "learning_rate": 1.1122711864406782e-05, | |
| "loss": 1.2029, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 3.4587175846099854, | |
| "learning_rate": 1.1054915254237289e-05, | |
| "loss": 1.1863, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 3.2643883228302, | |
| "learning_rate": 1.0987118644067798e-05, | |
| "loss": 1.2034, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 2.882018804550171, | |
| "learning_rate": 1.0919322033898305e-05, | |
| "loss": 1.1962, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 2.8615987300872803, | |
| "learning_rate": 1.0851525423728814e-05, | |
| "loss": 1.1839, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "eval_loss": 1.224142074584961, | |
| "eval_runtime": 32.1391, | |
| "eval_samples_per_second": 31.115, | |
| "eval_steps_per_second": 7.779, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 2.88732647895813, | |
| "learning_rate": 1.0783728813559321e-05, | |
| "loss": 1.1939, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 2.5806539058685303, | |
| "learning_rate": 1.0715932203389832e-05, | |
| "loss": 1.1893, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 2.975961446762085, | |
| "learning_rate": 1.0648135593220339e-05, | |
| "loss": 1.1679, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 3.987508773803711, | |
| "learning_rate": 1.0580338983050848e-05, | |
| "loss": 1.1939, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 2.954615354537964, | |
| "learning_rate": 1.0512542372881355e-05, | |
| "loss": 1.1823, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "eval_loss": 1.1981159448623657, | |
| "eval_runtime": 32.1026, | |
| "eval_samples_per_second": 31.15, | |
| "eval_steps_per_second": 7.788, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 2.9803731441497803, | |
| "learning_rate": 1.0444745762711866e-05, | |
| "loss": 1.1792, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 3.527334451675415, | |
| "learning_rate": 1.0376949152542373e-05, | |
| "loss": 1.1786, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 2.8651585578918457, | |
| "learning_rate": 1.0309152542372882e-05, | |
| "loss": 1.177, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 2.527571201324463, | |
| "learning_rate": 1.0241355932203391e-05, | |
| "loss": 1.1863, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 4.754082202911377, | |
| "learning_rate": 1.01735593220339e-05, | |
| "loss": 1.2066, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 1.197231650352478, | |
| "eval_runtime": 32.0595, | |
| "eval_samples_per_second": 31.192, | |
| "eval_steps_per_second": 7.798, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 3.02, | |
| "grad_norm": 2.30944561958313, | |
| "learning_rate": 1.0105762711864409e-05, | |
| "loss": 1.1471, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 2.808654308319092, | |
| "learning_rate": 1.0037966101694916e-05, | |
| "loss": 1.1552, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 2.997007369995117, | |
| "learning_rate": 9.970169491525425e-06, | |
| "loss": 1.1387, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 2.888899564743042, | |
| "learning_rate": 9.902372881355932e-06, | |
| "loss": 1.1245, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 5.218863010406494, | |
| "learning_rate": 9.834576271186441e-06, | |
| "loss": 1.1395, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "eval_loss": 1.1928576231002808, | |
| "eval_runtime": 32.0387, | |
| "eval_samples_per_second": 31.212, | |
| "eval_steps_per_second": 7.803, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 3.4590179920196533, | |
| "learning_rate": 9.76677966101695e-06, | |
| "loss": 1.1391, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 3.14, | |
| "grad_norm": 3.658998966217041, | |
| "learning_rate": 9.698983050847457e-06, | |
| "loss": 1.1298, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 3.16, | |
| "grad_norm": 3.6307895183563232, | |
| "learning_rate": 9.631186440677966e-06, | |
| "loss": 1.1395, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 3.18, | |
| "grad_norm": 4.277059078216553, | |
| "learning_rate": 9.563389830508475e-06, | |
| "loss": 1.144, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 3.627415657043457, | |
| "learning_rate": 9.495593220338984e-06, | |
| "loss": 1.1276, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_loss": 1.1989972591400146, | |
| "eval_runtime": 32.1164, | |
| "eval_samples_per_second": 31.137, | |
| "eval_steps_per_second": 7.784, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 3.22, | |
| "grad_norm": 3.0596442222595215, | |
| "learning_rate": 9.427796610169491e-06, | |
| "loss": 1.1137, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 3.24, | |
| "grad_norm": 3.6272132396698, | |
| "learning_rate": 9.360000000000002e-06, | |
| "loss": 1.1315, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 3.26, | |
| "grad_norm": 4.7155232429504395, | |
| "learning_rate": 9.292203389830509e-06, | |
| "loss": 1.1109, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 3.28, | |
| "grad_norm": 3.0606753826141357, | |
| "learning_rate": 9.224406779661018e-06, | |
| "loss": 1.1356, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 3.3, | |
| "grad_norm": 3.105088233947754, | |
| "learning_rate": 9.156610169491527e-06, | |
| "loss": 1.12, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 3.3, | |
| "eval_loss": 1.2401061058044434, | |
| "eval_runtime": 32.03, | |
| "eval_samples_per_second": 31.221, | |
| "eval_steps_per_second": 7.805, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 3.32, | |
| "grad_norm": 4.067813873291016, | |
| "learning_rate": 9.088813559322036e-06, | |
| "loss": 1.1264, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "grad_norm": 3.445667028427124, | |
| "learning_rate": 9.021016949152543e-06, | |
| "loss": 1.1131, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 3.6178576946258545, | |
| "learning_rate": 8.953220338983052e-06, | |
| "loss": 1.1286, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 3.38, | |
| "grad_norm": 3.9732513427734375, | |
| "learning_rate": 8.885423728813561e-06, | |
| "loss": 1.1324, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 2.9241085052490234, | |
| "learning_rate": 8.817627118644068e-06, | |
| "loss": 1.1403, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "eval_loss": 1.1982412338256836, | |
| "eval_runtime": 32.0231, | |
| "eval_samples_per_second": 31.227, | |
| "eval_steps_per_second": 7.807, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 3.42, | |
| "grad_norm": 3.081540822982788, | |
| "learning_rate": 8.749830508474577e-06, | |
| "loss": 1.1308, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 2.958272695541382, | |
| "learning_rate": 8.682033898305086e-06, | |
| "loss": 1.1305, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 3.46, | |
| "grad_norm": 3.6438255310058594, | |
| "learning_rate": 8.614237288135593e-06, | |
| "loss": 1.1224, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 3.043936014175415, | |
| "learning_rate": 8.546440677966102e-06, | |
| "loss": 1.1257, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 3.0439295768737793, | |
| "learning_rate": 8.479322033898306e-06, | |
| "loss": 1.1277, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "eval_loss": 1.2347404956817627, | |
| "eval_runtime": 32.0794, | |
| "eval_samples_per_second": 31.173, | |
| "eval_steps_per_second": 7.793, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 3.439828872680664, | |
| "learning_rate": 8.411525423728815e-06, | |
| "loss": 1.1214, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 3.54, | |
| "grad_norm": 3.504438638687134, | |
| "learning_rate": 8.343728813559323e-06, | |
| "loss": 1.1288, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 3.479522943496704, | |
| "learning_rate": 8.275932203389832e-06, | |
| "loss": 1.1066, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "grad_norm": 4.671799182891846, | |
| "learning_rate": 8.20813559322034e-06, | |
| "loss": 1.113, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 3.815126895904541, | |
| "learning_rate": 8.140338983050848e-06, | |
| "loss": 1.1067, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "eval_loss": 1.14591646194458, | |
| "eval_runtime": 32.0262, | |
| "eval_samples_per_second": 31.224, | |
| "eval_steps_per_second": 7.806, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "grad_norm": 3.5889601707458496, | |
| "learning_rate": 8.072542372881357e-06, | |
| "loss": 1.1344, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 3.64, | |
| "grad_norm": 3.246824264526367, | |
| "learning_rate": 8.004745762711866e-06, | |
| "loss": 1.1067, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 3.66, | |
| "grad_norm": 3.164020538330078, | |
| "learning_rate": 7.936949152542374e-06, | |
| "loss": 1.1187, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 3.3035717010498047, | |
| "learning_rate": 7.869152542372882e-06, | |
| "loss": 1.0953, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "grad_norm": 4.341372013092041, | |
| "learning_rate": 7.80135593220339e-06, | |
| "loss": 1.111, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "eval_loss": 1.1732285022735596, | |
| "eval_runtime": 32.0886, | |
| "eval_samples_per_second": 31.164, | |
| "eval_steps_per_second": 7.791, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 3.72, | |
| "grad_norm": 3.3150527477264404, | |
| "learning_rate": 7.7335593220339e-06, | |
| "loss": 1.1056, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 3.74, | |
| "grad_norm": 3.855264663696289, | |
| "learning_rate": 7.665762711864407e-06, | |
| "loss": 1.1424, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 3.5808959007263184, | |
| "learning_rate": 7.597966101694916e-06, | |
| "loss": 1.0958, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 3.78, | |
| "grad_norm": 4.313356876373291, | |
| "learning_rate": 7.530169491525425e-06, | |
| "loss": 1.1009, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 3.271247625350952, | |
| "learning_rate": 7.462372881355933e-06, | |
| "loss": 1.121, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "eval_loss": 1.1872472763061523, | |
| "eval_runtime": 32.0758, | |
| "eval_samples_per_second": 31.176, | |
| "eval_steps_per_second": 7.794, | |
| "step": 19000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 30000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 6, | |
| "save_steps": 500, | |
| "total_flos": 1.793641609691136e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |