{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.8, "eval_steps": 500, "global_step": 19000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 5.518036842346191, "learning_rate": 4.000000000000001e-06, "loss": 1.8359, "step": 100 }, { "epoch": 0.04, "grad_norm": 9.091249465942383, "learning_rate": 8.000000000000001e-06, "loss": 1.6874, "step": 200 }, { "epoch": 0.06, "grad_norm": 4.175647735595703, "learning_rate": 1.2e-05, "loss": 1.6864, "step": 300 }, { "epoch": 0.08, "grad_norm": 3.6093075275421143, "learning_rate": 1.6000000000000003e-05, "loss": 1.6442, "step": 400 }, { "epoch": 0.1, "grad_norm": 3.347723960876465, "learning_rate": 2e-05, "loss": 1.6588, "step": 500 }, { "epoch": 0.1, "eval_loss": 1.6333842277526855, "eval_runtime": 32.0898, "eval_samples_per_second": 31.163, "eval_steps_per_second": 7.791, "step": 500 }, { "epoch": 0.12, "grad_norm": 3.886333465576172, "learning_rate": 1.9932203389830512e-05, "loss": 1.6738, "step": 600 }, { "epoch": 0.14, "grad_norm": 2.807443618774414, "learning_rate": 1.986440677966102e-05, "loss": 1.625, "step": 700 }, { "epoch": 0.16, "grad_norm": 3.290576219558716, "learning_rate": 1.9796610169491527e-05, "loss": 1.623, "step": 800 }, { "epoch": 0.18, "grad_norm": 5.103792667388916, "learning_rate": 1.9728813559322034e-05, "loss": 1.6119, "step": 900 }, { "epoch": 0.2, "grad_norm": 4.206387996673584, "learning_rate": 1.9661016949152545e-05, "loss": 1.5908, "step": 1000 }, { "epoch": 0.2, "eval_loss": 1.5665849447250366, "eval_runtime": 32.0387, "eval_samples_per_second": 31.212, "eval_steps_per_second": 7.803, "step": 1000 }, { "epoch": 0.22, "grad_norm": 4.045094013214111, "learning_rate": 1.9593220338983052e-05, "loss": 1.5673, "step": 1100 }, { "epoch": 0.24, "grad_norm": 3.8652350902557373, "learning_rate": 1.9525423728813562e-05, "loss": 1.5724, "step": 1200 }, { "epoch": 0.26, "grad_norm": 2.9019522666931152, "learning_rate": 1.945762711864407e-05, "loss": 1.5576, "step": 1300 }, { "epoch": 0.28, "grad_norm": 4.4180498123168945, "learning_rate": 1.938983050847458e-05, "loss": 1.5675, "step": 1400 }, { "epoch": 0.3, "grad_norm": 2.4532687664031982, "learning_rate": 1.9322033898305087e-05, "loss": 1.5344, "step": 1500 }, { "epoch": 0.3, "eval_loss": 1.5171211957931519, "eval_runtime": 32.0626, "eval_samples_per_second": 31.189, "eval_steps_per_second": 7.797, "step": 1500 }, { "epoch": 0.32, "grad_norm": 3.2281365394592285, "learning_rate": 1.9254237288135595e-05, "loss": 1.5391, "step": 1600 }, { "epoch": 0.34, "grad_norm": 3.9512276649475098, "learning_rate": 1.91871186440678e-05, "loss": 1.5185, "step": 1700 }, { "epoch": 0.36, "grad_norm": 3.428534984588623, "learning_rate": 1.9119322033898308e-05, "loss": 1.5175, "step": 1800 }, { "epoch": 0.38, "grad_norm": 3.495711088180542, "learning_rate": 1.9051525423728815e-05, "loss": 1.5164, "step": 1900 }, { "epoch": 0.4, "grad_norm": 2.2846789360046387, "learning_rate": 1.8983728813559322e-05, "loss": 1.5111, "step": 2000 }, { "epoch": 0.4, "eval_loss": 1.52150297164917, "eval_runtime": 32.0907, "eval_samples_per_second": 31.162, "eval_steps_per_second": 7.79, "step": 2000 }, { "epoch": 0.42, "grad_norm": 3.574172019958496, "learning_rate": 1.8915932203389833e-05, "loss": 1.5017, "step": 2100 }, { "epoch": 0.44, "grad_norm": 2.372629165649414, "learning_rate": 1.884813559322034e-05, "loss": 1.4921, "step": 2200 }, { "epoch": 0.46, "grad_norm": 3.576235771179199, "learning_rate": 1.878033898305085e-05, "loss": 1.4955, "step": 2300 }, { "epoch": 0.48, "grad_norm": 3.2708258628845215, "learning_rate": 1.8712542372881358e-05, "loss": 1.4876, "step": 2400 }, { "epoch": 0.5, "grad_norm": 3.395094394683838, "learning_rate": 1.8644745762711865e-05, "loss": 1.4824, "step": 2500 }, { "epoch": 0.5, "eval_loss": 1.5081515312194824, "eval_runtime": 32.105, "eval_samples_per_second": 31.148, "eval_steps_per_second": 7.787, "step": 2500 }, { "epoch": 0.52, "grad_norm": 2.276524066925049, "learning_rate": 1.8576949152542373e-05, "loss": 1.5011, "step": 2600 }, { "epoch": 0.54, "grad_norm": 2.002469539642334, "learning_rate": 1.8509152542372883e-05, "loss": 1.4885, "step": 2700 }, { "epoch": 0.56, "grad_norm": 3.197190523147583, "learning_rate": 1.844135593220339e-05, "loss": 1.4658, "step": 2800 }, { "epoch": 0.58, "grad_norm": 3.1256494522094727, "learning_rate": 1.83735593220339e-05, "loss": 1.4664, "step": 2900 }, { "epoch": 0.6, "grad_norm": 3.351365804672241, "learning_rate": 1.8305762711864408e-05, "loss": 1.4346, "step": 3000 }, { "epoch": 0.6, "eval_loss": 1.452864408493042, "eval_runtime": 32.1316, "eval_samples_per_second": 31.122, "eval_steps_per_second": 7.781, "step": 3000 }, { "epoch": 0.62, "grad_norm": 2.3482844829559326, "learning_rate": 1.823796610169492e-05, "loss": 1.4468, "step": 3100 }, { "epoch": 0.64, "grad_norm": 2.369398355484009, "learning_rate": 1.8170169491525426e-05, "loss": 1.4347, "step": 3200 }, { "epoch": 0.66, "grad_norm": 4.017729759216309, "learning_rate": 1.8102372881355933e-05, "loss": 1.4521, "step": 3300 }, { "epoch": 0.68, "grad_norm": 3.2789974212646484, "learning_rate": 1.803457627118644e-05, "loss": 1.4646, "step": 3400 }, { "epoch": 0.7, "grad_norm": 3.6428675651550293, "learning_rate": 1.796677966101695e-05, "loss": 1.4449, "step": 3500 }, { "epoch": 0.7, "eval_loss": 1.4620351791381836, "eval_runtime": 32.1148, "eval_samples_per_second": 31.138, "eval_steps_per_second": 7.785, "step": 3500 }, { "epoch": 0.72, "grad_norm": 2.8244452476501465, "learning_rate": 1.789898305084746e-05, "loss": 1.4519, "step": 3600 }, { "epoch": 0.74, "grad_norm": 2.510248899459839, "learning_rate": 1.783118644067797e-05, "loss": 1.4344, "step": 3700 }, { "epoch": 0.76, "grad_norm": 3.1063272953033447, "learning_rate": 1.7763389830508476e-05, "loss": 1.4184, "step": 3800 }, { "epoch": 0.78, "grad_norm": 3.453012228012085, "learning_rate": 1.7695593220338983e-05, "loss": 1.4423, "step": 3900 }, { "epoch": 0.8, "grad_norm": 3.2033214569091797, "learning_rate": 1.762779661016949e-05, "loss": 1.4131, "step": 4000 }, { "epoch": 0.8, "eval_loss": 1.4358395338058472, "eval_runtime": 32.0793, "eval_samples_per_second": 31.173, "eval_steps_per_second": 7.793, "step": 4000 }, { "epoch": 0.82, "grad_norm": 3.898287057876587, "learning_rate": 1.756e-05, "loss": 1.4205, "step": 4100 }, { "epoch": 0.84, "grad_norm": 2.821258068084717, "learning_rate": 1.749220338983051e-05, "loss": 1.4089, "step": 4200 }, { "epoch": 0.86, "grad_norm": 3.3444740772247314, "learning_rate": 1.742440677966102e-05, "loss": 1.412, "step": 4300 }, { "epoch": 0.88, "grad_norm": 2.98468017578125, "learning_rate": 1.7356610169491526e-05, "loss": 1.4194, "step": 4400 }, { "epoch": 0.9, "grad_norm": 3.4117178916931152, "learning_rate": 1.7288813559322034e-05, "loss": 1.3851, "step": 4500 }, { "epoch": 0.9, "eval_loss": 1.3892821073532104, "eval_runtime": 32.0401, "eval_samples_per_second": 31.211, "eval_steps_per_second": 7.803, "step": 4500 }, { "epoch": 0.92, "grad_norm": 4.513248443603516, "learning_rate": 1.7221016949152544e-05, "loss": 1.3933, "step": 4600 }, { "epoch": 0.94, "grad_norm": 2.634674310684204, "learning_rate": 1.715322033898305e-05, "loss": 1.4078, "step": 4700 }, { "epoch": 0.96, "grad_norm": 2.2215662002563477, "learning_rate": 1.7085423728813562e-05, "loss": 1.4148, "step": 4800 }, { "epoch": 0.98, "grad_norm": 2.4833333492279053, "learning_rate": 1.7018305084745765e-05, "loss": 1.3975, "step": 4900 }, { "epoch": 1.0, "grad_norm": 9.368002891540527, "learning_rate": 1.6950508474576272e-05, "loss": 1.3656, "step": 5000 }, { "epoch": 1.0, "eval_loss": 1.3485517501831055, "eval_runtime": 32.0552, "eval_samples_per_second": 31.196, "eval_steps_per_second": 7.799, "step": 5000 }, { "epoch": 1.02, "grad_norm": 3.2558579444885254, "learning_rate": 1.688271186440678e-05, "loss": 1.3784, "step": 5100 }, { "epoch": 1.04, "grad_norm": 2.6388778686523438, "learning_rate": 1.681491525423729e-05, "loss": 1.3518, "step": 5200 }, { "epoch": 1.06, "grad_norm": 3.5066235065460205, "learning_rate": 1.67471186440678e-05, "loss": 1.3667, "step": 5300 }, { "epoch": 1.08, "grad_norm": 2.646228551864624, "learning_rate": 1.6679322033898308e-05, "loss": 1.3561, "step": 5400 }, { "epoch": 1.1, "grad_norm": 5.225134372711182, "learning_rate": 1.6611525423728815e-05, "loss": 1.3852, "step": 5500 }, { "epoch": 1.1, "eval_loss": 1.3907063007354736, "eval_runtime": 32.0688, "eval_samples_per_second": 31.183, "eval_steps_per_second": 7.796, "step": 5500 }, { "epoch": 1.12, "grad_norm": 3.219271421432495, "learning_rate": 1.6543728813559322e-05, "loss": 1.3582, "step": 5600 }, { "epoch": 1.14, "grad_norm": 3.149339199066162, "learning_rate": 1.6475932203389833e-05, "loss": 1.3529, "step": 5700 }, { "epoch": 1.16, "grad_norm": 2.8628132343292236, "learning_rate": 1.640813559322034e-05, "loss": 1.3508, "step": 5800 }, { "epoch": 1.18, "grad_norm": 2.588643789291382, "learning_rate": 1.634033898305085e-05, "loss": 1.3409, "step": 5900 }, { "epoch": 1.2, "grad_norm": 2.781371831893921, "learning_rate": 1.6272542372881358e-05, "loss": 1.3321, "step": 6000 }, { "epoch": 1.2, "eval_loss": 1.352632761001587, "eval_runtime": 32.0388, "eval_samples_per_second": 31.212, "eval_steps_per_second": 7.803, "step": 6000 }, { "epoch": 1.22, "grad_norm": 2.8276853561401367, "learning_rate": 1.6204745762711865e-05, "loss": 1.3299, "step": 6100 }, { "epoch": 1.24, "grad_norm": 2.248138189315796, "learning_rate": 1.6136949152542372e-05, "loss": 1.3344, "step": 6200 }, { "epoch": 1.26, "grad_norm": 2.4552414417266846, "learning_rate": 1.6069152542372883e-05, "loss": 1.3461, "step": 6300 }, { "epoch": 1.28, "grad_norm": 2.599705457687378, "learning_rate": 1.600135593220339e-05, "loss": 1.336, "step": 6400 }, { "epoch": 1.3, "grad_norm": 2.9430196285247803, "learning_rate": 1.59335593220339e-05, "loss": 1.326, "step": 6500 }, { "epoch": 1.3, "eval_loss": 1.3490904569625854, "eval_runtime": 32.0447, "eval_samples_per_second": 31.206, "eval_steps_per_second": 7.802, "step": 6500 }, { "epoch": 1.32, "grad_norm": 2.6514320373535156, "learning_rate": 1.5865762711864408e-05, "loss": 1.3307, "step": 6600 }, { "epoch": 1.34, "grad_norm": 3.654257297515869, "learning_rate": 1.579796610169492e-05, "loss": 1.3384, "step": 6700 }, { "epoch": 1.36, "grad_norm": 2.250063896179199, "learning_rate": 1.5730169491525426e-05, "loss": 1.3345, "step": 6800 }, { "epoch": 1.38, "grad_norm": 3.3941681385040283, "learning_rate": 1.5662372881355933e-05, "loss": 1.3365, "step": 6900 }, { "epoch": 1.4, "grad_norm": 3.2504653930664062, "learning_rate": 1.559525423728814e-05, "loss": 1.3341, "step": 7000 }, { "epoch": 1.4, "eval_loss": 1.3799883127212524, "eval_runtime": 32.0499, "eval_samples_per_second": 31.201, "eval_steps_per_second": 7.8, "step": 7000 }, { "epoch": 1.42, "grad_norm": 4.476578712463379, "learning_rate": 1.5527457627118646e-05, "loss": 1.3137, "step": 7100 }, { "epoch": 1.44, "grad_norm": 2.3496310710906982, "learning_rate": 1.5459661016949153e-05, "loss": 1.3091, "step": 7200 }, { "epoch": 1.46, "grad_norm": 2.8387935161590576, "learning_rate": 1.539186440677966e-05, "loss": 1.3111, "step": 7300 }, { "epoch": 1.48, "grad_norm": 2.771179437637329, "learning_rate": 1.532406779661017e-05, "loss": 1.3237, "step": 7400 }, { "epoch": 1.5, "grad_norm": 6.6864800453186035, "learning_rate": 1.5256271186440678e-05, "loss": 1.2966, "step": 7500 }, { "epoch": 1.5, "eval_loss": 1.3431659936904907, "eval_runtime": 32.03, "eval_samples_per_second": 31.221, "eval_steps_per_second": 7.805, "step": 7500 }, { "epoch": 1.52, "grad_norm": 3.6726136207580566, "learning_rate": 1.5188474576271189e-05, "loss": 1.3214, "step": 7600 }, { "epoch": 1.54, "grad_norm": 2.3472280502319336, "learning_rate": 1.5120677966101696e-05, "loss": 1.2976, "step": 7700 }, { "epoch": 1.56, "grad_norm": 4.14886474609375, "learning_rate": 1.5052881355932205e-05, "loss": 1.3035, "step": 7800 }, { "epoch": 1.58, "grad_norm": 2.4956181049346924, "learning_rate": 1.4985084745762712e-05, "loss": 1.291, "step": 7900 }, { "epoch": 1.6, "grad_norm": 2.7873871326446533, "learning_rate": 1.4917288135593221e-05, "loss": 1.3038, "step": 8000 }, { "epoch": 1.6, "eval_loss": 1.2782970666885376, "eval_runtime": 32.0401, "eval_samples_per_second": 31.211, "eval_steps_per_second": 7.803, "step": 8000 }, { "epoch": 1.62, "grad_norm": 3.488408327102661, "learning_rate": 1.4849491525423729e-05, "loss": 1.2882, "step": 8100 }, { "epoch": 1.64, "grad_norm": 2.6660850048065186, "learning_rate": 1.478169491525424e-05, "loss": 1.2943, "step": 8200 }, { "epoch": 1.66, "grad_norm": 2.9623501300811768, "learning_rate": 1.4713898305084746e-05, "loss": 1.3037, "step": 8300 }, { "epoch": 1.68, "grad_norm": 3.1543288230895996, "learning_rate": 1.4646101694915255e-05, "loss": 1.2855, "step": 8400 }, { "epoch": 1.7, "grad_norm": 4.273654460906982, "learning_rate": 1.4578305084745763e-05, "loss": 1.2909, "step": 8500 }, { "epoch": 1.7, "eval_loss": 1.2626874446868896, "eval_runtime": 32.031, "eval_samples_per_second": 31.22, "eval_steps_per_second": 7.805, "step": 8500 }, { "epoch": 1.72, "grad_norm": 3.6739509105682373, "learning_rate": 1.4510508474576273e-05, "loss": 1.2998, "step": 8600 }, { "epoch": 1.74, "grad_norm": 3.7686262130737305, "learning_rate": 1.444271186440678e-05, "loss": 1.2705, "step": 8700 }, { "epoch": 1.76, "grad_norm": 2.919701099395752, "learning_rate": 1.437491525423729e-05, "loss": 1.2868, "step": 8800 }, { "epoch": 1.78, "grad_norm": 2.2622547149658203, "learning_rate": 1.4307118644067797e-05, "loss": 1.2878, "step": 8900 }, { "epoch": 1.8, "grad_norm": 3.142400026321411, "learning_rate": 1.4239322033898306e-05, "loss": 1.2917, "step": 9000 }, { "epoch": 1.8, "eval_loss": 1.2586073875427246, "eval_runtime": 32.0452, "eval_samples_per_second": 31.206, "eval_steps_per_second": 7.801, "step": 9000 }, { "epoch": 1.82, "grad_norm": 3.069307565689087, "learning_rate": 1.4171525423728816e-05, "loss": 1.2823, "step": 9100 }, { "epoch": 1.84, "grad_norm": 2.5960686206817627, "learning_rate": 1.4103728813559323e-05, "loss": 1.2857, "step": 9200 }, { "epoch": 1.86, "grad_norm": 2.7897732257843018, "learning_rate": 1.4035932203389832e-05, "loss": 1.2802, "step": 9300 }, { "epoch": 1.88, "grad_norm": 2.958104133605957, "learning_rate": 1.396813559322034e-05, "loss": 1.2557, "step": 9400 }, { "epoch": 1.9, "grad_norm": 10.324545860290527, "learning_rate": 1.390033898305085e-05, "loss": 1.2593, "step": 9500 }, { "epoch": 1.9, "eval_loss": 1.2821636199951172, "eval_runtime": 32.0428, "eval_samples_per_second": 31.208, "eval_steps_per_second": 7.802, "step": 9500 }, { "epoch": 1.92, "grad_norm": 2.709005832672119, "learning_rate": 1.3833220338983051e-05, "loss": 1.2662, "step": 9600 }, { "epoch": 1.94, "grad_norm": 3.7861385345458984, "learning_rate": 1.376542372881356e-05, "loss": 1.2771, "step": 9700 }, { "epoch": 1.96, "grad_norm": 3.6657748222351074, "learning_rate": 1.3697627118644067e-05, "loss": 1.2507, "step": 9800 }, { "epoch": 1.98, "grad_norm": 2.3001210689544678, "learning_rate": 1.3629830508474578e-05, "loss": 1.2707, "step": 9900 }, { "epoch": 2.0, "grad_norm": 3.042156219482422, "learning_rate": 1.3562033898305085e-05, "loss": 1.2603, "step": 10000 }, { "epoch": 2.0, "eval_loss": 1.328155755996704, "eval_runtime": 32.083, "eval_samples_per_second": 31.169, "eval_steps_per_second": 7.792, "step": 10000 }, { "epoch": 2.02, "grad_norm": 3.44785213470459, "learning_rate": 1.3494237288135594e-05, "loss": 1.2366, "step": 10100 }, { "epoch": 2.04, "grad_norm": 2.8859453201293945, "learning_rate": 1.3426440677966105e-05, "loss": 1.2456, "step": 10200 }, { "epoch": 2.06, "grad_norm": 4.074487686157227, "learning_rate": 1.3359322033898305e-05, "loss": 1.2381, "step": 10300 }, { "epoch": 2.08, "grad_norm": 2.731782913208008, "learning_rate": 1.3291525423728814e-05, "loss": 1.2455, "step": 10400 }, { "epoch": 2.1, "grad_norm": 3.386629343032837, "learning_rate": 1.3223728813559322e-05, "loss": 1.2106, "step": 10500 }, { "epoch": 2.1, "eval_loss": 1.2644318342208862, "eval_runtime": 32.0624, "eval_samples_per_second": 31.189, "eval_steps_per_second": 7.797, "step": 10500 }, { "epoch": 2.12, "grad_norm": 3.1178646087646484, "learning_rate": 1.3155932203389832e-05, "loss": 1.2235, "step": 10600 }, { "epoch": 2.14, "grad_norm": 4.623811721801758, "learning_rate": 1.308813559322034e-05, "loss": 1.2485, "step": 10700 }, { "epoch": 2.16, "grad_norm": 2.8791615962982178, "learning_rate": 1.3020338983050848e-05, "loss": 1.223, "step": 10800 }, { "epoch": 2.18, "grad_norm": 3.4094972610473633, "learning_rate": 1.2952542372881356e-05, "loss": 1.2052, "step": 10900 }, { "epoch": 2.2, "grad_norm": 3.0983850955963135, "learning_rate": 1.2884745762711866e-05, "loss": 1.2365, "step": 11000 }, { "epoch": 2.2, "eval_loss": 1.2809499502182007, "eval_runtime": 32.0686, "eval_samples_per_second": 31.183, "eval_steps_per_second": 7.796, "step": 11000 }, { "epoch": 2.22, "grad_norm": 3.0877022743225098, "learning_rate": 1.2816949152542375e-05, "loss": 1.2395, "step": 11100 }, { "epoch": 2.24, "grad_norm": 3.992199182510376, "learning_rate": 1.2749152542372882e-05, "loss": 1.2363, "step": 11200 }, { "epoch": 2.26, "grad_norm": 2.398602247238159, "learning_rate": 1.2681355932203391e-05, "loss": 1.2013, "step": 11300 }, { "epoch": 2.28, "grad_norm": 2.6053998470306396, "learning_rate": 1.2613559322033899e-05, "loss": 1.2072, "step": 11400 }, { "epoch": 2.3, "grad_norm": 2.841458559036255, "learning_rate": 1.2545762711864409e-05, "loss": 1.2342, "step": 11500 }, { "epoch": 2.3, "eval_loss": 1.2798084020614624, "eval_runtime": 32.0506, "eval_samples_per_second": 31.201, "eval_steps_per_second": 7.8, "step": 11500 }, { "epoch": 2.32, "grad_norm": 7.328969478607178, "learning_rate": 1.2477966101694916e-05, "loss": 1.2122, "step": 11600 }, { "epoch": 2.34, "grad_norm": 3.2942442893981934, "learning_rate": 1.2410169491525425e-05, "loss": 1.2196, "step": 11700 }, { "epoch": 2.36, "grad_norm": 3.7863004207611084, "learning_rate": 1.2342372881355933e-05, "loss": 1.2078, "step": 11800 }, { "epoch": 2.38, "grad_norm": 4.496535778045654, "learning_rate": 1.2274576271186443e-05, "loss": 1.2191, "step": 11900 }, { "epoch": 2.4, "grad_norm": 3.3873374462127686, "learning_rate": 1.220677966101695e-05, "loss": 1.2232, "step": 12000 }, { "epoch": 2.4, "eval_loss": 1.233884334564209, "eval_runtime": 32.1206, "eval_samples_per_second": 31.133, "eval_steps_per_second": 7.783, "step": 12000 }, { "epoch": 2.42, "grad_norm": 3.0606558322906494, "learning_rate": 1.213898305084746e-05, "loss": 1.2159, "step": 12100 }, { "epoch": 2.44, "grad_norm": 3.141958475112915, "learning_rate": 1.2071186440677967e-05, "loss": 1.1979, "step": 12200 }, { "epoch": 2.46, "grad_norm": 2.942594051361084, "learning_rate": 1.2003389830508475e-05, "loss": 1.2046, "step": 12300 }, { "epoch": 2.48, "grad_norm": 4.217489719390869, "learning_rate": 1.1935593220338983e-05, "loss": 1.2257, "step": 12400 }, { "epoch": 2.5, "grad_norm": 2.2574400901794434, "learning_rate": 1.1867796610169493e-05, "loss": 1.2101, "step": 12500 }, { "epoch": 2.5, "eval_loss": 1.2290363311767578, "eval_runtime": 32.0859, "eval_samples_per_second": 31.166, "eval_steps_per_second": 7.792, "step": 12500 }, { "epoch": 2.52, "grad_norm": 3.426748752593994, "learning_rate": 1.18e-05, "loss": 1.1919, "step": 12600 }, { "epoch": 2.54, "grad_norm": 3.2718164920806885, "learning_rate": 1.173220338983051e-05, "loss": 1.1847, "step": 12700 }, { "epoch": 2.56, "grad_norm": 2.755702018737793, "learning_rate": 1.1664406779661017e-05, "loss": 1.1958, "step": 12800 }, { "epoch": 2.58, "grad_norm": 2.5422780513763428, "learning_rate": 1.1596610169491527e-05, "loss": 1.2087, "step": 12900 }, { "epoch": 2.6, "grad_norm": 2.8023762702941895, "learning_rate": 1.1528813559322035e-05, "loss": 1.2158, "step": 13000 }, { "epoch": 2.6, "eval_loss": 1.2218210697174072, "eval_runtime": 32.0875, "eval_samples_per_second": 31.165, "eval_steps_per_second": 7.791, "step": 13000 }, { "epoch": 2.62, "grad_norm": 2.298729181289673, "learning_rate": 1.1461016949152543e-05, "loss": 1.2015, "step": 13100 }, { "epoch": 2.64, "grad_norm": 3.4919259548187256, "learning_rate": 1.1393898305084748e-05, "loss": 1.2073, "step": 13200 }, { "epoch": 2.66, "grad_norm": 2.8417234420776367, "learning_rate": 1.1326101694915255e-05, "loss": 1.1753, "step": 13300 }, { "epoch": 2.68, "grad_norm": 2.5789356231689453, "learning_rate": 1.1258305084745764e-05, "loss": 1.2063, "step": 13400 }, { "epoch": 2.7, "grad_norm": 2.643751859664917, "learning_rate": 1.1190508474576271e-05, "loss": 1.1954, "step": 13500 }, { "epoch": 2.7, "eval_loss": 1.2146854400634766, "eval_runtime": 32.1302, "eval_samples_per_second": 31.123, "eval_steps_per_second": 7.781, "step": 13500 }, { "epoch": 2.72, "grad_norm": 3.573153495788574, "learning_rate": 1.1122711864406782e-05, "loss": 1.2029, "step": 13600 }, { "epoch": 2.74, "grad_norm": 3.4587175846099854, "learning_rate": 1.1054915254237289e-05, "loss": 1.1863, "step": 13700 }, { "epoch": 2.76, "grad_norm": 3.2643883228302, "learning_rate": 1.0987118644067798e-05, "loss": 1.2034, "step": 13800 }, { "epoch": 2.78, "grad_norm": 2.882018804550171, "learning_rate": 1.0919322033898305e-05, "loss": 1.1962, "step": 13900 }, { "epoch": 2.8, "grad_norm": 2.8615987300872803, "learning_rate": 1.0851525423728814e-05, "loss": 1.1839, "step": 14000 }, { "epoch": 2.8, "eval_loss": 1.224142074584961, "eval_runtime": 32.1391, "eval_samples_per_second": 31.115, "eval_steps_per_second": 7.779, "step": 14000 }, { "epoch": 2.82, "grad_norm": 2.88732647895813, "learning_rate": 1.0783728813559321e-05, "loss": 1.1939, "step": 14100 }, { "epoch": 2.84, "grad_norm": 2.5806539058685303, "learning_rate": 1.0715932203389832e-05, "loss": 1.1893, "step": 14200 }, { "epoch": 2.86, "grad_norm": 2.975961446762085, "learning_rate": 1.0648135593220339e-05, "loss": 1.1679, "step": 14300 }, { "epoch": 2.88, "grad_norm": 3.987508773803711, "learning_rate": 1.0580338983050848e-05, "loss": 1.1939, "step": 14400 }, { "epoch": 2.9, "grad_norm": 2.954615354537964, "learning_rate": 1.0512542372881355e-05, "loss": 1.1823, "step": 14500 }, { "epoch": 2.9, "eval_loss": 1.1981159448623657, "eval_runtime": 32.1026, "eval_samples_per_second": 31.15, "eval_steps_per_second": 7.788, "step": 14500 }, { "epoch": 2.92, "grad_norm": 2.9803731441497803, "learning_rate": 1.0444745762711866e-05, "loss": 1.1792, "step": 14600 }, { "epoch": 2.94, "grad_norm": 3.527334451675415, "learning_rate": 1.0376949152542373e-05, "loss": 1.1786, "step": 14700 }, { "epoch": 2.96, "grad_norm": 2.8651585578918457, "learning_rate": 1.0309152542372882e-05, "loss": 1.177, "step": 14800 }, { "epoch": 2.98, "grad_norm": 2.527571201324463, "learning_rate": 1.0241355932203391e-05, "loss": 1.1863, "step": 14900 }, { "epoch": 3.0, "grad_norm": 4.754082202911377, "learning_rate": 1.01735593220339e-05, "loss": 1.2066, "step": 15000 }, { "epoch": 3.0, "eval_loss": 1.197231650352478, "eval_runtime": 32.0595, "eval_samples_per_second": 31.192, "eval_steps_per_second": 7.798, "step": 15000 }, { "epoch": 3.02, "grad_norm": 2.30944561958313, "learning_rate": 1.0105762711864409e-05, "loss": 1.1471, "step": 15100 }, { "epoch": 3.04, "grad_norm": 2.808654308319092, "learning_rate": 1.0037966101694916e-05, "loss": 1.1552, "step": 15200 }, { "epoch": 3.06, "grad_norm": 2.997007369995117, "learning_rate": 9.970169491525425e-06, "loss": 1.1387, "step": 15300 }, { "epoch": 3.08, "grad_norm": 2.888899564743042, "learning_rate": 9.902372881355932e-06, "loss": 1.1245, "step": 15400 }, { "epoch": 3.1, "grad_norm": 5.218863010406494, "learning_rate": 9.834576271186441e-06, "loss": 1.1395, "step": 15500 }, { "epoch": 3.1, "eval_loss": 1.1928576231002808, "eval_runtime": 32.0387, "eval_samples_per_second": 31.212, "eval_steps_per_second": 7.803, "step": 15500 }, { "epoch": 3.12, "grad_norm": 3.4590179920196533, "learning_rate": 9.76677966101695e-06, "loss": 1.1391, "step": 15600 }, { "epoch": 3.14, "grad_norm": 3.658998966217041, "learning_rate": 9.698983050847457e-06, "loss": 1.1298, "step": 15700 }, { "epoch": 3.16, "grad_norm": 3.6307895183563232, "learning_rate": 9.631186440677966e-06, "loss": 1.1395, "step": 15800 }, { "epoch": 3.18, "grad_norm": 4.277059078216553, "learning_rate": 9.563389830508475e-06, "loss": 1.144, "step": 15900 }, { "epoch": 3.2, "grad_norm": 3.627415657043457, "learning_rate": 9.495593220338984e-06, "loss": 1.1276, "step": 16000 }, { "epoch": 3.2, "eval_loss": 1.1989972591400146, "eval_runtime": 32.1164, "eval_samples_per_second": 31.137, "eval_steps_per_second": 7.784, "step": 16000 }, { "epoch": 3.22, "grad_norm": 3.0596442222595215, "learning_rate": 9.427796610169491e-06, "loss": 1.1137, "step": 16100 }, { "epoch": 3.24, "grad_norm": 3.6272132396698, "learning_rate": 9.360000000000002e-06, "loss": 1.1315, "step": 16200 }, { "epoch": 3.26, "grad_norm": 4.7155232429504395, "learning_rate": 9.292203389830509e-06, "loss": 1.1109, "step": 16300 }, { "epoch": 3.28, "grad_norm": 3.0606753826141357, "learning_rate": 9.224406779661018e-06, "loss": 1.1356, "step": 16400 }, { "epoch": 3.3, "grad_norm": 3.105088233947754, "learning_rate": 9.156610169491527e-06, "loss": 1.12, "step": 16500 }, { "epoch": 3.3, "eval_loss": 1.2401061058044434, "eval_runtime": 32.03, "eval_samples_per_second": 31.221, "eval_steps_per_second": 7.805, "step": 16500 }, { "epoch": 3.32, "grad_norm": 4.067813873291016, "learning_rate": 9.088813559322036e-06, "loss": 1.1264, "step": 16600 }, { "epoch": 3.34, "grad_norm": 3.445667028427124, "learning_rate": 9.021016949152543e-06, "loss": 1.1131, "step": 16700 }, { "epoch": 3.36, "grad_norm": 3.6178576946258545, "learning_rate": 8.953220338983052e-06, "loss": 1.1286, "step": 16800 }, { "epoch": 3.38, "grad_norm": 3.9732513427734375, "learning_rate": 8.885423728813561e-06, "loss": 1.1324, "step": 16900 }, { "epoch": 3.4, "grad_norm": 2.9241085052490234, "learning_rate": 8.817627118644068e-06, "loss": 1.1403, "step": 17000 }, { "epoch": 3.4, "eval_loss": 1.1982412338256836, "eval_runtime": 32.0231, "eval_samples_per_second": 31.227, "eval_steps_per_second": 7.807, "step": 17000 }, { "epoch": 3.42, "grad_norm": 3.081540822982788, "learning_rate": 8.749830508474577e-06, "loss": 1.1308, "step": 17100 }, { "epoch": 3.44, "grad_norm": 2.958272695541382, "learning_rate": 8.682033898305086e-06, "loss": 1.1305, "step": 17200 }, { "epoch": 3.46, "grad_norm": 3.6438255310058594, "learning_rate": 8.614237288135593e-06, "loss": 1.1224, "step": 17300 }, { "epoch": 3.48, "grad_norm": 3.043936014175415, "learning_rate": 8.546440677966102e-06, "loss": 1.1257, "step": 17400 }, { "epoch": 3.5, "grad_norm": 3.0439295768737793, "learning_rate": 8.479322033898306e-06, "loss": 1.1277, "step": 17500 }, { "epoch": 3.5, "eval_loss": 1.2347404956817627, "eval_runtime": 32.0794, "eval_samples_per_second": 31.173, "eval_steps_per_second": 7.793, "step": 17500 }, { "epoch": 3.52, "grad_norm": 3.439828872680664, "learning_rate": 8.411525423728815e-06, "loss": 1.1214, "step": 17600 }, { "epoch": 3.54, "grad_norm": 3.504438638687134, "learning_rate": 8.343728813559323e-06, "loss": 1.1288, "step": 17700 }, { "epoch": 3.56, "grad_norm": 3.479522943496704, "learning_rate": 8.275932203389832e-06, "loss": 1.1066, "step": 17800 }, { "epoch": 3.58, "grad_norm": 4.671799182891846, "learning_rate": 8.20813559322034e-06, "loss": 1.113, "step": 17900 }, { "epoch": 3.6, "grad_norm": 3.815126895904541, "learning_rate": 8.140338983050848e-06, "loss": 1.1067, "step": 18000 }, { "epoch": 3.6, "eval_loss": 1.14591646194458, "eval_runtime": 32.0262, "eval_samples_per_second": 31.224, "eval_steps_per_second": 7.806, "step": 18000 }, { "epoch": 3.62, "grad_norm": 3.5889601707458496, "learning_rate": 8.072542372881357e-06, "loss": 1.1344, "step": 18100 }, { "epoch": 3.64, "grad_norm": 3.246824264526367, "learning_rate": 8.004745762711866e-06, "loss": 1.1067, "step": 18200 }, { "epoch": 3.66, "grad_norm": 3.164020538330078, "learning_rate": 7.936949152542374e-06, "loss": 1.1187, "step": 18300 }, { "epoch": 3.68, "grad_norm": 3.3035717010498047, "learning_rate": 7.869152542372882e-06, "loss": 1.0953, "step": 18400 }, { "epoch": 3.7, "grad_norm": 4.341372013092041, "learning_rate": 7.80135593220339e-06, "loss": 1.111, "step": 18500 }, { "epoch": 3.7, "eval_loss": 1.1732285022735596, "eval_runtime": 32.0886, "eval_samples_per_second": 31.164, "eval_steps_per_second": 7.791, "step": 18500 }, { "epoch": 3.72, "grad_norm": 3.3150527477264404, "learning_rate": 7.7335593220339e-06, "loss": 1.1056, "step": 18600 }, { "epoch": 3.74, "grad_norm": 3.855264663696289, "learning_rate": 7.665762711864407e-06, "loss": 1.1424, "step": 18700 }, { "epoch": 3.76, "grad_norm": 3.5808959007263184, "learning_rate": 7.597966101694916e-06, "loss": 1.0958, "step": 18800 }, { "epoch": 3.78, "grad_norm": 4.313356876373291, "learning_rate": 7.530169491525425e-06, "loss": 1.1009, "step": 18900 }, { "epoch": 3.8, "grad_norm": 3.271247625350952, "learning_rate": 7.462372881355933e-06, "loss": 1.121, "step": 19000 }, { "epoch": 3.8, "eval_loss": 1.1872472763061523, "eval_runtime": 32.0758, "eval_samples_per_second": 31.176, "eval_steps_per_second": 7.794, "step": 19000 } ], "logging_steps": 100, "max_steps": 30000, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "total_flos": 1.793641609691136e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }