| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.581755593803787, | |
| "eval_steps": 500, | |
| "global_step": 12000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.281562089920044, | |
| "learning_rate": 4e-05, | |
| "loss": 0.5044, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.3552560806274414, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4867, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.8318453431129456, | |
| "learning_rate": 0.00012, | |
| "loss": 0.4359, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.3478361368179321, | |
| "learning_rate": 0.00016, | |
| "loss": 0.4475, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.1437219381332397, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4524, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "eval_loss": 0.3725915253162384, | |
| "eval_runtime": 90.4733, | |
| "eval_samples_per_second": 11.053, | |
| "eval_steps_per_second": 2.763, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.8374431133270264, | |
| "learning_rate": 0.00019897435897435898, | |
| "loss": 0.4569, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.6386727094650269, | |
| "learning_rate": 0.00019794871794871796, | |
| "loss": 0.4584, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 1.4089267253875732, | |
| "learning_rate": 0.00019692307692307696, | |
| "loss": 0.4625, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 2.269509792327881, | |
| "learning_rate": 0.0001958974358974359, | |
| "loss": 0.464, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.0896354913711548, | |
| "learning_rate": 0.00019487179487179487, | |
| "loss": 0.4807, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "eval_loss": 0.3314511179924011, | |
| "eval_runtime": 90.3867, | |
| "eval_samples_per_second": 11.064, | |
| "eval_steps_per_second": 2.766, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.0933148860931396, | |
| "learning_rate": 0.00019384615384615385, | |
| "loss": 0.4032, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 2.555145740509033, | |
| "learning_rate": 0.00019282051282051282, | |
| "loss": 0.4353, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 2.245523452758789, | |
| "learning_rate": 0.00019179487179487182, | |
| "loss": 0.409, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 2.084887981414795, | |
| "learning_rate": 0.0001907692307692308, | |
| "loss": 0.4214, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.2279912233352661, | |
| "learning_rate": 0.00018974358974358974, | |
| "loss": 0.4507, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_loss": 0.312484472990036, | |
| "eval_runtime": 90.4149, | |
| "eval_samples_per_second": 11.06, | |
| "eval_steps_per_second": 2.765, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 4.321805477142334, | |
| "learning_rate": 0.0001887179487179487, | |
| "loss": 0.4523, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.376891016960144, | |
| "learning_rate": 0.0001876923076923077, | |
| "loss": 0.441, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 2.077620267868042, | |
| "learning_rate": 0.0001866666666666667, | |
| "loss": 0.4392, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 2.546964168548584, | |
| "learning_rate": 0.00018564102564102566, | |
| "loss": 0.4749, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 2.035524845123291, | |
| "learning_rate": 0.00018461538461538463, | |
| "loss": 0.4116, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "eval_loss": 0.2960582375526428, | |
| "eval_runtime": 90.8091, | |
| "eval_samples_per_second": 11.012, | |
| "eval_steps_per_second": 2.753, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 1.3167320489883423, | |
| "learning_rate": 0.00018358974358974358, | |
| "loss": 0.4135, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 2.908482074737549, | |
| "learning_rate": 0.00018256410256410258, | |
| "loss": 0.4092, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 2.8161234855651855, | |
| "learning_rate": 0.00018153846153846155, | |
| "loss": 0.4364, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.0435675382614136, | |
| "learning_rate": 0.00018051282051282052, | |
| "loss": 0.4258, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.7152374982833862, | |
| "learning_rate": 0.0001794871794871795, | |
| "loss": 0.4208, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "eval_loss": 0.3006269633769989, | |
| "eval_runtime": 90.5246, | |
| "eval_samples_per_second": 11.047, | |
| "eval_steps_per_second": 2.762, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.79283607006073, | |
| "learning_rate": 0.00017846153846153847, | |
| "loss": 0.4209, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.5817480087280273, | |
| "learning_rate": 0.00017743589743589744, | |
| "loss": 0.415, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 2.306807279586792, | |
| "learning_rate": 0.00017641025641025642, | |
| "loss": 0.4231, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.0432254076004028, | |
| "learning_rate": 0.0001753846153846154, | |
| "loss": 0.38, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 1.0641353130340576, | |
| "learning_rate": 0.00017435897435897436, | |
| "loss": 0.4671, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "eval_loss": 0.3091895282268524, | |
| "eval_runtime": 90.8377, | |
| "eval_samples_per_second": 11.009, | |
| "eval_steps_per_second": 2.752, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 2.6519484519958496, | |
| "learning_rate": 0.00017333333333333334, | |
| "loss": 0.4148, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.851606547832489, | |
| "learning_rate": 0.00017230769230769234, | |
| "loss": 0.37, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.1799293756484985, | |
| "learning_rate": 0.0001712923076923077, | |
| "loss": 0.3687, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 1.2787513732910156, | |
| "learning_rate": 0.0001702666666666667, | |
| "loss": 0.3997, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.2793489694595337, | |
| "learning_rate": 0.00016924102564102564, | |
| "loss": 0.3936, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "eval_loss": 0.2866547703742981, | |
| "eval_runtime": 90.8477, | |
| "eval_samples_per_second": 11.007, | |
| "eval_steps_per_second": 2.752, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.7189435362815857, | |
| "learning_rate": 0.0001682153846153846, | |
| "loss": 0.4191, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.9801158308982849, | |
| "learning_rate": 0.0001671897435897436, | |
| "loss": 0.3978, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 1.41176438331604, | |
| "learning_rate": 0.0001661641025641026, | |
| "loss": 0.3762, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.9064908027648926, | |
| "learning_rate": 0.00016513846153846156, | |
| "loss": 0.3746, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 1.3439726829528809, | |
| "learning_rate": 0.00016411282051282053, | |
| "loss": 0.3823, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "eval_loss": 0.2893534004688263, | |
| "eval_runtime": 90.8714, | |
| "eval_samples_per_second": 11.005, | |
| "eval_steps_per_second": 2.751, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.6646331548690796, | |
| "learning_rate": 0.00016308717948717948, | |
| "loss": 0.4029, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.9874048829078674, | |
| "learning_rate": 0.00016206153846153845, | |
| "loss": 0.3933, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 1.0010713338851929, | |
| "learning_rate": 0.00016103589743589745, | |
| "loss": 0.41, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.9607357978820801, | |
| "learning_rate": 0.00016001025641025642, | |
| "loss": 0.3659, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 1.3652217388153076, | |
| "learning_rate": 0.0001589846153846154, | |
| "loss": 0.3794, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "eval_loss": 0.2667659819126129, | |
| "eval_runtime": 90.9201, | |
| "eval_samples_per_second": 10.999, | |
| "eval_steps_per_second": 2.75, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 1.056810736656189, | |
| "learning_rate": 0.00015795897435897437, | |
| "loss": 0.3878, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 0.7765600085258484, | |
| "learning_rate": 0.00015693333333333334, | |
| "loss": 0.3396, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 1.0664465427398682, | |
| "learning_rate": 0.00015590769230769232, | |
| "loss": 0.2813, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 0.6342141628265381, | |
| "learning_rate": 0.0001548820512820513, | |
| "loss": 0.309, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 1.1380507946014404, | |
| "learning_rate": 0.00015385641025641026, | |
| "loss": 0.2888, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "eval_loss": 0.27630186080932617, | |
| "eval_runtime": 90.9762, | |
| "eval_samples_per_second": 10.992, | |
| "eval_steps_per_second": 2.748, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 1.6043438911437988, | |
| "learning_rate": 0.00015283076923076924, | |
| "loss": 0.3836, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 0.2961331307888031, | |
| "learning_rate": 0.0001518051282051282, | |
| "loss": 0.2964, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 1.0106085538864136, | |
| "learning_rate": 0.00015077948717948718, | |
| "loss": 0.2837, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 1.2187350988388062, | |
| "learning_rate": 0.00014975384615384616, | |
| "loss": 0.3093, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 1.4018324613571167, | |
| "learning_rate": 0.00014872820512820513, | |
| "loss": 0.3029, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "eval_loss": 0.2620677351951599, | |
| "eval_runtime": 90.8592, | |
| "eval_samples_per_second": 11.006, | |
| "eval_steps_per_second": 2.752, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 0.8634820580482483, | |
| "learning_rate": 0.00014771282051282051, | |
| "loss": 0.3046, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 0.6309552788734436, | |
| "learning_rate": 0.0001466871794871795, | |
| "loss": 0.3041, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 1.2985124588012695, | |
| "learning_rate": 0.00014566153846153846, | |
| "loss": 0.312, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 0.9580160975456238, | |
| "learning_rate": 0.00014463589743589746, | |
| "loss": 0.3306, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 1.0316152572631836, | |
| "learning_rate": 0.00014361025641025643, | |
| "loss": 0.2979, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "eval_loss": 0.26443469524383545, | |
| "eval_runtime": 90.9026, | |
| "eval_samples_per_second": 11.001, | |
| "eval_steps_per_second": 2.75, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 1.9922102689743042, | |
| "learning_rate": 0.00014258461538461538, | |
| "loss": 0.3105, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 1.274574875831604, | |
| "learning_rate": 0.00014155897435897435, | |
| "loss": 0.3174, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 1.1152174472808838, | |
| "learning_rate": 0.00014053333333333335, | |
| "loss": 0.3094, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 1.4428844451904297, | |
| "learning_rate": 0.00013950769230769233, | |
| "loss": 0.3136, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 1.418609380722046, | |
| "learning_rate": 0.0001384820512820513, | |
| "loss": 0.3425, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "eval_loss": 0.299164742231369, | |
| "eval_runtime": 90.9837, | |
| "eval_samples_per_second": 10.991, | |
| "eval_steps_per_second": 2.748, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 1.309278130531311, | |
| "learning_rate": 0.00013745641025641027, | |
| "loss": 0.3318, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.0984652042388916, | |
| "learning_rate": 0.00013643076923076922, | |
| "loss": 0.3221, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 1.1776598691940308, | |
| "learning_rate": 0.00013540512820512822, | |
| "loss": 0.3151, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 1.1536751985549927, | |
| "learning_rate": 0.0001343794871794872, | |
| "loss": 0.3471, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 1.5731265544891357, | |
| "learning_rate": 0.00013335384615384616, | |
| "loss": 0.3342, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "eval_loss": 0.25467580556869507, | |
| "eval_runtime": 90.9083, | |
| "eval_samples_per_second": 11.0, | |
| "eval_steps_per_second": 2.75, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 1.5424981117248535, | |
| "learning_rate": 0.00013233846153846155, | |
| "loss": 0.3337, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 1.2443273067474365, | |
| "learning_rate": 0.00013131282051282052, | |
| "loss": 0.2955, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 0.7886701822280884, | |
| "learning_rate": 0.0001302871794871795, | |
| "loss": 0.3017, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 1.6013621091842651, | |
| "learning_rate": 0.00012926153846153847, | |
| "loss": 0.2976, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 1.495753288269043, | |
| "learning_rate": 0.00012823589743589744, | |
| "loss": 0.3086, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "eval_loss": 0.25845155119895935, | |
| "eval_runtime": 90.8943, | |
| "eval_samples_per_second": 11.002, | |
| "eval_steps_per_second": 2.75, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 1.181015968322754, | |
| "learning_rate": 0.00012721025641025641, | |
| "loss": 0.2657, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 1.2074403762817383, | |
| "learning_rate": 0.0001261846153846154, | |
| "loss": 0.3149, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 1.216676950454712, | |
| "learning_rate": 0.00012515897435897436, | |
| "loss": 0.2999, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 1.8093730211257935, | |
| "learning_rate": 0.00012413333333333333, | |
| "loss": 0.3144, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 3.7325637340545654, | |
| "learning_rate": 0.00012310769230769233, | |
| "loss": 0.3326, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "eval_loss": 0.23783066868782043, | |
| "eval_runtime": 90.8877, | |
| "eval_samples_per_second": 11.003, | |
| "eval_steps_per_second": 2.751, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 0.6969020366668701, | |
| "learning_rate": 0.00012208205128205128, | |
| "loss": 0.2935, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 2.1927125453948975, | |
| "learning_rate": 0.00012105641025641025, | |
| "loss": 0.3091, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 1.8521186113357544, | |
| "learning_rate": 0.00012003076923076924, | |
| "loss": 0.2517, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 1.5349504947662354, | |
| "learning_rate": 0.00011900512820512821, | |
| "loss": 0.2794, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 0.6325456500053406, | |
| "learning_rate": 0.00011797948717948718, | |
| "loss": 0.2912, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "eval_loss": 0.23375801742076874, | |
| "eval_runtime": 90.8478, | |
| "eval_samples_per_second": 11.007, | |
| "eval_steps_per_second": 2.752, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 1.615515112876892, | |
| "learning_rate": 0.00011695384615384617, | |
| "loss": 0.2953, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 1.2424674034118652, | |
| "learning_rate": 0.00011592820512820513, | |
| "loss": 0.3047, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 1.2125675678253174, | |
| "learning_rate": 0.0001149025641025641, | |
| "loss": 0.3209, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 1.6464908123016357, | |
| "learning_rate": 0.00011387692307692308, | |
| "loss": 0.3214, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 1.300310730934143, | |
| "learning_rate": 0.00011285128205128206, | |
| "loss": 0.2965, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "eval_loss": 0.2334287166595459, | |
| "eval_runtime": 90.9451, | |
| "eval_samples_per_second": 10.996, | |
| "eval_steps_per_second": 2.749, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 1.7211843729019165, | |
| "learning_rate": 0.00011182564102564104, | |
| "loss": 0.2813, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 1.5777404308319092, | |
| "learning_rate": 0.00011080000000000001, | |
| "loss": 0.2951, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.3048077821731567, | |
| "learning_rate": 0.00010977435897435897, | |
| "loss": 0.301, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 1.1753878593444824, | |
| "learning_rate": 0.00010874871794871794, | |
| "loss": 0.1944, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 0.8378590941429138, | |
| "learning_rate": 0.00010772307692307693, | |
| "loss": 0.2041, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "eval_loss": 0.2733120024204254, | |
| "eval_runtime": 90.9249, | |
| "eval_samples_per_second": 10.998, | |
| "eval_steps_per_second": 2.75, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 1.4123268127441406, | |
| "learning_rate": 0.0001066974358974359, | |
| "loss": 0.2206, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 1.5923714637756348, | |
| "learning_rate": 0.00010567179487179489, | |
| "loss": 0.2331, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 2.31000018119812, | |
| "learning_rate": 0.00010465641025641026, | |
| "loss": 0.2564, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 1.4770272970199585, | |
| "learning_rate": 0.00010363076923076925, | |
| "loss": 0.2009, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 1.5393586158752441, | |
| "learning_rate": 0.00010260512820512822, | |
| "loss": 0.2168, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "eval_loss": 0.24773281812667847, | |
| "eval_runtime": 90.901, | |
| "eval_samples_per_second": 11.001, | |
| "eval_steps_per_second": 2.75, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 2.3399605751037598, | |
| "learning_rate": 0.00010157948717948718, | |
| "loss": 0.2275, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 1.417143702507019, | |
| "learning_rate": 0.00010055384615384615, | |
| "loss": 0.2093, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 1.7041810750961304, | |
| "learning_rate": 9.952820512820513e-05, | |
| "loss": 0.2105, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 3.7192060947418213, | |
| "learning_rate": 9.850256410256411e-05, | |
| "loss": 0.2078, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 0.7868184447288513, | |
| "learning_rate": 9.747692307692307e-05, | |
| "loss": 0.2058, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "eval_loss": 0.22978341579437256, | |
| "eval_runtime": 90.9187, | |
| "eval_samples_per_second": 10.999, | |
| "eval_steps_per_second": 2.75, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 2.572187662124634, | |
| "learning_rate": 9.645128205128206e-05, | |
| "loss": 0.2304, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 1.3671247959136963, | |
| "learning_rate": 9.542564102564103e-05, | |
| "loss": 0.1962, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 2.6237735748291016, | |
| "learning_rate": 9.44e-05, | |
| "loss": 0.2201, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 1.1776219606399536, | |
| "learning_rate": 9.337435897435898e-05, | |
| "loss": 0.1972, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 1.236425757408142, | |
| "learning_rate": 9.234871794871795e-05, | |
| "loss": 0.2126, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "eval_loss": 0.24023191630840302, | |
| "eval_runtime": 90.9406, | |
| "eval_samples_per_second": 10.996, | |
| "eval_steps_per_second": 2.749, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 1.0826618671417236, | |
| "learning_rate": 9.132307692307692e-05, | |
| "loss": 0.2168, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 0.8385189771652222, | |
| "learning_rate": 9.02974358974359e-05, | |
| "loss": 0.2029, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 0.7595863342285156, | |
| "learning_rate": 8.927179487179488e-05, | |
| "loss": 0.1902, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 2.0246148109436035, | |
| "learning_rate": 8.824615384615384e-05, | |
| "loss": 0.2276, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 1.2247196435928345, | |
| "learning_rate": 8.722051282051283e-05, | |
| "loss": 0.2, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "eval_loss": 0.25648975372314453, | |
| "eval_runtime": 90.8579, | |
| "eval_samples_per_second": 11.006, | |
| "eval_steps_per_second": 2.752, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 1.0143483877182007, | |
| "learning_rate": 8.61948717948718e-05, | |
| "loss": 0.2179, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 2.7020885944366455, | |
| "learning_rate": 8.516923076923076e-05, | |
| "loss": 0.2214, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 1.8533117771148682, | |
| "learning_rate": 8.414358974358975e-05, | |
| "loss": 0.2074, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 1.7365753650665283, | |
| "learning_rate": 8.311794871794872e-05, | |
| "loss": 0.2396, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 0.8160982131958008, | |
| "learning_rate": 8.209230769230771e-05, | |
| "loss": 0.1786, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "eval_loss": 0.2251870185136795, | |
| "eval_runtime": 90.8167, | |
| "eval_samples_per_second": 11.011, | |
| "eval_steps_per_second": 2.753, | |
| "step": 12000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 20000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "total_flos": 3.86418819603628e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |