| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9932104752667313, |
| "eval_steps": 1024, |
| "global_step": 21504, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.011823934229365849, |
| "grad_norm": 1.9160995483398438, |
| "learning_rate": 0.000498046875, |
| "loss": 0.9917811751365662, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.023647868458731697, |
| "grad_norm": 4.747977256774902, |
| "learning_rate": 0.000998046875, |
| "loss": 0.7975242733955383, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.03547180268809755, |
| "grad_norm": 7.29379940032959, |
| "learning_rate": 0.000999640996023194, |
| "loss": 0.6437413692474365, |
| "step": 768 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "grad_norm": 9.162477493286133, |
| "learning_rate": 0.0009985588674043958, |
| "loss": 0.5476456880569458, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "eval_cos_loss": 0.39061775943051735, |
| "eval_loss": 0.5246143476467698, |
| "eval_mse_loss": 0.5246143476467698, |
| "flow/cos_sim": 0.609382248904607, |
| "flow/improvement_ratio": 0.6071757009448526, |
| "flow/mag_ratio_mean": 0.6818114621182011, |
| "flow/mag_ratio_std": 0.25719259335706224, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "eval_cos_loss": 0.39061775943051735, |
| "eval_loss": 0.5246143476467698, |
| "eval_mse_loss": 0.5246143476467698, |
| "eval_runtime": 36.6551, |
| "eval_samples_per_second": 763.687, |
| "eval_steps_per_second": 11.949, |
| "flow/cos_sim": 0.609382248904607, |
| "flow/improvement_ratio": 0.6071757009448526, |
| "flow/mag_ratio_mean": 0.6818114621182011, |
| "flow/mag_ratio_std": 0.25719259335706224, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.05911967114682925, |
| "grad_norm": 13.371840476989746, |
| "learning_rate": 0.0009967551747861387, |
| "loss": 0.5056764483451843, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.0709436053761951, |
| "grad_norm": 7.342921733856201, |
| "learning_rate": 0.000994232528651847, |
| "loss": 0.4739161729812622, |
| "step": 1536 |
| }, |
| { |
| "epoch": 0.08276753960556095, |
| "grad_norm": 9.060990333557129, |
| "learning_rate": 0.0009909945800260092, |
| "loss": 0.4664166569709778, |
| "step": 1792 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "grad_norm": 7.4525556564331055, |
| "learning_rate": 0.0009870460151900522, |
| "loss": 0.44106417894363403, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "eval_cos_loss": 0.3169162204344523, |
| "eval_loss": 0.44313590847738255, |
| "eval_mse_loss": 0.44313590847738255, |
| "flow/cos_sim": 0.6830837719788835, |
| "flow/improvement_ratio": 0.5721429720452932, |
| "flow/mag_ratio_mean": 0.61852062170364, |
| "flow/mag_ratio_std": 0.28984700054882867, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "eval_cos_loss": 0.3169162204344523, |
| "eval_loss": 0.44313590847738255, |
| "eval_mse_loss": 0.44313590847738255, |
| "eval_runtime": 36.5832, |
| "eval_samples_per_second": 765.188, |
| "eval_steps_per_second": 11.973, |
| "flow/cos_sim": 0.6830837719788835, |
| "flow/improvement_ratio": 0.5721429720452932, |
| "flow/mag_ratio_mean": 0.61852062170364, |
| "flow/mag_ratio_std": 0.28984700054882867, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.10641540806429264, |
| "grad_norm": 9.94619083404541, |
| "learning_rate": 0.0009823925488998885, |
| "loss": 0.42647701501846313, |
| "step": 2304 |
| }, |
| { |
| "epoch": 0.1182393422936585, |
| "grad_norm": 3.24662446975708, |
| "learning_rate": 0.0009770409161149525, |
| "loss": 0.4398079514503479, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.13006327652302435, |
| "grad_norm": 0.8419686555862427, |
| "learning_rate": 0.0009709988622506973, |
| "loss": 0.4244914650917053, |
| "step": 2816 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "grad_norm": 6.962368965148926, |
| "learning_rate": 0.000964275131968659, |
| "loss": 0.4100937843322754, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "eval_cos_loss": 0.2142252650571196, |
| "eval_loss": 0.3256854459167071, |
| "eval_mse_loss": 0.3256854459167071, |
| "flow/cos_sim": 0.7857747346026712, |
| "flow/improvement_ratio": 0.5133735555219868, |
| "flow/mag_ratio_mean": 0.754958403437105, |
| "flow/mag_ratio_std": 0.28671529593124784, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "eval_cos_loss": 0.2142252650571196, |
| "eval_loss": 0.3256854459167071, |
| "eval_mse_loss": 0.3256854459167071, |
| "eval_runtime": 37.4143, |
| "eval_samples_per_second": 748.189, |
| "eval_steps_per_second": 11.707, |
| "flow/cos_sim": 0.7857747346026712, |
| "flow/improvement_ratio": 0.5133735555219868, |
| "flow/mag_ratio_mean": 0.754958403437105, |
| "flow/mag_ratio_std": 0.28671529593124784, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.15371114498175603, |
| "grad_norm": 13.790922164916992, |
| "learning_rate": 0.0009568794565203123, |
| "loss": 0.4049389958381653, |
| "step": 3328 |
| }, |
| { |
| "epoch": 0.1655350792111219, |
| "grad_norm": 2.3658220767974854, |
| "learning_rate": 0.0009488225396630347, |
| "loss": 0.3951943516731262, |
| "step": 3584 |
| }, |
| { |
| "epoch": 0.17735901344048774, |
| "grad_norm": 10.210602760314941, |
| "learning_rate": 0.0009401160421685646, |
| "loss": 0.3869022727012634, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "grad_norm": 8.834197998046875, |
| "learning_rate": 0.0009307725649463714, |
| "loss": 0.39980870485305786, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "eval_cos_loss": 0.19631782169007275, |
| "eval_loss": 0.305890281791012, |
| "eval_mse_loss": 0.305890281791012, |
| "flow/cos_sim": 0.8036821900981747, |
| "flow/improvement_ratio": 0.49340173255090847, |
| "flow/mag_ratio_mean": 0.7550306366458875, |
| "flow/mag_ratio_std": 0.2717885251526963, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "eval_cos_loss": 0.19631782169007275, |
| "eval_loss": 0.305890281791012, |
| "eval_mse_loss": 0.305890281791012, |
| "eval_runtime": 37.2111, |
| "eval_samples_per_second": 752.276, |
| "eval_steps_per_second": 11.771, |
| "flow/cos_sim": 0.8036821900981747, |
| "flow/improvement_ratio": 0.49340173255090847, |
| "flow/mag_ratio_mean": 0.7550306366458875, |
| "flow/mag_ratio_std": 0.2717885251526963, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.20100688189921945, |
| "grad_norm": 2.6557016372680664, |
| "learning_rate": 0.0009208056308063659, |
| "loss": 0.38342079520225525, |
| "step": 4352 |
| }, |
| { |
| "epoch": 0.2128308161285853, |
| "grad_norm": 6.585168838500977, |
| "learning_rate": 0.0009102296648873445, |
| "loss": 0.38296523690223694, |
| "step": 4608 |
| }, |
| { |
| "epoch": 0.22465475035795113, |
| "grad_norm": 11.361368179321289, |
| "learning_rate": 0.0008990599737794927, |
| "loss": 0.3765565752983093, |
| "step": 4864 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "grad_norm": 19.619009017944336, |
| "learning_rate": 0.0008873127233711644, |
| "loss": 0.3799913823604584, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "eval_cos_loss": 0.2768729927877313, |
| "eval_loss": 0.3794649503955014, |
| "eval_mse_loss": 0.3794649503955014, |
| "flow/cos_sim": 0.7231270148329538, |
| "flow/improvement_ratio": 0.3986093916305124, |
| "flow/mag_ratio_mean": 0.7333052162438223, |
| "flow/mag_ratio_std": 0.2822123572366423, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "eval_cos_loss": 0.2768729927877313, |
| "eval_loss": 0.3794649503955014, |
| "eval_mse_loss": 0.3794649503955014, |
| "eval_runtime": 36.7774, |
| "eval_samples_per_second": 761.148, |
| "eval_steps_per_second": 11.91, |
| "flow/cos_sim": 0.7231270148329538, |
| "flow/improvement_ratio": 0.3986093916305124, |
| "flow/mag_ratio_mean": 0.7333052162438223, |
| "flow/mag_ratio_std": 0.2822123572366423, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.24830261881668284, |
| "grad_norm": 8.770722389221191, |
| "learning_rate": 0.0008750049154520011, |
| "loss": 0.369044691324234, |
| "step": 5376 |
| }, |
| { |
| "epoch": 0.2601265530460487, |
| "grad_norm": 8.493452072143555, |
| "learning_rate": 0.0008621543631062487, |
| "loss": 0.3745132386684418, |
| "step": 5632 |
| }, |
| { |
| "epoch": 0.27195048727541454, |
| "grad_norm": 2.5794949531555176, |
| "learning_rate": 0.0008487796649318904, |
| "loss": 0.36342066526412964, |
| "step": 5888 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "grad_norm": 7.236816883087158, |
| "learning_rate": 0.0008349001781229053, |
| "loss": 0.35946616530418396, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "eval_cos_loss": 0.18904457571299654, |
| "eval_loss": 0.29877711084063197, |
| "eval_mse_loss": 0.29877711084063197, |
| "flow/cos_sim": 0.8109554353097802, |
| "flow/improvement_ratio": 0.4896438498747403, |
| "flow/mag_ratio_mean": 0.7580637161590192, |
| "flow/mag_ratio_std": 0.2673305695936016, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "eval_cos_loss": 0.18904457571299654, |
| "eval_loss": 0.29877711084063197, |
| "eval_mse_loss": 0.29877711084063197, |
| "eval_runtime": 36.4777, |
| "eval_samples_per_second": 767.399, |
| "eval_steps_per_second": 12.007, |
| "flow/cos_sim": 0.8109554353097802, |
| "flow/improvement_ratio": 0.4896438498747403, |
| "flow/mag_ratio_mean": 0.7580637161590192, |
| "flow/mag_ratio_std": 0.2673305695936016, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2955983557341462, |
| "grad_norm": 9.964309692382812, |
| "learning_rate": 0.0008205359904536107, |
| "loss": 0.36158525943756104, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.30742228996351206, |
| "grad_norm": 2.096620559692383, |
| "learning_rate": 0.0008057078912056363, |
| "loss": 0.3546257019042969, |
| "step": 6656 |
| }, |
| { |
| "epoch": 0.3192462241928779, |
| "grad_norm": 4.746336936950684, |
| "learning_rate": 0.0007904373410796086, |
| "loss": 0.359122633934021, |
| "step": 6912 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "grad_norm": 3.691810369491577, |
| "learning_rate": 0.0007747464411350876, |
| "loss": 0.3498728573322296, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "eval_cos_loss": 0.20516517899556247, |
| "eval_loss": 0.3059897372099363, |
| "eval_mse_loss": 0.3059897372099363, |
| "flow/cos_sim": 0.794834830717409, |
| "flow/improvement_ratio": 0.5194083058262524, |
| "flow/mag_ratio_mean": 0.7830329138666527, |
| "flow/mag_ratio_std": 0.2623120350380466, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "eval_cos_loss": 0.20516517899556247, |
| "eval_loss": 0.3059897372099363, |
| "eval_mse_loss": 0.3059897372099363, |
| "eval_runtime": 36.6184, |
| "eval_samples_per_second": 764.452, |
| "eval_steps_per_second": 11.961, |
| "flow/cos_sim": 0.794834830717409, |
| "flow/improvement_ratio": 0.5194083058262524, |
| "flow/mag_ratio_mean": 0.7830329138666527, |
| "flow/mag_ratio_std": 0.2623120350380466, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.34289409265160964, |
| "grad_norm": 8.637118339538574, |
| "learning_rate": 0.000758657900803716, |
| "loss": 0.35182619094848633, |
| "step": 7424 |
| }, |
| { |
| "epoch": 0.3547180268809755, |
| "grad_norm": 1.3778982162475586, |
| "learning_rate": 0.000742195005021869, |
| "loss": 0.3482975363731384, |
| "step": 7680 |
| }, |
| { |
| "epoch": 0.3665419611103413, |
| "grad_norm": 6.048179626464844, |
| "learning_rate": 0.0007253815805303786, |
| "loss": 0.3527331054210663, |
| "step": 7936 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "grad_norm": 22.744041442871094, |
| "learning_rate": 0.0007082419613901028, |
| "loss": 0.35066965222358704, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "eval_cos_loss": 0.2029232965722748, |
| "eval_loss": 0.3061500937427016, |
| "eval_mse_loss": 0.3061500937427016, |
| "flow/cos_sim": 0.7970767074251828, |
| "flow/improvement_ratio": 0.5205303045440483, |
| "flow/mag_ratio_mean": 0.7948372063299293, |
| "flow/mag_ratio_std": 0.24897354126793064, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "eval_cos_loss": 0.2029232965722748, |
| "eval_loss": 0.3061500937427016, |
| "eval_mse_loss": 0.3061500937427016, |
| "eval_runtime": 36.7266, |
| "eval_samples_per_second": 762.2, |
| "eval_steps_per_second": 11.926, |
| "flow/cos_sim": 0.7970767074251828, |
| "flow/improvement_ratio": 0.5205303045440483, |
| "flow/mag_ratio_mean": 0.7948372063299293, |
| "flow/mag_ratio_std": 0.24897354126793064, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.390189829569073, |
| "grad_norm": 5.6811418533325195, |
| "learning_rate": 0.0006908009537632514, |
| "loss": 0.3552817404270172, |
| "step": 8448 |
| }, |
| { |
| "epoch": 0.4020137637984389, |
| "grad_norm": 22.05255126953125, |
| "learning_rate": 0.0006730838000114403, |
| "loss": 0.3447907269001007, |
| "step": 8704 |
| }, |
| { |
| "epoch": 0.41383769802780473, |
| "grad_norm": 2.4954335689544678, |
| "learning_rate": 0.0006551161421624341, |
| "loss": 0.3462049663066864, |
| "step": 8960 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "grad_norm": 10.035924911499023, |
| "learning_rate": 0.0006369239847984517, |
| "loss": 0.3400329649448395, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "eval_cos_loss": 0.18391099199652672, |
| "eval_loss": 0.28414961062882044, |
| "eval_mse_loss": 0.28414961062882044, |
| "flow/cos_sim": 0.816089018839135, |
| "flow/improvement_ratio": 0.49869729156635667, |
| "flow/mag_ratio_mean": 0.8094866498964562, |
| "flow/mag_ratio_std": 0.256197469147373, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "eval_cos_loss": 0.18391099199652672, |
| "eval_loss": 0.28414961062882044, |
| "eval_mse_loss": 0.28414961062882044, |
| "eval_runtime": 36.6985, |
| "eval_samples_per_second": 762.783, |
| "eval_steps_per_second": 11.935, |
| "flow/cos_sim": 0.816089018839135, |
| "flow/improvement_ratio": 0.49869729156635667, |
| "flow/mag_ratio_mean": 0.8094866498964562, |
| "flow/mag_ratio_std": 0.256197469147373, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4374855664865364, |
| "grad_norm": 7.901397705078125, |
| "learning_rate": 0.0006185336574197479, |
| "loss": 0.3395940065383911, |
| "step": 9472 |
| }, |
| { |
| "epoch": 0.44930950071590225, |
| "grad_norm": 6.7792510986328125, |
| "learning_rate": 0.0005999717763379407, |
| "loss": 0.33901962637901306, |
| "step": 9728 |
| }, |
| { |
| "epoch": 0.4611334349452681, |
| "grad_norm": 2.6271603107452393, |
| "learning_rate": 0.0005812652061542363, |
| "loss": 0.34404292702674866, |
| "step": 9984 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "grad_norm": 4.694519996643066, |
| "learning_rate": 0.0005624410208783071, |
| "loss": 0.33998560905456543, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "eval_cos_loss": 0.19037291955321892, |
| "eval_loss": 0.29135872845508193, |
| "eval_mse_loss": 0.29135872845508193, |
| "flow/cos_sim": 0.809627095041754, |
| "flow/improvement_ratio": 0.5105767372536333, |
| "flow/mag_ratio_mean": 0.8087259782775896, |
| "flow/mag_ratio_std": 0.26619178907239816, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "eval_cos_loss": 0.19037291955321892, |
| "eval_loss": 0.29135872845508193, |
| "eval_mse_loss": 0.29135872845508193, |
| "eval_runtime": 36.2962, |
| "eval_samples_per_second": 771.237, |
| "eval_steps_per_second": 12.067, |
| "flow/cos_sim": 0.809627095041754, |
| "flow/improvement_ratio": 0.5105767372536333, |
| "flow/mag_ratio_mean": 0.8087259782775896, |
| "flow/mag_ratio_std": 0.26619178907239816, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.48478130340399983, |
| "grad_norm": 6.458459377288818, |
| "learning_rate": 0.0005435264647440881, |
| "loss": 0.33370938897132874, |
| "step": 10496 |
| }, |
| { |
| "epoch": 0.49660523763336567, |
| "grad_norm": 4.395213603973389, |
| "learning_rate": 0.000524548912779213, |
| "loss": 0.3374914824962616, |
| "step": 10752 |
| }, |
| { |
| "epoch": 0.5084291718627315, |
| "grad_norm": 14.4677152633667, |
| "learning_rate": 0.0005055358311851499, |
| "loss": 0.3334283232688904, |
| "step": 11008 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "grad_norm": 9.418996810913086, |
| "learning_rate": 0.0004865147375853812, |
| "loss": 0.33191806077957153, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "eval_cos_loss": 0.19875142282751052, |
| "eval_loss": 0.2990237715990032, |
| "eval_mse_loss": 0.2990237715990032, |
| "flow/cos_sim": 0.8012485821225327, |
| "flow/improvement_ratio": 0.50358519822223, |
| "flow/mag_ratio_mean": 0.7997565068066392, |
| "flow/mag_ratio_std": 0.2594736643373694, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "eval_cos_loss": 0.19875142282751052, |
| "eval_loss": 0.2990237715990032, |
| "eval_mse_loss": 0.2990237715990032, |
| "eval_runtime": 36.1011, |
| "eval_samples_per_second": 775.405, |
| "eval_steps_per_second": 12.133, |
| "flow/cos_sim": 0.8012485821225327, |
| "flow/improvement_ratio": 0.50358519822223, |
| "flow/mag_ratio_mean": 0.7997565068066392, |
| "flow/mag_ratio_std": 0.2594736643373694, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5320770403214632, |
| "grad_norm": 12.210289001464844, |
| "learning_rate": 0.0004675131611991607, |
| "loss": 0.33119097352027893, |
| "step": 11520 |
| }, |
| { |
| "epoch": 0.5439009745508291, |
| "grad_norm": 5.221109867095947, |
| "learning_rate": 0.0004485586029984899, |
| "loss": 0.3348545432090759, |
| "step": 11776 |
| }, |
| { |
| "epoch": 0.5557249087801949, |
| "grad_norm": 9.297412872314453, |
| "learning_rate": 0.00042967849590597266, |
| "loss": 0.3332735300064087, |
| "step": 12032 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "grad_norm": 11.681222915649414, |
| "learning_rate": 0.0004109001650911621, |
| "loss": 0.32956069707870483, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "eval_cos_loss": 0.19949678523831715, |
| "eval_loss": 0.2990707095083036, |
| "eval_mse_loss": 0.2990707095083036, |
| "flow/cos_sim": 0.800503221821023, |
| "flow/improvement_ratio": 0.45373746521396724, |
| "flow/mag_ratio_mean": 0.8073946686640178, |
| "flow/mag_ratio_std": 0.24824306055835393, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "eval_cos_loss": 0.19949678523831715, |
| "eval_loss": 0.2990707095083036, |
| "eval_mse_loss": 0.2990707095083036, |
| "eval_runtime": 36.2453, |
| "eval_samples_per_second": 772.321, |
| "eval_steps_per_second": 12.084, |
| "flow/cos_sim": 0.800503221821023, |
| "flow/improvement_ratio": 0.45373746521396724, |
| "flow/mag_ratio_mean": 0.8073946686640178, |
| "flow/mag_ratio_std": 0.24824306055835393, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5793727772389267, |
| "grad_norm": 11.020252227783203, |
| "learning_rate": 0.0003922507884228551, |
| "loss": 0.3290621340274811, |
| "step": 12544 |
| }, |
| { |
| "epoch": 0.5911967114682924, |
| "grad_norm": 12.451437950134277, |
| "learning_rate": 0.00037375735713457723, |
| "loss": 0.32443854212760925, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.6030206456976583, |
| "grad_norm": 1.0615566968917847, |
| "learning_rate": 0.00035544663676018276, |
| "loss": 0.32868245244026184, |
| "step": 13056 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "grad_norm": 12.848718643188477, |
| "learning_rate": 0.00033734512839611255, |
| "loss": 0.32849442958831787, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "eval_cos_loss": 0.22722740034194297, |
| "eval_loss": 0.3263611817332708, |
| "eval_mse_loss": 0.3263611817332708, |
| "flow/cos_sim": 0.7727726012060087, |
| "flow/improvement_ratio": 0.5387330570993902, |
| "flow/mag_ratio_mean": 0.8061602507552056, |
| "flow/mag_ratio_std": 0.24406742666886277, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "eval_cos_loss": 0.22722740034194297, |
| "eval_loss": 0.3263611817332708, |
| "eval_mse_loss": 0.3263611817332708, |
| "eval_runtime": 36.2088, |
| "eval_samples_per_second": 773.1, |
| "eval_steps_per_second": 12.097, |
| "flow/cos_sim": 0.7727726012060087, |
| "flow/improvement_ratio": 0.5387330570993902, |
| "flow/mag_ratio_mean": 0.8061602507552056, |
| "flow/mag_ratio_std": 0.24406742666886277, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.62666851415639, |
| "grad_norm": 3.315863609313965, |
| "learning_rate": 0.0003194790303463687, |
| "loss": 0.32513225078582764, |
| "step": 13568 |
| }, |
| { |
| "epoch": 0.6384924483857558, |
| "grad_norm": 0.8015891909599304, |
| "learning_rate": 0.00030187420020572406, |
| "loss": 0.32288485765457153, |
| "step": 13824 |
| }, |
| { |
| "epoch": 0.6503163826151217, |
| "grad_norm": 8.767995834350586, |
| "learning_rate": 0.00028455611743603626, |
| "loss": 0.31935569643974304, |
| "step": 14080 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "grad_norm": 2.7436001300811768, |
| "learning_rate": 0.0002675498464898373, |
| "loss": 0.3218556344509125, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "eval_cos_loss": 0.17437796600877423, |
| "eval_loss": 0.27404612850516896, |
| "eval_mse_loss": 0.27404612850516896, |
| "flow/cos_sim": 0.8256220499130145, |
| "flow/improvement_ratio": 0.4894021302461624, |
| "flow/mag_ratio_mean": 0.8008586715345514, |
| "flow/mag_ratio_std": 0.2502900305199841, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "eval_cos_loss": 0.17437796600877423, |
| "eval_loss": 0.27404612850516896, |
| "eval_mse_loss": 0.27404612850516896, |
| "eval_runtime": 36.4787, |
| "eval_samples_per_second": 767.38, |
| "eval_steps_per_second": 12.007, |
| "flow/cos_sim": 0.8256220499130145, |
| "flow/improvement_ratio": 0.4894021302461624, |
| "flow/mag_ratio_mean": 0.8008586715345514, |
| "flow/mag_ratio_std": 0.2502900305199841, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6739642510738534, |
| "grad_norm": 1.7940927743911743, |
| "learning_rate": 0.0002508800005345623, |
| "loss": 0.32087063789367676, |
| "step": 14592 |
| }, |
| { |
| "epoch": 0.6857881853032193, |
| "grad_norm": 3.976600170135498, |
| "learning_rate": 0.00023457070582992562, |
| "loss": 0.31643542647361755, |
| "step": 14848 |
| }, |
| { |
| "epoch": 0.6976121195325851, |
| "grad_norm": 14.614996910095215, |
| "learning_rate": 0.00021864556680999692, |
| "loss": 0.31989988684654236, |
| "step": 15104 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "grad_norm": 1.3015860319137573, |
| "learning_rate": 0.0002031276319205152, |
| "loss": 0.319489985704422, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "eval_cos_loss": 0.1751825540883628, |
| "eval_loss": 0.2740960814841262, |
| "eval_mse_loss": 0.2740960814841262, |
| "flow/cos_sim": 0.8248174480379444, |
| "flow/improvement_ratio": 0.5053689104917387, |
| "flow/mag_ratio_mean": 0.7900927232825048, |
| "flow/mag_ratio_std": 0.24186882635229798, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "eval_cos_loss": 0.1751825540883628, |
| "eval_loss": 0.2740960814841262, |
| "eval_mse_loss": 0.2740960814841262, |
| "eval_runtime": 36.3752, |
| "eval_samples_per_second": 769.563, |
| "eval_steps_per_second": 12.041, |
| "flow/cos_sim": 0.8248174480379444, |
| "flow/improvement_ratio": 0.5053689104917387, |
| "flow/mag_ratio_mean": 0.7900927232825048, |
| "flow/mag_ratio_std": 0.24186882635229798, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.7212599879913169, |
| "grad_norm": 0.6580540537834167, |
| "learning_rate": 0.00018803936026088542, |
| "loss": 0.31635990738868713, |
| "step": 15616 |
| }, |
| { |
| "epoch": 0.7330839222206826, |
| "grad_norm": 4.1114654541015625, |
| "learning_rate": 0.00017340258907913464, |
| "loss": 0.3187476694583893, |
| "step": 15872 |
| }, |
| { |
| "epoch": 0.7449078564500485, |
| "grad_norm": 12.402987480163574, |
| "learning_rate": 0.0001592385021668743, |
| "loss": 0.3155273497104645, |
| "step": 16128 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "grad_norm": 1.0991050004959106, |
| "learning_rate": 0.0001455675992000087, |
| "loss": 0.31325414776802063, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "eval_cos_loss": 0.17473208807306748, |
| "eval_loss": 0.2738769624263184, |
| "eval_mse_loss": 0.2738769624263184, |
| "flow/cos_sim": 0.8252679411679098, |
| "flow/improvement_ratio": 0.48874352702267093, |
| "flow/mag_ratio_mean": 0.7912447798197673, |
| "flow/mag_ratio_std": 0.25162723619643956, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "eval_cos_loss": 0.17473208807306748, |
| "eval_loss": 0.2738769624263184, |
| "eval_mse_loss": 0.2738769624263184, |
| "eval_runtime": 36.1853, |
| "eval_samples_per_second": 773.602, |
| "eval_steps_per_second": 12.104, |
| "flow/cos_sim": 0.8252679411679098, |
| "flow/improvement_ratio": 0.48874352702267093, |
| "flow/mag_ratio_mean": 0.7912447798197673, |
| "flow/mag_ratio_std": 0.25162723619643956, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7685557249087802, |
| "grad_norm": 1.9538664817810059, |
| "learning_rate": 0.000132409666069565, |
| "loss": 0.31205466389656067, |
| "step": 16640 |
| }, |
| { |
| "epoch": 0.780379659138146, |
| "grad_norm": 3.74383544921875, |
| "learning_rate": 0.0001197837462455823, |
| "loss": 0.3163967728614807, |
| "step": 16896 |
| }, |
| { |
| "epoch": 0.7922035933675119, |
| "grad_norm": 2.0866622924804688, |
| "learning_rate": 0.00010770811321550749, |
| "loss": 0.3133208453655243, |
| "step": 17152 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "grad_norm": 3.845749855041504, |
| "learning_rate": 9.620024403698591e-05, |
| "loss": 0.31291234493255615, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "eval_cos_loss": 0.17509721335210757, |
| "eval_loss": 0.273063686272325, |
| "eval_mse_loss": 0.273063686272325, |
| "flow/cos_sim": 0.824902795357247, |
| "flow/improvement_ratio": 0.5037120235020711, |
| "flow/mag_ratio_mean": 0.8056292350161566, |
| "flow/mag_ratio_std": 0.23707074216103444, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "eval_cos_loss": 0.17509721335210757, |
| "eval_loss": 0.273063686272325, |
| "eval_mse_loss": 0.273063686272325, |
| "eval_runtime": 36.2474, |
| "eval_samples_per_second": 772.277, |
| "eval_steps_per_second": 12.084, |
| "flow/cos_sim": 0.824902795357247, |
| "flow/improvement_ratio": 0.5037120235020711, |
| "flow/mag_ratio_mean": 0.8056292350161566, |
| "flow/mag_ratio_std": 0.23707074216103444, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8158514618262436, |
| "grad_norm": 4.972906112670898, |
| "learning_rate": 8.527679404332429e-05, |
| "loss": 0.3136768639087677, |
| "step": 17664 |
| }, |
| { |
| "epoch": 0.8276753960556095, |
| "grad_norm": 0.5566576719284058, |
| "learning_rate": 7.495357273823544e-05, |
| "loss": 0.30899107456207275, |
| "step": 17920 |
| }, |
| { |
| "epoch": 0.8394993302849753, |
| "grad_norm": 2.5373237133026123, |
| "learning_rate": 6.524552091475183e-05, |
| "loss": 0.30978360772132874, |
| "step": 18176 |
| }, |
| { |
| "epoch": 0.8513232645143411, |
| "grad_norm": 8.832274436950684, |
| "learning_rate": 5.6166689031422024e-05, |
| "loss": 0.3114849030971527, |
| "step": 18432 |
| }, |
| { |
| "epoch": 0.8513232645143411, |
| "eval_cos_loss": 0.179012101160609, |
| "eval_loss": 0.27659866115137866, |
| "eval_mse_loss": 0.27659866115137866, |
| "flow/cos_sim": 0.8209879062219297, |
| "flow/improvement_ratio": 0.5032780424236707, |
| "flow/mag_ratio_mean": 0.8021674478707248, |
| "flow/mag_ratio_std": 0.2417376903502364, |
| "step": 18432 |
| }, |
| { |
| "epoch": 0.8513232645143411, |
| "eval_cos_loss": 0.179012101160609, |
| "eval_loss": 0.27659866115137866, |
| "eval_mse_loss": 0.27659866115137866, |
| "eval_runtime": 36.438, |
| "eval_samples_per_second": 768.236, |
| "eval_steps_per_second": 12.02, |
| "flow/cos_sim": 0.8209879062219297, |
| "flow/improvement_ratio": 0.5032780424236707, |
| "flow/mag_ratio_mean": 0.8021674478707248, |
| "flow/mag_ratio_std": 0.2417376903502364, |
| "step": 18432 |
| }, |
| { |
| "epoch": 0.8631471987437069, |
| "grad_norm": 6.2862749099731445, |
| "learning_rate": 4.773021687709067e-05, |
| "loss": 0.30685460567474365, |
| "step": 18688 |
| }, |
| { |
| "epoch": 0.8749711329730728, |
| "grad_norm": 1.9315192699432373, |
| "learning_rate": 3.994831455368719e-05, |
| "loss": 0.31047487258911133, |
| "step": 18944 |
| }, |
| { |
| "epoch": 0.8867950672024387, |
| "grad_norm": 8.073254585266113, |
| "learning_rate": 3.283224480455282e-05, |
| "loss": 0.3102303445339203, |
| "step": 19200 |
| }, |
| { |
| "epoch": 0.8986190014318045, |
| "grad_norm": 0.3247526288032532, |
| "learning_rate": 2.639230671387627e-05, |
| "loss": 0.3080306351184845, |
| "step": 19456 |
| }, |
| { |
| "epoch": 0.8986190014318045, |
| "eval_cos_loss": 0.17426143144363682, |
| "eval_loss": 0.2714149275258796, |
| "eval_mse_loss": 0.2714149275258796, |
| "flow/cos_sim": 0.8257385824368969, |
| "flow/improvement_ratio": 0.5061814522362191, |
| "flow/mag_ratio_mean": 0.8065648428653474, |
| "flow/mag_ratio_std": 0.23905256662738922, |
| "step": 19456 |
| }, |
| { |
| "epoch": 0.8986190014318045, |
| "eval_cos_loss": 0.17426143144363682, |
| "eval_loss": 0.2714149275258796, |
| "eval_mse_loss": 0.2714149275258796, |
| "eval_runtime": 36.6997, |
| "eval_samples_per_second": 762.759, |
| "eval_steps_per_second": 11.935, |
| "flow/cos_sim": 0.8257385824368969, |
| "flow/improvement_ratio": 0.5061814522362191, |
| "flow/mag_ratio_mean": 0.8065648428653474, |
| "flow/mag_ratio_std": 0.23905256662738922, |
| "step": 19456 |
| }, |
| { |
| "epoch": 0.9104429356611704, |
| "grad_norm": 7.262045383453369, |
| "learning_rate": 2.063782080083576e-05, |
| "loss": 0.30695223808288574, |
| "step": 19712 |
| }, |
| { |
| "epoch": 0.9222668698905362, |
| "grad_norm": 0.8199298977851868, |
| "learning_rate": 1.557711553001523e-05, |
| "loss": 0.3086055517196655, |
| "step": 19968 |
| }, |
| { |
| "epoch": 0.9340908041199021, |
| "grad_norm": 3.6796634197235107, |
| "learning_rate": 1.1217515257622269e-05, |
| "loss": 0.30838918685913086, |
| "step": 20224 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "grad_norm": 0.3800135850906372, |
| "learning_rate": 7.565329630950746e-06, |
| "loss": 0.3107503354549408, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "eval_cos_loss": 0.16941133098991495, |
| "eval_loss": 0.26742714126360473, |
| "eval_mse_loss": 0.26742714126360473, |
| "flow/cos_sim": 0.8305886884530386, |
| "flow/improvement_ratio": 0.49329238075371745, |
| "flow/mag_ratio_mean": 0.8075202496628783, |
| "flow/mag_ratio_std": 0.24095512161107913, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "eval_cos_loss": 0.16941133098991495, |
| "eval_loss": 0.26742714126360473, |
| "eval_mse_loss": 0.26742714126360473, |
| "eval_runtime": 37.4646, |
| "eval_samples_per_second": 747.185, |
| "eval_steps_per_second": 11.691, |
| "flow/cos_sim": 0.8305886884530386, |
| "flow/improvement_ratio": 0.49329238075371745, |
| "flow/mag_ratio_mean": 0.8075202496628783, |
| "flow/mag_ratio_std": 0.24095512161107913, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.9577386725786338, |
| "grad_norm": 5.383033752441406, |
| "learning_rate": 4.62584445643166e-06, |
| "loss": 0.30672797560691833, |
| "step": 20736 |
| }, |
| { |
| "epoch": 0.9695626068079997, |
| "grad_norm": 1.579878807067871, |
| "learning_rate": 2.40331404948807e-06, |
| "loss": 0.30701693892478943, |
| "step": 20992 |
| }, |
| { |
| "epoch": 0.9813865410373654, |
| "grad_norm": 4.654466152191162, |
| "learning_rate": 9.009550772663965e-07, |
| "loss": 0.30958274006843567, |
| "step": 21248 |
| }, |
| { |
| "epoch": 0.9932104752667313, |
| "grad_norm": 0.809504508972168, |
| "learning_rate": 1.2094190315575791e-07, |
| "loss": 0.3090161085128784, |
| "step": 21504 |
| }, |
| { |
| "epoch": 0.9932104752667313, |
| "eval_cos_loss": 0.16669115675910967, |
| "eval_loss": 0.2628253948402731, |
| "eval_mse_loss": 0.2628253948402731, |
| "flow/cos_sim": 0.8333088479357768, |
| "flow/improvement_ratio": 0.5023197360915136, |
| "flow/mag_ratio_mean": 0.8090449382453204, |
| "flow/mag_ratio_std": 0.23580366417289325, |
| "step": 21504 |
| }, |
| { |
| "epoch": 0.9932104752667313, |
| "eval_cos_loss": 0.16669115675910967, |
| "eval_loss": 0.2628253948402731, |
| "eval_mse_loss": 0.2628253948402731, |
| "eval_runtime": 37.5609, |
| "eval_samples_per_second": 745.271, |
| "eval_steps_per_second": 11.661, |
| "flow/cos_sim": 0.8333088479357768, |
| "flow/improvement_ratio": 0.5023197360915136, |
| "flow/mag_ratio_mean": 0.8090449382453204, |
| "flow/mag_ratio_std": 0.23580366417289325, |
| "step": 21504 |
| } |
| ], |
| "logging_steps": 256, |
| "max_steps": 21651, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1024, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|