{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9932104752667313, "eval_steps": 1024, "global_step": 21504, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011823934229365849, "grad_norm": 1.9160995483398438, "learning_rate": 0.000498046875, "loss": 0.9917811751365662, "step": 256 }, { "epoch": 0.023647868458731697, "grad_norm": 4.747977256774902, "learning_rate": 0.000998046875, "loss": 0.7975242733955383, "step": 512 }, { "epoch": 0.03547180268809755, "grad_norm": 7.29379940032959, "learning_rate": 0.000999640996023194, "loss": 0.6437413692474365, "step": 768 }, { "epoch": 0.047295736917463395, "grad_norm": 9.162477493286133, "learning_rate": 0.0009985588674043958, "loss": 0.5476456880569458, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_cos_loss": 0.39061775943051735, "eval_loss": 0.5246143476467698, "eval_mse_loss": 0.5246143476467698, "flow/cos_sim": 0.609382248904607, "flow/improvement_ratio": 0.6071757009448526, "flow/mag_ratio_mean": 0.6818114621182011, "flow/mag_ratio_std": 0.25719259335706224, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_cos_loss": 0.39061775943051735, "eval_loss": 0.5246143476467698, "eval_mse_loss": 0.5246143476467698, "eval_runtime": 36.6551, "eval_samples_per_second": 763.687, "eval_steps_per_second": 11.949, "flow/cos_sim": 0.609382248904607, "flow/improvement_ratio": 0.6071757009448526, "flow/mag_ratio_mean": 0.6818114621182011, "flow/mag_ratio_std": 0.25719259335706224, "step": 1024 }, { "epoch": 0.05911967114682925, "grad_norm": 13.371840476989746, "learning_rate": 0.0009967551747861387, "loss": 0.5056764483451843, "step": 1280 }, { "epoch": 0.0709436053761951, "grad_norm": 7.342921733856201, "learning_rate": 0.000994232528651847, "loss": 0.4739161729812622, "step": 1536 }, { "epoch": 0.08276753960556095, "grad_norm": 9.060990333557129, "learning_rate": 0.0009909945800260092, "loss": 0.4664166569709778, "step": 1792 }, { "epoch": 0.09459147383492679, "grad_norm": 7.4525556564331055, "learning_rate": 0.0009870460151900522, "loss": 0.44106417894363403, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_cos_loss": 0.3169162204344523, "eval_loss": 0.44313590847738255, "eval_mse_loss": 0.44313590847738255, "flow/cos_sim": 0.6830837719788835, "flow/improvement_ratio": 0.5721429720452932, "flow/mag_ratio_mean": 0.61852062170364, "flow/mag_ratio_std": 0.28984700054882867, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_cos_loss": 0.3169162204344523, "eval_loss": 0.44313590847738255, "eval_mse_loss": 0.44313590847738255, "eval_runtime": 36.5832, "eval_samples_per_second": 765.188, "eval_steps_per_second": 11.973, "flow/cos_sim": 0.6830837719788835, "flow/improvement_ratio": 0.5721429720452932, "flow/mag_ratio_mean": 0.61852062170364, "flow/mag_ratio_std": 0.28984700054882867, "step": 2048 }, { "epoch": 0.10641540806429264, "grad_norm": 9.94619083404541, "learning_rate": 0.0009823925488998885, "loss": 0.42647701501846313, "step": 2304 }, { "epoch": 0.1182393422936585, "grad_norm": 3.24662446975708, "learning_rate": 0.0009770409161149525, "loss": 0.4398079514503479, "step": 2560 }, { "epoch": 0.13006327652302435, "grad_norm": 0.8419686555862427, "learning_rate": 0.0009709988622506973, "loss": 0.4244914650917053, "step": 2816 }, { "epoch": 0.1418872107523902, "grad_norm": 6.962368965148926, "learning_rate": 0.000964275131968659, "loss": 0.4100937843322754, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_cos_loss": 0.2142252650571196, "eval_loss": 0.3256854459167071, "eval_mse_loss": 0.3256854459167071, "flow/cos_sim": 0.7857747346026712, "flow/improvement_ratio": 0.5133735555219868, "flow/mag_ratio_mean": 0.754958403437105, "flow/mag_ratio_std": 0.28671529593124784, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_cos_loss": 0.2142252650571196, "eval_loss": 0.3256854459167071, "eval_mse_loss": 0.3256854459167071, "eval_runtime": 37.4143, "eval_samples_per_second": 748.189, "eval_steps_per_second": 11.707, "flow/cos_sim": 0.7857747346026712, "flow/improvement_ratio": 0.5133735555219868, "flow/mag_ratio_mean": 0.754958403437105, "flow/mag_ratio_std": 0.28671529593124784, "step": 3072 }, { "epoch": 0.15371114498175603, "grad_norm": 13.790922164916992, "learning_rate": 0.0009568794565203123, "loss": 0.4049389958381653, "step": 3328 }, { "epoch": 0.1655350792111219, "grad_norm": 2.3658220767974854, "learning_rate": 0.0009488225396630347, "loss": 0.3951943516731262, "step": 3584 }, { "epoch": 0.17735901344048774, "grad_norm": 10.210602760314941, "learning_rate": 0.0009401160421685646, "loss": 0.3869022727012634, "step": 3840 }, { "epoch": 0.18918294766985358, "grad_norm": 8.834197998046875, "learning_rate": 0.0009307725649463714, "loss": 0.39980870485305786, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_cos_loss": 0.19631782169007275, "eval_loss": 0.305890281791012, "eval_mse_loss": 0.305890281791012, "flow/cos_sim": 0.8036821900981747, "flow/improvement_ratio": 0.49340173255090847, "flow/mag_ratio_mean": 0.7550306366458875, "flow/mag_ratio_std": 0.2717885251526963, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_cos_loss": 0.19631782169007275, "eval_loss": 0.305890281791012, "eval_mse_loss": 0.305890281791012, "eval_runtime": 37.2111, "eval_samples_per_second": 752.276, "eval_steps_per_second": 11.771, "flow/cos_sim": 0.8036821900981747, "flow/improvement_ratio": 0.49340173255090847, "flow/mag_ratio_mean": 0.7550306366458875, "flow/mag_ratio_std": 0.2717885251526963, "step": 4096 }, { "epoch": 0.20100688189921945, "grad_norm": 2.6557016372680664, "learning_rate": 0.0009208056308063659, "loss": 0.38342079520225525, "step": 4352 }, { "epoch": 0.2128308161285853, "grad_norm": 6.585168838500977, "learning_rate": 0.0009102296648873445, "loss": 0.38296523690223694, "step": 4608 }, { "epoch": 0.22465475035795113, "grad_norm": 11.361368179321289, "learning_rate": 0.0008990599737794927, "loss": 0.3765565752983093, "step": 4864 }, { "epoch": 0.236478684587317, "grad_norm": 19.619009017944336, "learning_rate": 0.0008873127233711644, "loss": 0.3799913823604584, "step": 5120 }, { "epoch": 0.236478684587317, "eval_cos_loss": 0.2768729927877313, "eval_loss": 0.3794649503955014, "eval_mse_loss": 0.3794649503955014, "flow/cos_sim": 0.7231270148329538, "flow/improvement_ratio": 0.3986093916305124, "flow/mag_ratio_mean": 0.7333052162438223, "flow/mag_ratio_std": 0.2822123572366423, "step": 5120 }, { "epoch": 0.236478684587317, "eval_cos_loss": 0.2768729927877313, "eval_loss": 0.3794649503955014, "eval_mse_loss": 0.3794649503955014, "eval_runtime": 36.7774, "eval_samples_per_second": 761.148, "eval_steps_per_second": 11.91, "flow/cos_sim": 0.7231270148329538, "flow/improvement_ratio": 0.3986093916305124, "flow/mag_ratio_mean": 0.7333052162438223, "flow/mag_ratio_std": 0.2822123572366423, "step": 5120 }, { "epoch": 0.24830261881668284, "grad_norm": 8.770722389221191, "learning_rate": 0.0008750049154520011, "loss": 0.369044691324234, "step": 5376 }, { "epoch": 0.2601265530460487, "grad_norm": 8.493452072143555, "learning_rate": 0.0008621543631062487, "loss": 0.3745132386684418, "step": 5632 }, { "epoch": 0.27195048727541454, "grad_norm": 2.5794949531555176, "learning_rate": 0.0008487796649318904, "loss": 0.36342066526412964, "step": 5888 }, { "epoch": 0.2837744215047804, "grad_norm": 7.236816883087158, "learning_rate": 0.0008349001781229053, "loss": 0.35946616530418396, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_cos_loss": 0.18904457571299654, "eval_loss": 0.29877711084063197, "eval_mse_loss": 0.29877711084063197, "flow/cos_sim": 0.8109554353097802, "flow/improvement_ratio": 0.4896438498747403, "flow/mag_ratio_mean": 0.7580637161590192, "flow/mag_ratio_std": 0.2673305695936016, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_cos_loss": 0.18904457571299654, "eval_loss": 0.29877711084063197, "eval_mse_loss": 0.29877711084063197, "eval_runtime": 36.4777, "eval_samples_per_second": 767.399, "eval_steps_per_second": 12.007, "flow/cos_sim": 0.8109554353097802, "flow/improvement_ratio": 0.4896438498747403, "flow/mag_ratio_mean": 0.7580637161590192, "flow/mag_ratio_std": 0.2673305695936016, "step": 6144 }, { "epoch": 0.2955983557341462, "grad_norm": 9.964309692382812, "learning_rate": 0.0008205359904536107, "loss": 0.36158525943756104, "step": 6400 }, { "epoch": 0.30742228996351206, "grad_norm": 2.096620559692383, "learning_rate": 0.0008057078912056363, "loss": 0.3546257019042969, "step": 6656 }, { "epoch": 0.3192462241928779, "grad_norm": 4.746336936950684, "learning_rate": 0.0007904373410796086, "loss": 0.359122633934021, "step": 6912 }, { "epoch": 0.3310701584222438, "grad_norm": 3.691810369491577, "learning_rate": 0.0007747464411350876, "loss": 0.3498728573322296, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_cos_loss": 0.20516517899556247, "eval_loss": 0.3059897372099363, "eval_mse_loss": 0.3059897372099363, "flow/cos_sim": 0.794834830717409, "flow/improvement_ratio": 0.5194083058262524, "flow/mag_ratio_mean": 0.7830329138666527, "flow/mag_ratio_std": 0.2623120350380466, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_cos_loss": 0.20516517899556247, "eval_loss": 0.3059897372099363, "eval_mse_loss": 0.3059897372099363, "eval_runtime": 36.6184, "eval_samples_per_second": 764.452, "eval_steps_per_second": 11.961, "flow/cos_sim": 0.794834830717409, "flow/improvement_ratio": 0.5194083058262524, "flow/mag_ratio_mean": 0.7830329138666527, "flow/mag_ratio_std": 0.2623120350380466, "step": 7168 }, { "epoch": 0.34289409265160964, "grad_norm": 8.637118339538574, "learning_rate": 0.000758657900803716, "loss": 0.35182619094848633, "step": 7424 }, { "epoch": 0.3547180268809755, "grad_norm": 1.3778982162475586, "learning_rate": 0.000742195005021869, "loss": 0.3482975363731384, "step": 7680 }, { "epoch": 0.3665419611103413, "grad_norm": 6.048179626464844, "learning_rate": 0.0007253815805303786, "loss": 0.3527331054210663, "step": 7936 }, { "epoch": 0.37836589533970716, "grad_norm": 22.744041442871094, "learning_rate": 0.0007082419613901028, "loss": 0.35066965222358704, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_cos_loss": 0.2029232965722748, "eval_loss": 0.3061500937427016, "eval_mse_loss": 0.3061500937427016, "flow/cos_sim": 0.7970767074251828, "flow/improvement_ratio": 0.5205303045440483, "flow/mag_ratio_mean": 0.7948372063299293, "flow/mag_ratio_std": 0.24897354126793064, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_cos_loss": 0.2029232965722748, "eval_loss": 0.3061500937427016, "eval_mse_loss": 0.3061500937427016, "eval_runtime": 36.7266, "eval_samples_per_second": 762.2, "eval_steps_per_second": 11.926, "flow/cos_sim": 0.7970767074251828, "flow/improvement_ratio": 0.5205303045440483, "flow/mag_ratio_mean": 0.7948372063299293, "flow/mag_ratio_std": 0.24897354126793064, "step": 8192 }, { "epoch": 0.390189829569073, "grad_norm": 5.6811418533325195, "learning_rate": 0.0006908009537632514, "loss": 0.3552817404270172, "step": 8448 }, { "epoch": 0.4020137637984389, "grad_norm": 22.05255126953125, "learning_rate": 0.0006730838000114403, "loss": 0.3447907269001007, "step": 8704 }, { "epoch": 0.41383769802780473, "grad_norm": 2.4954335689544678, "learning_rate": 0.0006551161421624341, "loss": 0.3462049663066864, "step": 8960 }, { "epoch": 0.4256616322571706, "grad_norm": 10.035924911499023, "learning_rate": 0.0006369239847984517, "loss": 0.3400329649448395, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_cos_loss": 0.18391099199652672, "eval_loss": 0.28414961062882044, "eval_mse_loss": 0.28414961062882044, "flow/cos_sim": 0.816089018839135, "flow/improvement_ratio": 0.49869729156635667, "flow/mag_ratio_mean": 0.8094866498964562, "flow/mag_ratio_std": 0.256197469147373, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_cos_loss": 0.18391099199652672, "eval_loss": 0.28414961062882044, "eval_mse_loss": 0.28414961062882044, "eval_runtime": 36.6985, "eval_samples_per_second": 762.783, "eval_steps_per_second": 11.935, "flow/cos_sim": 0.816089018839135, "flow/improvement_ratio": 0.49869729156635667, "flow/mag_ratio_mean": 0.8094866498964562, "flow/mag_ratio_std": 0.256197469147373, "step": 9216 }, { "epoch": 0.4374855664865364, "grad_norm": 7.901397705078125, "learning_rate": 0.0006185336574197479, "loss": 0.3395940065383911, "step": 9472 }, { "epoch": 0.44930950071590225, "grad_norm": 6.7792510986328125, "learning_rate": 0.0005999717763379407, "loss": 0.33901962637901306, "step": 9728 }, { "epoch": 0.4611334349452681, "grad_norm": 2.6271603107452393, "learning_rate": 0.0005812652061542363, "loss": 0.34404292702674866, "step": 9984 }, { "epoch": 0.472957369174634, "grad_norm": 4.694519996643066, "learning_rate": 0.0005624410208783071, "loss": 0.33998560905456543, "step": 10240 }, { "epoch": 0.472957369174634, "eval_cos_loss": 0.19037291955321892, "eval_loss": 0.29135872845508193, "eval_mse_loss": 0.29135872845508193, "flow/cos_sim": 0.809627095041754, "flow/improvement_ratio": 0.5105767372536333, "flow/mag_ratio_mean": 0.8087259782775896, "flow/mag_ratio_std": 0.26619178907239816, "step": 10240 }, { "epoch": 0.472957369174634, "eval_cos_loss": 0.19037291955321892, "eval_loss": 0.29135872845508193, "eval_mse_loss": 0.29135872845508193, "eval_runtime": 36.2962, "eval_samples_per_second": 771.237, "eval_steps_per_second": 12.067, "flow/cos_sim": 0.809627095041754, "flow/improvement_ratio": 0.5105767372536333, "flow/mag_ratio_mean": 0.8087259782775896, "flow/mag_ratio_std": 0.26619178907239816, "step": 10240 }, { "epoch": 0.48478130340399983, "grad_norm": 6.458459377288818, "learning_rate": 0.0005435264647440881, "loss": 0.33370938897132874, "step": 10496 }, { "epoch": 0.49660523763336567, "grad_norm": 4.395213603973389, "learning_rate": 0.000524548912779213, "loss": 0.3374914824962616, "step": 10752 }, { "epoch": 0.5084291718627315, "grad_norm": 14.4677152633667, "learning_rate": 0.0005055358311851499, "loss": 0.3334283232688904, "step": 11008 }, { "epoch": 0.5202531060920974, "grad_norm": 9.418996810913086, "learning_rate": 0.0004865147375853812, "loss": 0.33191806077957153, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_cos_loss": 0.19875142282751052, "eval_loss": 0.2990237715990032, "eval_mse_loss": 0.2990237715990032, "flow/cos_sim": 0.8012485821225327, "flow/improvement_ratio": 0.50358519822223, "flow/mag_ratio_mean": 0.7997565068066392, "flow/mag_ratio_std": 0.2594736643373694, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_cos_loss": 0.19875142282751052, "eval_loss": 0.2990237715990032, "eval_mse_loss": 0.2990237715990032, "eval_runtime": 36.1011, "eval_samples_per_second": 775.405, "eval_steps_per_second": 12.133, "flow/cos_sim": 0.8012485821225327, "flow/improvement_ratio": 0.50358519822223, "flow/mag_ratio_mean": 0.7997565068066392, "flow/mag_ratio_std": 0.2594736643373694, "step": 11264 }, { "epoch": 0.5320770403214632, "grad_norm": 12.210289001464844, "learning_rate": 0.0004675131611991607, "loss": 0.33119097352027893, "step": 11520 }, { "epoch": 0.5439009745508291, "grad_norm": 5.221109867095947, "learning_rate": 0.0004485586029984899, "loss": 0.3348545432090759, "step": 11776 }, { "epoch": 0.5557249087801949, "grad_norm": 9.297412872314453, "learning_rate": 0.00042967849590597266, "loss": 0.3332735300064087, "step": 12032 }, { "epoch": 0.5675488430095608, "grad_norm": 11.681222915649414, "learning_rate": 0.0004109001650911621, "loss": 0.32956069707870483, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_cos_loss": 0.19949678523831715, "eval_loss": 0.2990707095083036, "eval_mse_loss": 0.2990707095083036, "flow/cos_sim": 0.800503221821023, "flow/improvement_ratio": 0.45373746521396724, "flow/mag_ratio_mean": 0.8073946686640178, "flow/mag_ratio_std": 0.24824306055835393, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_cos_loss": 0.19949678523831715, "eval_loss": 0.2990707095083036, "eval_mse_loss": 0.2990707095083036, "eval_runtime": 36.2453, "eval_samples_per_second": 772.321, "eval_steps_per_second": 12.084, "flow/cos_sim": 0.800503221821023, "flow/improvement_ratio": 0.45373746521396724, "flow/mag_ratio_mean": 0.8073946686640178, "flow/mag_ratio_std": 0.24824306055835393, "step": 12288 }, { "epoch": 0.5793727772389267, "grad_norm": 11.020252227783203, "learning_rate": 0.0003922507884228551, "loss": 0.3290621340274811, "step": 12544 }, { "epoch": 0.5911967114682924, "grad_norm": 12.451437950134277, "learning_rate": 0.00037375735713457723, "loss": 0.32443854212760925, "step": 12800 }, { "epoch": 0.6030206456976583, "grad_norm": 1.0615566968917847, "learning_rate": 0.00035544663676018276, "loss": 0.32868245244026184, "step": 13056 }, { "epoch": 0.6148445799270241, "grad_norm": 12.848718643188477, "learning_rate": 0.00033734512839611255, "loss": 0.32849442958831787, "step": 13312 }, { "epoch": 0.6148445799270241, "eval_cos_loss": 0.22722740034194297, "eval_loss": 0.3263611817332708, "eval_mse_loss": 0.3263611817332708, "flow/cos_sim": 0.7727726012060087, "flow/improvement_ratio": 0.5387330570993902, "flow/mag_ratio_mean": 0.8061602507552056, "flow/mag_ratio_std": 0.24406742666886277, "step": 13312 }, { "epoch": 0.6148445799270241, "eval_cos_loss": 0.22722740034194297, "eval_loss": 0.3263611817332708, "eval_mse_loss": 0.3263611817332708, "eval_runtime": 36.2088, "eval_samples_per_second": 773.1, "eval_steps_per_second": 12.097, "flow/cos_sim": 0.7727726012060087, "flow/improvement_ratio": 0.5387330570993902, "flow/mag_ratio_mean": 0.8061602507552056, "flow/mag_ratio_std": 0.24406742666886277, "step": 13312 }, { "epoch": 0.62666851415639, "grad_norm": 3.315863609313965, "learning_rate": 0.0003194790303463687, "loss": 0.32513225078582764, "step": 13568 }, { "epoch": 0.6384924483857558, "grad_norm": 0.8015891909599304, "learning_rate": 0.00030187420020572406, "loss": 0.32288485765457153, "step": 13824 }, { "epoch": 0.6503163826151217, "grad_norm": 8.767995834350586, "learning_rate": 0.00028455611743603626, "loss": 0.31935569643974304, "step": 14080 }, { "epoch": 0.6621403168444876, "grad_norm": 2.7436001300811768, "learning_rate": 0.0002675498464898373, "loss": 0.3218556344509125, "step": 14336 }, { "epoch": 0.6621403168444876, "eval_cos_loss": 0.17437796600877423, "eval_loss": 0.27404612850516896, "eval_mse_loss": 0.27404612850516896, "flow/cos_sim": 0.8256220499130145, "flow/improvement_ratio": 0.4894021302461624, "flow/mag_ratio_mean": 0.8008586715345514, "flow/mag_ratio_std": 0.2502900305199841, "step": 14336 }, { "epoch": 0.6621403168444876, "eval_cos_loss": 0.17437796600877423, "eval_loss": 0.27404612850516896, "eval_mse_loss": 0.27404612850516896, "eval_runtime": 36.4787, "eval_samples_per_second": 767.38, "eval_steps_per_second": 12.007, "flow/cos_sim": 0.8256220499130145, "flow/improvement_ratio": 0.4894021302461624, "flow/mag_ratio_mean": 0.8008586715345514, "flow/mag_ratio_std": 0.2502900305199841, "step": 14336 }, { "epoch": 0.6739642510738534, "grad_norm": 1.7940927743911743, "learning_rate": 0.0002508800005345623, "loss": 0.32087063789367676, "step": 14592 }, { "epoch": 0.6857881853032193, "grad_norm": 3.976600170135498, "learning_rate": 0.00023457070582992562, "loss": 0.31643542647361755, "step": 14848 }, { "epoch": 0.6976121195325851, "grad_norm": 14.614996910095215, "learning_rate": 0.00021864556680999692, "loss": 0.31989988684654236, "step": 15104 }, { "epoch": 0.709436053761951, "grad_norm": 1.3015860319137573, "learning_rate": 0.0002031276319205152, "loss": 0.319489985704422, "step": 15360 }, { "epoch": 0.709436053761951, "eval_cos_loss": 0.1751825540883628, "eval_loss": 0.2740960814841262, "eval_mse_loss": 0.2740960814841262, "flow/cos_sim": 0.8248174480379444, "flow/improvement_ratio": 0.5053689104917387, "flow/mag_ratio_mean": 0.7900927232825048, "flow/mag_ratio_std": 0.24186882635229798, "step": 15360 }, { "epoch": 0.709436053761951, "eval_cos_loss": 0.1751825540883628, "eval_loss": 0.2740960814841262, "eval_mse_loss": 0.2740960814841262, "eval_runtime": 36.3752, "eval_samples_per_second": 769.563, "eval_steps_per_second": 12.041, "flow/cos_sim": 0.8248174480379444, "flow/improvement_ratio": 0.5053689104917387, "flow/mag_ratio_mean": 0.7900927232825048, "flow/mag_ratio_std": 0.24186882635229798, "step": 15360 }, { "epoch": 0.7212599879913169, "grad_norm": 0.6580540537834167, "learning_rate": 0.00018803936026088542, "loss": 0.31635990738868713, "step": 15616 }, { "epoch": 0.7330839222206826, "grad_norm": 4.1114654541015625, "learning_rate": 0.00017340258907913464, "loss": 0.3187476694583893, "step": 15872 }, { "epoch": 0.7449078564500485, "grad_norm": 12.402987480163574, "learning_rate": 0.0001592385021668743, "loss": 0.3155273497104645, "step": 16128 }, { "epoch": 0.7567317906794143, "grad_norm": 1.0991050004959106, "learning_rate": 0.0001455675992000087, "loss": 0.31325414776802063, "step": 16384 }, { "epoch": 0.7567317906794143, "eval_cos_loss": 0.17473208807306748, "eval_loss": 0.2738769624263184, "eval_mse_loss": 0.2738769624263184, "flow/cos_sim": 0.8252679411679098, "flow/improvement_ratio": 0.48874352702267093, "flow/mag_ratio_mean": 0.7912447798197673, "flow/mag_ratio_std": 0.25162723619643956, "step": 16384 }, { "epoch": 0.7567317906794143, "eval_cos_loss": 0.17473208807306748, "eval_loss": 0.2738769624263184, "eval_mse_loss": 0.2738769624263184, "eval_runtime": 36.1853, "eval_samples_per_second": 773.602, "eval_steps_per_second": 12.104, "flow/cos_sim": 0.8252679411679098, "flow/improvement_ratio": 0.48874352702267093, "flow/mag_ratio_mean": 0.7912447798197673, "flow/mag_ratio_std": 0.25162723619643956, "step": 16384 }, { "epoch": 0.7685557249087802, "grad_norm": 1.9538664817810059, "learning_rate": 0.000132409666069565, "loss": 0.31205466389656067, "step": 16640 }, { "epoch": 0.780379659138146, "grad_norm": 3.74383544921875, "learning_rate": 0.0001197837462455823, "loss": 0.3163967728614807, "step": 16896 }, { "epoch": 0.7922035933675119, "grad_norm": 2.0866622924804688, "learning_rate": 0.00010770811321550749, "loss": 0.3133208453655243, "step": 17152 }, { "epoch": 0.8040275275968778, "grad_norm": 3.845749855041504, "learning_rate": 9.620024403698591e-05, "loss": 0.31291234493255615, "step": 17408 }, { "epoch": 0.8040275275968778, "eval_cos_loss": 0.17509721335210757, "eval_loss": 0.273063686272325, "eval_mse_loss": 0.273063686272325, "flow/cos_sim": 0.824902795357247, "flow/improvement_ratio": 0.5037120235020711, "flow/mag_ratio_mean": 0.8056292350161566, "flow/mag_ratio_std": 0.23707074216103444, "step": 17408 }, { "epoch": 0.8040275275968778, "eval_cos_loss": 0.17509721335210757, "eval_loss": 0.273063686272325, "eval_mse_loss": 0.273063686272325, "eval_runtime": 36.2474, "eval_samples_per_second": 772.277, "eval_steps_per_second": 12.084, "flow/cos_sim": 0.824902795357247, "flow/improvement_ratio": 0.5037120235020711, "flow/mag_ratio_mean": 0.8056292350161566, "flow/mag_ratio_std": 0.23707074216103444, "step": 17408 }, { "epoch": 0.8158514618262436, "grad_norm": 4.972906112670898, "learning_rate": 8.527679404332429e-05, "loss": 0.3136768639087677, "step": 17664 }, { "epoch": 0.8276753960556095, "grad_norm": 0.5566576719284058, "learning_rate": 7.495357273823544e-05, "loss": 0.30899107456207275, "step": 17920 }, { "epoch": 0.8394993302849753, "grad_norm": 2.5373237133026123, "learning_rate": 6.524552091475183e-05, "loss": 0.30978360772132874, "step": 18176 }, { "epoch": 0.8513232645143411, "grad_norm": 8.832274436950684, "learning_rate": 5.6166689031422024e-05, "loss": 0.3114849030971527, "step": 18432 }, { "epoch": 0.8513232645143411, "eval_cos_loss": 0.179012101160609, "eval_loss": 0.27659866115137866, "eval_mse_loss": 0.27659866115137866, "flow/cos_sim": 0.8209879062219297, "flow/improvement_ratio": 0.5032780424236707, "flow/mag_ratio_mean": 0.8021674478707248, "flow/mag_ratio_std": 0.2417376903502364, "step": 18432 }, { "epoch": 0.8513232645143411, "eval_cos_loss": 0.179012101160609, "eval_loss": 0.27659866115137866, "eval_mse_loss": 0.27659866115137866, "eval_runtime": 36.438, "eval_samples_per_second": 768.236, "eval_steps_per_second": 12.02, "flow/cos_sim": 0.8209879062219297, "flow/improvement_ratio": 0.5032780424236707, "flow/mag_ratio_mean": 0.8021674478707248, "flow/mag_ratio_std": 0.2417376903502364, "step": 18432 }, { "epoch": 0.8631471987437069, "grad_norm": 6.2862749099731445, "learning_rate": 4.773021687709067e-05, "loss": 0.30685460567474365, "step": 18688 }, { "epoch": 0.8749711329730728, "grad_norm": 1.9315192699432373, "learning_rate": 3.994831455368719e-05, "loss": 0.31047487258911133, "step": 18944 }, { "epoch": 0.8867950672024387, "grad_norm": 8.073254585266113, "learning_rate": 3.283224480455282e-05, "loss": 0.3102303445339203, "step": 19200 }, { "epoch": 0.8986190014318045, "grad_norm": 0.3247526288032532, "learning_rate": 2.639230671387627e-05, "loss": 0.3080306351184845, "step": 19456 }, { "epoch": 0.8986190014318045, "eval_cos_loss": 0.17426143144363682, "eval_loss": 0.2714149275258796, "eval_mse_loss": 0.2714149275258796, "flow/cos_sim": 0.8257385824368969, "flow/improvement_ratio": 0.5061814522362191, "flow/mag_ratio_mean": 0.8065648428653474, "flow/mag_ratio_std": 0.23905256662738922, "step": 19456 }, { "epoch": 0.8986190014318045, "eval_cos_loss": 0.17426143144363682, "eval_loss": 0.2714149275258796, "eval_mse_loss": 0.2714149275258796, "eval_runtime": 36.6997, "eval_samples_per_second": 762.759, "eval_steps_per_second": 11.935, "flow/cos_sim": 0.8257385824368969, "flow/improvement_ratio": 0.5061814522362191, "flow/mag_ratio_mean": 0.8065648428653474, "flow/mag_ratio_std": 0.23905256662738922, "step": 19456 }, { "epoch": 0.9104429356611704, "grad_norm": 7.262045383453369, "learning_rate": 2.063782080083576e-05, "loss": 0.30695223808288574, "step": 19712 }, { "epoch": 0.9222668698905362, "grad_norm": 0.8199298977851868, "learning_rate": 1.557711553001523e-05, "loss": 0.3086055517196655, "step": 19968 }, { "epoch": 0.9340908041199021, "grad_norm": 3.6796634197235107, "learning_rate": 1.1217515257622269e-05, "loss": 0.30838918685913086, "step": 20224 }, { "epoch": 0.945914738349268, "grad_norm": 0.3800135850906372, "learning_rate": 7.565329630950746e-06, "loss": 0.3107503354549408, "step": 20480 }, { "epoch": 0.945914738349268, "eval_cos_loss": 0.16941133098991495, "eval_loss": 0.26742714126360473, "eval_mse_loss": 0.26742714126360473, "flow/cos_sim": 0.8305886884530386, "flow/improvement_ratio": 0.49329238075371745, "flow/mag_ratio_mean": 0.8075202496628783, "flow/mag_ratio_std": 0.24095512161107913, "step": 20480 }, { "epoch": 0.945914738349268, "eval_cos_loss": 0.16941133098991495, "eval_loss": 0.26742714126360473, "eval_mse_loss": 0.26742714126360473, "eval_runtime": 37.4646, "eval_samples_per_second": 747.185, "eval_steps_per_second": 11.691, "flow/cos_sim": 0.8305886884530386, "flow/improvement_ratio": 0.49329238075371745, "flow/mag_ratio_mean": 0.8075202496628783, "flow/mag_ratio_std": 0.24095512161107913, "step": 20480 }, { "epoch": 0.9577386725786338, "grad_norm": 5.383033752441406, "learning_rate": 4.62584445643166e-06, "loss": 0.30672797560691833, "step": 20736 }, { "epoch": 0.9695626068079997, "grad_norm": 1.579878807067871, "learning_rate": 2.40331404948807e-06, "loss": 0.30701693892478943, "step": 20992 }, { "epoch": 0.9813865410373654, "grad_norm": 4.654466152191162, "learning_rate": 9.009550772663965e-07, "loss": 0.30958274006843567, "step": 21248 }, { "epoch": 0.9932104752667313, "grad_norm": 0.809504508972168, "learning_rate": 1.2094190315575791e-07, "loss": 0.3090161085128784, "step": 21504 }, { "epoch": 0.9932104752667313, "eval_cos_loss": 0.16669115675910967, "eval_loss": 0.2628253948402731, "eval_mse_loss": 0.2628253948402731, "flow/cos_sim": 0.8333088479357768, "flow/improvement_ratio": 0.5023197360915136, "flow/mag_ratio_mean": 0.8090449382453204, "flow/mag_ratio_std": 0.23580366417289325, "step": 21504 }, { "epoch": 0.9932104752667313, "eval_cos_loss": 0.16669115675910967, "eval_loss": 0.2628253948402731, "eval_mse_loss": 0.2628253948402731, "eval_runtime": 37.5609, "eval_samples_per_second": 745.271, "eval_steps_per_second": 11.661, "flow/cos_sim": 0.8333088479357768, "flow/improvement_ratio": 0.5023197360915136, "flow/mag_ratio_mean": 0.8090449382453204, "flow/mag_ratio_std": 0.23580366417289325, "step": 21504 } ], "logging_steps": 256, "max_steps": 21651, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1024, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }