| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9932104752667313, |
| "eval_steps": 1024, |
| "global_step": 21504, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.011823934229365849, |
| "grad_norm": 0.3170062303543091, |
| "learning_rate": 0.000498046875, |
| "loss": 2.42356538772583, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.023647868458731697, |
| "grad_norm": 0.20766793191432953, |
| "learning_rate": 0.000998046875, |
| "loss": 2.0067272186279297, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.03547180268809755, |
| "grad_norm": 0.47662925720214844, |
| "learning_rate": 0.000999640996023194, |
| "loss": 1.8047175407409668, |
| "step": 768 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "grad_norm": 0.38411229848861694, |
| "learning_rate": 0.0009985588674043958, |
| "loss": 1.7324724197387695, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "eval_cos_loss": 0.4818032477136072, |
| "eval_loss": 1.6560364654075066, |
| "eval_mse_loss": 1.4151348444424807, |
| "flow/cos_sim": 0.5181967507894725, |
| "flow/improvement_ratio": 0.954772694197964, |
| "flow/mag_ratio_mean": 0.48862707193039323, |
| "flow/mag_ratio_std": 0.18404143276416005, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "eval_cos_loss": 0.4818032477136072, |
| "eval_loss": 1.6560364654075066, |
| "eval_mse_loss": 1.4151348444424807, |
| "eval_runtime": 41.6432, |
| "eval_samples_per_second": 672.211, |
| "eval_steps_per_second": 10.518, |
| "flow/cos_sim": 0.5181967507894725, |
| "flow/improvement_ratio": 0.954772694197964, |
| "flow/mag_ratio_mean": 0.48862707193039323, |
| "flow/mag_ratio_std": 0.18404143276416005, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.05911967114682925, |
| "grad_norm": 0.4044971168041229, |
| "learning_rate": 0.0009967551747861387, |
| "loss": 1.6944022178649902, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.0709436053761951, |
| "grad_norm": 0.23417921364307404, |
| "learning_rate": 0.000994232528651847, |
| "loss": 1.6682453155517578, |
| "step": 1536 |
| }, |
| { |
| "epoch": 0.08276753960556095, |
| "grad_norm": 0.11558878421783447, |
| "learning_rate": 0.0009909945800260092, |
| "loss": 1.6490408182144165, |
| "step": 1792 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "grad_norm": 0.1645389050245285, |
| "learning_rate": 0.0009870460151900522, |
| "loss": 1.6396121978759766, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "eval_cos_loss": 0.4538577257498214, |
| "eval_loss": 1.5759433782808312, |
| "eval_mse_loss": 1.3490145179234683, |
| "flow/cos_sim": 0.5461422990174054, |
| "flow/improvement_ratio": 0.9588973424206041, |
| "flow/mag_ratio_mean": 0.5239204951481188, |
| "flow/mag_ratio_std": 0.19641730230148524, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "eval_cos_loss": 0.4538577257498214, |
| "eval_loss": 1.5759433782808312, |
| "eval_mse_loss": 1.3490145179234683, |
| "eval_runtime": 38.1827, |
| "eval_samples_per_second": 733.134, |
| "eval_steps_per_second": 11.471, |
| "flow/cos_sim": 0.5461422990174054, |
| "flow/improvement_ratio": 0.9588973424206041, |
| "flow/mag_ratio_mean": 0.5239204951481188, |
| "flow/mag_ratio_std": 0.19641730230148524, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.10641540806429264, |
| "grad_norm": 0.1483554244041443, |
| "learning_rate": 0.0009823925488998885, |
| "loss": 1.629364252090454, |
| "step": 2304 |
| }, |
| { |
| "epoch": 0.1182393422936585, |
| "grad_norm": 0.1272267997264862, |
| "learning_rate": 0.0009770409161149525, |
| "loss": 1.6207596063613892, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.13006327652302435, |
| "grad_norm": 0.4405772387981415, |
| "learning_rate": 0.0009709988622506973, |
| "loss": 1.6130421161651611, |
| "step": 2816 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "grad_norm": 0.2765417993068695, |
| "learning_rate": 0.000964275131968659, |
| "loss": 1.6063523292541504, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "eval_cos_loss": 0.4437784091930955, |
| "eval_loss": 1.5468863191669935, |
| "eval_mse_loss": 1.3249971137199228, |
| "flow/cos_sim": 0.5562216001286354, |
| "flow/improvement_ratio": 0.9605493699306766, |
| "flow/mag_ratio_mean": 0.531293115863517, |
| "flow/mag_ratio_std": 0.20344179882306487, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "eval_cos_loss": 0.4437784091930955, |
| "eval_loss": 1.5468863191669935, |
| "eval_mse_loss": 1.3249971137199228, |
| "eval_runtime": 38.9587, |
| "eval_samples_per_second": 718.531, |
| "eval_steps_per_second": 11.243, |
| "flow/cos_sim": 0.5562216001286354, |
| "flow/improvement_ratio": 0.9605493699306766, |
| "flow/mag_ratio_mean": 0.531293115863517, |
| "flow/mag_ratio_std": 0.20344179882306487, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.15371114498175603, |
| "grad_norm": 0.14539985358715057, |
| "learning_rate": 0.0009568794565203123, |
| "loss": 1.6010686159133911, |
| "step": 3328 |
| }, |
| { |
| "epoch": 0.1655350792111219, |
| "grad_norm": 0.146505206823349, |
| "learning_rate": 0.0009488225396630347, |
| "loss": 1.593965768814087, |
| "step": 3584 |
| }, |
| { |
| "epoch": 0.17735901344048774, |
| "grad_norm": 0.17704616487026215, |
| "learning_rate": 0.0009401160421685646, |
| "loss": 1.591158390045166, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "grad_norm": 0.21900475025177002, |
| "learning_rate": 0.0009307725649463714, |
| "loss": 1.5863914489746094, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "eval_cos_loss": 0.4372148915240754, |
| "eval_loss": 1.5282805349728832, |
| "eval_mse_loss": 1.3096730913201424, |
| "flow/cos_sim": 0.5627851008552395, |
| "flow/improvement_ratio": 0.960628461483951, |
| "flow/mag_ratio_mean": 0.5383912115336553, |
| "flow/mag_ratio_std": 0.20601355281050346, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "eval_cos_loss": 0.4372148915240754, |
| "eval_loss": 1.5282805349728832, |
| "eval_mse_loss": 1.3096730913201424, |
| "eval_runtime": 37.5818, |
| "eval_samples_per_second": 744.854, |
| "eval_steps_per_second": 11.655, |
| "flow/cos_sim": 0.5627851008552395, |
| "flow/improvement_ratio": 0.960628461483951, |
| "flow/mag_ratio_mean": 0.5383912115336553, |
| "flow/mag_ratio_std": 0.20601355281050346, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.20100688189921945, |
| "grad_norm": 0.3215397000312805, |
| "learning_rate": 0.0009208056308063659, |
| "loss": 1.5831135511398315, |
| "step": 4352 |
| }, |
| { |
| "epoch": 0.2128308161285853, |
| "grad_norm": 0.39093902707099915, |
| "learning_rate": 0.0009102296648873445, |
| "loss": 1.5793178081512451, |
| "step": 4608 |
| }, |
| { |
| "epoch": 0.22465475035795113, |
| "grad_norm": 0.24414080381393433, |
| "learning_rate": 0.0008990599737794927, |
| "loss": 1.5755181312561035, |
| "step": 4864 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "grad_norm": 0.18973594903945923, |
| "learning_rate": 0.0008873127233711644, |
| "loss": 1.5741870403289795, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "eval_cos_loss": 0.43135033779220494, |
| "eval_loss": 1.5102224085973277, |
| "eval_mse_loss": 1.294547238578535, |
| "flow/cos_sim": 0.568649673570781, |
| "flow/improvement_ratio": 0.9623238818558384, |
| "flow/mag_ratio_mean": 0.5475505786399318, |
| "flow/mag_ratio_std": 0.2083817292089876, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "eval_cos_loss": 0.43135033779220494, |
| "eval_loss": 1.5102224085973277, |
| "eval_mse_loss": 1.294547238578535, |
| "eval_runtime": 38.3814, |
| "eval_samples_per_second": 729.337, |
| "eval_steps_per_second": 11.412, |
| "flow/cos_sim": 0.568649673570781, |
| "flow/improvement_ratio": 0.9623238818558384, |
| "flow/mag_ratio_mean": 0.5475505786399318, |
| "flow/mag_ratio_std": 0.2083817292089876, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.24830261881668284, |
| "grad_norm": 0.23730266094207764, |
| "learning_rate": 0.0008750049154520011, |
| "loss": 1.5687248706817627, |
| "step": 5376 |
| }, |
| { |
| "epoch": 0.2601265530460487, |
| "grad_norm": 0.23536692559719086, |
| "learning_rate": 0.0008621543631062487, |
| "loss": 1.5671372413635254, |
| "step": 5632 |
| }, |
| { |
| "epoch": 0.27195048727541454, |
| "grad_norm": 0.15165014564990997, |
| "learning_rate": 0.0008487796649318904, |
| "loss": 1.5671967267990112, |
| "step": 5888 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "grad_norm": 0.1857716292142868, |
| "learning_rate": 0.0008349001781229053, |
| "loss": 1.5656864643096924, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "eval_cos_loss": 0.4303430036458795, |
| "eval_loss": 1.507517477148744, |
| "eval_mse_loss": 1.292345978353666, |
| "flow/cos_sim": 0.5696570243193134, |
| "flow/improvement_ratio": 0.9619137474663182, |
| "flow/mag_ratio_mean": 0.5517072507507725, |
| "flow/mag_ratio_std": 0.2047896740711443, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "eval_cos_loss": 0.4303430036458795, |
| "eval_loss": 1.507517477148744, |
| "eval_mse_loss": 1.292345978353666, |
| "eval_runtime": 39.1056, |
| "eval_samples_per_second": 715.83, |
| "eval_steps_per_second": 11.2, |
| "flow/cos_sim": 0.5696570243193134, |
| "flow/improvement_ratio": 0.9619137474663182, |
| "flow/mag_ratio_mean": 0.5517072507507725, |
| "flow/mag_ratio_std": 0.2047896740711443, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2955983557341462, |
| "grad_norm": 0.16939695179462433, |
| "learning_rate": 0.0008205359904536107, |
| "loss": 1.5614818334579468, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.30742228996351206, |
| "grad_norm": 0.20279403030872345, |
| "learning_rate": 0.0008057078912056363, |
| "loss": 1.5598734617233276, |
| "step": 6656 |
| }, |
| { |
| "epoch": 0.3192462241928779, |
| "grad_norm": 0.17468483746051788, |
| "learning_rate": 0.0007904373410796086, |
| "loss": 1.5608937740325928, |
| "step": 6912 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "grad_norm": 0.13540241122245789, |
| "learning_rate": 0.0007747464411350876, |
| "loss": 1.5515446662902832, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "eval_cos_loss": 0.42760413388411206, |
| "eval_loss": 1.4991425577908346, |
| "eval_mse_loss": 1.285340489862172, |
| "flow/cos_sim": 0.5723958823778857, |
| "flow/improvement_ratio": 0.9629914771476292, |
| "flow/mag_ratio_mean": 0.551068566841622, |
| "flow/mag_ratio_std": 0.21019490124428108, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "eval_cos_loss": 0.42760413388411206, |
| "eval_loss": 1.4991425577908346, |
| "eval_mse_loss": 1.285340489862172, |
| "eval_runtime": 37.5914, |
| "eval_samples_per_second": 744.665, |
| "eval_steps_per_second": 11.652, |
| "flow/cos_sim": 0.5723958823778857, |
| "flow/improvement_ratio": 0.9629914771476292, |
| "flow/mag_ratio_mean": 0.551068566841622, |
| "flow/mag_ratio_std": 0.21019490124428108, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.34289409265160964, |
| "grad_norm": 0.16203606128692627, |
| "learning_rate": 0.000758657900803716, |
| "loss": 1.5541112422943115, |
| "step": 7424 |
| }, |
| { |
| "epoch": 0.3547180268809755, |
| "grad_norm": 0.15600702166557312, |
| "learning_rate": 0.000742195005021869, |
| "loss": 1.552086353302002, |
| "step": 7680 |
| }, |
| { |
| "epoch": 0.3665419611103413, |
| "grad_norm": 0.25274714827537537, |
| "learning_rate": 0.0007253815805303786, |
| "loss": 1.5514739751815796, |
| "step": 7936 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "grad_norm": 0.16391786932945251, |
| "learning_rate": 0.0007082419613901028, |
| "loss": 1.5471858978271484, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "eval_cos_loss": 0.42565103276679506, |
| "eval_loss": 1.4905888979837774, |
| "eval_mse_loss": 1.277763385206597, |
| "flow/cos_sim": 0.5743489830189099, |
| "flow/improvement_ratio": 0.9618241816895193, |
| "flow/mag_ratio_mean": 0.5533584926498535, |
| "flow/mag_ratio_std": 0.21572684449011878, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "eval_cos_loss": 0.42565103276679506, |
| "eval_loss": 1.4905888979837774, |
| "eval_mse_loss": 1.277763385206597, |
| "eval_runtime": 37.3487, |
| "eval_samples_per_second": 749.503, |
| "eval_steps_per_second": 11.727, |
| "flow/cos_sim": 0.5743489830189099, |
| "flow/improvement_ratio": 0.9618241816895193, |
| "flow/mag_ratio_mean": 0.5533584926498535, |
| "flow/mag_ratio_std": 0.21572684449011878, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.390189829569073, |
| "grad_norm": 0.13766784965991974, |
| "learning_rate": 0.0006908009537632514, |
| "loss": 1.5495105981826782, |
| "step": 8448 |
| }, |
| { |
| "epoch": 0.4020137637984389, |
| "grad_norm": 0.15862250328063965, |
| "learning_rate": 0.0006730838000114403, |
| "loss": 1.5457631349563599, |
| "step": 8704 |
| }, |
| { |
| "epoch": 0.41383769802780473, |
| "grad_norm": 0.1839229315519333, |
| "learning_rate": 0.0006551161421624341, |
| "loss": 1.5441044569015503, |
| "step": 8960 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "grad_norm": 0.13566677272319794, |
| "learning_rate": 0.0006369239847984517, |
| "loss": 1.5432497262954712, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "eval_cos_loss": 0.42430811161047793, |
| "eval_loss": 1.4869584389473205, |
| "eval_mse_loss": 1.2748043828358933, |
| "flow/cos_sim": 0.5756919168990496, |
| "flow/improvement_ratio": 0.96208056807518, |
| "flow/mag_ratio_mean": 0.5574794498752785, |
| "flow/mag_ratio_std": 0.21047259782139024, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "eval_cos_loss": 0.42430811161047793, |
| "eval_loss": 1.4869584389473205, |
| "eval_mse_loss": 1.2748043828358933, |
| "eval_runtime": 37.5205, |
| "eval_samples_per_second": 746.072, |
| "eval_steps_per_second": 11.674, |
| "flow/cos_sim": 0.5756919168990496, |
| "flow/improvement_ratio": 0.96208056807518, |
| "flow/mag_ratio_mean": 0.5574794498752785, |
| "flow/mag_ratio_std": 0.21047259782139024, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4374855664865364, |
| "grad_norm": 0.13037081062793732, |
| "learning_rate": 0.0006185336574197479, |
| "loss": 1.544568419456482, |
| "step": 9472 |
| }, |
| { |
| "epoch": 0.44930950071590225, |
| "grad_norm": 0.2653915286064148, |
| "learning_rate": 0.0005999717763379407, |
| "loss": 1.5420877933502197, |
| "step": 9728 |
| }, |
| { |
| "epoch": 0.4611334349452681, |
| "grad_norm": 0.12325401604175568, |
| "learning_rate": 0.0005812652061542363, |
| "loss": 1.5435826778411865, |
| "step": 9984 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "grad_norm": 0.2154727578163147, |
| "learning_rate": 0.0005624410208783071, |
| "loss": 1.5385282039642334, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "eval_cos_loss": 0.42216237005033447, |
| "eval_loss": 1.4820257316441297, |
| "eval_mse_loss": 1.2709445436251219, |
| "flow/cos_sim": 0.577837641925028, |
| "flow/improvement_ratio": 0.9637134710675506, |
| "flow/mag_ratio_mean": 0.5554149469284162, |
| "flow/mag_ratio_std": 0.2115786727432791, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "eval_cos_loss": 0.42216237005033447, |
| "eval_loss": 1.4820257316441297, |
| "eval_mse_loss": 1.2709445436251219, |
| "eval_runtime": 37.3144, |
| "eval_samples_per_second": 750.194, |
| "eval_steps_per_second": 11.738, |
| "flow/cos_sim": 0.577837641925028, |
| "flow/improvement_ratio": 0.9637134710675506, |
| "flow/mag_ratio_mean": 0.5554149469284162, |
| "flow/mag_ratio_std": 0.2115786727432791, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.48478130340399983, |
| "grad_norm": 0.167108952999115, |
| "learning_rate": 0.0005435264647440881, |
| "loss": 1.5393356084823608, |
| "step": 10496 |
| }, |
| { |
| "epoch": 0.49660523763336567, |
| "grad_norm": 0.11320369690656662, |
| "learning_rate": 0.000524548912779213, |
| "loss": 1.5383528470993042, |
| "step": 10752 |
| }, |
| { |
| "epoch": 0.5084291718627315, |
| "grad_norm": 0.279090940952301, |
| "learning_rate": 0.0005055358311851499, |
| "loss": 1.5419063568115234, |
| "step": 11008 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "grad_norm": 0.1590709239244461, |
| "learning_rate": 0.0004865147375853812, |
| "loss": 1.5351711511611938, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "eval_cos_loss": 0.42213277620811984, |
| "eval_loss": 1.4814584206228387, |
| "eval_mse_loss": 1.2703920295249382, |
| "flow/cos_sim": 0.577867228690892, |
| "flow/improvement_ratio": 0.962785639174997, |
| "flow/mag_ratio_mean": 0.5578808011529652, |
| "flow/mag_ratio_std": 0.21099132279010668, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "eval_cos_loss": 0.42213277620811984, |
| "eval_loss": 1.4814584206228387, |
| "eval_mse_loss": 1.2703920295249382, |
| "eval_runtime": 38.5165, |
| "eval_samples_per_second": 726.78, |
| "eval_steps_per_second": 11.372, |
| "flow/cos_sim": 0.577867228690892, |
| "flow/improvement_ratio": 0.962785639174997, |
| "flow/mag_ratio_mean": 0.5578808011529652, |
| "flow/mag_ratio_std": 0.21099132279010668, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5320770403214632, |
| "grad_norm": 0.14667300879955292, |
| "learning_rate": 0.0004675131611991607, |
| "loss": 1.5333118438720703, |
| "step": 11520 |
| }, |
| { |
| "epoch": 0.5439009745508291, |
| "grad_norm": 0.12886938452720642, |
| "learning_rate": 0.0004485586029984899, |
| "loss": 1.5335720777511597, |
| "step": 11776 |
| }, |
| { |
| "epoch": 0.5557249087801949, |
| "grad_norm": 0.12272178381681442, |
| "learning_rate": 0.00042967849590597266, |
| "loss": 1.536974310874939, |
| "step": 12032 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "grad_norm": 0.1320953667163849, |
| "learning_rate": 0.0004109001650911621, |
| "loss": 1.5350415706634521, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "eval_cos_loss": 0.4204618357224007, |
| "eval_loss": 1.4744470647480934, |
| "eval_mse_loss": 1.2642161459683283, |
| "flow/cos_sim": 0.5795381778179238, |
| "flow/improvement_ratio": 0.9622249373290093, |
| "flow/mag_ratio_mean": 0.5585955846799563, |
| "flow/mag_ratio_std": 0.21563580887366648, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "eval_cos_loss": 0.4204618357224007, |
| "eval_loss": 1.4744470647480934, |
| "eval_mse_loss": 1.2642161459683283, |
| "eval_runtime": 39.1238, |
| "eval_samples_per_second": 715.498, |
| "eval_steps_per_second": 11.195, |
| "flow/cos_sim": 0.5795381778179238, |
| "flow/improvement_ratio": 0.9622249373290093, |
| "flow/mag_ratio_mean": 0.5585955846799563, |
| "flow/mag_ratio_std": 0.21563580887366648, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5793727772389267, |
| "grad_norm": 0.16382183134555817, |
| "learning_rate": 0.0003922507884228551, |
| "loss": 1.5361268520355225, |
| "step": 12544 |
| }, |
| { |
| "epoch": 0.5911967114682924, |
| "grad_norm": 0.13758233189582825, |
| "learning_rate": 0.00037375735713457723, |
| "loss": 1.530663013458252, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.6030206456976583, |
| "grad_norm": 0.13651148974895477, |
| "learning_rate": 0.00035544663676018276, |
| "loss": 1.528375506401062, |
| "step": 13056 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "grad_norm": 0.19377295672893524, |
| "learning_rate": 0.00033734512839611255, |
| "loss": 1.531029462814331, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "eval_cos_loss": 0.4196748673643696, |
| "eval_loss": 1.4739454851302927, |
| "eval_mse_loss": 1.2641080489986019, |
| "flow/cos_sim": 0.5803251580832756, |
| "flow/improvement_ratio": 0.9619928854241219, |
| "flow/mag_ratio_mean": 0.5591708176756558, |
| "flow/mag_ratio_std": 0.2099404629302896, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "eval_cos_loss": 0.4196748673643696, |
| "eval_loss": 1.4739454851302927, |
| "eval_mse_loss": 1.2641080489986019, |
| "eval_runtime": 37.6325, |
| "eval_samples_per_second": 743.851, |
| "eval_steps_per_second": 11.639, |
| "flow/cos_sim": 0.5803251580832756, |
| "flow/improvement_ratio": 0.9619928854241219, |
| "flow/mag_ratio_mean": 0.5591708176756558, |
| "flow/mag_ratio_std": 0.2099404629302896, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.62666851415639, |
| "grad_norm": 0.11970210075378418, |
| "learning_rate": 0.0003194790303463687, |
| "loss": 1.526605248451233, |
| "step": 13568 |
| }, |
| { |
| "epoch": 0.6384924483857558, |
| "grad_norm": 0.16870331764221191, |
| "learning_rate": 0.00030187420020572406, |
| "loss": 1.5294699668884277, |
| "step": 13824 |
| }, |
| { |
| "epoch": 0.6503163826151217, |
| "grad_norm": 0.20158159732818604, |
| "learning_rate": 0.00028455611743603626, |
| "loss": 1.5296140909194946, |
| "step": 14080 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "grad_norm": 0.17316782474517822, |
| "learning_rate": 0.0002675498464898373, |
| "loss": 1.5282411575317383, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "eval_cos_loss": 0.4197337330748501, |
| "eval_loss": 1.4732592075927073, |
| "eval_mse_loss": 1.263392340374864, |
| "flow/cos_sim": 0.5802662803974326, |
| "flow/improvement_ratio": 0.9630070868964609, |
| "flow/mag_ratio_mean": 0.5561743245277231, |
| "flow/mag_ratio_std": 0.21411516738536696, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "eval_cos_loss": 0.4197337330748501, |
| "eval_loss": 1.4732592075927073, |
| "eval_mse_loss": 1.263392340374864, |
| "eval_runtime": 38.6362, |
| "eval_samples_per_second": 724.527, |
| "eval_steps_per_second": 11.337, |
| "flow/cos_sim": 0.5802662803974326, |
| "flow/improvement_ratio": 0.9630070868964609, |
| "flow/mag_ratio_mean": 0.5561743245277231, |
| "flow/mag_ratio_std": 0.21411516738536696, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6739642510738534, |
| "grad_norm": 0.1980622410774231, |
| "learning_rate": 0.0002508800005345623, |
| "loss": 1.527830719947815, |
| "step": 14592 |
| }, |
| { |
| "epoch": 0.6857881853032193, |
| "grad_norm": 0.13991132378578186, |
| "learning_rate": 0.00023457070582992562, |
| "loss": 1.5318052768707275, |
| "step": 14848 |
| }, |
| { |
| "epoch": 0.6976121195325851, |
| "grad_norm": 0.149434432387352, |
| "learning_rate": 0.00021864556680999692, |
| "loss": 1.5274654626846313, |
| "step": 15104 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "grad_norm": 0.13668516278266907, |
| "learning_rate": 0.0002031276319205152, |
| "loss": 1.5278632640838623, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "eval_cos_loss": 0.41798419257005054, |
| "eval_loss": 1.4683536525186338, |
| "eval_mse_loss": 1.259361555315044, |
| "flow/cos_sim": 0.5820158255971186, |
| "flow/improvement_ratio": 0.9633506106459386, |
| "flow/mag_ratio_mean": 0.5602124498042886, |
| "flow/mag_ratio_std": 0.21119761075739446, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "eval_cos_loss": 0.41798419257005054, |
| "eval_loss": 1.4683536525186338, |
| "eval_mse_loss": 1.259361555315044, |
| "eval_runtime": 37.7713, |
| "eval_samples_per_second": 741.119, |
| "eval_steps_per_second": 11.596, |
| "flow/cos_sim": 0.5820158255971186, |
| "flow/improvement_ratio": 0.9633506106459386, |
| "flow/mag_ratio_mean": 0.5602124498042886, |
| "flow/mag_ratio_std": 0.21119761075739446, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.7212599879913169, |
| "grad_norm": 0.10699856281280518, |
| "learning_rate": 0.00018803936026088542, |
| "loss": 1.5269209146499634, |
| "step": 15616 |
| }, |
| { |
| "epoch": 0.7330839222206826, |
| "grad_norm": 0.14415434002876282, |
| "learning_rate": 0.00017340258907913464, |
| "loss": 1.5266590118408203, |
| "step": 15872 |
| }, |
| { |
| "epoch": 0.7449078564500485, |
| "grad_norm": 0.10905779153108597, |
| "learning_rate": 0.0001592385021668743, |
| "loss": 1.524775505065918, |
| "step": 16128 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "grad_norm": 0.19332484900951385, |
| "learning_rate": 0.0001455675992000087, |
| "loss": 1.528196096420288, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "eval_cos_loss": 0.4177281832041806, |
| "eval_loss": 1.4672707036205623, |
| "eval_mse_loss": 1.258406613243225, |
| "flow/cos_sim": 0.5822718332619428, |
| "flow/improvement_ratio": 0.9624373912811279, |
| "flow/mag_ratio_mean": 0.5578821809052332, |
| "flow/mag_ratio_std": 0.21358931578321544, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "eval_cos_loss": 0.4177281832041806, |
| "eval_loss": 1.4672707036205623, |
| "eval_mse_loss": 1.258406613243225, |
| "eval_runtime": 38.8935, |
| "eval_samples_per_second": 719.735, |
| "eval_steps_per_second": 11.262, |
| "flow/cos_sim": 0.5822718332619428, |
| "flow/improvement_ratio": 0.9624373912811279, |
| "flow/mag_ratio_mean": 0.5578821809052332, |
| "flow/mag_ratio_std": 0.21358931578321544, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7685557249087802, |
| "grad_norm": 0.17042651772499084, |
| "learning_rate": 0.000132409666069565, |
| "loss": 1.524718999862671, |
| "step": 16640 |
| }, |
| { |
| "epoch": 0.780379659138146, |
| "grad_norm": 0.18739284574985504, |
| "learning_rate": 0.0001197837462455823, |
| "loss": 1.5236212015151978, |
| "step": 16896 |
| }, |
| { |
| "epoch": 0.7922035933675119, |
| "grad_norm": 0.18229515850543976, |
| "learning_rate": 0.00010770811321550749, |
| "loss": 1.5214073657989502, |
| "step": 17152 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "grad_norm": 0.1200081929564476, |
| "learning_rate": 9.620024403698591e-05, |
| "loss": 1.5241215229034424, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "eval_cos_loss": 0.41774995141922067, |
| "eval_loss": 1.4673981081405187, |
| "eval_mse_loss": 1.2585231351525816, |
| "flow/cos_sim": 0.5822500714428349, |
| "flow/improvement_ratio": 0.9632431905563563, |
| "flow/mag_ratio_mean": 0.5575322135942712, |
| "flow/mag_ratio_std": 0.2133726158506794, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "eval_cos_loss": 0.41774995141922067, |
| "eval_loss": 1.4673981081405187, |
| "eval_mse_loss": 1.2585231351525816, |
| "eval_runtime": 38.8056, |
| "eval_samples_per_second": 721.365, |
| "eval_steps_per_second": 11.287, |
| "flow/cos_sim": 0.5822500714428349, |
| "flow/improvement_ratio": 0.9632431905563563, |
| "flow/mag_ratio_mean": 0.5575322135942712, |
| "flow/mag_ratio_std": 0.2133726158506794, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8158514618262436, |
| "grad_norm": 0.20400674641132355, |
| "learning_rate": 8.527679404332429e-05, |
| "loss": 1.527288794517517, |
| "step": 17664 |
| }, |
| { |
| "epoch": 0.8276753960556095, |
| "grad_norm": 0.13825097680091858, |
| "learning_rate": 7.495357273823544e-05, |
| "loss": 1.5233813524246216, |
| "step": 17920 |
| }, |
| { |
| "epoch": 0.8394993302849753, |
| "grad_norm": 0.1886366605758667, |
| "learning_rate": 6.524552091475183e-05, |
| "loss": 1.5250685214996338, |
| "step": 18176 |
| }, |
| { |
| "epoch": 0.8513232645143411, |
| "grad_norm": 0.12399227172136307, |
| "learning_rate": 5.6166689031422024e-05, |
| "loss": 1.5211102962493896, |
| "step": 18432 |
| }, |
| { |
| "epoch": 0.8513232645143411, |
| "eval_cos_loss": 0.4172959377368291, |
| "eval_loss": 1.467090511975223, |
| "eval_mse_loss": 1.2584425412356581, |
| "flow/cos_sim": 0.5827040846489336, |
| "flow/improvement_ratio": 0.9641230167591408, |
| "flow/mag_ratio_mean": 0.559296594771076, |
| "flow/mag_ratio_std": 0.2129400044203349, |
| "step": 18432 |
| }, |
| { |
| "epoch": 0.8513232645143411, |
| "eval_cos_loss": 0.4172959377368291, |
| "eval_loss": 1.467090511975223, |
| "eval_mse_loss": 1.2584425412356581, |
| "eval_runtime": 38.949, |
| "eval_samples_per_second": 718.708, |
| "eval_steps_per_second": 11.245, |
| "flow/cos_sim": 0.5827040846489336, |
| "flow/improvement_ratio": 0.9641230167591408, |
| "flow/mag_ratio_mean": 0.559296594771076, |
| "flow/mag_ratio_std": 0.2129400044203349, |
| "step": 18432 |
| }, |
| { |
| "epoch": 0.8631471987437069, |
| "grad_norm": 0.15448303520679474, |
| "learning_rate": 4.773021687709067e-05, |
| "loss": 1.524503231048584, |
| "step": 18688 |
| }, |
| { |
| "epoch": 0.8749711329730728, |
| "grad_norm": 0.11015952378511429, |
| "learning_rate": 3.994831455368719e-05, |
| "loss": 1.5246198177337646, |
| "step": 18944 |
| }, |
| { |
| "epoch": 0.8867950672024387, |
| "grad_norm": 0.14956603944301605, |
| "learning_rate": 3.283224480455282e-05, |
| "loss": 1.524648666381836, |
| "step": 19200 |
| }, |
| { |
| "epoch": 0.8986190014318045, |
| "grad_norm": 0.1354081630706787, |
| "learning_rate": 2.639230671387627e-05, |
| "loss": 1.5228582620620728, |
| "step": 19456 |
| }, |
| { |
| "epoch": 0.8986190014318045, |
| "eval_cos_loss": 0.4183324677486942, |
| "eval_loss": 1.4691238653714254, |
| "eval_mse_loss": 1.2599576338785423, |
| "flow/cos_sim": 0.5816675356533973, |
| "flow/improvement_ratio": 0.9630278509501453, |
| "flow/mag_ratio_mean": 0.5622789722327228, |
| "flow/mag_ratio_std": 0.21166673544198955, |
| "step": 19456 |
| }, |
| { |
| "epoch": 0.8986190014318045, |
| "eval_cos_loss": 0.4183324677486942, |
| "eval_loss": 1.4691238653714254, |
| "eval_mse_loss": 1.2599576338785423, |
| "eval_runtime": 38.8176, |
| "eval_samples_per_second": 721.141, |
| "eval_steps_per_second": 11.284, |
| "flow/cos_sim": 0.5816675356533973, |
| "flow/improvement_ratio": 0.9630278509501453, |
| "flow/mag_ratio_mean": 0.5622789722327228, |
| "flow/mag_ratio_std": 0.21166673544198955, |
| "step": 19456 |
| }, |
| { |
| "epoch": 0.9104429356611704, |
| "grad_norm": 0.11552918702363968, |
| "learning_rate": 2.063782080083576e-05, |
| "loss": 1.5217269659042358, |
| "step": 19712 |
| }, |
| { |
| "epoch": 0.9222668698905362, |
| "grad_norm": 0.14077740907669067, |
| "learning_rate": 1.557711553001523e-05, |
| "loss": 1.5255944728851318, |
| "step": 19968 |
| }, |
| { |
| "epoch": 0.9340908041199021, |
| "grad_norm": 0.11086593568325043, |
| "learning_rate": 1.1217515257622269e-05, |
| "loss": 1.5237233638763428, |
| "step": 20224 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "grad_norm": 0.1408848911523819, |
| "learning_rate": 7.565329630950746e-06, |
| "loss": 1.5244028568267822, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "eval_cos_loss": 0.41753342645625546, |
| "eval_loss": 1.4667748912284364, |
| "eval_mse_loss": 1.2580081817766302, |
| "flow/cos_sim": 0.5824665990594315, |
| "flow/improvement_ratio": 0.9622961689620257, |
| "flow/mag_ratio_mean": 0.5584562299730571, |
| "flow/mag_ratio_std": 0.2129541815483951, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "eval_cos_loss": 0.41753342645625546, |
| "eval_loss": 1.4667748912284364, |
| "eval_mse_loss": 1.2580081817766302, |
| "eval_runtime": 38.5143, |
| "eval_samples_per_second": 726.82, |
| "eval_steps_per_second": 11.372, |
| "flow/cos_sim": 0.5824665990594315, |
| "flow/improvement_ratio": 0.9622961689620257, |
| "flow/mag_ratio_mean": 0.5584562299730571, |
| "flow/mag_ratio_std": 0.2129541815483951, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.9577386725786338, |
| "grad_norm": 0.11133582890033722, |
| "learning_rate": 4.62584445643166e-06, |
| "loss": 1.5237617492675781, |
| "step": 20736 |
| }, |
| { |
| "epoch": 0.9695626068079997, |
| "grad_norm": 0.13919079303741455, |
| "learning_rate": 2.40331404948807e-06, |
| "loss": 1.5243273973464966, |
| "step": 20992 |
| }, |
| { |
| "epoch": 0.9813865410373654, |
| "grad_norm": 0.10569830238819122, |
| "learning_rate": 9.009550772663965e-07, |
| "loss": 1.524424433708191, |
| "step": 21248 |
| }, |
| { |
| "epoch": 0.9932104752667313, |
| "grad_norm": 0.10687630623579025, |
| "learning_rate": 1.2094190315575791e-07, |
| "loss": 1.5235631465911865, |
| "step": 21504 |
| }, |
| { |
| "epoch": 0.9932104752667313, |
| "eval_cos_loss": 0.4184109731896283, |
| "eval_loss": 1.4685546744904017, |
| "eval_mse_loss": 1.25934919020901, |
| "flow/cos_sim": 0.5815890253134514, |
| "flow/improvement_ratio": 0.9625635148455564, |
| "flow/mag_ratio_mean": 0.5583232171731453, |
| "flow/mag_ratio_std": 0.2131242363216126, |
| "step": 21504 |
| }, |
| { |
| "epoch": 0.9932104752667313, |
| "eval_cos_loss": 0.4184109731896283, |
| "eval_loss": 1.4685546744904017, |
| "eval_mse_loss": 1.25934919020901, |
| "eval_runtime": 37.8094, |
| "eval_samples_per_second": 740.371, |
| "eval_steps_per_second": 11.584, |
| "flow/cos_sim": 0.5815890253134514, |
| "flow/improvement_ratio": 0.9625635148455564, |
| "flow/mag_ratio_mean": 0.5583232171731453, |
| "flow/mag_ratio_std": 0.2131242363216126, |
| "step": 21504 |
| } |
| ], |
| "logging_steps": 256, |
| "max_steps": 21651, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1024, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|