| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9932104752667313, |
| "eval_steps": 1024, |
| "global_step": 21504, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.011823934229365849, |
| "grad_norm": 0.30032017827033997, |
| "learning_rate": 0.000498046875, |
| "loss": 1.8424009084701538, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.023647868458731697, |
| "grad_norm": 0.48476606607437134, |
| "learning_rate": 0.000998046875, |
| "loss": 1.3167253732681274, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.03547180268809755, |
| "grad_norm": 0.46512553095817566, |
| "learning_rate": 0.000999640996023194, |
| "loss": 1.0889508724212646, |
| "step": 768 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "grad_norm": 0.3862663209438324, |
| "learning_rate": 0.0009985588674043958, |
| "loss": 0.9328931570053101, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "eval_cos_loss": 0.3909751661972368, |
| "eval_loss": 0.859363008171456, |
| "eval_mse_loss": 0.859363008171456, |
| "flow/cos_sim": 0.6090248374089803, |
| "flow/improvement_ratio": 0.96680837529435, |
| "flow/mag_ratio_mean": 0.6285397670312559, |
| "flow/mag_ratio_std": 0.2260889009865996, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "eval_cos_loss": 0.3909751661972368, |
| "eval_loss": 0.859363008171456, |
| "eval_mse_loss": 0.859363008171456, |
| "eval_runtime": 27.6707, |
| "eval_samples_per_second": 1011.649, |
| "eval_steps_per_second": 15.829, |
| "flow/cos_sim": 0.6090248374089803, |
| "flow/improvement_ratio": 0.96680837529435, |
| "flow/mag_ratio_mean": 0.6285397670312559, |
| "flow/mag_ratio_std": 0.2260889009865996, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.05911967114682925, |
| "grad_norm": 0.3437928557395935, |
| "learning_rate": 0.0009967551747861387, |
| "loss": 0.7950779795646667, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.0709436053761951, |
| "grad_norm": 0.4011635482311249, |
| "learning_rate": 0.000994232528651847, |
| "loss": 0.6695980429649353, |
| "step": 1536 |
| }, |
| { |
| "epoch": 0.08276753960556095, |
| "grad_norm": 0.49726057052612305, |
| "learning_rate": 0.0009909945800260092, |
| "loss": 0.5587890148162842, |
| "step": 1792 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "grad_norm": 0.29605454206466675, |
| "learning_rate": 0.0009870460151900522, |
| "loss": 0.4556920826435089, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "eval_cos_loss": 0.21293426390107908, |
| "eval_loss": 0.4076748799243474, |
| "eval_mse_loss": 0.4076748799243474, |
| "flow/cos_sim": 0.7870657346020006, |
| "flow/improvement_ratio": 0.9822775999160662, |
| "flow/mag_ratio_mean": 0.7956366545805648, |
| "flow/mag_ratio_std": 0.18839361525425627, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "eval_cos_loss": 0.21293426390107908, |
| "eval_loss": 0.4076748799243474, |
| "eval_mse_loss": 0.4076748799243474, |
| "eval_runtime": 27.9237, |
| "eval_samples_per_second": 1002.48, |
| "eval_steps_per_second": 15.686, |
| "flow/cos_sim": 0.7870657346020006, |
| "flow/improvement_ratio": 0.9822775999160662, |
| "flow/mag_ratio_mean": 0.7956366545805648, |
| "flow/mag_ratio_std": 0.18839361525425627, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.10641540806429264, |
| "grad_norm": 0.37945282459259033, |
| "learning_rate": 0.0009823925488998885, |
| "loss": 0.3644663393497467, |
| "step": 2304 |
| }, |
| { |
| "epoch": 0.1182393422936585, |
| "grad_norm": 0.2784593999385834, |
| "learning_rate": 0.0009770409161149525, |
| "loss": 0.2877335250377655, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.13006327652302435, |
| "grad_norm": 0.28011366724967957, |
| "learning_rate": 0.0009709988622506973, |
| "loss": 0.22570422291755676, |
| "step": 2816 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "grad_norm": 0.16544577479362488, |
| "learning_rate": 0.000964275131968659, |
| "loss": 0.18250930309295654, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "eval_cos_loss": 0.08555437008675919, |
| "eval_loss": 0.16474426010427953, |
| "eval_mse_loss": 0.16474426010427953, |
| "flow/cos_sim": 0.9144456284503414, |
| "flow/improvement_ratio": 0.9908438801221108, |
| "flow/mag_ratio_mean": 0.9158023052836117, |
| "flow/mag_ratio_std": 0.12712571353196553, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "eval_cos_loss": 0.08555437008675919, |
| "eval_loss": 0.16474426010427953, |
| "eval_mse_loss": 0.16474426010427953, |
| "eval_runtime": 27.5022, |
| "eval_samples_per_second": 1017.847, |
| "eval_steps_per_second": 15.926, |
| "flow/cos_sim": 0.9144456284503414, |
| "flow/improvement_ratio": 0.9908438801221108, |
| "flow/mag_ratio_mean": 0.9158023052836117, |
| "flow/mag_ratio_std": 0.12712571353196553, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.15371114498175603, |
| "grad_norm": 0.6755663752555847, |
| "learning_rate": 0.0009568794565203123, |
| "loss": 0.15251992642879486, |
| "step": 3328 |
| }, |
| { |
| "epoch": 0.1655350792111219, |
| "grad_norm": 1.6686415672302246, |
| "learning_rate": 0.0009488225396630347, |
| "loss": 0.13442441821098328, |
| "step": 3584 |
| }, |
| { |
| "epoch": 0.17735901344048774, |
| "grad_norm": 0.1821625530719757, |
| "learning_rate": 0.0009401160421685646, |
| "loss": 0.12373481690883636, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "grad_norm": 0.14941175282001495, |
| "learning_rate": 0.0009307725649463714, |
| "loss": 0.11532580107450485, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "eval_cos_loss": 0.05926439563263225, |
| "eval_loss": 0.11456974646816515, |
| "eval_mse_loss": 0.11456974646816515, |
| "flow/cos_sim": 0.9407356054815528, |
| "flow/improvement_ratio": 0.9921626927645784, |
| "flow/mag_ratio_mean": 0.942024191764936, |
| "flow/mag_ratio_std": 0.09949292650778, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "eval_cos_loss": 0.05926439563263225, |
| "eval_loss": 0.11456974646816515, |
| "eval_mse_loss": 0.11456974646816515, |
| "eval_runtime": 28.5456, |
| "eval_samples_per_second": 980.64, |
| "eval_steps_per_second": 15.344, |
| "flow/cos_sim": 0.9407356054815528, |
| "flow/improvement_ratio": 0.9921626927645784, |
| "flow/mag_ratio_mean": 0.942024191764936, |
| "flow/mag_ratio_std": 0.09949292650778, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.20100688189921945, |
| "grad_norm": 0.18909402191638947, |
| "learning_rate": 0.0009208056308063659, |
| "loss": 0.10999151319265366, |
| "step": 4352 |
| }, |
| { |
| "epoch": 0.2128308161285853, |
| "grad_norm": 0.40145087242126465, |
| "learning_rate": 0.0009102296648873445, |
| "loss": 0.10512682795524597, |
| "step": 4608 |
| }, |
| { |
| "epoch": 0.22465475035795113, |
| "grad_norm": 0.19676516950130463, |
| "learning_rate": 0.0008990599737794927, |
| "loss": 0.10141732543706894, |
| "step": 4864 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "grad_norm": 0.17175287008285522, |
| "learning_rate": 0.0008873127233711644, |
| "loss": 0.09812680631875992, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "eval_cos_loss": 0.050695853061191565, |
| "eval_loss": 0.09832691372325432, |
| "eval_mse_loss": 0.09832691372325432, |
| "flow/cos_sim": 0.9493041464455052, |
| "flow/improvement_ratio": 0.9926231454496515, |
| "flow/mag_ratio_mean": 0.9585911660161737, |
| "flow/mag_ratio_std": 0.08434220563331152, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "eval_cos_loss": 0.050695853061191565, |
| "eval_loss": 0.09832691372325432, |
| "eval_mse_loss": 0.09832691372325432, |
| "eval_runtime": 27.3575, |
| "eval_samples_per_second": 1023.228, |
| "eval_steps_per_second": 16.01, |
| "flow/cos_sim": 0.9493041464455052, |
| "flow/improvement_ratio": 0.9926231454496515, |
| "flow/mag_ratio_mean": 0.9585911660161737, |
| "flow/mag_ratio_std": 0.08434220563331152, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.24830261881668284, |
| "grad_norm": 0.13017332553863525, |
| "learning_rate": 0.0008750049154520011, |
| "loss": 0.09549019485712051, |
| "step": 5376 |
| }, |
| { |
| "epoch": 0.2601265530460487, |
| "grad_norm": 0.12280620634555817, |
| "learning_rate": 0.0008621543631062487, |
| "loss": 0.09369617700576782, |
| "step": 5632 |
| }, |
| { |
| "epoch": 0.27195048727541454, |
| "grad_norm": 0.1377657949924469, |
| "learning_rate": 0.0008487796649318904, |
| "loss": 0.09171723574399948, |
| "step": 5888 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "grad_norm": 0.13840267062187195, |
| "learning_rate": 0.0008349001781229053, |
| "loss": 0.0928829088807106, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "eval_cos_loss": 0.04612090527058736, |
| "eval_loss": 0.08980049764456814, |
| "eval_mse_loss": 0.08980049764456814, |
| "flow/cos_sim": 0.9538790959201447, |
| "flow/improvement_ratio": 0.9942120166674052, |
| "flow/mag_ratio_mean": 0.9548581241200503, |
| "flow/mag_ratio_std": 0.07417129391812843, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "eval_cos_loss": 0.04612090527058736, |
| "eval_loss": 0.08980049764456814, |
| "eval_mse_loss": 0.08980049764456814, |
| "eval_runtime": 27.8042, |
| "eval_samples_per_second": 1006.79, |
| "eval_steps_per_second": 15.753, |
| "flow/cos_sim": 0.9538790959201447, |
| "flow/improvement_ratio": 0.9942120166674052, |
| "flow/mag_ratio_mean": 0.9548581241200503, |
| "flow/mag_ratio_std": 0.07417129391812843, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2955983557341462, |
| "grad_norm": 0.10361608117818832, |
| "learning_rate": 0.0008205359904536107, |
| "loss": 0.08833561092615128, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.30742228996351206, |
| "grad_norm": 0.40116435289382935, |
| "learning_rate": 0.0008057078912056363, |
| "loss": 0.08737602084875107, |
| "step": 6656 |
| }, |
| { |
| "epoch": 0.3192462241928779, |
| "grad_norm": 0.12478947639465332, |
| "learning_rate": 0.0007904373410796086, |
| "loss": 0.08542584627866745, |
| "step": 6912 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "grad_norm": 0.4613255560398102, |
| "learning_rate": 0.0007747464411350876, |
| "loss": 0.08413137495517731, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "eval_cos_loss": 0.042676221596323734, |
| "eval_loss": 0.08318097989785073, |
| "eval_mse_loss": 0.08318097989785073, |
| "flow/cos_sim": 0.9573237789820318, |
| "flow/improvement_ratio": 0.9943706639553314, |
| "flow/mag_ratio_mean": 0.9544149404519224, |
| "flow/mag_ratio_std": 0.06704916739538652, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "eval_cos_loss": 0.042676221596323734, |
| "eval_loss": 0.08318097989785073, |
| "eval_mse_loss": 0.08318097989785073, |
| "eval_runtime": 28.2024, |
| "eval_samples_per_second": 992.576, |
| "eval_steps_per_second": 15.531, |
| "flow/cos_sim": 0.9573237789820318, |
| "flow/improvement_ratio": 0.9943706639553314, |
| "flow/mag_ratio_mean": 0.9544149404519224, |
| "flow/mag_ratio_std": 0.06704916739538652, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.34289409265160964, |
| "grad_norm": 0.3934769928455353, |
| "learning_rate": 0.000758657900803716, |
| "loss": 0.08310537040233612, |
| "step": 7424 |
| }, |
| { |
| "epoch": 0.3547180268809755, |
| "grad_norm": 0.36915773153305054, |
| "learning_rate": 0.000742195005021869, |
| "loss": 0.08330061286687851, |
| "step": 7680 |
| }, |
| { |
| "epoch": 0.3665419611103413, |
| "grad_norm": 0.11406463384628296, |
| "learning_rate": 0.0007253815805303786, |
| "loss": 0.08167832344770432, |
| "step": 7936 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "grad_norm": 0.10967294871807098, |
| "learning_rate": 0.0007082419613901028, |
| "loss": 0.08145447075366974, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "eval_cos_loss": 0.041529001344896885, |
| "eval_loss": 0.08086973952648302, |
| "eval_mse_loss": 0.08086973952648302, |
| "flow/cos_sim": 0.9584709986976293, |
| "flow/improvement_ratio": 0.9945931761232141, |
| "flow/mag_ratio_mean": 0.9566228298563936, |
| "flow/mag_ratio_std": 0.06226046868178942, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "eval_cos_loss": 0.041529001344896885, |
| "eval_loss": 0.08086973952648302, |
| "eval_mse_loss": 0.08086973952648302, |
| "eval_runtime": 27.7752, |
| "eval_samples_per_second": 1007.843, |
| "eval_steps_per_second": 15.769, |
| "flow/cos_sim": 0.9584709986976293, |
| "flow/improvement_ratio": 0.9945931761232141, |
| "flow/mag_ratio_mean": 0.9566228298563936, |
| "flow/mag_ratio_std": 0.06226046868178942, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.390189829569073, |
| "grad_norm": 0.25795963406562805, |
| "learning_rate": 0.0006908009537632514, |
| "loss": 0.0798477828502655, |
| "step": 8448 |
| }, |
| { |
| "epoch": 0.4020137637984389, |
| "grad_norm": 0.1671624779701233, |
| "learning_rate": 0.0006730838000114403, |
| "loss": 0.079444020986557, |
| "step": 8704 |
| }, |
| { |
| "epoch": 0.41383769802780473, |
| "grad_norm": 0.48908066749572754, |
| "learning_rate": 0.0006551161421624341, |
| "loss": 0.0788843110203743, |
| "step": 8960 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "grad_norm": 0.4118134677410126, |
| "learning_rate": 0.0006369239847984517, |
| "loss": 0.07910227030515671, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "eval_cos_loss": 0.04017602187075299, |
| "eval_loss": 0.07839778928111678, |
| "eval_mse_loss": 0.07839778928111678, |
| "flow/cos_sim": 0.9598239789542542, |
| "flow/improvement_ratio": 0.9939218965023076, |
| "flow/mag_ratio_mean": 0.9669629585797384, |
| "flow/mag_ratio_std": 0.05801496024511448, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "eval_cos_loss": 0.04017602187075299, |
| "eval_loss": 0.07839778928111678, |
| "eval_mse_loss": 0.07839778928111678, |
| "eval_runtime": 27.7311, |
| "eval_samples_per_second": 1009.444, |
| "eval_steps_per_second": 15.795, |
| "flow/cos_sim": 0.9598239789542542, |
| "flow/improvement_ratio": 0.9939218965023076, |
| "flow/mag_ratio_mean": 0.9669629585797384, |
| "flow/mag_ratio_std": 0.05801496024511448, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4374855664865364, |
| "grad_norm": 0.2151251882314682, |
| "learning_rate": 0.0006185336574197479, |
| "loss": 0.07854308187961578, |
| "step": 9472 |
| }, |
| { |
| "epoch": 0.44930950071590225, |
| "grad_norm": 0.06261777877807617, |
| "learning_rate": 0.0005999717763379407, |
| "loss": 0.07761576026678085, |
| "step": 9728 |
| }, |
| { |
| "epoch": 0.4611334349452681, |
| "grad_norm": 0.4810551404953003, |
| "learning_rate": 0.0005812652061542363, |
| "loss": 0.0762307196855545, |
| "step": 9984 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "grad_norm": 0.2286372035741806, |
| "learning_rate": 0.0005624410208783071, |
| "loss": 0.07773169875144958, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "eval_cos_loss": 0.039269575551492436, |
| "eval_loss": 0.07653046936885408, |
| "eval_mse_loss": 0.07653046936885408, |
| "flow/cos_sim": 0.9607304255711978, |
| "flow/improvement_ratio": 0.9943927899343238, |
| "flow/mag_ratio_mean": 0.9607061044266235, |
| "flow/mag_ratio_std": 0.055606617375391805, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "eval_cos_loss": 0.039269575551492436, |
| "eval_loss": 0.07653046936885408, |
| "eval_mse_loss": 0.07653046936885408, |
| "eval_runtime": 27.9635, |
| "eval_samples_per_second": 1001.054, |
| "eval_steps_per_second": 15.663, |
| "flow/cos_sim": 0.9607304255711978, |
| "flow/improvement_ratio": 0.9943927899343238, |
| "flow/mag_ratio_mean": 0.9607061044266235, |
| "flow/mag_ratio_std": 0.055606617375391805, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.48478130340399983, |
| "grad_norm": 0.11119114607572556, |
| "learning_rate": 0.0005435264647440881, |
| "loss": 0.07176047563552856, |
| "step": 10496 |
| }, |
| { |
| "epoch": 0.49660523763336567, |
| "grad_norm": 0.345055490732193, |
| "learning_rate": 0.000524548912779213, |
| "loss": 0.056456491351127625, |
| "step": 10752 |
| }, |
| { |
| "epoch": 0.5084291718627315, |
| "grad_norm": 0.21917253732681274, |
| "learning_rate": 0.0005055358311851499, |
| "loss": 0.053561244159936905, |
| "step": 11008 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "grad_norm": 0.21707729995250702, |
| "learning_rate": 0.0004865147375853812, |
| "loss": 0.05172141268849373, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "eval_cos_loss": 0.026124844331821623, |
| "eval_loss": 0.0521071719082251, |
| "eval_mse_loss": 0.0521071719082251, |
| "flow/cos_sim": 0.9738751555958839, |
| "flow/improvement_ratio": 0.9952182877281485, |
| "flow/mag_ratio_mean": 0.9838547212620304, |
| "flow/mag_ratio_std": 0.058393157618843256, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "eval_cos_loss": 0.026124844331821623, |
| "eval_loss": 0.0521071719082251, |
| "eval_mse_loss": 0.0521071719082251, |
| "eval_runtime": 27.9732, |
| "eval_samples_per_second": 1000.709, |
| "eval_steps_per_second": 15.658, |
| "flow/cos_sim": 0.9738751555958839, |
| "flow/improvement_ratio": 0.9952182877281485, |
| "flow/mag_ratio_mean": 0.9838547212620304, |
| "flow/mag_ratio_std": 0.058393157618843256, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5320770403214632, |
| "grad_norm": 0.12577901780605316, |
| "learning_rate": 0.0004675131611991607, |
| "loss": 0.05032345652580261, |
| "step": 11520 |
| }, |
| { |
| "epoch": 0.5439009745508291, |
| "grad_norm": 0.34826231002807617, |
| "learning_rate": 0.0004485586029984899, |
| "loss": 0.049038730561733246, |
| "step": 11776 |
| }, |
| { |
| "epoch": 0.5557249087801949, |
| "grad_norm": 0.44762158393859863, |
| "learning_rate": 0.00042967849590597266, |
| "loss": 0.048636652529239655, |
| "step": 12032 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "grad_norm": 0.27833688259124756, |
| "learning_rate": 0.0004109001650911621, |
| "loss": 0.04764322564005852, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "eval_cos_loss": 0.024070425779818126, |
| "eval_loss": 0.04753756304547939, |
| "eval_mse_loss": 0.04753756304547939, |
| "flow/cos_sim": 0.9759295767845084, |
| "flow/improvement_ratio": 0.9942291939639609, |
| "flow/mag_ratio_mean": 0.9780929656061408, |
| "flow/mag_ratio_std": 0.05394861750202636, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "eval_cos_loss": 0.024070425779818126, |
| "eval_loss": 0.04753756304547939, |
| "eval_mse_loss": 0.04753756304547939, |
| "eval_runtime": 27.6721, |
| "eval_samples_per_second": 1011.595, |
| "eval_steps_per_second": 15.828, |
| "flow/cos_sim": 0.9759295767845084, |
| "flow/improvement_ratio": 0.9942291939639609, |
| "flow/mag_ratio_mean": 0.9780929656061408, |
| "flow/mag_ratio_std": 0.05394861750202636, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5793727772389267, |
| "grad_norm": 0.17133653163909912, |
| "learning_rate": 0.0003922507884228551, |
| "loss": 0.04732891544699669, |
| "step": 12544 |
| }, |
| { |
| "epoch": 0.5911967114682924, |
| "grad_norm": 0.08853697776794434, |
| "learning_rate": 0.00037375735713457723, |
| "loss": 0.04574156925082207, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.6030206456976583, |
| "grad_norm": 0.14900043606758118, |
| "learning_rate": 0.00035544663676018276, |
| "loss": 0.0459522120654583, |
| "step": 13056 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "grad_norm": 0.10938149690628052, |
| "learning_rate": 0.00033734512839611255, |
| "loss": 0.04567023739218712, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "eval_cos_loss": 0.022640768034579276, |
| "eval_loss": 0.044894084334373474, |
| "eval_mse_loss": 0.044894084334373474, |
| "flow/cos_sim": 0.977359231884621, |
| "flow/improvement_ratio": 0.9948612654318004, |
| "flow/mag_ratio_mean": 0.9749562089846014, |
| "flow/mag_ratio_std": 0.049488119823543446, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "eval_cos_loss": 0.022640768034579276, |
| "eval_loss": 0.044894084334373474, |
| "eval_mse_loss": 0.044894084334373474, |
| "eval_runtime": 28.6172, |
| "eval_samples_per_second": 978.188, |
| "eval_steps_per_second": 15.305, |
| "flow/cos_sim": 0.977359231884621, |
| "flow/improvement_ratio": 0.9948612654318004, |
| "flow/mag_ratio_mean": 0.9749562089846014, |
| "flow/mag_ratio_std": 0.049488119823543446, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.62666851415639, |
| "grad_norm": 0.23983722925186157, |
| "learning_rate": 0.0003194790303463687, |
| "loss": 0.04453244432806969, |
| "step": 13568 |
| }, |
| { |
| "epoch": 0.6384924483857558, |
| "grad_norm": 0.08603578805923462, |
| "learning_rate": 0.00030187420020572406, |
| "loss": 0.04426493123173714, |
| "step": 13824 |
| }, |
| { |
| "epoch": 0.6503163826151217, |
| "grad_norm": 0.39926600456237793, |
| "learning_rate": 0.00028455611743603626, |
| "loss": 0.04409552738070488, |
| "step": 14080 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "grad_norm": 0.6597372889518738, |
| "learning_rate": 0.0002675498464898373, |
| "loss": 0.04343748837709427, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "eval_cos_loss": 0.022019475217925633, |
| "eval_loss": 0.04345123091346869, |
| "eval_mse_loss": 0.04345123091346869, |
| "flow/cos_sim": 0.9779805275675368, |
| "flow/improvement_ratio": 0.9940928038396791, |
| "flow/mag_ratio_mean": 0.9784686472862278, |
| "flow/mag_ratio_std": 0.04926606593440929, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "eval_cos_loss": 0.022019475217925633, |
| "eval_loss": 0.04345123091346869, |
| "eval_mse_loss": 0.04345123091346869, |
| "eval_runtime": 29.1857, |
| "eval_samples_per_second": 959.135, |
| "eval_steps_per_second": 15.007, |
| "flow/cos_sim": 0.9779805275675368, |
| "flow/improvement_ratio": 0.9940928038396791, |
| "flow/mag_ratio_mean": 0.9784686472862278, |
| "flow/mag_ratio_std": 0.04926606593440929, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6739642510738534, |
| "grad_norm": 0.5350319743156433, |
| "learning_rate": 0.0002508800005345623, |
| "loss": 0.042832110077142715, |
| "step": 14592 |
| }, |
| { |
| "epoch": 0.6857881853032193, |
| "grad_norm": 0.18352548778057098, |
| "learning_rate": 0.00023457070582992562, |
| "loss": 0.042873408645391464, |
| "step": 14848 |
| }, |
| { |
| "epoch": 0.6976121195325851, |
| "grad_norm": 0.06461436301469803, |
| "learning_rate": 0.00021864556680999692, |
| "loss": 0.0426117368042469, |
| "step": 15104 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "grad_norm": 0.07462433725595474, |
| "learning_rate": 0.0002031276319205152, |
| "loss": 0.04218180850148201, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "eval_cos_loss": 0.021368508580136517, |
| "eval_loss": 0.04225278059743583, |
| "eval_mse_loss": 0.04225278059743583, |
| "flow/cos_sim": 0.9786314918025988, |
| "flow/improvement_ratio": 0.9940550222516604, |
| "flow/mag_ratio_mean": 0.9777582721895279, |
| "flow/mag_ratio_std": 0.0456332682916835, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "eval_cos_loss": 0.021368508580136517, |
| "eval_loss": 0.04225278059743583, |
| "eval_mse_loss": 0.04225278059743583, |
| "eval_runtime": 28.5705, |
| "eval_samples_per_second": 979.785, |
| "eval_steps_per_second": 15.33, |
| "flow/cos_sim": 0.9786314918025988, |
| "flow/improvement_ratio": 0.9940550222516604, |
| "flow/mag_ratio_mean": 0.9777582721895279, |
| "flow/mag_ratio_std": 0.0456332682916835, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.7212599879913169, |
| "grad_norm": 0.04815741628408432, |
| "learning_rate": 0.00018803936026088542, |
| "loss": 0.04143794998526573, |
| "step": 15616 |
| }, |
| { |
| "epoch": 0.7330839222206826, |
| "grad_norm": 0.07569579780101776, |
| "learning_rate": 0.00017340258907913464, |
| "loss": 0.04184219613671303, |
| "step": 15872 |
| }, |
| { |
| "epoch": 0.7449078564500485, |
| "grad_norm": 0.19264720380306244, |
| "learning_rate": 0.0001592385021668743, |
| "loss": 0.041615359485149384, |
| "step": 16128 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "grad_norm": 0.09954190999269485, |
| "learning_rate": 0.0001455675992000087, |
| "loss": 0.04146020486950874, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "eval_cos_loss": 0.020705404857250108, |
| "eval_loss": 0.04112391300567481, |
| "eval_mse_loss": 0.04112391300567481, |
| "flow/cos_sim": 0.979294594292227, |
| "flow/improvement_ratio": 0.9948036799964295, |
| "flow/mag_ratio_mean": 0.9830839078720301, |
| "flow/mag_ratio_std": 0.04493439051250345, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "eval_cos_loss": 0.020705404857250108, |
| "eval_loss": 0.04112391300567481, |
| "eval_mse_loss": 0.04112391300567481, |
| "eval_runtime": 27.7293, |
| "eval_samples_per_second": 1009.511, |
| "eval_steps_per_second": 15.796, |
| "flow/cos_sim": 0.979294594292227, |
| "flow/improvement_ratio": 0.9948036799964295, |
| "flow/mag_ratio_mean": 0.9830839078720301, |
| "flow/mag_ratio_std": 0.04493439051250345, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7685557249087802, |
| "grad_norm": 0.08436273038387299, |
| "learning_rate": 0.000132409666069565, |
| "loss": 0.041200295090675354, |
| "step": 16640 |
| }, |
| { |
| "epoch": 0.780379659138146, |
| "grad_norm": 0.15963269770145416, |
| "learning_rate": 0.0001197837462455823, |
| "loss": 0.04084719717502594, |
| "step": 16896 |
| }, |
| { |
| "epoch": 0.7922035933675119, |
| "grad_norm": 0.11329685896635056, |
| "learning_rate": 0.00010770811321550749, |
| "loss": 0.04111889377236366, |
| "step": 17152 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "grad_norm": 0.06041397154331207, |
| "learning_rate": 9.620024403698591e-05, |
| "loss": 0.04088206961750984, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "eval_cos_loss": 0.02060152174369113, |
| "eval_loss": 0.040920868566999696, |
| "eval_mse_loss": 0.040920868566999696, |
| "flow/cos_sim": 0.9793984800168912, |
| "flow/improvement_ratio": 0.9943572009262973, |
| "flow/mag_ratio_mean": 0.9805345156965735, |
| "flow/mag_ratio_std": 0.04472058159235406, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "eval_cos_loss": 0.02060152174369113, |
| "eval_loss": 0.040920868566999696, |
| "eval_mse_loss": 0.040920868566999696, |
| "eval_runtime": 28.2966, |
| "eval_samples_per_second": 989.272, |
| "eval_steps_per_second": 15.479, |
| "flow/cos_sim": 0.9793984800168912, |
| "flow/improvement_ratio": 0.9943572009262973, |
| "flow/mag_ratio_mean": 0.9805345156965735, |
| "flow/mag_ratio_std": 0.04472058159235406, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8158514618262436, |
| "grad_norm": 0.412060409784317, |
| "learning_rate": 8.527679404332429e-05, |
| "loss": 0.04045189172029495, |
| "step": 17664 |
| }, |
| { |
| "epoch": 0.8276753960556095, |
| "grad_norm": 0.0535939484834671, |
| "learning_rate": 7.495357273823544e-05, |
| "loss": 0.040322255343198776, |
| "step": 17920 |
| }, |
| { |
| "epoch": 0.8394993302849753, |
| "grad_norm": 0.05492197349667549, |
| "learning_rate": 6.524552091475183e-05, |
| "loss": 0.04015512391924858, |
| "step": 18176 |
| }, |
| { |
| "epoch": 0.8513232645143411, |
| "grad_norm": 1.343531847000122, |
| "learning_rate": 5.6166689031422024e-05, |
| "loss": 0.04034010320901871, |
| "step": 18432 |
| }, |
| { |
| "epoch": 0.8513232645143411, |
| "eval_cos_loss": 0.02017251738096209, |
| "eval_loss": 0.040136819041959225, |
| "eval_mse_loss": 0.040136819041959225, |
| "flow/cos_sim": 0.9798274840394111, |
| "flow/improvement_ratio": 0.9949594466653588, |
| "flow/mag_ratio_mean": 0.9830197368038299, |
| "flow/mag_ratio_std": 0.04276561933106075, |
| "step": 18432 |
| }, |
| { |
| "epoch": 0.8513232645143411, |
| "eval_cos_loss": 0.02017251738096209, |
| "eval_loss": 0.040136819041959225, |
| "eval_mse_loss": 0.040136819041959225, |
| "eval_runtime": 28.9254, |
| "eval_samples_per_second": 967.764, |
| "eval_steps_per_second": 15.142, |
| "flow/cos_sim": 0.9798274840394111, |
| "flow/improvement_ratio": 0.9949594466653588, |
| "flow/mag_ratio_mean": 0.9830197368038299, |
| "flow/mag_ratio_std": 0.04276561933106075, |
| "step": 18432 |
| }, |
| { |
| "epoch": 0.8631471987437069, |
| "grad_norm": 0.07045256346464157, |
| "learning_rate": 4.773021687709067e-05, |
| "loss": 0.04022738337516785, |
| "step": 18688 |
| }, |
| { |
| "epoch": 0.8749711329730728, |
| "grad_norm": 0.04534813016653061, |
| "learning_rate": 3.994831455368719e-05, |
| "loss": 0.040090736001729965, |
| "step": 18944 |
| }, |
| { |
| "epoch": 0.8867950672024387, |
| "grad_norm": 0.12301895767450333, |
| "learning_rate": 3.283224480455282e-05, |
| "loss": 0.03993465378880501, |
| "step": 19200 |
| }, |
| { |
| "epoch": 0.8986190014318045, |
| "grad_norm": 0.030255919322371483, |
| "learning_rate": 2.639230671387627e-05, |
| "loss": 0.03986474499106407, |
| "step": 19456 |
| }, |
| { |
| "epoch": 0.8986190014318045, |
| "eval_cos_loss": 0.01997290319894955, |
| "eval_loss": 0.03975143301466556, |
| "eval_mse_loss": 0.03975143301466556, |
| "flow/cos_sim": 0.9800270978174254, |
| "flow/improvement_ratio": 0.9948155708508949, |
| "flow/mag_ratio_mean": 0.9813638247855722, |
| "flow/mag_ratio_std": 0.0424358811586687, |
| "step": 19456 |
| }, |
| { |
| "epoch": 0.8986190014318045, |
| "eval_cos_loss": 0.01997290319894955, |
| "eval_loss": 0.03975143301466556, |
| "eval_mse_loss": 0.03975143301466556, |
| "eval_runtime": 29.2292, |
| "eval_samples_per_second": 957.708, |
| "eval_steps_per_second": 14.985, |
| "flow/cos_sim": 0.9800270978174254, |
| "flow/improvement_ratio": 0.9948155708508949, |
| "flow/mag_ratio_mean": 0.9813638247855722, |
| "flow/mag_ratio_std": 0.0424358811586687, |
| "step": 19456 |
| }, |
| { |
| "epoch": 0.9104429356611704, |
| "grad_norm": 2.355804681777954, |
| "learning_rate": 2.063782080083576e-05, |
| "loss": 0.03972519189119339, |
| "step": 19712 |
| }, |
| { |
| "epoch": 0.9222668698905362, |
| "grad_norm": 0.06069854274392128, |
| "learning_rate": 1.557711553001523e-05, |
| "loss": 0.039879463613033295, |
| "step": 19968 |
| }, |
| { |
| "epoch": 0.9340908041199021, |
| "grad_norm": 0.029576338827610016, |
| "learning_rate": 1.1217515257622269e-05, |
| "loss": 0.039681050926446915, |
| "step": 20224 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "grad_norm": 0.05208945646882057, |
| "learning_rate": 7.565329630950746e-06, |
| "loss": 0.03949534893035889, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "eval_cos_loss": 0.019900511831181234, |
| "eval_loss": 0.03963183302041058, |
| "eval_mse_loss": 0.03963183302041058, |
| "flow/cos_sim": 0.9800994916049313, |
| "flow/improvement_ratio": 0.9955979625111846, |
| "flow/mag_ratio_mean": 0.9811528873498037, |
| "flow/mag_ratio_std": 0.04229976768397033, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "eval_cos_loss": 0.019900511831181234, |
| "eval_loss": 0.03963183302041058, |
| "eval_mse_loss": 0.03963183302041058, |
| "eval_runtime": 29.2534, |
| "eval_samples_per_second": 956.914, |
| "eval_steps_per_second": 14.973, |
| "flow/cos_sim": 0.9800994916049313, |
| "flow/improvement_ratio": 0.9955979625111846, |
| "flow/mag_ratio_mean": 0.9811528873498037, |
| "flow/mag_ratio_std": 0.04229976768397033, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.9577386725786338, |
| "grad_norm": 0.08807196468114853, |
| "learning_rate": 4.62584445643166e-06, |
| "loss": 0.039531927555799484, |
| "step": 20736 |
| }, |
| { |
| "epoch": 0.9695626068079997, |
| "grad_norm": 0.026826005429029465, |
| "learning_rate": 2.40331404948807e-06, |
| "loss": 0.03951640799641609, |
| "step": 20992 |
| }, |
| { |
| "epoch": 0.9813865410373654, |
| "grad_norm": 0.06786994636058807, |
| "learning_rate": 9.009550772663965e-07, |
| "loss": 0.03962159901857376, |
| "step": 21248 |
| }, |
| { |
| "epoch": 0.9932104752667313, |
| "grad_norm": 0.05299762263894081, |
| "learning_rate": 1.2094190315575791e-07, |
| "loss": 0.03955233097076416, |
| "step": 21504 |
| }, |
| { |
| "epoch": 0.9932104752667313, |
| "eval_cos_loss": 0.019904947206037773, |
| "eval_loss": 0.03966651372101209, |
| "eval_mse_loss": 0.03966651372101209, |
| "flow/cos_sim": 0.9800950555496564, |
| "flow/improvement_ratio": 0.9943993097026599, |
| "flow/mag_ratio_mean": 0.9814930751443454, |
| "flow/mag_ratio_std": 0.042273162626002204, |
| "step": 21504 |
| }, |
| { |
| "epoch": 0.9932104752667313, |
| "eval_cos_loss": 0.019904947206037773, |
| "eval_loss": 0.03966651372101209, |
| "eval_mse_loss": 0.03966651372101209, |
| "eval_runtime": 29.0571, |
| "eval_samples_per_second": 963.378, |
| "eval_steps_per_second": 15.074, |
| "flow/cos_sim": 0.9800950555496564, |
| "flow/improvement_ratio": 0.9943993097026599, |
| "flow/mag_ratio_mean": 0.9814930751443454, |
| "flow/mag_ratio_std": 0.042273162626002204, |
| "step": 21504 |
| } |
| ], |
| "logging_steps": 256, |
| "max_steps": 21651, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1024, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|