| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9932104752667313, |
| "eval_steps": 1024, |
| "global_step": 21504, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.011823934229365849, |
| "grad_norm": 0.13531190156936646, |
| "learning_rate": 0.0002490234375, |
| "loss": 1.4445146322250366, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.023647868458731697, |
| "grad_norm": 0.1410810798406601, |
| "learning_rate": 0.0004990234375, |
| "loss": 1.3425910472869873, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.03547180268809755, |
| "grad_norm": 0.11418232321739197, |
| "learning_rate": 0.000499820498011597, |
| "loss": 1.3074675798416138, |
| "step": 768 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "grad_norm": 0.1378929316997528, |
| "learning_rate": 0.0004992794337021979, |
| "loss": 1.2896003723144531, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "eval_cos_loss": 0.44286489105660076, |
| "eval_loss": 1.221002193074248, |
| "eval_mse_loss": 1.221002193074248, |
| "flow/cos_sim": 0.5571351273146938, |
| "flow/improvement_ratio": 0.9647975193855425, |
| "flow/mag_ratio_mean": 0.5350822540314775, |
| "flow/mag_ratio_std": 0.24492420766451586, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "eval_cos_loss": 0.44286489105660076, |
| "eval_loss": 1.221002193074248, |
| "eval_mse_loss": 1.221002193074248, |
| "eval_runtime": 37.6256, |
| "eval_samples_per_second": 743.989, |
| "eval_steps_per_second": 11.641, |
| "flow/cos_sim": 0.5571351273146938, |
| "flow/improvement_ratio": 0.9647975193855425, |
| "flow/mag_ratio_mean": 0.5350822540314775, |
| "flow/mag_ratio_std": 0.24492420766451586, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.05911967114682925, |
| "grad_norm": 0.16314451396465302, |
| "learning_rate": 0.0004983775873930693, |
| "loss": 1.2742286920547485, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.0709436053761951, |
| "grad_norm": 0.1408475935459137, |
| "learning_rate": 0.0004971162643259235, |
| "loss": 1.2636311054229736, |
| "step": 1536 |
| }, |
| { |
| "epoch": 0.08276753960556095, |
| "grad_norm": 0.4067126512527466, |
| "learning_rate": 0.0004954972900130046, |
| "loss": 1.2552720308303833, |
| "step": 1792 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "grad_norm": 0.11263003200292587, |
| "learning_rate": 0.0004935230075950261, |
| "loss": 1.250608205795288, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "eval_cos_loss": 0.4280974593472807, |
| "eval_loss": 1.1850101215654312, |
| "eval_mse_loss": 1.1850101215654312, |
| "flow/cos_sim": 0.5719025665766573, |
| "flow/improvement_ratio": 0.9701844343583877, |
| "flow/mag_ratio_mean": 0.5505562974437731, |
| "flow/mag_ratio_std": 0.2407817519078516, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "eval_cos_loss": 0.4280974593472807, |
| "eval_loss": 1.1850101215654312, |
| "eval_mse_loss": 1.1850101215654312, |
| "eval_runtime": 36.5824, |
| "eval_samples_per_second": 765.204, |
| "eval_steps_per_second": 11.973, |
| "flow/cos_sim": 0.5719025665766573, |
| "flow/improvement_ratio": 0.9701844343583877, |
| "flow/mag_ratio_mean": 0.5505562974437731, |
| "flow/mag_ratio_std": 0.2407817519078516, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.10641540806429264, |
| "grad_norm": 0.23430709540843964, |
| "learning_rate": 0.0004911962744499443, |
| "loss": 1.2465660572052002, |
| "step": 2304 |
| }, |
| { |
| "epoch": 0.1182393422936585, |
| "grad_norm": 0.12819728255271912, |
| "learning_rate": 0.0004885204580574763, |
| "loss": 1.2422897815704346, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.13006327652302435, |
| "grad_norm": 0.1120893657207489, |
| "learning_rate": 0.00048549943112534866, |
| "loss": 1.2349907159805298, |
| "step": 2816 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "grad_norm": 0.12881016731262207, |
| "learning_rate": 0.0004821375659843295, |
| "loss": 1.2320677042007446, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "eval_cos_loss": 0.4224365182799291, |
| "eval_loss": 1.1717089387379824, |
| "eval_mse_loss": 1.1717089387379824, |
| "flow/cos_sim": 0.5775635075079252, |
| "flow/improvement_ratio": 0.9726638531303842, |
| "flow/mag_ratio_mean": 0.5545849762006437, |
| "flow/mag_ratio_std": 0.2390297385927749, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "eval_cos_loss": 0.4224365182799291, |
| "eval_loss": 1.1717089387379824, |
| "eval_mse_loss": 1.1717089387379824, |
| "eval_runtime": 36.5841, |
| "eval_samples_per_second": 765.167, |
| "eval_steps_per_second": 11.972, |
| "flow/cos_sim": 0.5775635075079252, |
| "flow/improvement_ratio": 0.9726638531303842, |
| "flow/mag_ratio_mean": 0.5545849762006437, |
| "flow/mag_ratio_std": 0.2390297385927749, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.15371114498175603, |
| "grad_norm": 0.14192286133766174, |
| "learning_rate": 0.00047843972826015616, |
| "loss": 1.229474663734436, |
| "step": 3328 |
| }, |
| { |
| "epoch": 0.1655350792111219, |
| "grad_norm": 0.1288514882326126, |
| "learning_rate": 0.00047441126983151737, |
| "loss": 1.2259553670883179, |
| "step": 3584 |
| }, |
| { |
| "epoch": 0.17735901344048774, |
| "grad_norm": 0.1571502685546875, |
| "learning_rate": 0.0004700580210842823, |
| "loss": 1.2254003286361694, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "grad_norm": 0.18948955833911896, |
| "learning_rate": 0.0004653862824731857, |
| "loss": 1.2214388847351074, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "eval_cos_loss": 0.4189364165748091, |
| "eval_loss": 1.162348562179635, |
| "eval_mse_loss": 1.162348562179635, |
| "flow/cos_sim": 0.5810636061511628, |
| "flow/improvement_ratio": 0.971539740420912, |
| "flow/mag_ratio_mean": 0.5564619683783892, |
| "flow/mag_ratio_std": 0.23926625594700854, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "eval_cos_loss": 0.4189364165748091, |
| "eval_loss": 1.162348562179635, |
| "eval_mse_loss": 1.162348562179635, |
| "eval_runtime": 36.6574, |
| "eval_samples_per_second": 763.638, |
| "eval_steps_per_second": 11.948, |
| "flow/cos_sim": 0.5810636061511628, |
| "flow/improvement_ratio": 0.971539740420912, |
| "flow/mag_ratio_mean": 0.5564619683783892, |
| "flow/mag_ratio_std": 0.23926625594700854, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.20100688189921945, |
| "grad_norm": 0.11497154086828232, |
| "learning_rate": 0.00046040281540318296, |
| "loss": 1.2192964553833008, |
| "step": 4352 |
| }, |
| { |
| "epoch": 0.2128308161285853, |
| "grad_norm": 0.17087024450302124, |
| "learning_rate": 0.00045511483244367227, |
| "loss": 1.2178804874420166, |
| "step": 4608 |
| }, |
| { |
| "epoch": 0.22465475035795113, |
| "grad_norm": 0.15084773302078247, |
| "learning_rate": 0.00044952998688974635, |
| "loss": 1.2137683629989624, |
| "step": 4864 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "grad_norm": 0.17674025893211365, |
| "learning_rate": 0.0004436563616855822, |
| "loss": 1.214182734489441, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "eval_cos_loss": 0.41552880569680095, |
| "eval_loss": 1.1498510279611909, |
| "eval_mse_loss": 1.1498510279611909, |
| "flow/cos_sim": 0.5844712080476491, |
| "flow/improvement_ratio": 0.9718176997415552, |
| "flow/mag_ratio_mean": 0.5639459366939928, |
| "flow/mag_ratio_std": 0.24295896026369643, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "eval_cos_loss": 0.41552880569680095, |
| "eval_loss": 1.1498510279611909, |
| "eval_mse_loss": 1.1498510279611909, |
| "eval_runtime": 36.5045, |
| "eval_samples_per_second": 766.837, |
| "eval_steps_per_second": 11.999, |
| "flow/cos_sim": 0.5844712080476491, |
| "flow/improvement_ratio": 0.9718176997415552, |
| "flow/mag_ratio_mean": 0.5639459366939928, |
| "flow/mag_ratio_std": 0.24295896026369643, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.24830261881668284, |
| "grad_norm": 0.19734610617160797, |
| "learning_rate": 0.00043750245772600053, |
| "loss": 1.2093557119369507, |
| "step": 5376 |
| }, |
| { |
| "epoch": 0.2601265530460487, |
| "grad_norm": 0.11800088733434677, |
| "learning_rate": 0.00043107718155312435, |
| "loss": 1.2076833248138428, |
| "step": 5632 |
| }, |
| { |
| "epoch": 0.27195048727541454, |
| "grad_norm": 0.11836104094982147, |
| "learning_rate": 0.0004243898324659452, |
| "loss": 1.2095065116882324, |
| "step": 5888 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "grad_norm": 0.14233353734016418, |
| "learning_rate": 0.00041745008906145264, |
| "loss": 1.2082980871200562, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "eval_cos_loss": 0.41333945823586693, |
| "eval_loss": 1.1455941828962874, |
| "eval_mse_loss": 1.1455941828962874, |
| "flow/cos_sim": 0.5866605730633757, |
| "flow/improvement_ratio": 0.9727034600085864, |
| "flow/mag_ratio_mean": 0.5605599706031416, |
| "flow/mag_ratio_std": 0.24318457544530364, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "eval_cos_loss": 0.41333945823586693, |
| "eval_loss": 1.1455941828962874, |
| "eval_mse_loss": 1.1455941828962874, |
| "eval_runtime": 36.3926, |
| "eval_samples_per_second": 769.196, |
| "eval_steps_per_second": 12.035, |
| "flow/cos_sim": 0.5866605730633757, |
| "flow/improvement_ratio": 0.9727034600085864, |
| "flow/mag_ratio_mean": 0.5605599706031416, |
| "flow/mag_ratio_std": 0.24318457544530364, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2955983557341462, |
| "grad_norm": 0.1180572658777237, |
| "learning_rate": 0.00041026799522680534, |
| "loss": 1.2049767971038818, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.30742228996351206, |
| "grad_norm": 0.14954492449760437, |
| "learning_rate": 0.00040285394560281816, |
| "loss": 1.203136682510376, |
| "step": 6656 |
| }, |
| { |
| "epoch": 0.3192462241928779, |
| "grad_norm": 0.27384424209594727, |
| "learning_rate": 0.0003952186705398043, |
| "loss": 1.2036312818527222, |
| "step": 6912 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "grad_norm": 0.15835484862327576, |
| "learning_rate": 0.0003873732205675438, |
| "loss": 1.1990163326263428, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "eval_cos_loss": 0.41148509575079567, |
| "eval_loss": 1.1415519687138735, |
| "eval_mse_loss": 1.1415519687138735, |
| "flow/cos_sim": 0.5885149201029511, |
| "flow/improvement_ratio": 0.9741524475622395, |
| "flow/mag_ratio_mean": 0.5640481464122529, |
| "flow/mag_ratio_std": 0.24272260563150388, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "eval_cos_loss": 0.41148509575079567, |
| "eval_loss": 1.1415519687138735, |
| "eval_mse_loss": 1.1415519687138735, |
| "eval_runtime": 36.5211, |
| "eval_samples_per_second": 766.489, |
| "eval_steps_per_second": 11.993, |
| "flow/cos_sim": 0.5885149201029511, |
| "flow/improvement_ratio": 0.9741524475622395, |
| "flow/mag_ratio_mean": 0.5640481464122529, |
| "flow/mag_ratio_std": 0.24272260563150388, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.34289409265160964, |
| "grad_norm": 0.13862715661525726, |
| "learning_rate": 0.000379328950401858, |
| "loss": 1.1999462842941284, |
| "step": 7424 |
| }, |
| { |
| "epoch": 0.3547180268809755, |
| "grad_norm": 0.1664004623889923, |
| "learning_rate": 0.0003710975025109345, |
| "loss": 1.1982899904251099, |
| "step": 7680 |
| }, |
| { |
| "epoch": 0.3665419611103413, |
| "grad_norm": 0.13604609668254852, |
| "learning_rate": 0.0003626907902651893, |
| "loss": 1.1958998441696167, |
| "step": 7936 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "grad_norm": 0.1457708328962326, |
| "learning_rate": 0.0003541209806950514, |
| "loss": 1.1945908069610596, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "eval_cos_loss": 0.4107164820851801, |
| "eval_loss": 1.1382312957010312, |
| "eval_mse_loss": 1.1382312957010312, |
| "flow/cos_sim": 0.589283528393262, |
| "flow/improvement_ratio": 0.9722015831840637, |
| "flow/mag_ratio_mean": 0.5640345150477266, |
| "flow/mag_ratio_std": 0.24401044291040125, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "eval_cos_loss": 0.4107164820851801, |
| "eval_loss": 1.1382312957010312, |
| "eval_mse_loss": 1.1382312957010312, |
| "eval_runtime": 36.7013, |
| "eval_samples_per_second": 762.726, |
| "eval_steps_per_second": 11.934, |
| "flow/cos_sim": 0.589283528393262, |
| "flow/improvement_ratio": 0.9722015831840637, |
| "flow/mag_ratio_mean": 0.5640345150477266, |
| "flow/mag_ratio_std": 0.24401044291040125, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.390189829569073, |
| "grad_norm": 0.16633214056491852, |
| "learning_rate": 0.0003454004768816257, |
| "loss": 1.1974270343780518, |
| "step": 8448 |
| }, |
| { |
| "epoch": 0.4020137637984389, |
| "grad_norm": 0.13664524257183075, |
| "learning_rate": 0.00033654190000572017, |
| "loss": 1.1954563856124878, |
| "step": 8704 |
| }, |
| { |
| "epoch": 0.41383769802780473, |
| "grad_norm": 0.17472244799137115, |
| "learning_rate": 0.00032755807108121707, |
| "loss": 1.1933891773223877, |
| "step": 8960 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "grad_norm": 0.13021035492420197, |
| "learning_rate": 0.00031846199239922587, |
| "loss": 1.1937364339828491, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "eval_cos_loss": 0.40818845524907654, |
| "eval_loss": 1.1306110892665986, |
| "eval_mse_loss": 1.1306110892665986, |
| "flow/cos_sim": 0.5918115706748615, |
| "flow/improvement_ratio": 0.9736669311240383, |
| "flow/mag_ratio_mean": 0.5680791953382971, |
| "flow/mag_ratio_std": 0.24117264903435423, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "eval_cos_loss": 0.40818845524907654, |
| "eval_loss": 1.1306110892665986, |
| "eval_mse_loss": 1.1306110892665986, |
| "eval_runtime": 37.6562, |
| "eval_samples_per_second": 743.384, |
| "eval_steps_per_second": 11.632, |
| "flow/cos_sim": 0.5918115706748615, |
| "flow/improvement_ratio": 0.9736669311240383, |
| "flow/mag_ratio_mean": 0.5680791953382971, |
| "flow/mag_ratio_std": 0.24117264903435423, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4374855664865364, |
| "grad_norm": 0.1436239778995514, |
| "learning_rate": 0.00030926682870987393, |
| "loss": 1.1944515705108643, |
| "step": 9472 |
| }, |
| { |
| "epoch": 0.44930950071590225, |
| "grad_norm": 0.1279948502779007, |
| "learning_rate": 0.00029998588816897034, |
| "loss": 1.192986011505127, |
| "step": 9728 |
| }, |
| { |
| "epoch": 0.4611334349452681, |
| "grad_norm": 0.16707991063594818, |
| "learning_rate": 0.00029063260307711817, |
| "loss": 1.1940622329711914, |
| "step": 9984 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "grad_norm": 0.12476647645235062, |
| "learning_rate": 0.00028122051043915356, |
| "loss": 1.190167784690857, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "eval_cos_loss": 0.4076138965479315, |
| "eval_loss": 1.1312634806110435, |
| "eval_mse_loss": 1.1312634806110435, |
| "flow/cos_sim": 0.5923861386296956, |
| "flow/improvement_ratio": 0.9748937231764946, |
| "flow/mag_ratio_mean": 0.5703548358999975, |
| "flow/mag_ratio_std": 0.24083318081620622, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "eval_cos_loss": 0.4076138965479315, |
| "eval_loss": 1.1312634806110435, |
| "eval_mse_loss": 1.1312634806110435, |
| "eval_runtime": 37.3647, |
| "eval_samples_per_second": 749.184, |
| "eval_steps_per_second": 11.722, |
| "flow/cos_sim": 0.5923861386296956, |
| "flow/improvement_ratio": 0.9748937231764946, |
| "flow/mag_ratio_mean": 0.5703548358999975, |
| "flow/mag_ratio_std": 0.24083318081620622, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.48478130340399983, |
| "grad_norm": 0.16375412046909332, |
| "learning_rate": 0.00027176323237204403, |
| "loss": 1.1899298429489136, |
| "step": 10496 |
| }, |
| { |
| "epoch": 0.49660523763336567, |
| "grad_norm": 0.13618549704551697, |
| "learning_rate": 0.0002622744563896065, |
| "loss": 1.1911197900772095, |
| "step": 10752 |
| }, |
| { |
| "epoch": 0.5084291718627315, |
| "grad_norm": 0.14637431502342224, |
| "learning_rate": 0.00025276791559257494, |
| "loss": 1.1925891637802124, |
| "step": 11008 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "grad_norm": 0.13531003892421722, |
| "learning_rate": 0.0002432573687926906, |
| "loss": 1.186372995376587, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "eval_cos_loss": 0.40703500609963994, |
| "eval_loss": 1.1297938347407128, |
| "eval_mse_loss": 1.1297938347407128, |
| "flow/cos_sim": 0.5929650133603239, |
| "flow/improvement_ratio": 0.9741698461308327, |
| "flow/mag_ratio_mean": 0.5677392150713428, |
| "flow/mag_ratio_std": 0.24396490948657468, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "eval_cos_loss": 0.40703500609963994, |
| "eval_loss": 1.1297938347407128, |
| "eval_mse_loss": 1.1297938347407128, |
| "eval_runtime": 38.0112, |
| "eval_samples_per_second": 736.441, |
| "eval_steps_per_second": 11.523, |
| "flow/cos_sim": 0.5929650133603239, |
| "flow/improvement_ratio": 0.9741698461308327, |
| "flow/mag_ratio_mean": 0.5677392150713428, |
| "flow/mag_ratio_std": 0.24396490948657468, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5320770403214632, |
| "grad_norm": 0.13568931818008423, |
| "learning_rate": 0.00023375658059958035, |
| "loss": 1.1851963996887207, |
| "step": 11520 |
| }, |
| { |
| "epoch": 0.5439009745508291, |
| "grad_norm": 0.158619686961174, |
| "learning_rate": 0.00022427930149924495, |
| "loss": 1.18683660030365, |
| "step": 11776 |
| }, |
| { |
| "epoch": 0.5557249087801949, |
| "grad_norm": 0.12816773355007172, |
| "learning_rate": 0.00021483924795298633, |
| "loss": 1.1887577772140503, |
| "step": 12032 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "grad_norm": 0.1509481817483902, |
| "learning_rate": 0.00020545008254558105, |
| "loss": 1.1869347095489502, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "eval_cos_loss": 0.4058912583546007, |
| "eval_loss": 1.1245840085695868, |
| "eval_mse_loss": 1.1245840085695868, |
| "flow/cos_sim": 0.5941087536888036, |
| "flow/improvement_ratio": 0.9735238416009842, |
| "flow/mag_ratio_mean": 0.5702283619201347, |
| "flow/mag_ratio_std": 0.2440737300131419, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "eval_cos_loss": 0.4058912583546007, |
| "eval_loss": 1.1245840085695868, |
| "eval_mse_loss": 1.1245840085695868, |
| "eval_runtime": 37.443, |
| "eval_samples_per_second": 747.617, |
| "eval_steps_per_second": 11.698, |
| "flow/cos_sim": 0.5941087536888036, |
| "flow/improvement_ratio": 0.9735238416009842, |
| "flow/mag_ratio_mean": 0.5702283619201347, |
| "flow/mag_ratio_std": 0.2440737300131419, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5793727772389267, |
| "grad_norm": 0.12331920862197876, |
| "learning_rate": 0.00019612539421142756, |
| "loss": 1.1875509023666382, |
| "step": 12544 |
| }, |
| { |
| "epoch": 0.5911967114682924, |
| "grad_norm": 0.14404380321502686, |
| "learning_rate": 0.00018687867856728862, |
| "loss": 1.186294436454773, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.6030206456976583, |
| "grad_norm": 0.16519756615161896, |
| "learning_rate": 0.00017772331838009138, |
| "loss": 1.1816554069519043, |
| "step": 13056 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "grad_norm": 0.1261790543794632, |
| "learning_rate": 0.00016867256419805628, |
| "loss": 1.184910774230957, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "eval_cos_loss": 0.4058965126129046, |
| "eval_loss": 1.124463065301991, |
| "eval_mse_loss": 1.124463065301991, |
| "flow/cos_sim": 0.5941034868427607, |
| "flow/improvement_ratio": 0.9736478256580492, |
| "flow/mag_ratio_mean": 0.5693308626680069, |
| "flow/mag_ratio_std": 0.24294855394592024, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "eval_cos_loss": 0.4058965126129046, |
| "eval_loss": 1.124463065301991, |
| "eval_mse_loss": 1.124463065301991, |
| "eval_runtime": 37.6078, |
| "eval_samples_per_second": 744.34, |
| "eval_steps_per_second": 11.647, |
| "flow/cos_sim": 0.5941034868427607, |
| "flow/improvement_ratio": 0.9736478256580492, |
| "flow/mag_ratio_mean": 0.5693308626680069, |
| "flow/mag_ratio_std": 0.24294855394592024, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.62666851415639, |
| "grad_norm": 0.10855361819267273, |
| "learning_rate": 0.00015973951517318435, |
| "loss": 1.1825791597366333, |
| "step": 13568 |
| }, |
| { |
| "epoch": 0.6384924483857558, |
| "grad_norm": 0.12764589488506317, |
| "learning_rate": 0.00015093710010286203, |
| "loss": 1.183720588684082, |
| "step": 13824 |
| }, |
| { |
| "epoch": 0.6503163826151217, |
| "grad_norm": 0.15194828808307648, |
| "learning_rate": 0.00014227805871801813, |
| "loss": 1.1843342781066895, |
| "step": 14080 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "grad_norm": 0.1313941776752472, |
| "learning_rate": 0.00013377492324491864, |
| "loss": 1.1845036745071411, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "eval_cos_loss": 0.4055524829043645, |
| "eval_loss": 1.1236549692066837, |
| "eval_mse_loss": 1.1236549692066837, |
| "flow/cos_sim": 0.5944475390051054, |
| "flow/improvement_ratio": 0.9741561509430681, |
| "flow/mag_ratio_mean": 0.5722755660477294, |
| "flow/mag_ratio_std": 0.24292387908588262, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "eval_cos_loss": 0.4055524829043645, |
| "eval_loss": 1.1236549692066837, |
| "eval_mse_loss": 1.1236549692066837, |
| "eval_runtime": 37.766, |
| "eval_samples_per_second": 741.222, |
| "eval_steps_per_second": 11.598, |
| "flow/cos_sim": 0.5944475390051054, |
| "flow/improvement_ratio": 0.9741561509430681, |
| "flow/mag_ratio_mean": 0.5722755660477294, |
| "flow/mag_ratio_std": 0.24292387908588262, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6739642510738534, |
| "grad_norm": 0.1419185996055603, |
| "learning_rate": 0.00012544000026728114, |
| "loss": 1.1815910339355469, |
| "step": 14592 |
| }, |
| { |
| "epoch": 0.6857881853032193, |
| "grad_norm": 0.11921229213476181, |
| "learning_rate": 0.00011728535291496281, |
| "loss": 1.1859562397003174, |
| "step": 14848 |
| }, |
| { |
| "epoch": 0.6976121195325851, |
| "grad_norm": 0.13644331693649292, |
| "learning_rate": 0.00010932278340499846, |
| "loss": 1.1809968948364258, |
| "step": 15104 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "grad_norm": 0.12353217601776123, |
| "learning_rate": 0.0001015638159602576, |
| "loss": 1.1835533380508423, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "eval_cos_loss": 0.404491749596378, |
| "eval_loss": 1.1212830510857987, |
| "eval_mse_loss": 1.1212830510857987, |
| "flow/cos_sim": 0.5955082661212852, |
| "flow/improvement_ratio": 0.9744905962791617, |
| "flow/mag_ratio_mean": 0.5719217330081278, |
| "flow/mag_ratio_std": 0.24169711207282052, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "eval_cos_loss": 0.404491749596378, |
| "eval_loss": 1.1212830510857987, |
| "eval_mse_loss": 1.1212830510857987, |
| "eval_runtime": 37.817, |
| "eval_samples_per_second": 740.223, |
| "eval_steps_per_second": 11.582, |
| "flow/cos_sim": 0.5955082661212852, |
| "flow/improvement_ratio": 0.9744905962791617, |
| "flow/mag_ratio_mean": 0.5719217330081278, |
| "flow/mag_ratio_std": 0.24169711207282052, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.7212599879913169, |
| "grad_norm": 0.11799845844507217, |
| "learning_rate": 9.401968013044271e-05, |
| "loss": 1.1806869506835938, |
| "step": 15616 |
| }, |
| { |
| "epoch": 0.7330839222206826, |
| "grad_norm": 0.12838932871818542, |
| "learning_rate": 8.670129453956732e-05, |
| "loss": 1.182599663734436, |
| "step": 15872 |
| }, |
| { |
| "epoch": 0.7449078564500485, |
| "grad_norm": 0.11126351356506348, |
| "learning_rate": 7.961925108343715e-05, |
| "loss": 1.1809351444244385, |
| "step": 16128 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "grad_norm": 0.13469421863555908, |
| "learning_rate": 7.278379960000436e-05, |
| "loss": 1.1837177276611328, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "eval_cos_loss": 0.40498853165265086, |
| "eval_loss": 1.1224167082952037, |
| "eval_mse_loss": 1.1224167082952037, |
| "flow/cos_sim": 0.5950114819557155, |
| "flow/improvement_ratio": 0.9733862189669588, |
| "flow/mag_ratio_mean": 0.5712394598684355, |
| "flow/mag_ratio_std": 0.24294140764839572, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "eval_cos_loss": 0.40498853165265086, |
| "eval_loss": 1.1224167082952037, |
| "eval_mse_loss": 1.1224167082952037, |
| "eval_runtime": 38.0074, |
| "eval_samples_per_second": 736.514, |
| "eval_steps_per_second": 11.524, |
| "flow/cos_sim": 0.5950114819557155, |
| "flow/improvement_ratio": 0.9733862189669588, |
| "flow/mag_ratio_mean": 0.5712394598684355, |
| "flow/mag_ratio_std": 0.24294140764839572, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7685557249087802, |
| "grad_norm": 0.13311493396759033, |
| "learning_rate": 6.62048330347825e-05, |
| "loss": 1.1809964179992676, |
| "step": 16640 |
| }, |
| { |
| "epoch": 0.780379659138146, |
| "grad_norm": 0.10799466073513031, |
| "learning_rate": 5.989187312279115e-05, |
| "loss": 1.180505633354187, |
| "step": 16896 |
| }, |
| { |
| "epoch": 0.7922035933675119, |
| "grad_norm": 0.1486814022064209, |
| "learning_rate": 5.3854056607753746e-05, |
| "loss": 1.1795099973678589, |
| "step": 17152 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "grad_norm": 0.11168931424617767, |
| "learning_rate": 4.8100122018492956e-05, |
| "loss": 1.1790964603424072, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "eval_cos_loss": 0.40403676060236754, |
| "eval_loss": 1.1197874216728558, |
| "eval_mse_loss": 1.1197874216728558, |
| "flow/cos_sim": 0.5959632730102975, |
| "flow/improvement_ratio": 0.974381895370135, |
| "flow/mag_ratio_mean": 0.5709017501574128, |
| "flow/mag_ratio_std": 0.24267656927647657, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "eval_cos_loss": 0.40403676060236754, |
| "eval_loss": 1.1197874216728558, |
| "eval_mse_loss": 1.1197874216728558, |
| "eval_runtime": 37.0697, |
| "eval_samples_per_second": 755.145, |
| "eval_steps_per_second": 11.816, |
| "flow/cos_sim": 0.5959632730102975, |
| "flow/improvement_ratio": 0.974381895370135, |
| "flow/mag_ratio_mean": 0.5709017501574128, |
| "flow/mag_ratio_std": 0.24267656927647657, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8158514618262436, |
| "grad_norm": 0.120413638651371, |
| "learning_rate": 4.263839702166214e-05, |
| "loss": 1.1833932399749756, |
| "step": 17664 |
| }, |
| { |
| "epoch": 0.8276753960556095, |
| "grad_norm": 0.13419309258460999, |
| "learning_rate": 3.747678636911772e-05, |
| "loss": 1.1790565252304077, |
| "step": 17920 |
| }, |
| { |
| "epoch": 0.8394993302849753, |
| "grad_norm": 0.13175411522388458, |
| "learning_rate": 3.262276045737592e-05, |
| "loss": 1.182090401649475, |
| "step": 18176 |
| }, |
| { |
| "epoch": 0.8513232645143411, |
| "grad_norm": 0.09947279095649719, |
| "learning_rate": 2.8083344515711012e-05, |
| "loss": 1.177965760231018, |
| "step": 18432 |
| }, |
| { |
| "epoch": 0.8513232645143411, |
| "eval_cos_loss": 0.40370703621269904, |
| "eval_loss": 1.1215725149193856, |
| "eval_mse_loss": 1.1215725149193856, |
| "flow/cos_sim": 0.5962929766472072, |
| "flow/improvement_ratio": 0.9753795086520992, |
| "flow/mag_ratio_mean": 0.5721857149034875, |
| "flow/mag_ratio_std": 0.24184994011709135, |
| "step": 18432 |
| }, |
| { |
| "epoch": 0.8513232645143411, |
| "eval_cos_loss": 0.40370703621269904, |
| "eval_loss": 1.1215725149193856, |
| "eval_mse_loss": 1.1215725149193856, |
| "eval_runtime": 37.003, |
| "eval_samples_per_second": 756.506, |
| "eval_steps_per_second": 11.837, |
| "flow/cos_sim": 0.5962929766472072, |
| "flow/improvement_ratio": 0.9753795086520992, |
| "flow/mag_ratio_mean": 0.5721857149034875, |
| "flow/mag_ratio_std": 0.24184994011709135, |
| "step": 18432 |
| }, |
| { |
| "epoch": 0.8631471987437069, |
| "grad_norm": 0.11105956137180328, |
| "learning_rate": 2.3865108438545337e-05, |
| "loss": 1.181681752204895, |
| "step": 18688 |
| }, |
| { |
| "epoch": 0.8749711329730728, |
| "grad_norm": 0.10993946343660355, |
| "learning_rate": 1.9974157276843596e-05, |
| "loss": 1.1794625520706177, |
| "step": 18944 |
| }, |
| { |
| "epoch": 0.8867950672024387, |
| "grad_norm": 0.10621558874845505, |
| "learning_rate": 1.641612240227641e-05, |
| "loss": 1.1792101860046387, |
| "step": 19200 |
| }, |
| { |
| "epoch": 0.8986190014318045, |
| "grad_norm": 0.09712148457765579, |
| "learning_rate": 1.3196153356938134e-05, |
| "loss": 1.1793529987335205, |
| "step": 19456 |
| }, |
| { |
| "epoch": 0.8986190014318045, |
| "eval_cos_loss": 0.40458166646902965, |
| "eval_loss": 1.1218491670203536, |
| "eval_mse_loss": 1.1218491670203536, |
| "flow/cos_sim": 0.5954183448939563, |
| "flow/improvement_ratio": 0.9741832541820665, |
| "flow/mag_ratio_mean": 0.5714860147537162, |
| "flow/mag_ratio_std": 0.2427709357787485, |
| "step": 19456 |
| }, |
| { |
| "epoch": 0.8986190014318045, |
| "eval_cos_loss": 0.40458166646902965, |
| "eval_loss": 1.1218491670203536, |
| "eval_mse_loss": 1.1218491670203536, |
| "eval_runtime": 37.3338, |
| "eval_samples_per_second": 749.804, |
| "eval_steps_per_second": 11.732, |
| "flow/cos_sim": 0.5954183448939563, |
| "flow/improvement_ratio": 0.9741832541820665, |
| "flow/mag_ratio_mean": 0.5714860147537162, |
| "flow/mag_ratio_std": 0.2427709357787485, |
| "step": 19456 |
| }, |
| { |
| "epoch": 0.9104429356611704, |
| "grad_norm": 0.12275788933038712, |
| "learning_rate": 1.031891040041788e-05, |
| "loss": 1.1787521839141846, |
| "step": 19712 |
| }, |
| { |
| "epoch": 0.9222668698905362, |
| "grad_norm": 0.1388554871082306, |
| "learning_rate": 7.788557765007614e-06, |
| "loss": 1.1809200048446655, |
| "step": 19968 |
| }, |
| { |
| "epoch": 0.9340908041199021, |
| "grad_norm": 0.11414396017789841, |
| "learning_rate": 5.6087576288111344e-06, |
| "loss": 1.1808236837387085, |
| "step": 20224 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "grad_norm": 0.11537494510412216, |
| "learning_rate": 3.782664815475373e-06, |
| "loss": 1.1786174774169922, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "eval_cos_loss": 0.4043068891246569, |
| "eval_loss": 1.120300586909464, |
| "eval_mse_loss": 1.120300586909464, |
| "flow/cos_sim": 0.595693129110554, |
| "flow/improvement_ratio": 0.9729666603754644, |
| "flow/mag_ratio_mean": 0.5716023456016087, |
| "flow/mag_ratio_std": 0.24340959124641331, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "eval_cos_loss": 0.4043068891246569, |
| "eval_loss": 1.120300586909464, |
| "eval_mse_loss": 1.120300586909464, |
| "eval_runtime": 37.0085, |
| "eval_samples_per_second": 756.394, |
| "eval_steps_per_second": 11.835, |
| "flow/cos_sim": 0.595693129110554, |
| "flow/improvement_ratio": 0.9729666603754644, |
| "flow/mag_ratio_mean": 0.5716023456016087, |
| "flow/mag_ratio_std": 0.24340959124641331, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.9577386725786338, |
| "grad_norm": 0.12133973091840744, |
| "learning_rate": 2.31292222821583e-06, |
| "loss": 1.1803685426712036, |
| "step": 20736 |
| }, |
| { |
| "epoch": 0.9695626068079997, |
| "grad_norm": 0.10521073639392853, |
| "learning_rate": 1.201657024744035e-06, |
| "loss": 1.1818820238113403, |
| "step": 20992 |
| }, |
| { |
| "epoch": 0.9813865410373654, |
| "grad_norm": 0.09989389777183533, |
| "learning_rate": 4.5047753863319827e-07, |
| "loss": 1.1803994178771973, |
| "step": 21248 |
| }, |
| { |
| "epoch": 0.9932104752667313, |
| "grad_norm": 0.09399425238370895, |
| "learning_rate": 6.047095157787896e-08, |
| "loss": 1.1804349422454834, |
| "step": 21504 |
| }, |
| { |
| "epoch": 0.9932104752667313, |
| "eval_cos_loss": 0.40440767044073916, |
| "eval_loss": 1.1205239834850782, |
| "eval_mse_loss": 1.1205239834850782, |
| "flow/cos_sim": 0.5955923152024343, |
| "flow/improvement_ratio": 0.9742439836944075, |
| "flow/mag_ratio_mean": 0.5713475014521107, |
| "flow/mag_ratio_std": 0.24330289396521163, |
| "step": 21504 |
| }, |
| { |
| "epoch": 0.9932104752667313, |
| "eval_cos_loss": 0.40440767044073916, |
| "eval_loss": 1.1205239834850782, |
| "eval_mse_loss": 1.1205239834850782, |
| "eval_runtime": 37.1012, |
| "eval_samples_per_second": 754.504, |
| "eval_steps_per_second": 11.806, |
| "flow/cos_sim": 0.5955923152024343, |
| "flow/improvement_ratio": 0.9742439836944075, |
| "flow/mag_ratio_mean": 0.5713475014521107, |
| "flow/mag_ratio_std": 0.24330289396521163, |
| "step": 21504 |
| } |
| ], |
| "logging_steps": 256, |
| "max_steps": 21651, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1024, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|