{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9932104752667313, "eval_steps": 1024, "global_step": 21504, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011823934229365849, "grad_norm": 0.39305946230888367, "learning_rate": 0.000498046875, "loss": 1.780936598777771, "step": 256 }, { "epoch": 0.023647868458731697, "grad_norm": 0.4956166744232178, "learning_rate": 0.000998046875, "loss": 1.21787691116333, "step": 512 }, { "epoch": 0.03547180268809755, "grad_norm": 0.46151286363601685, "learning_rate": 0.000999640996023194, "loss": 1.1046085357666016, "step": 768 }, { "epoch": 0.047295736917463395, "grad_norm": 0.4608488082885742, "learning_rate": 0.0009985588674043958, "loss": 1.0648458003997803, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_cos_loss": 0.3708630034096165, "eval_loss": 1.0313464179975258, "eval_mse_loss": 1.0313464179975258, "flow/cos_sim": 0.6291370091100806, "flow/improvement_ratio": 0.45305174915757895, "flow/mag_ratio_mean": 0.636674555197154, "flow/mag_ratio_std": 0.21184622356880745, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_cos_loss": 0.3708630034096165, "eval_loss": 1.0313464179975258, "eval_mse_loss": 1.0313464179975258, "eval_runtime": 51.0215, "eval_samples_per_second": 548.652, "eval_steps_per_second": 8.585, "flow/cos_sim": 0.6291370091100806, "flow/improvement_ratio": 0.45305174915757895, "flow/mag_ratio_mean": 0.636674555197154, "flow/mag_ratio_std": 0.21184622356880745, "step": 1024 }, { "epoch": 0.05911967114682925, "grad_norm": 0.42826929688453674, "learning_rate": 0.0009967551747861387, "loss": 1.0448938608169556, "step": 1280 }, { "epoch": 0.0709436053761951, "grad_norm": 0.44176557660102844, "learning_rate": 0.000994232528651847, "loss": 1.0278342962265015, "step": 1536 }, { "epoch": 0.08276753960556095, "grad_norm": 0.4006131887435913, "learning_rate": 0.0009909945800260092, "loss": 1.0153039693832397, "step": 1792 }, { "epoch": 0.09459147383492679, "grad_norm": 0.5178146362304688, "learning_rate": 0.0009870460151900522, "loss": 1.0096927881240845, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_cos_loss": 0.35073428046485605, "eval_loss": 0.9796474619543172, "eval_mse_loss": 0.9796474619543172, "flow/cos_sim": 0.6492657340280542, "flow/improvement_ratio": 0.45769495602067745, "flow/mag_ratio_mean": 0.6580993892395333, "flow/mag_ratio_std": 0.21975676535063138, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_cos_loss": 0.35073428046485605, "eval_loss": 0.9796474619543172, "eval_mse_loss": 0.9796474619543172, "eval_runtime": 50.9087, "eval_samples_per_second": 549.867, "eval_steps_per_second": 8.604, "flow/cos_sim": 0.6492657340280542, "flow/improvement_ratio": 0.45769495602067745, "flow/mag_ratio_mean": 0.6580993892395333, "flow/mag_ratio_std": 0.21975676535063138, "step": 2048 }, { "epoch": 0.10641540806429264, "grad_norm": 0.4543623626232147, "learning_rate": 0.0009823925488998885, "loss": 1.0023009777069092, "step": 2304 }, { "epoch": 0.1182393422936585, "grad_norm": 0.41527411341667175, "learning_rate": 0.0009770409161149525, "loss": 0.9987744092941284, "step": 2560 }, { "epoch": 0.13006327652302435, "grad_norm": 0.37379226088523865, "learning_rate": 0.0009709988622506973, "loss": 0.9899244904518127, "step": 2816 }, { "epoch": 0.1418872107523902, "grad_norm": 0.4600990414619446, "learning_rate": 0.000964275131968659, "loss": 0.9836398959159851, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_cos_loss": 0.33966573984383447, "eval_loss": 0.9576488193039481, "eval_mse_loss": 0.9576488193039481, "flow/cos_sim": 0.6603342864883545, "flow/improvement_ratio": 0.4705073194552774, "flow/mag_ratio_mean": 0.6616723446269014, "flow/mag_ratio_std": 0.2178211697976883, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_cos_loss": 0.33966573984383447, "eval_loss": 0.9576488193039481, "eval_mse_loss": 0.9576488193039481, "eval_runtime": 50.7658, "eval_samples_per_second": 551.415, "eval_steps_per_second": 8.628, "flow/cos_sim": 0.6603342864883545, "flow/improvement_ratio": 0.4705073194552774, "flow/mag_ratio_mean": 0.6616723446269014, "flow/mag_ratio_std": 0.2178211697976883, "step": 3072 }, { "epoch": 0.15371114498175603, "grad_norm": 0.4591522812843323, "learning_rate": 0.0009568794565203123, "loss": 0.9778329133987427, "step": 3328 }, { "epoch": 0.1655350792111219, "grad_norm": 0.506796658039093, "learning_rate": 0.0009488225396630347, "loss": 0.9751444458961487, "step": 3584 }, { "epoch": 0.17735901344048774, "grad_norm": 0.6946936845779419, "learning_rate": 0.0009401160421685646, "loss": 0.9650535583496094, "step": 3840 }, { "epoch": 0.18918294766985358, "grad_norm": 0.5334352254867554, "learning_rate": 0.0009307725649463714, "loss": 0.968015193939209, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_cos_loss": 0.3295682058350681, "eval_loss": 0.9358410420210939, "eval_mse_loss": 0.9358410420210939, "flow/cos_sim": 0.6704318162785273, "flow/improvement_ratio": 0.4610280344883601, "flow/mag_ratio_mean": 0.6802759532514773, "flow/mag_ratio_std": 0.21443206633198753, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_cos_loss": 0.3295682058350681, "eval_loss": 0.9358410420210939, "eval_mse_loss": 0.9358410420210939, "eval_runtime": 50.6775, "eval_samples_per_second": 552.375, "eval_steps_per_second": 8.643, "flow/cos_sim": 0.6704318162785273, "flow/improvement_ratio": 0.4610280344883601, "flow/mag_ratio_mean": 0.6802759532514773, "flow/mag_ratio_std": 0.21443206633198753, "step": 4096 }, { "epoch": 0.20100688189921945, "grad_norm": 0.23779992759227753, "learning_rate": 0.0009208056308063659, "loss": 0.9618788957595825, "step": 4352 }, { "epoch": 0.2128308161285853, "grad_norm": 0.3806602358818054, "learning_rate": 0.0009102296648873445, "loss": 0.9570742249488831, "step": 4608 }, { "epoch": 0.22465475035795113, "grad_norm": 0.41401633620262146, "learning_rate": 0.0008990599737794927, "loss": 0.9550731778144836, "step": 4864 }, { "epoch": 0.236478684587317, "grad_norm": 0.4475092589855194, "learning_rate": 0.0008873127233711644, "loss": 0.9473839998245239, "step": 5120 }, { "epoch": 0.236478684587317, "eval_cos_loss": 0.31993362566107486, "eval_loss": 0.9154947232710172, "eval_mse_loss": 0.9154947232710172, "flow/cos_sim": 0.6800663863142876, "flow/improvement_ratio": 0.4631300759778175, "flow/mag_ratio_mean": 0.6912006213240427, "flow/mag_ratio_std": 0.2171582194155754, "step": 5120 }, { "epoch": 0.236478684587317, "eval_cos_loss": 0.31993362566107486, "eval_loss": 0.9154947232710172, "eval_mse_loss": 0.9154947232710172, "eval_runtime": 50.7141, "eval_samples_per_second": 551.977, "eval_steps_per_second": 8.637, "flow/cos_sim": 0.6800663863142876, "flow/improvement_ratio": 0.4631300759778175, "flow/mag_ratio_mean": 0.6912006213240427, "flow/mag_ratio_std": 0.2171582194155754, "step": 5120 }, { "epoch": 0.24830261881668284, "grad_norm": 0.39829879999160767, "learning_rate": 0.0008750049154520011, "loss": 0.9461303949356079, "step": 5376 }, { "epoch": 0.2601265530460487, "grad_norm": 0.570689857006073, "learning_rate": 0.0008621543631062487, "loss": 0.9459335803985596, "step": 5632 }, { "epoch": 0.27195048727541454, "grad_norm": 0.49673837423324585, "learning_rate": 0.0008487796649318904, "loss": 0.9424968957901001, "step": 5888 }, { "epoch": 0.2837744215047804, "grad_norm": 0.267270028591156, "learning_rate": 0.0008349001781229053, "loss": 0.9410290122032166, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_cos_loss": 0.31771982879671334, "eval_loss": 0.9094011069160618, "eval_mse_loss": 0.9094011069160618, "flow/cos_sim": 0.682280182430189, "flow/improvement_ratio": 0.46761273847867363, "flow/mag_ratio_mean": 0.6880964620472634, "flow/mag_ratio_std": 0.21481055140359218, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_cos_loss": 0.31771982879671334, "eval_loss": 0.9094011069160618, "eval_mse_loss": 0.9094011069160618, "eval_runtime": 51.0025, "eval_samples_per_second": 548.856, "eval_steps_per_second": 8.588, "flow/cos_sim": 0.682280182430189, "flow/improvement_ratio": 0.46761273847867363, "flow/mag_ratio_mean": 0.6880964620472634, "flow/mag_ratio_std": 0.21481055140359218, "step": 6144 }, { "epoch": 0.2955983557341462, "grad_norm": 0.43064209818840027, "learning_rate": 0.0008205359904536107, "loss": 0.9332870841026306, "step": 6400 }, { "epoch": 0.30742228996351206, "grad_norm": 0.45424163341522217, "learning_rate": 0.0008057078912056363, "loss": 0.9331082105636597, "step": 6656 }, { "epoch": 0.3192462241928779, "grad_norm": 0.6565593481063843, "learning_rate": 0.0007904373410796086, "loss": 0.9313054084777832, "step": 6912 }, { "epoch": 0.3310701584222438, "grad_norm": 0.5485164523124695, "learning_rate": 0.0007747464411350876, "loss": 0.9296150207519531, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_cos_loss": 0.3111404624839896, "eval_loss": 0.8942771547189042, "eval_mse_loss": 0.8942771547189042, "flow/cos_sim": 0.688859552281088, "flow/improvement_ratio": 0.46996809215578317, "flow/mag_ratio_mean": 0.6797864257472835, "flow/mag_ratio_std": 0.21222736455125896, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_cos_loss": 0.3111404624839896, "eval_loss": 0.8942771547189042, "eval_mse_loss": 0.8942771547189042, "eval_runtime": 51.2111, "eval_samples_per_second": 546.619, "eval_steps_per_second": 8.553, "flow/cos_sim": 0.688859552281088, "flow/improvement_ratio": 0.46996809215578317, "flow/mag_ratio_mean": 0.6797864257472835, "flow/mag_ratio_std": 0.21222736455125896, "step": 7168 }, { "epoch": 0.34289409265160964, "grad_norm": 0.3187243938446045, "learning_rate": 0.000758657900803716, "loss": 0.928747832775116, "step": 7424 }, { "epoch": 0.3547180268809755, "grad_norm": 0.3337271213531494, "learning_rate": 0.000742195005021869, "loss": 0.925912082195282, "step": 7680 }, { "epoch": 0.3665419611103413, "grad_norm": 0.5179343223571777, "learning_rate": 0.0007253815805303786, "loss": 0.923928439617157, "step": 7936 }, { "epoch": 0.37836589533970716, "grad_norm": 0.3594360053539276, "learning_rate": 0.000708309515673374, "loss": 0.9226717948913574, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_cos_loss": 0.3082785013059503, "eval_loss": 0.8866745081122063, "eval_mse_loss": 0.8866745081122063, "flow/cos_sim": 0.6917215108054958, "flow/improvement_ratio": 0.46622252083260174, "flow/mag_ratio_mean": 0.6977443612057325, "flow/mag_ratio_std": 0.21176587402412336, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_cos_loss": 0.3082785013059503, "eval_loss": 0.8866745081122063, "eval_mse_loss": 0.8866745081122063, "eval_runtime": 50.453, "eval_samples_per_second": 554.833, "eval_steps_per_second": 8.681, "flow/cos_sim": 0.6917215108054958, "flow/improvement_ratio": 0.46622252083260174, "flow/mag_ratio_mean": 0.6977443612057325, "flow/mag_ratio_std": 0.21176587402412336, "step": 8192 }, { "epoch": 0.390189829569073, "grad_norm": 0.2854546010494232, "learning_rate": 0.0006908696365085842, "loss": 0.9181273579597473, "step": 8448 }, { "epoch": 0.4020137637984389, "grad_norm": 0.6743373870849609, "learning_rate": 0.0006731535118143318, "loss": 0.9195088148117065, "step": 8704 }, { "epoch": 0.41383769802780473, "grad_norm": 0.31884175539016724, "learning_rate": 0.0006551867821290267, "loss": 0.9166384935379028, "step": 8960 }, { "epoch": 0.4256616322571706, "grad_norm": 0.377502977848053, "learning_rate": 0.0006369954506915572, "loss": 0.9148516654968262, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_cos_loss": 0.3045589508803468, "eval_loss": 0.8784834154664654, "eval_mse_loss": 0.8784834154664654, "flow/cos_sim": 0.6954410576929241, "flow/improvement_ratio": 0.4660896160150772, "flow/mag_ratio_mean": 0.6896047796288581, "flow/mag_ratio_std": 0.20802477359363478, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_cos_loss": 0.3045589508803468, "eval_loss": 0.8784834154664654, "eval_mse_loss": 0.8784834154664654, "eval_runtime": 50.5191, "eval_samples_per_second": 554.108, "eval_steps_per_second": 8.67, "flow/cos_sim": 0.6954410576929241, "flow/improvement_ratio": 0.4660896160150772, "flow/mag_ratio_mean": 0.6896047796288581, "flow/mag_ratio_std": 0.20802477359363478, "step": 9216 }, { "epoch": 0.4374855664865364, "grad_norm": 0.2935723066329956, "learning_rate": 0.0006186058458068149, "loss": 0.9158189296722412, "step": 9472 }, { "epoch": 0.44930950071590225, "grad_norm": 0.6002705097198486, "learning_rate": 0.0006000445827407526, "loss": 0.9137871265411377, "step": 9728 }, { "epoch": 0.4611334349452681, "grad_norm": 0.3263113498687744, "learning_rate": 0.0005813385252001232, "loss": 0.9110487103462219, "step": 9984 }, { "epoch": 0.472957369174634, "grad_norm": 0.3971521854400635, "learning_rate": 0.0005625884706462481, "loss": 0.9095574021339417, "step": 10240 }, { "epoch": 0.472957369174634, "eval_cos_loss": 0.30255618235564125, "eval_loss": 0.8742554381013461, "eval_mse_loss": 0.8742554381013461, "flow/cos_sim": 0.6974438190732373, "flow/improvement_ratio": 0.46927081490760525, "flow/mag_ratio_mean": 0.6958274860360306, "flow/mag_ratio_std": 0.21019614664795191, "step": 10240 }, { "epoch": 0.472957369174634, "eval_cos_loss": 0.30255618235564125, "eval_loss": 0.8742554381013461, "eval_mse_loss": 0.8742554381013461, "eval_runtime": 51.3022, "eval_samples_per_second": 545.649, "eval_steps_per_second": 8.538, "flow/cos_sim": 0.6974438190732373, "flow/improvement_ratio": 0.46927081490760525, "flow/mag_ratio_mean": 0.6958274860360306, "flow/mag_ratio_std": 0.21019614664795191, "step": 10240 }, { "epoch": 0.48478130340399983, "grad_norm": 0.3551290035247803, "learning_rate": 0.0005436745145807214, "loss": 0.9060850143432617, "step": 10496 }, { "epoch": 0.49660523763336567, "grad_norm": 0.4094627797603607, "learning_rate": 0.0005246973484120841, "loss": 0.9049650430679321, "step": 10752 }, { "epoch": 0.5084291718627315, "grad_norm": 0.4399157762527466, "learning_rate": 0.0005056844377834413, "loss": 0.9042121767997742, "step": 11008 }, { "epoch": 0.5202531060920974, "grad_norm": 0.5329852104187012, "learning_rate": 0.0004866633000708374, "loss": 0.9038464426994324, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_cos_loss": 0.3004189523183592, "eval_loss": 0.8695836310789465, "eval_mse_loss": 0.8695836310789465, "flow/cos_sim": 0.699581062548781, "flow/improvement_ratio": 0.47203471392529195, "flow/mag_ratio_mean": 0.6982838295093955, "flow/mag_ratio_std": 0.20910314640634137, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_cos_loss": 0.3004189523183592, "eval_loss": 0.8695836310789465, "eval_mse_loss": 0.8695836310789465, "eval_runtime": 51.0932, "eval_samples_per_second": 547.881, "eval_steps_per_second": 8.573, "flow/cos_sim": 0.699581062548781, "flow/improvement_ratio": 0.47203471392529195, "flow/mag_ratio_mean": 0.6982838295093955, "flow/mag_ratio_std": 0.20910314640634137, "step": 11264 }, { "epoch": 0.5320770403214632, "grad_norm": 0.3345041275024414, "learning_rate": 0.00046766146455737116, "loss": 0.9016320109367371, "step": 11520 }, { "epoch": 0.5439009745508291, "grad_norm": 0.47475534677505493, "learning_rate": 0.00044870643259007823, "loss": 0.9061211347579956, "step": 11776 }, { "epoch": 0.5557249087801949, "grad_norm": 0.6051743030548096, "learning_rate": 0.000429825637777245, "loss": 0.903529167175293, "step": 12032 }, { "epoch": 0.5675488430095608, "grad_norm": 0.39291107654571533, "learning_rate": 0.0004111195298279063, "loss": 0.8996114730834961, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_cos_loss": 0.29944087231540245, "eval_loss": 0.8666894671307307, "eval_mse_loss": 0.8666894671307307, "flow/cos_sim": 0.700559140476462, "flow/improvement_ratio": 0.4712014450058001, "flow/mag_ratio_mean": 0.6962402411247497, "flow/mag_ratio_std": 0.20598308850913288, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_cos_loss": 0.29944087231540245, "eval_loss": 0.8666894671307307, "eval_mse_loss": 0.8666894671307307, "eval_runtime": 50.9516, "eval_samples_per_second": 549.403, "eval_steps_per_second": 8.596, "flow/cos_sim": 0.700559140476462, "flow/improvement_ratio": 0.4712014450058001, "flow/mag_ratio_mean": 0.6962402411247497, "flow/mag_ratio_std": 0.20598308850913288, "step": 12288 }, { "epoch": 0.5793727772389267, "grad_norm": 0.38517266511917114, "learning_rate": 0.0003924684852772684, "loss": 0.9009869694709778, "step": 12544 }, { "epoch": 0.5911967114682924, "grad_norm": 0.3089086413383484, "learning_rate": 0.00037404498123644504, "loss": 0.9021767377853394, "step": 12800 }, { "epoch": 0.6030206456976583, "grad_norm": 0.5309678912162781, "learning_rate": 0.0003557312014609784, "loss": 0.9018325209617615, "step": 13056 }, { "epoch": 0.6148445799270241, "grad_norm": 0.36101001501083374, "learning_rate": 0.00033762622184548686, "loss": 0.8969551920890808, "step": 13312 }, { "epoch": 0.6148445799270241, "eval_cos_loss": 0.30058121572346447, "eval_loss": 0.8693417448159222, "eval_mse_loss": 0.8693417448159222, "flow/cos_sim": 0.6994188143510253, "flow/improvement_ratio": 0.4627683243658989, "flow/mag_ratio_mean": 0.7132113126859273, "flow/mag_ratio_std": 0.21025482180728217, "step": 13312 }, { "epoch": 0.6148445799270241, "eval_cos_loss": 0.30058121572346447, "eval_loss": 0.8693417448159222, "eval_mse_loss": 0.8693417448159222, "eval_runtime": 50.5293, "eval_samples_per_second": 553.996, "eval_steps_per_second": 8.668, "flow/cos_sim": 0.6994188143510253, "flow/improvement_ratio": 0.4627683243658989, "flow/mag_ratio_mean": 0.7132113126859273, "flow/mag_ratio_std": 0.21025482180728217, "step": 13312 }, { "epoch": 0.62666851415639, "grad_norm": 0.4100685119628906, "learning_rate": 0.00031975624571791317, "loss": 0.8950372934341431, "step": 13568 }, { "epoch": 0.6384924483857558, "grad_norm": 0.2832315266132355, "learning_rate": 0.00030214713628576984, "loss": 0.8969645500183105, "step": 13824 }, { "epoch": 0.6503163826151217, "grad_norm": 0.39231061935424805, "learning_rate": 0.00028482437920433144, "loss": 0.8969537019729614, "step": 14080 }, { "epoch": 0.6621403168444876, "grad_norm": 0.7917575240135193, "learning_rate": 0.00026781304569125866, "loss": 0.8955793976783752, "step": 14336 }, { "epoch": 0.6621403168444876, "eval_cos_loss": 0.29705161533263175, "eval_loss": 0.8606252675731433, "eval_mse_loss": 0.8606252675731433, "flow/cos_sim": 0.7029484053180642, "flow/improvement_ratio": 0.46662241846459096, "flow/mag_ratio_mean": 0.7076406654429762, "flow/mag_ratio_std": 0.20891345561095023, "step": 14336 }, { "epoch": 0.6621403168444876, "eval_cos_loss": 0.29705161533263175, "eval_loss": 0.8606252675731433, "eval_mse_loss": 0.8606252675731433, "eval_runtime": 50.7312, "eval_samples_per_second": 551.791, "eval_steps_per_second": 8.634, "flow/cos_sim": 0.7029484053180642, "flow/improvement_ratio": 0.46662241846459096, "flow/mag_ratio_mean": 0.7076406654429762, "flow/mag_ratio_std": 0.20891345561095023, "step": 14336 }, { "epoch": 0.6739642510738534, "grad_norm": 0.2073170691728592, "learning_rate": 0.0002511377562410373, "loss": 0.8957792520523071, "step": 14592 }, { "epoch": 0.6857881853032193, "grad_norm": 0.6534483432769775, "learning_rate": 0.00023482264499174915, "loss": 0.8970192074775696, "step": 14848 }, { "epoch": 0.6976121195325851, "grad_norm": 0.24640800058841705, "learning_rate": 0.00021889132479574586, "loss": 0.8956112861633301, "step": 15104 }, { "epoch": 0.709436053761951, "grad_norm": 0.35241010785102844, "learning_rate": 0.0002033668530447801, "loss": 0.8931179642677307, "step": 15360 }, { "epoch": 0.709436053761951, "eval_cos_loss": 0.3005856423889665, "eval_loss": 0.8699537216256198, "eval_mse_loss": 0.8699537216256198, "flow/cos_sim": 0.6994143777514157, "flow/improvement_ratio": 0.47257714858066, "flow/mag_ratio_mean": 0.7026257012804894, "flow/mag_ratio_std": 0.20547213071965736, "step": 15360 }, { "epoch": 0.709436053761951, "eval_cos_loss": 0.3005856423889665, "eval_loss": 0.8699537216256198, "eval_mse_loss": 0.8699537216256198, "eval_runtime": 50.6626, "eval_samples_per_second": 552.538, "eval_steps_per_second": 8.645, "flow/cos_sim": 0.6994143777514157, "flow/improvement_ratio": 0.47257714858066, "flow/mag_ratio_mean": 0.7026257012804894, "flow/mag_ratio_std": 0.20547213071965736, "step": 15360 }, { "epoch": 0.7212599879913169, "grad_norm": 0.4916954040527344, "learning_rate": 0.0001882716982990524, "loss": 0.8969737887382507, "step": 15616 }, { "epoch": 0.7330839222206826, "grad_norm": 0.5466639399528503, "learning_rate": 0.00017362770776847765, "loss": 0.8959704637527466, "step": 15872 }, { "epoch": 0.7449078564500485, "grad_norm": 0.45104148983955383, "learning_rate": 0.00015951048788154866, "loss": 0.8920841813087463, "step": 16128 }, { "epoch": 0.7567317906794143, "grad_norm": 0.23488114774227142, "learning_rate": 0.0001458297605985633, "loss": 0.8922313451766968, "step": 16384 }, { "epoch": 0.7567317906794143, "eval_cos_loss": 0.2997721447928311, "eval_loss": 0.8672782906658574, "eval_mse_loss": 0.8672782906658574, "flow/cos_sim": 0.7002278570442984, "flow/improvement_ratio": 0.4662386941719273, "flow/mag_ratio_mean": 0.7100178600718442, "flow/mag_ratio_std": 0.20784400301436856, "step": 16384 }, { "epoch": 0.7567317906794143, "eval_cos_loss": 0.2997721447928311, "eval_loss": 0.8672782906658574, "eval_mse_loss": 0.8672782906658574, "eval_runtime": 50.9632, "eval_samples_per_second": 549.278, "eval_steps_per_second": 8.594, "flow/cos_sim": 0.7002278570442984, "flow/improvement_ratio": 0.4662386941719273, "flow/mag_ratio_mean": 0.7100178600718442, "flow/mag_ratio_std": 0.20784400301436856, "step": 16384 }, { "epoch": 0.7685557249087802, "grad_norm": 0.25124895572662354, "learning_rate": 0.00013266162372593872, "loss": 0.8903353810310364, "step": 16640 }, { "epoch": 0.780379659138146, "grad_norm": 0.2108362913131714, "learning_rate": 0.00012002513550158511, "loss": 0.8916131854057312, "step": 16896 }, { "epoch": 0.7922035933675119, "grad_norm": 0.22269435226917267, "learning_rate": 0.00010793858470858986, "loss": 0.8932616114616394, "step": 17152 }, { "epoch": 0.8040275275968778, "grad_norm": 0.2587853968143463, "learning_rate": 9.641946420587128e-05, "loss": 0.8924716114997864, "step": 17408 }, { "epoch": 0.8040275275968778, "eval_cos_loss": 0.29718101851335943, "eval_loss": 0.8611624605579463, "eval_mse_loss": 0.8611624605579463, "flow/cos_sim": 0.7028190088054361, "flow/improvement_ratio": 0.465975399158861, "flow/mag_ratio_mean": 0.7077710173990084, "flow/mag_ratio_std": 0.20818141616506664, "step": 17408 }, { "epoch": 0.8040275275968778, "eval_cos_loss": 0.29718101851335943, "eval_loss": 0.8611624605579463, "eval_mse_loss": 0.8611624605579463, "eval_runtime": 50.5279, "eval_samples_per_second": 554.011, "eval_steps_per_second": 8.668, "flow/cos_sim": 0.7028190088054361, "flow/improvement_ratio": 0.465975399158861, "flow/mag_ratio_mean": 0.7077710173990084, "flow/mag_ratio_std": 0.20818141616506664, "step": 17408 }, { "epoch": 0.8158514618262436, "grad_norm": 0.22211408615112305, "learning_rate": 8.548444561077174e-05, "loss": 0.8929438591003418, "step": 17664 }, { "epoch": 0.8276753960556095, "grad_norm": 0.16869360208511353, "learning_rate": 7.514935517023558e-05, "loss": 0.8896693587303162, "step": 17920 }, { "epoch": 0.8394993302849753, "grad_norm": 0.2596553862094879, "learning_rate": 6.542915085548828e-05, "loss": 0.8902697563171387, "step": 18176 }, { "epoch": 0.8513232645143411, "grad_norm": 0.23781056702136993, "learning_rate": 5.6337900713373745e-05, "loss": 0.8896790742874146, "step": 18432 }, { "epoch": 0.8513232645143411, "eval_cos_loss": 0.29792979842723777, "eval_loss": 0.8626222945239446, "eval_mse_loss": 0.8626222945239446, "flow/cos_sim": 0.7020702216451027, "flow/improvement_ratio": 0.4633185115034722, "flow/mag_ratio_mean": 0.7091497124330094, "flow/mag_ratio_std": 0.20735928929967967, "step": 18432 }, { "epoch": 0.8513232645143411, "eval_cos_loss": 0.29792979842723777, "eval_loss": 0.8626222945239446, "eval_mse_loss": 0.8626222945239446, "eval_runtime": 50.6008, "eval_samples_per_second": 553.212, "eval_steps_per_second": 8.656, "flow/cos_sim": 0.7020702216451027, "flow/improvement_ratio": 0.4633185115034722, "flow/mag_ratio_mean": 0.7091497124330094, "flow/mag_ratio_std": 0.20735928929967967, "step": 18432 }, { "epoch": 0.8631471987437069, "grad_norm": 0.24920311570167542, "learning_rate": 4.78887625056757e-05, "loss": 0.8902055621147156, "step": 18688 }, { "epoch": 0.8749711329730728, "grad_norm": 0.29849973320961, "learning_rate": 4.009396466589682e-05, "loss": 0.8859462141990662, "step": 18944 }, { "epoch": 0.8867950672024387, "grad_norm": 0.2714053690433502, "learning_rate": 3.296478860105229e-05, "loss": 0.890446662902832, "step": 19200 }, { "epoch": 0.8986190014318045, "grad_norm": 0.3177722096443176, "learning_rate": 2.6511552364095358e-05, "loss": 0.8924580216407776, "step": 19456 }, { "epoch": 0.8986190014318045, "eval_cos_loss": 0.2985236220504051, "eval_loss": 0.8639824096470663, "eval_mse_loss": 0.8639824096470663, "flow/cos_sim": 0.7014764100993605, "flow/improvement_ratio": 0.46989300261893774, "flow/mag_ratio_mean": 0.7079281760677355, "flow/mag_ratio_std": 0.20886976322899126, "step": 19456 }, { "epoch": 0.8986190014318045, "eval_cos_loss": 0.2985236220504051, "eval_loss": 0.8639824096470663, "eval_mse_loss": 0.8639824096470663, "eval_runtime": 51.0434, "eval_samples_per_second": 548.416, "eval_steps_per_second": 8.581, "flow/cos_sim": 0.7014764100993605, "flow/improvement_ratio": 0.46989300261893774, "flow/mag_ratio_mean": 0.7079281760677355, "flow/mag_ratio_std": 0.20886976322899126, "step": 19456 }, { "epoch": 0.9104429356611704, "grad_norm": 0.2206149697303772, "learning_rate": 2.074359572060447e-05, "loss": 0.888733446598053, "step": 19712 }, { "epoch": 0.9222668698905362, "grad_norm": 0.1809428334236145, "learning_rate": 1.5669266631345104e-05, "loss": 0.8881487250328064, "step": 19968 }, { "epoch": 0.9340908041199021, "grad_norm": 0.16622242331504822, "learning_rate": 1.1311620335770879e-05, "loss": 0.8930599689483643, "step": 20224 }, { "epoch": 0.945914738349268, "grad_norm": 0.25432291626930237, "learning_rate": 7.642790173984836e-06, "loss": 0.8947161436080933, "step": 20480 }, { "epoch": 0.945914738349268, "eval_cos_loss": 0.2954880066862389, "eval_loss": 0.8570421781714104, "eval_mse_loss": 0.8570421781714104, "flow/cos_sim": 0.7045120175026324, "flow/improvement_ratio": 0.4633371506107452, "flow/mag_ratio_mean": 0.7112098344384807, "flow/mag_ratio_std": 0.20906852869546577, "step": 20480 }, { "epoch": 0.945914738349268, "eval_cos_loss": 0.2954880066862389, "eval_loss": 0.8570421781714104, "eval_mse_loss": 0.8570421781714104, "eval_runtime": 50.6225, "eval_samples_per_second": 552.976, "eval_steps_per_second": 8.652, "flow/cos_sim": 0.7045120175026324, "flow/improvement_ratio": 0.4633371506107452, "flow/mag_ratio_mean": 0.7112098344384807, "flow/mag_ratio_std": 0.20906852869546577, "step": 20480 }, { "epoch": 0.9577386725786338, "grad_norm": 0.21038714051246643, "learning_rate": 4.686548355746001e-06, "loss": 0.8879244923591614, "step": 20736 }, { "epoch": 0.9695626068079997, "grad_norm": 0.24928613007068634, "learning_rate": 2.447173448359541e-06, "loss": 0.889561653137207, "step": 20992 }, { "epoch": 0.9813865410373654, "grad_norm": 0.21087926626205444, "learning_rate": 9.279064980089457e-07, "loss": 0.8902660608291626, "step": 21248 }, { "epoch": 0.9932104752667313, "grad_norm": 0.18456599116325378, "learning_rate": 1.3094633899163322e-07, "loss": 0.8902599811553955, "step": 21504 }, { "epoch": 0.9932104752667313, "eval_cos_loss": 0.29621158380350565, "eval_loss": 0.858656403544831, "eval_mse_loss": 0.858656403544831, "flow/cos_sim": 0.7037884391606126, "flow/improvement_ratio": 0.47204427734085413, "flow/mag_ratio_mean": 0.7089065290205011, "flow/mag_ratio_std": 0.20877777332584607, "step": 21504 }, { "epoch": 0.9932104752667313, "eval_cos_loss": 0.29621158380350565, "eval_loss": 0.858656403544831, "eval_mse_loss": 0.858656403544831, "eval_runtime": 50.8696, "eval_samples_per_second": 550.289, "eval_steps_per_second": 8.61, "flow/cos_sim": 0.7037884391606126, "flow/improvement_ratio": 0.47204427734085413, "flow/mag_ratio_mean": 0.7089065290205011, "flow/mag_ratio_std": 0.20877777332584607, "step": 21504 } ], "logging_steps": 256, "max_steps": 21651, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1024, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }