| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9932104752667313, |
| "eval_steps": 1024, |
| "global_step": 21504, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.011823934229365849, |
| "grad_norm": 0.39305946230888367, |
| "learning_rate": 0.000498046875, |
| "loss": 1.780936598777771, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.023647868458731697, |
| "grad_norm": 0.4956166744232178, |
| "learning_rate": 0.000998046875, |
| "loss": 1.21787691116333, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.03547180268809755, |
| "grad_norm": 0.46151286363601685, |
| "learning_rate": 0.000999640996023194, |
| "loss": 1.1046085357666016, |
| "step": 768 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "grad_norm": 0.4608488082885742, |
| "learning_rate": 0.0009985588674043958, |
| "loss": 1.0648458003997803, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "eval_cos_loss": 0.3708630034096165, |
| "eval_loss": 1.0313464179975258, |
| "eval_mse_loss": 1.0313464179975258, |
| "flow/cos_sim": 0.6291370091100806, |
| "flow/improvement_ratio": 0.45305174915757895, |
| "flow/mag_ratio_mean": 0.636674555197154, |
| "flow/mag_ratio_std": 0.21184622356880745, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "eval_cos_loss": 0.3708630034096165, |
| "eval_loss": 1.0313464179975258, |
| "eval_mse_loss": 1.0313464179975258, |
| "eval_runtime": 51.0215, |
| "eval_samples_per_second": 548.652, |
| "eval_steps_per_second": 8.585, |
| "flow/cos_sim": 0.6291370091100806, |
| "flow/improvement_ratio": 0.45305174915757895, |
| "flow/mag_ratio_mean": 0.636674555197154, |
| "flow/mag_ratio_std": 0.21184622356880745, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.05911967114682925, |
| "grad_norm": 0.42826929688453674, |
| "learning_rate": 0.0009967551747861387, |
| "loss": 1.0448938608169556, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.0709436053761951, |
| "grad_norm": 0.44176557660102844, |
| "learning_rate": 0.000994232528651847, |
| "loss": 1.0278342962265015, |
| "step": 1536 |
| }, |
| { |
| "epoch": 0.08276753960556095, |
| "grad_norm": 0.4006131887435913, |
| "learning_rate": 0.0009909945800260092, |
| "loss": 1.0153039693832397, |
| "step": 1792 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "grad_norm": 0.5178146362304688, |
| "learning_rate": 0.0009870460151900522, |
| "loss": 1.0096927881240845, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "eval_cos_loss": 0.35073428046485605, |
| "eval_loss": 0.9796474619543172, |
| "eval_mse_loss": 0.9796474619543172, |
| "flow/cos_sim": 0.6492657340280542, |
| "flow/improvement_ratio": 0.45769495602067745, |
| "flow/mag_ratio_mean": 0.6580993892395333, |
| "flow/mag_ratio_std": 0.21975676535063138, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "eval_cos_loss": 0.35073428046485605, |
| "eval_loss": 0.9796474619543172, |
| "eval_mse_loss": 0.9796474619543172, |
| "eval_runtime": 50.9087, |
| "eval_samples_per_second": 549.867, |
| "eval_steps_per_second": 8.604, |
| "flow/cos_sim": 0.6492657340280542, |
| "flow/improvement_ratio": 0.45769495602067745, |
| "flow/mag_ratio_mean": 0.6580993892395333, |
| "flow/mag_ratio_std": 0.21975676535063138, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.10641540806429264, |
| "grad_norm": 0.4543623626232147, |
| "learning_rate": 0.0009823925488998885, |
| "loss": 1.0023009777069092, |
| "step": 2304 |
| }, |
| { |
| "epoch": 0.1182393422936585, |
| "grad_norm": 0.41527411341667175, |
| "learning_rate": 0.0009770409161149525, |
| "loss": 0.9987744092941284, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.13006327652302435, |
| "grad_norm": 0.37379226088523865, |
| "learning_rate": 0.0009709988622506973, |
| "loss": 0.9899244904518127, |
| "step": 2816 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "grad_norm": 0.4600990414619446, |
| "learning_rate": 0.000964275131968659, |
| "loss": 0.9836398959159851, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "eval_cos_loss": 0.33966573984383447, |
| "eval_loss": 0.9576488193039481, |
| "eval_mse_loss": 0.9576488193039481, |
| "flow/cos_sim": 0.6603342864883545, |
| "flow/improvement_ratio": 0.4705073194552774, |
| "flow/mag_ratio_mean": 0.6616723446269014, |
| "flow/mag_ratio_std": 0.2178211697976883, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "eval_cos_loss": 0.33966573984383447, |
| "eval_loss": 0.9576488193039481, |
| "eval_mse_loss": 0.9576488193039481, |
| "eval_runtime": 50.7658, |
| "eval_samples_per_second": 551.415, |
| "eval_steps_per_second": 8.628, |
| "flow/cos_sim": 0.6603342864883545, |
| "flow/improvement_ratio": 0.4705073194552774, |
| "flow/mag_ratio_mean": 0.6616723446269014, |
| "flow/mag_ratio_std": 0.2178211697976883, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.15371114498175603, |
| "grad_norm": 0.4591522812843323, |
| "learning_rate": 0.0009568794565203123, |
| "loss": 0.9778329133987427, |
| "step": 3328 |
| }, |
| { |
| "epoch": 0.1655350792111219, |
| "grad_norm": 0.506796658039093, |
| "learning_rate": 0.0009488225396630347, |
| "loss": 0.9751444458961487, |
| "step": 3584 |
| }, |
| { |
| "epoch": 0.17735901344048774, |
| "grad_norm": 0.6946936845779419, |
| "learning_rate": 0.0009401160421685646, |
| "loss": 0.9650535583496094, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "grad_norm": 0.5334352254867554, |
| "learning_rate": 0.0009307725649463714, |
| "loss": 0.968015193939209, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "eval_cos_loss": 0.3295682058350681, |
| "eval_loss": 0.9358410420210939, |
| "eval_mse_loss": 0.9358410420210939, |
| "flow/cos_sim": 0.6704318162785273, |
| "flow/improvement_ratio": 0.4610280344883601, |
| "flow/mag_ratio_mean": 0.6802759532514773, |
| "flow/mag_ratio_std": 0.21443206633198753, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "eval_cos_loss": 0.3295682058350681, |
| "eval_loss": 0.9358410420210939, |
| "eval_mse_loss": 0.9358410420210939, |
| "eval_runtime": 50.6775, |
| "eval_samples_per_second": 552.375, |
| "eval_steps_per_second": 8.643, |
| "flow/cos_sim": 0.6704318162785273, |
| "flow/improvement_ratio": 0.4610280344883601, |
| "flow/mag_ratio_mean": 0.6802759532514773, |
| "flow/mag_ratio_std": 0.21443206633198753, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.20100688189921945, |
| "grad_norm": 0.23779992759227753, |
| "learning_rate": 0.0009208056308063659, |
| "loss": 0.9618788957595825, |
| "step": 4352 |
| }, |
| { |
| "epoch": 0.2128308161285853, |
| "grad_norm": 0.3806602358818054, |
| "learning_rate": 0.0009102296648873445, |
| "loss": 0.9570742249488831, |
| "step": 4608 |
| }, |
| { |
| "epoch": 0.22465475035795113, |
| "grad_norm": 0.41401633620262146, |
| "learning_rate": 0.0008990599737794927, |
| "loss": 0.9550731778144836, |
| "step": 4864 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "grad_norm": 0.4475092589855194, |
| "learning_rate": 0.0008873127233711644, |
| "loss": 0.9473839998245239, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "eval_cos_loss": 0.31993362566107486, |
| "eval_loss": 0.9154947232710172, |
| "eval_mse_loss": 0.9154947232710172, |
| "flow/cos_sim": 0.6800663863142876, |
| "flow/improvement_ratio": 0.4631300759778175, |
| "flow/mag_ratio_mean": 0.6912006213240427, |
| "flow/mag_ratio_std": 0.2171582194155754, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "eval_cos_loss": 0.31993362566107486, |
| "eval_loss": 0.9154947232710172, |
| "eval_mse_loss": 0.9154947232710172, |
| "eval_runtime": 50.7141, |
| "eval_samples_per_second": 551.977, |
| "eval_steps_per_second": 8.637, |
| "flow/cos_sim": 0.6800663863142876, |
| "flow/improvement_ratio": 0.4631300759778175, |
| "flow/mag_ratio_mean": 0.6912006213240427, |
| "flow/mag_ratio_std": 0.2171582194155754, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.24830261881668284, |
| "grad_norm": 0.39829879999160767, |
| "learning_rate": 0.0008750049154520011, |
| "loss": 0.9461303949356079, |
| "step": 5376 |
| }, |
| { |
| "epoch": 0.2601265530460487, |
| "grad_norm": 0.570689857006073, |
| "learning_rate": 0.0008621543631062487, |
| "loss": 0.9459335803985596, |
| "step": 5632 |
| }, |
| { |
| "epoch": 0.27195048727541454, |
| "grad_norm": 0.49673837423324585, |
| "learning_rate": 0.0008487796649318904, |
| "loss": 0.9424968957901001, |
| "step": 5888 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "grad_norm": 0.267270028591156, |
| "learning_rate": 0.0008349001781229053, |
| "loss": 0.9410290122032166, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "eval_cos_loss": 0.31771982879671334, |
| "eval_loss": 0.9094011069160618, |
| "eval_mse_loss": 0.9094011069160618, |
| "flow/cos_sim": 0.682280182430189, |
| "flow/improvement_ratio": 0.46761273847867363, |
| "flow/mag_ratio_mean": 0.6880964620472634, |
| "flow/mag_ratio_std": 0.21481055140359218, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "eval_cos_loss": 0.31771982879671334, |
| "eval_loss": 0.9094011069160618, |
| "eval_mse_loss": 0.9094011069160618, |
| "eval_runtime": 51.0025, |
| "eval_samples_per_second": 548.856, |
| "eval_steps_per_second": 8.588, |
| "flow/cos_sim": 0.682280182430189, |
| "flow/improvement_ratio": 0.46761273847867363, |
| "flow/mag_ratio_mean": 0.6880964620472634, |
| "flow/mag_ratio_std": 0.21481055140359218, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2955983557341462, |
| "grad_norm": 0.43064209818840027, |
| "learning_rate": 0.0008205359904536107, |
| "loss": 0.9332870841026306, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.30742228996351206, |
| "grad_norm": 0.45424163341522217, |
| "learning_rate": 0.0008057078912056363, |
| "loss": 0.9331082105636597, |
| "step": 6656 |
| }, |
| { |
| "epoch": 0.3192462241928779, |
| "grad_norm": 0.6565593481063843, |
| "learning_rate": 0.0007904373410796086, |
| "loss": 0.9313054084777832, |
| "step": 6912 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "grad_norm": 0.5485164523124695, |
| "learning_rate": 0.0007747464411350876, |
| "loss": 0.9296150207519531, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "eval_cos_loss": 0.3111404624839896, |
| "eval_loss": 0.8942771547189042, |
| "eval_mse_loss": 0.8942771547189042, |
| "flow/cos_sim": 0.688859552281088, |
| "flow/improvement_ratio": 0.46996809215578317, |
| "flow/mag_ratio_mean": 0.6797864257472835, |
| "flow/mag_ratio_std": 0.21222736455125896, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "eval_cos_loss": 0.3111404624839896, |
| "eval_loss": 0.8942771547189042, |
| "eval_mse_loss": 0.8942771547189042, |
| "eval_runtime": 51.2111, |
| "eval_samples_per_second": 546.619, |
| "eval_steps_per_second": 8.553, |
| "flow/cos_sim": 0.688859552281088, |
| "flow/improvement_ratio": 0.46996809215578317, |
| "flow/mag_ratio_mean": 0.6797864257472835, |
| "flow/mag_ratio_std": 0.21222736455125896, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.34289409265160964, |
| "grad_norm": 0.3187243938446045, |
| "learning_rate": 0.000758657900803716, |
| "loss": 0.928747832775116, |
| "step": 7424 |
| }, |
| { |
| "epoch": 0.3547180268809755, |
| "grad_norm": 0.3337271213531494, |
| "learning_rate": 0.000742195005021869, |
| "loss": 0.925912082195282, |
| "step": 7680 |
| }, |
| { |
| "epoch": 0.3665419611103413, |
| "grad_norm": 0.5179343223571777, |
| "learning_rate": 0.0007253815805303786, |
| "loss": 0.923928439617157, |
| "step": 7936 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "grad_norm": 0.3594360053539276, |
| "learning_rate": 0.000708309515673374, |
| "loss": 0.9226717948913574, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "eval_cos_loss": 0.3082785013059503, |
| "eval_loss": 0.8866745081122063, |
| "eval_mse_loss": 0.8866745081122063, |
| "flow/cos_sim": 0.6917215108054958, |
| "flow/improvement_ratio": 0.46622252083260174, |
| "flow/mag_ratio_mean": 0.6977443612057325, |
| "flow/mag_ratio_std": 0.21176587402412336, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "eval_cos_loss": 0.3082785013059503, |
| "eval_loss": 0.8866745081122063, |
| "eval_mse_loss": 0.8866745081122063, |
| "eval_runtime": 50.453, |
| "eval_samples_per_second": 554.833, |
| "eval_steps_per_second": 8.681, |
| "flow/cos_sim": 0.6917215108054958, |
| "flow/improvement_ratio": 0.46622252083260174, |
| "flow/mag_ratio_mean": 0.6977443612057325, |
| "flow/mag_ratio_std": 0.21176587402412336, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.390189829569073, |
| "grad_norm": 0.2854546010494232, |
| "learning_rate": 0.0006908696365085842, |
| "loss": 0.9181273579597473, |
| "step": 8448 |
| }, |
| { |
| "epoch": 0.4020137637984389, |
| "grad_norm": 0.6743373870849609, |
| "learning_rate": 0.0006731535118143318, |
| "loss": 0.9195088148117065, |
| "step": 8704 |
| }, |
| { |
| "epoch": 0.41383769802780473, |
| "grad_norm": 0.31884175539016724, |
| "learning_rate": 0.0006551867821290267, |
| "loss": 0.9166384935379028, |
| "step": 8960 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "grad_norm": 0.377502977848053, |
| "learning_rate": 0.0006369954506915572, |
| "loss": 0.9148516654968262, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "eval_cos_loss": 0.3045589508803468, |
| "eval_loss": 0.8784834154664654, |
| "eval_mse_loss": 0.8784834154664654, |
| "flow/cos_sim": 0.6954410576929241, |
| "flow/improvement_ratio": 0.4660896160150772, |
| "flow/mag_ratio_mean": 0.6896047796288581, |
| "flow/mag_ratio_std": 0.20802477359363478, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "eval_cos_loss": 0.3045589508803468, |
| "eval_loss": 0.8784834154664654, |
| "eval_mse_loss": 0.8784834154664654, |
| "eval_runtime": 50.5191, |
| "eval_samples_per_second": 554.108, |
| "eval_steps_per_second": 8.67, |
| "flow/cos_sim": 0.6954410576929241, |
| "flow/improvement_ratio": 0.4660896160150772, |
| "flow/mag_ratio_mean": 0.6896047796288581, |
| "flow/mag_ratio_std": 0.20802477359363478, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4374855664865364, |
| "grad_norm": 0.2935723066329956, |
| "learning_rate": 0.0006186058458068149, |
| "loss": 0.9158189296722412, |
| "step": 9472 |
| }, |
| { |
| "epoch": 0.44930950071590225, |
| "grad_norm": 0.6002705097198486, |
| "learning_rate": 0.0006000445827407526, |
| "loss": 0.9137871265411377, |
| "step": 9728 |
| }, |
| { |
| "epoch": 0.4611334349452681, |
| "grad_norm": 0.3263113498687744, |
| "learning_rate": 0.0005813385252001232, |
| "loss": 0.9110487103462219, |
| "step": 9984 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "grad_norm": 0.3971521854400635, |
| "learning_rate": 0.0005625884706462481, |
| "loss": 0.9095574021339417, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "eval_cos_loss": 0.30255618235564125, |
| "eval_loss": 0.8742554381013461, |
| "eval_mse_loss": 0.8742554381013461, |
| "flow/cos_sim": 0.6974438190732373, |
| "flow/improvement_ratio": 0.46927081490760525, |
| "flow/mag_ratio_mean": 0.6958274860360306, |
| "flow/mag_ratio_std": 0.21019614664795191, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "eval_cos_loss": 0.30255618235564125, |
| "eval_loss": 0.8742554381013461, |
| "eval_mse_loss": 0.8742554381013461, |
| "eval_runtime": 51.3022, |
| "eval_samples_per_second": 545.649, |
| "eval_steps_per_second": 8.538, |
| "flow/cos_sim": 0.6974438190732373, |
| "flow/improvement_ratio": 0.46927081490760525, |
| "flow/mag_ratio_mean": 0.6958274860360306, |
| "flow/mag_ratio_std": 0.21019614664795191, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.48478130340399983, |
| "grad_norm": 0.3551290035247803, |
| "learning_rate": 0.0005436745145807214, |
| "loss": 0.9060850143432617, |
| "step": 10496 |
| }, |
| { |
| "epoch": 0.49660523763336567, |
| "grad_norm": 0.4094627797603607, |
| "learning_rate": 0.0005246973484120841, |
| "loss": 0.9049650430679321, |
| "step": 10752 |
| }, |
| { |
| "epoch": 0.5084291718627315, |
| "grad_norm": 0.4399157762527466, |
| "learning_rate": 0.0005056844377834413, |
| "loss": 0.9042121767997742, |
| "step": 11008 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "grad_norm": 0.5329852104187012, |
| "learning_rate": 0.0004866633000708374, |
| "loss": 0.9038464426994324, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "eval_cos_loss": 0.3004189523183592, |
| "eval_loss": 0.8695836310789465, |
| "eval_mse_loss": 0.8695836310789465, |
| "flow/cos_sim": 0.699581062548781, |
| "flow/improvement_ratio": 0.47203471392529195, |
| "flow/mag_ratio_mean": 0.6982838295093955, |
| "flow/mag_ratio_std": 0.20910314640634137, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "eval_cos_loss": 0.3004189523183592, |
| "eval_loss": 0.8695836310789465, |
| "eval_mse_loss": 0.8695836310789465, |
| "eval_runtime": 51.0932, |
| "eval_samples_per_second": 547.881, |
| "eval_steps_per_second": 8.573, |
| "flow/cos_sim": 0.699581062548781, |
| "flow/improvement_ratio": 0.47203471392529195, |
| "flow/mag_ratio_mean": 0.6982838295093955, |
| "flow/mag_ratio_std": 0.20910314640634137, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5320770403214632, |
| "grad_norm": 0.3345041275024414, |
| "learning_rate": 0.00046766146455737116, |
| "loss": 0.9016320109367371, |
| "step": 11520 |
| }, |
| { |
| "epoch": 0.5439009745508291, |
| "grad_norm": 0.47475534677505493, |
| "learning_rate": 0.00044870643259007823, |
| "loss": 0.9061211347579956, |
| "step": 11776 |
| }, |
| { |
| "epoch": 0.5557249087801949, |
| "grad_norm": 0.6051743030548096, |
| "learning_rate": 0.000429825637777245, |
| "loss": 0.903529167175293, |
| "step": 12032 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "grad_norm": 0.39291107654571533, |
| "learning_rate": 0.0004111195298279063, |
| "loss": 0.8996114730834961, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "eval_cos_loss": 0.29944087231540245, |
| "eval_loss": 0.8666894671307307, |
| "eval_mse_loss": 0.8666894671307307, |
| "flow/cos_sim": 0.700559140476462, |
| "flow/improvement_ratio": 0.4712014450058001, |
| "flow/mag_ratio_mean": 0.6962402411247497, |
| "flow/mag_ratio_std": 0.20598308850913288, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "eval_cos_loss": 0.29944087231540245, |
| "eval_loss": 0.8666894671307307, |
| "eval_mse_loss": 0.8666894671307307, |
| "eval_runtime": 50.9516, |
| "eval_samples_per_second": 549.403, |
| "eval_steps_per_second": 8.596, |
| "flow/cos_sim": 0.700559140476462, |
| "flow/improvement_ratio": 0.4712014450058001, |
| "flow/mag_ratio_mean": 0.6962402411247497, |
| "flow/mag_ratio_std": 0.20598308850913288, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5793727772389267, |
| "grad_norm": 0.38517266511917114, |
| "learning_rate": 0.0003924684852772684, |
| "loss": 0.9009869694709778, |
| "step": 12544 |
| }, |
| { |
| "epoch": 0.5911967114682924, |
| "grad_norm": 0.3089086413383484, |
| "learning_rate": 0.00037404498123644504, |
| "loss": 0.9021767377853394, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.6030206456976583, |
| "grad_norm": 0.5309678912162781, |
| "learning_rate": 0.0003557312014609784, |
| "loss": 0.9018325209617615, |
| "step": 13056 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "grad_norm": 0.36101001501083374, |
| "learning_rate": 0.00033762622184548686, |
| "loss": 0.8969551920890808, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "eval_cos_loss": 0.30058121572346447, |
| "eval_loss": 0.8693417448159222, |
| "eval_mse_loss": 0.8693417448159222, |
| "flow/cos_sim": 0.6994188143510253, |
| "flow/improvement_ratio": 0.4627683243658989, |
| "flow/mag_ratio_mean": 0.7132113126859273, |
| "flow/mag_ratio_std": 0.21025482180728217, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "eval_cos_loss": 0.30058121572346447, |
| "eval_loss": 0.8693417448159222, |
| "eval_mse_loss": 0.8693417448159222, |
| "eval_runtime": 50.5293, |
| "eval_samples_per_second": 553.996, |
| "eval_steps_per_second": 8.668, |
| "flow/cos_sim": 0.6994188143510253, |
| "flow/improvement_ratio": 0.4627683243658989, |
| "flow/mag_ratio_mean": 0.7132113126859273, |
| "flow/mag_ratio_std": 0.21025482180728217, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.62666851415639, |
| "grad_norm": 0.4100685119628906, |
| "learning_rate": 0.00031975624571791317, |
| "loss": 0.8950372934341431, |
| "step": 13568 |
| }, |
| { |
| "epoch": 0.6384924483857558, |
| "grad_norm": 0.2832315266132355, |
| "learning_rate": 0.00030214713628576984, |
| "loss": 0.8969645500183105, |
| "step": 13824 |
| }, |
| { |
| "epoch": 0.6503163826151217, |
| "grad_norm": 0.39231061935424805, |
| "learning_rate": 0.00028482437920433144, |
| "loss": 0.8969537019729614, |
| "step": 14080 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "grad_norm": 0.7917575240135193, |
| "learning_rate": 0.00026781304569125866, |
| "loss": 0.8955793976783752, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "eval_cos_loss": 0.29705161533263175, |
| "eval_loss": 0.8606252675731433, |
| "eval_mse_loss": 0.8606252675731433, |
| "flow/cos_sim": 0.7029484053180642, |
| "flow/improvement_ratio": 0.46662241846459096, |
| "flow/mag_ratio_mean": 0.7076406654429762, |
| "flow/mag_ratio_std": 0.20891345561095023, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "eval_cos_loss": 0.29705161533263175, |
| "eval_loss": 0.8606252675731433, |
| "eval_mse_loss": 0.8606252675731433, |
| "eval_runtime": 50.7312, |
| "eval_samples_per_second": 551.791, |
| "eval_steps_per_second": 8.634, |
| "flow/cos_sim": 0.7029484053180642, |
| "flow/improvement_ratio": 0.46662241846459096, |
| "flow/mag_ratio_mean": 0.7076406654429762, |
| "flow/mag_ratio_std": 0.20891345561095023, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6739642510738534, |
| "grad_norm": 0.2073170691728592, |
| "learning_rate": 0.0002511377562410373, |
| "loss": 0.8957792520523071, |
| "step": 14592 |
| }, |
| { |
| "epoch": 0.6857881853032193, |
| "grad_norm": 0.6534483432769775, |
| "learning_rate": 0.00023482264499174915, |
| "loss": 0.8970192074775696, |
| "step": 14848 |
| }, |
| { |
| "epoch": 0.6976121195325851, |
| "grad_norm": 0.24640800058841705, |
| "learning_rate": 0.00021889132479574586, |
| "loss": 0.8956112861633301, |
| "step": 15104 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "grad_norm": 0.35241010785102844, |
| "learning_rate": 0.0002033668530447801, |
| "loss": 0.8931179642677307, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "eval_cos_loss": 0.3005856423889665, |
| "eval_loss": 0.8699537216256198, |
| "eval_mse_loss": 0.8699537216256198, |
| "flow/cos_sim": 0.6994143777514157, |
| "flow/improvement_ratio": 0.47257714858066, |
| "flow/mag_ratio_mean": 0.7026257012804894, |
| "flow/mag_ratio_std": 0.20547213071965736, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "eval_cos_loss": 0.3005856423889665, |
| "eval_loss": 0.8699537216256198, |
| "eval_mse_loss": 0.8699537216256198, |
| "eval_runtime": 50.6626, |
| "eval_samples_per_second": 552.538, |
| "eval_steps_per_second": 8.645, |
| "flow/cos_sim": 0.6994143777514157, |
| "flow/improvement_ratio": 0.47257714858066, |
| "flow/mag_ratio_mean": 0.7026257012804894, |
| "flow/mag_ratio_std": 0.20547213071965736, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.7212599879913169, |
| "grad_norm": 0.4916954040527344, |
| "learning_rate": 0.0001882716982990524, |
| "loss": 0.8969737887382507, |
| "step": 15616 |
| }, |
| { |
| "epoch": 0.7330839222206826, |
| "grad_norm": 0.5466639399528503, |
| "learning_rate": 0.00017362770776847765, |
| "loss": 0.8959704637527466, |
| "step": 15872 |
| }, |
| { |
| "epoch": 0.7449078564500485, |
| "grad_norm": 0.45104148983955383, |
| "learning_rate": 0.00015951048788154866, |
| "loss": 0.8920841813087463, |
| "step": 16128 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "grad_norm": 0.23488114774227142, |
| "learning_rate": 0.0001458297605985633, |
| "loss": 0.8922313451766968, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "eval_cos_loss": 0.2997721447928311, |
| "eval_loss": 0.8672782906658574, |
| "eval_mse_loss": 0.8672782906658574, |
| "flow/cos_sim": 0.7002278570442984, |
| "flow/improvement_ratio": 0.4662386941719273, |
| "flow/mag_ratio_mean": 0.7100178600718442, |
| "flow/mag_ratio_std": 0.20784400301436856, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "eval_cos_loss": 0.2997721447928311, |
| "eval_loss": 0.8672782906658574, |
| "eval_mse_loss": 0.8672782906658574, |
| "eval_runtime": 50.9632, |
| "eval_samples_per_second": 549.278, |
| "eval_steps_per_second": 8.594, |
| "flow/cos_sim": 0.7002278570442984, |
| "flow/improvement_ratio": 0.4662386941719273, |
| "flow/mag_ratio_mean": 0.7100178600718442, |
| "flow/mag_ratio_std": 0.20784400301436856, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7685557249087802, |
| "grad_norm": 0.25124895572662354, |
| "learning_rate": 0.00013266162372593872, |
| "loss": 0.8903353810310364, |
| "step": 16640 |
| }, |
| { |
| "epoch": 0.780379659138146, |
| "grad_norm": 0.2108362913131714, |
| "learning_rate": 0.00012002513550158511, |
| "loss": 0.8916131854057312, |
| "step": 16896 |
| }, |
| { |
| "epoch": 0.7922035933675119, |
| "grad_norm": 0.22269435226917267, |
| "learning_rate": 0.00010793858470858986, |
| "loss": 0.8932616114616394, |
| "step": 17152 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "grad_norm": 0.2587853968143463, |
| "learning_rate": 9.641946420587128e-05, |
| "loss": 0.8924716114997864, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "eval_cos_loss": 0.29718101851335943, |
| "eval_loss": 0.8611624605579463, |
| "eval_mse_loss": 0.8611624605579463, |
| "flow/cos_sim": 0.7028190088054361, |
| "flow/improvement_ratio": 0.465975399158861, |
| "flow/mag_ratio_mean": 0.7077710173990084, |
| "flow/mag_ratio_std": 0.20818141616506664, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "eval_cos_loss": 0.29718101851335943, |
| "eval_loss": 0.8611624605579463, |
| "eval_mse_loss": 0.8611624605579463, |
| "eval_runtime": 50.5279, |
| "eval_samples_per_second": 554.011, |
| "eval_steps_per_second": 8.668, |
| "flow/cos_sim": 0.7028190088054361, |
| "flow/improvement_ratio": 0.465975399158861, |
| "flow/mag_ratio_mean": 0.7077710173990084, |
| "flow/mag_ratio_std": 0.20818141616506664, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8158514618262436, |
| "grad_norm": 0.22211408615112305, |
| "learning_rate": 8.548444561077174e-05, |
| "loss": 0.8929438591003418, |
| "step": 17664 |
| }, |
| { |
| "epoch": 0.8276753960556095, |
| "grad_norm": 0.16869360208511353, |
| "learning_rate": 7.514935517023558e-05, |
| "loss": 0.8896693587303162, |
| "step": 17920 |
| }, |
| { |
| "epoch": 0.8394993302849753, |
| "grad_norm": 0.2596553862094879, |
| "learning_rate": 6.542915085548828e-05, |
| "loss": 0.8902697563171387, |
| "step": 18176 |
| }, |
| { |
| "epoch": 0.8513232645143411, |
| "grad_norm": 0.23781056702136993, |
| "learning_rate": 5.6337900713373745e-05, |
| "loss": 0.8896790742874146, |
| "step": 18432 |
| }, |
| { |
| "epoch": 0.8513232645143411, |
| "eval_cos_loss": 0.29792979842723777, |
| "eval_loss": 0.8626222945239446, |
| "eval_mse_loss": 0.8626222945239446, |
| "flow/cos_sim": 0.7020702216451027, |
| "flow/improvement_ratio": 0.4633185115034722, |
| "flow/mag_ratio_mean": 0.7091497124330094, |
| "flow/mag_ratio_std": 0.20735928929967967, |
| "step": 18432 |
| }, |
| { |
| "epoch": 0.8513232645143411, |
| "eval_cos_loss": 0.29792979842723777, |
| "eval_loss": 0.8626222945239446, |
| "eval_mse_loss": 0.8626222945239446, |
| "eval_runtime": 50.6008, |
| "eval_samples_per_second": 553.212, |
| "eval_steps_per_second": 8.656, |
| "flow/cos_sim": 0.7020702216451027, |
| "flow/improvement_ratio": 0.4633185115034722, |
| "flow/mag_ratio_mean": 0.7091497124330094, |
| "flow/mag_ratio_std": 0.20735928929967967, |
| "step": 18432 |
| }, |
| { |
| "epoch": 0.8631471987437069, |
| "grad_norm": 0.24920311570167542, |
| "learning_rate": 4.78887625056757e-05, |
| "loss": 0.8902055621147156, |
| "step": 18688 |
| }, |
| { |
| "epoch": 0.8749711329730728, |
| "grad_norm": 0.29849973320961, |
| "learning_rate": 4.009396466589682e-05, |
| "loss": 0.8859462141990662, |
| "step": 18944 |
| }, |
| { |
| "epoch": 0.8867950672024387, |
| "grad_norm": 0.2714053690433502, |
| "learning_rate": 3.296478860105229e-05, |
| "loss": 0.890446662902832, |
| "step": 19200 |
| }, |
| { |
| "epoch": 0.8986190014318045, |
| "grad_norm": 0.3177722096443176, |
| "learning_rate": 2.6511552364095358e-05, |
| "loss": 0.8924580216407776, |
| "step": 19456 |
| }, |
| { |
| "epoch": 0.8986190014318045, |
| "eval_cos_loss": 0.2985236220504051, |
| "eval_loss": 0.8639824096470663, |
| "eval_mse_loss": 0.8639824096470663, |
| "flow/cos_sim": 0.7014764100993605, |
| "flow/improvement_ratio": 0.46989300261893774, |
| "flow/mag_ratio_mean": 0.7079281760677355, |
| "flow/mag_ratio_std": 0.20886976322899126, |
| "step": 19456 |
| }, |
| { |
| "epoch": 0.8986190014318045, |
| "eval_cos_loss": 0.2985236220504051, |
| "eval_loss": 0.8639824096470663, |
| "eval_mse_loss": 0.8639824096470663, |
| "eval_runtime": 51.0434, |
| "eval_samples_per_second": 548.416, |
| "eval_steps_per_second": 8.581, |
| "flow/cos_sim": 0.7014764100993605, |
| "flow/improvement_ratio": 0.46989300261893774, |
| "flow/mag_ratio_mean": 0.7079281760677355, |
| "flow/mag_ratio_std": 0.20886976322899126, |
| "step": 19456 |
| }, |
| { |
| "epoch": 0.9104429356611704, |
| "grad_norm": 0.2206149697303772, |
| "learning_rate": 2.074359572060447e-05, |
| "loss": 0.888733446598053, |
| "step": 19712 |
| }, |
| { |
| "epoch": 0.9222668698905362, |
| "grad_norm": 0.1809428334236145, |
| "learning_rate": 1.5669266631345104e-05, |
| "loss": 0.8881487250328064, |
| "step": 19968 |
| }, |
| { |
| "epoch": 0.9340908041199021, |
| "grad_norm": 0.16622242331504822, |
| "learning_rate": 1.1311620335770879e-05, |
| "loss": 0.8930599689483643, |
| "step": 20224 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "grad_norm": 0.25432291626930237, |
| "learning_rate": 7.642790173984836e-06, |
| "loss": 0.8947161436080933, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "eval_cos_loss": 0.2954880066862389, |
| "eval_loss": 0.8570421781714104, |
| "eval_mse_loss": 0.8570421781714104, |
| "flow/cos_sim": 0.7045120175026324, |
| "flow/improvement_ratio": 0.4633371506107452, |
| "flow/mag_ratio_mean": 0.7112098344384807, |
| "flow/mag_ratio_std": 0.20906852869546577, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "eval_cos_loss": 0.2954880066862389, |
| "eval_loss": 0.8570421781714104, |
| "eval_mse_loss": 0.8570421781714104, |
| "eval_runtime": 50.6225, |
| "eval_samples_per_second": 552.976, |
| "eval_steps_per_second": 8.652, |
| "flow/cos_sim": 0.7045120175026324, |
| "flow/improvement_ratio": 0.4633371506107452, |
| "flow/mag_ratio_mean": 0.7112098344384807, |
| "flow/mag_ratio_std": 0.20906852869546577, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.9577386725786338, |
| "grad_norm": 0.21038714051246643, |
| "learning_rate": 4.686548355746001e-06, |
| "loss": 0.8879244923591614, |
| "step": 20736 |
| }, |
| { |
| "epoch": 0.9695626068079997, |
| "grad_norm": 0.24928613007068634, |
| "learning_rate": 2.447173448359541e-06, |
| "loss": 0.889561653137207, |
| "step": 20992 |
| }, |
| { |
| "epoch": 0.9813865410373654, |
| "grad_norm": 0.21087926626205444, |
| "learning_rate": 9.279064980089457e-07, |
| "loss": 0.8902660608291626, |
| "step": 21248 |
| }, |
| { |
| "epoch": 0.9932104752667313, |
| "grad_norm": 0.18456599116325378, |
| "learning_rate": 1.3094633899163322e-07, |
| "loss": 0.8902599811553955, |
| "step": 21504 |
| }, |
| { |
| "epoch": 0.9932104752667313, |
| "eval_cos_loss": 0.29621158380350565, |
| "eval_loss": 0.858656403544831, |
| "eval_mse_loss": 0.858656403544831, |
| "flow/cos_sim": 0.7037884391606126, |
| "flow/improvement_ratio": 0.47204427734085413, |
| "flow/mag_ratio_mean": 0.7089065290205011, |
| "flow/mag_ratio_std": 0.20877777332584607, |
| "step": 21504 |
| }, |
| { |
| "epoch": 0.9932104752667313, |
| "eval_cos_loss": 0.29621158380350565, |
| "eval_loss": 0.858656403544831, |
| "eval_mse_loss": 0.858656403544831, |
| "eval_runtime": 50.8696, |
| "eval_samples_per_second": 550.289, |
| "eval_steps_per_second": 8.61, |
| "flow/cos_sim": 0.7037884391606126, |
| "flow/improvement_ratio": 0.47204427734085413, |
| "flow/mag_ratio_mean": 0.7089065290205011, |
| "flow/mag_ratio_std": 0.20877777332584607, |
| "step": 21504 |
| } |
| ], |
| "logging_steps": 256, |
| "max_steps": 21651, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1024, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|