{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 1024, "global_step": 21651, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011823934229365849, "grad_norm": 0.2415127456188202, "learning_rate": 0.000498046875, "loss": 1.734155297279358, "step": 256 }, { "epoch": 0.023647868458731697, "grad_norm": 0.200724259018898, "learning_rate": 0.000998046875, "loss": 1.1768943071365356, "step": 512 }, { "epoch": 0.03547180268809755, "grad_norm": 0.10385265946388245, "learning_rate": 0.000999640996023194, "loss": 1.0633362531661987, "step": 768 }, { "epoch": 0.047295736917463395, "grad_norm": 0.24572038650512695, "learning_rate": 0.0009985588674043958, "loss": 1.0212352275848389, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_cos_loss": 0.2939360714666375, "eval_loss": 0.9792764313416938, "eval_mse_loss": 0.9792764313416938, "flow/cos_sim": 0.706063949762414, "flow/improvement_ratio": 0.47937897269584273, "flow/mag_ratio_mean": 0.7043995116943638, "flow/mag_ratio_std": 0.14764773857579927, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_cos_loss": 0.2939360714666375, "eval_loss": 0.9792764313416938, "eval_mse_loss": 0.9792764313416938, "eval_runtime": 37.309, "eval_samples_per_second": 750.302, "eval_steps_per_second": 11.74, "flow/cos_sim": 0.706063949762414, "flow/improvement_ratio": 0.47937897269584273, "flow/mag_ratio_mean": 0.7043995116943638, "flow/mag_ratio_std": 0.14764773857579927, "step": 1024 }, { "epoch": 0.05911967114682925, "grad_norm": 0.14034895598888397, "learning_rate": 0.0009967551747861387, "loss": 0.9987254738807678, "step": 1280 }, { "epoch": 0.0709436053761951, "grad_norm": 0.32785671949386597, "learning_rate": 0.000994232528651847, "loss": 0.9810371398925781, "step": 1536 }, { "epoch": 0.08276753960556095, "grad_norm": 0.4202195107936859, "learning_rate": 0.0009909945800260092, "loss": 0.9669110774993896, "step": 1792 }, { "epoch": 0.09459147383492679, "grad_norm": 0.11297852545976639, "learning_rate": 0.0009870460151900522, "loss": 0.9570462107658386, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_cos_loss": 0.27135700569169163, "eval_loss": 0.9140212976769225, "eval_mse_loss": 0.9140212976769225, "flow/cos_sim": 0.7286430077465702, "flow/improvement_ratio": 0.4797877063081689, "flow/mag_ratio_mean": 0.7240678813631676, "flow/mag_ratio_std": 0.15831342356526143, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_cos_loss": 0.27135700569169163, "eval_loss": 0.9140212976769225, "eval_mse_loss": 0.9140212976769225, "eval_runtime": 37.3614, "eval_samples_per_second": 749.249, "eval_steps_per_second": 11.723, "flow/cos_sim": 0.7286430077465702, "flow/improvement_ratio": 0.4797877063081689, "flow/mag_ratio_mean": 0.7240678813631676, "flow/mag_ratio_std": 0.15831342356526143, "step": 2048 }, { "epoch": 0.10641540806429264, "grad_norm": 0.10684677958488464, "learning_rate": 0.0009823925488998885, "loss": 0.9458644986152649, "step": 2304 }, { "epoch": 0.1182393422936585, "grad_norm": 0.35598301887512207, "learning_rate": 0.0009770409161149525, "loss": 0.9369097948074341, "step": 2560 }, { "epoch": 0.13006327652302435, "grad_norm": 0.24151749908924103, "learning_rate": 0.0009709988622506973, "loss": 0.925268292427063, "step": 2816 }, { "epoch": 0.1418872107523902, "grad_norm": 0.17448143661022186, "learning_rate": 0.000964275131968659, "loss": 0.9168843030929565, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_cos_loss": 0.258329911795381, "eval_loss": 0.872469331303688, "eval_mse_loss": 0.872469331303688, "flow/cos_sim": 0.7416701187554016, "flow/improvement_ratio": 0.4866235294978913, "flow/mag_ratio_mean": 0.7309322730046973, "flow/mag_ratio_std": 0.16244563712104815, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_cos_loss": 0.258329911795381, "eval_loss": 0.872469331303688, "eval_mse_loss": 0.872469331303688, "eval_runtime": 37.367, "eval_samples_per_second": 749.136, "eval_steps_per_second": 11.722, "flow/cos_sim": 0.7416701187554016, "flow/improvement_ratio": 0.4866235294978913, "flow/mag_ratio_mean": 0.7309322730046973, "flow/mag_ratio_std": 0.16244563712104815, "step": 3072 }, { "epoch": 0.15371114498175603, "grad_norm": 0.16108064353466034, "learning_rate": 0.0009568794565203123, "loss": 0.91062992811203, "step": 3328 }, { "epoch": 0.1655350792111219, "grad_norm": 0.21124346554279327, "learning_rate": 0.0009488225396630347, "loss": 0.9045400023460388, "step": 3584 }, { "epoch": 0.17735901344048774, "grad_norm": 0.49332571029663086, "learning_rate": 0.0009401160421685646, "loss": 0.8949952721595764, "step": 3840 }, { "epoch": 0.18918294766985358, "grad_norm": 0.25588458776474, "learning_rate": 0.0009307725649463714, "loss": 0.8952076435089111, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_cos_loss": 0.24964532671181577, "eval_loss": 0.8440415833366516, "eval_mse_loss": 0.8440415833366516, "flow/cos_sim": 0.7503547062884727, "flow/improvement_ratio": 0.4770904905311593, "flow/mag_ratio_mean": 0.7401841025646418, "flow/mag_ratio_std": 0.1675400482533185, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_cos_loss": 0.24964532671181577, "eval_loss": 0.8440415833366516, "eval_mse_loss": 0.8440415833366516, "eval_runtime": 37.4493, "eval_samples_per_second": 747.491, "eval_steps_per_second": 11.696, "flow/cos_sim": 0.7503547062884727, "flow/improvement_ratio": 0.4770904905311593, "flow/mag_ratio_mean": 0.7401841025646418, "flow/mag_ratio_std": 0.1675400482533185, "step": 4096 }, { "epoch": 0.20100688189921945, "grad_norm": 0.37869253754615784, "learning_rate": 0.0009208056308063659, "loss": 0.8890768885612488, "step": 4352 }, { "epoch": 0.2128308161285853, "grad_norm": 0.9834415316581726, "learning_rate": 0.0009102296648873445, "loss": 0.8840116262435913, "step": 4608 }, { "epoch": 0.22465475035795113, "grad_norm": 1.289456844329834, "learning_rate": 0.0008990599737794927, "loss": 0.8811625838279724, "step": 4864 }, { "epoch": 0.236478684587317, "grad_norm": 0.24145644903182983, "learning_rate": 0.0008873127233711644, "loss": 0.8737959861755371, "step": 5120 }, { "epoch": 0.236478684587317, "eval_cos_loss": 0.24506433716375534, "eval_loss": 0.8307346953376787, "eval_mse_loss": 0.8307346953376787, "flow/cos_sim": 0.7549356581413582, "flow/improvement_ratio": 0.47620510081994477, "flow/mag_ratio_mean": 0.7473646888177689, "flow/mag_ratio_std": 0.16763514821251776, "step": 5120 }, { "epoch": 0.236478684587317, "eval_cos_loss": 0.24506433716375534, "eval_loss": 0.8307346953376787, "eval_mse_loss": 0.8307346953376787, "eval_runtime": 37.5063, "eval_samples_per_second": 746.354, "eval_steps_per_second": 11.678, "flow/cos_sim": 0.7549356581413582, "flow/improvement_ratio": 0.47620510081994477, "flow/mag_ratio_mean": 0.7473646888177689, "flow/mag_ratio_std": 0.16763514821251776, "step": 5120 }, { "epoch": 0.24830261881668284, "grad_norm": 1.039290428161621, "learning_rate": 0.0008750049154520011, "loss": 0.8717202544212341, "step": 5376 }, { "epoch": 0.2601265530460487, "grad_norm": 0.22408978641033173, "learning_rate": 0.0008621543631062487, "loss": 0.8717328906059265, "step": 5632 }, { "epoch": 0.27195048727541454, "grad_norm": 0.7838807106018066, "learning_rate": 0.0008487796649318904, "loss": 0.8674213886260986, "step": 5888 }, { "epoch": 0.2837744215047804, "grad_norm": 0.15397988259792328, "learning_rate": 0.0008349553511611836, "loss": 0.8678247332572937, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_cos_loss": 0.2434517246879399, "eval_loss": 0.8235682821981439, "eval_mse_loss": 0.8235682821981439, "flow/cos_sim": 0.7565482973235927, "flow/improvement_ratio": 0.48134120863322255, "flow/mag_ratio_mean": 0.752200963970733, "flow/mag_ratio_std": 0.17564187855481012, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_cos_loss": 0.2434517246879399, "eval_loss": 0.8235682821981439, "eval_mse_loss": 0.8235682821981439, "eval_runtime": 37.4608, "eval_samples_per_second": 747.262, "eval_steps_per_second": 11.692, "flow/cos_sim": 0.7565482973235927, "flow/improvement_ratio": 0.48134120863322255, "flow/mag_ratio_mean": 0.752200963970733, "flow/mag_ratio_std": 0.17564187855481012, "step": 6144 }, { "epoch": 0.2955983557341462, "grad_norm": 0.15729770064353943, "learning_rate": 0.0008205930168562264, "loss": 0.8611059188842773, "step": 6400 }, { "epoch": 0.30742228996351206, "grad_norm": 0.5960604548454285, "learning_rate": 0.0008057666884383055, "loss": 0.8611810803413391, "step": 6656 }, { "epoch": 0.3192462241928779, "grad_norm": 0.12568458914756775, "learning_rate": 0.0007905583005945037, "loss": 0.8600746393203735, "step": 6912 }, { "epoch": 0.3310701584222438, "grad_norm": 0.15424804389476776, "learning_rate": 0.000774870597388272, "loss": 0.8601675629615784, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_cos_loss": 0.24115517810327278, "eval_loss": 0.8168311545021458, "eval_mse_loss": 0.8168311545021458, "flow/cos_sim": 0.758844825230777, "flow/improvement_ratio": 0.48181676517610683, "flow/mag_ratio_mean": 0.7439628689800768, "flow/mag_ratio_std": 0.17355786634770703, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_cos_loss": 0.24115517810327278, "eval_loss": 0.8168311545021458, "eval_mse_loss": 0.8168311545021458, "eval_runtime": 37.4871, "eval_samples_per_second": 746.736, "eval_steps_per_second": 11.684, "flow/cos_sim": 0.758844825230777, "flow/improvement_ratio": 0.48181676517610683, "flow/mag_ratio_mean": 0.7439628689800768, "flow/mag_ratio_std": 0.17355786634770703, "step": 7168 }, { "epoch": 0.34289409265160964, "grad_norm": 0.1787535548210144, "learning_rate": 0.0007587850741039065, "loss": 0.8584595322608948, "step": 7424 }, { "epoch": 0.3547180268809755, "grad_norm": 0.8946473002433777, "learning_rate": 0.000742325011311212, "loss": 0.8562237620353699, "step": 7680 }, { "epoch": 0.3665419611103413, "grad_norm": 0.1438024491071701, "learning_rate": 0.0007255142316508366, "loss": 0.8549248576164246, "step": 7936 }, { "epoch": 0.37836589533970716, "grad_norm": 0.15269657969474792, "learning_rate": 0.0007083770653557752, "loss": 0.8540560603141785, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_cos_loss": 0.238492783584279, "eval_loss": 0.8075671801556191, "eval_mse_loss": 0.8075671801556191, "flow/cos_sim": 0.7615072517634527, "flow/improvement_ratio": 0.48001350197073533, "flow/mag_ratio_mean": 0.7569256454842276, "flow/mag_ratio_std": 0.17385539324044091, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_cos_loss": 0.238492783584279, "eval_loss": 0.8075671801556191, "eval_mse_loss": 0.8075671801556191, "eval_runtime": 37.4059, "eval_samples_per_second": 748.358, "eval_steps_per_second": 11.709, "flow/cos_sim": 0.7615072517634527, "flow/improvement_ratio": 0.48001350197073533, "flow/mag_ratio_mean": 0.7569256454842276, "flow/mag_ratio_std": 0.17385539324044091, "step": 8192 }, { "epoch": 0.390189829569073, "grad_norm": 0.2120441496372223, "learning_rate": 0.0006909383150382365, "loss": 0.850504994392395, "step": 8448 }, { "epoch": 0.4020137637984389, "grad_norm": 1.0651849508285522, "learning_rate": 0.0006732232197928334, "loss": 0.8530774116516113, "step": 8704 }, { "epoch": 0.41383769802780473, "grad_norm": 0.18748371303081512, "learning_rate": 0.0006552574186680553, "loss": 0.849456250667572, "step": 8960 }, { "epoch": 0.4256616322571706, "grad_norm": 1.3815633058547974, "learning_rate": 0.0006370669135588852, "loss": 0.8472809791564941, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_cos_loss": 0.23755816625405665, "eval_loss": 0.8048887692357851, "eval_mse_loss": 0.8048887692357851, "flow/cos_sim": 0.7624418421150887, "flow/improvement_ratio": 0.47895704168979436, "flow/mag_ratio_mean": 0.7494950834746774, "flow/mag_ratio_std": 0.1740222738621986, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_cos_loss": 0.23755816625405665, "eval_loss": 0.8048887692357851, "eval_mse_loss": 0.8048887692357851, "eval_runtime": 37.4388, "eval_samples_per_second": 747.7, "eval_steps_per_second": 11.699, "flow/cos_sim": 0.7624418421150887, "flow/improvement_ratio": 0.47895704168979436, "flow/mag_ratio_mean": 0.7494950834746774, "flow/mag_ratio_std": 0.1740222738621986, "step": 9216 }, { "epoch": 0.4374855664865364, "grad_norm": 0.30135899782180786, "learning_rate": 0.0006187502147205189, "loss": 0.8494647741317749, "step": 9472 }, { "epoch": 0.44930950071590225, "grad_norm": 0.1467859447002411, "learning_rate": 0.000600190188915804, "loss": 0.8477605581283569, "step": 9728 }, { "epoch": 0.4611334349452681, "grad_norm": 0.5628754496574402, "learning_rate": 0.0005814851579007778, "loss": 0.845370352268219, "step": 9984 }, { "epoch": 0.472957369174634, "grad_norm": 0.8975974321365356, "learning_rate": 0.0005626621934574736, "loss": 0.8435695171356201, "step": 10240 }, { "epoch": 0.472957369174634, "eval_cos_loss": 0.23847832721254053, "eval_loss": 0.8075589748550224, "eval_mse_loss": 0.8075589748550224, "flow/cos_sim": 0.76152167540707, "flow/improvement_ratio": 0.48205344698744823, "flow/mag_ratio_mean": 0.7498847978300156, "flow/mag_ratio_std": 0.17484679011857673, "step": 10240 }, { "epoch": 0.472957369174634, "eval_cos_loss": 0.23847832721254053, "eval_loss": 0.8075589748550224, "eval_mse_loss": 0.8075589748550224, "eval_runtime": 37.5927, "eval_samples_per_second": 744.639, "eval_steps_per_second": 11.651, "flow/cos_sim": 0.76152167540707, "flow/improvement_ratio": 0.48205344698744823, "flow/mag_ratio_mean": 0.7498847978300156, "flow/mag_ratio_std": 0.17484679011857673, "step": 10240 }, { "epoch": 0.48478130340399983, "grad_norm": 0.7524752020835876, "learning_rate": 0.0005437485380529165, "loss": 0.8425331115722656, "step": 10496 }, { "epoch": 0.49660523763336567, "grad_norm": 0.6698228120803833, "learning_rate": 0.0005247715654111146, "loss": 0.8414271473884583, "step": 10752 }, { "epoch": 0.5084291718627315, "grad_norm": 0.5097092390060425, "learning_rate": 0.0005057587408950816, "loss": 0.8411047458648682, "step": 11008 }, { "epoch": 0.5202531060920974, "grad_norm": 0.7546167373657227, "learning_rate": 0.00048673758175623157, "loss": 0.841041088104248, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_cos_loss": 0.23565144779067063, "eval_loss": 0.7988868664660954, "eval_mse_loss": 0.7988868664660954, "flow/cos_sim": 0.7643485723837325, "flow/improvement_ratio": 0.48455546578588005, "flow/mag_ratio_mean": 0.7568667526930979, "flow/mag_ratio_std": 0.17350949476298677, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_cos_loss": 0.23565144779067063, "eval_loss": 0.7988868664660954, "eval_mse_loss": 0.7988868664660954, "eval_runtime": 37.5643, "eval_samples_per_second": 745.203, "eval_steps_per_second": 11.66, "flow/cos_sim": 0.7643485723837325, "flow/improvement_ratio": 0.48455546578588005, "flow/mag_ratio_mean": 0.7568667526930979, "flow/mag_ratio_std": 0.17350949476298677, "step": 11264 }, { "epoch": 0.5320770403214632, "grad_norm": 0.24848276376724243, "learning_rate": 0.00046773561730867265, "loss": 0.8385207056999207, "step": 11520 }, { "epoch": 0.5439009745508291, "grad_norm": 0.2328663319349289, "learning_rate": 0.0004487803490860472, "loss": 0.8425102829933167, "step": 11776 }, { "epoch": 0.5557249087801949, "grad_norm": 1.160973072052002, "learning_rate": 0.00042989921103857385, "loss": 0.8413041830062866, "step": 12032 }, { "epoch": 0.5675488430095608, "grad_norm": 0.48494067788124084, "learning_rate": 0.00041119265533512737, "loss": 0.8376575708389282, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_cos_loss": 0.23514270255146505, "eval_loss": 0.7956640910884561, "eval_mse_loss": 0.7956640910884561, "flow/cos_sim": 0.7648573198002767, "flow/improvement_ratio": 0.4841282178550006, "flow/mag_ratio_mean": 0.754011933798115, "flow/mag_ratio_std": 0.17356456295676428, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_cos_loss": 0.23514270255146505, "eval_loss": 0.7956640910884561, "eval_mse_loss": 0.7956640910884561, "eval_runtime": 37.4222, "eval_samples_per_second": 748.032, "eval_steps_per_second": 11.704, "flow/cos_sim": 0.7648573198002767, "flow/improvement_ratio": 0.4841282178550006, "flow/mag_ratio_mean": 0.754011933798115, "flow/mag_ratio_std": 0.17356456295676428, "step": 12288 }, { "epoch": 0.5793727772389267, "grad_norm": 0.2547326982021332, "learning_rate": 0.0003925410556475763, "loss": 0.8384107947349548, "step": 12544 }, { "epoch": 0.5911967114682924, "grad_norm": 0.2702837586402893, "learning_rate": 0.00037404498123644504, "loss": 0.8394683003425598, "step": 12800 }, { "epoch": 0.6030206456976583, "grad_norm": 0.4496554732322693, "learning_rate": 0.0003557312014609784, "loss": 0.8402306437492371, "step": 13056 }, { "epoch": 0.6148445799270241, "grad_norm": 0.40902939438819885, "learning_rate": 0.00033762622184548686, "loss": 0.8355939388275146, "step": 13312 }, { "epoch": 0.6148445799270241, "eval_cos_loss": 0.23389767066120556, "eval_loss": 0.7927584718895829, "eval_mse_loss": 0.7927584718895829, "flow/cos_sim": 0.7661023556369625, "flow/improvement_ratio": 0.4783722004389654, "flow/mag_ratio_mean": 0.7541745113183375, "flow/mag_ratio_std": 0.17828954082645782, "step": 13312 }, { "epoch": 0.6148445799270241, "eval_cos_loss": 0.23389767066120556, "eval_loss": 0.7927584718895829, "eval_mse_loss": 0.7927584718895829, "eval_runtime": 37.4329, "eval_samples_per_second": 747.818, "eval_steps_per_second": 11.701, "flow/cos_sim": 0.7661023556369625, "flow/improvement_ratio": 0.4783722004389654, "flow/mag_ratio_mean": 0.7541745113183375, "flow/mag_ratio_std": 0.17828954082645782, "step": 13312 }, { "epoch": 0.62666851415639, "grad_norm": 1.1801968812942505, "learning_rate": 0.00031975624571791317, "loss": 0.8333742022514343, "step": 13568 }, { "epoch": 0.6384924483857558, "grad_norm": 0.1962728500366211, "learning_rate": 0.00030214713628576984, "loss": 0.8351457715034485, "step": 13824 }, { "epoch": 0.6503163826151217, "grad_norm": 0.5504610538482666, "learning_rate": 0.00028482437920433144, "loss": 0.8345349431037903, "step": 14080 }, { "epoch": 0.6621403168444876, "grad_norm": 0.2537405788898468, "learning_rate": 0.00026781304569125866, "loss": 0.8331661820411682, "step": 14336 }, { "epoch": 0.6621403168444876, "eval_cos_loss": 0.23358930164276193, "eval_loss": 0.790659844331001, "eval_mse_loss": 0.790659844331001, "flow/cos_sim": 0.7664107191780386, "flow/improvement_ratio": 0.48042010873147883, "flow/mag_ratio_mean": 0.7608423984214051, "flow/mag_ratio_std": 0.17466495716816757, "step": 14336 }, { "epoch": 0.6621403168444876, "eval_cos_loss": 0.23358930164276193, "eval_loss": 0.790659844331001, "eval_mse_loss": 0.790659844331001, "eval_runtime": 37.4882, "eval_samples_per_second": 746.715, "eval_steps_per_second": 11.684, "flow/cos_sim": 0.7664107191780386, "flow/improvement_ratio": 0.48042010873147883, "flow/mag_ratio_mean": 0.7608423984214051, "flow/mag_ratio_std": 0.17466495716816757, "step": 14336 }, { "epoch": 0.6739642510738534, "grad_norm": 0.44085371494293213, "learning_rate": 0.0002512022089125798, "loss": 0.8341683149337769, "step": 14592 }, { "epoch": 0.6857881853032193, "grad_norm": 1.0609331130981445, "learning_rate": 0.00023488564442791606, "loss": 0.835027813911438, "step": 14848 }, { "epoch": 0.6976121195325851, "grad_norm": 0.22936664521694183, "learning_rate": 0.0002189527798174848, "loss": 0.8334974050521851, "step": 15104 }, { "epoch": 0.709436053761951, "grad_norm": 0.28989291191101074, "learning_rate": 0.00020342667470826854, "loss": 0.8318911790847778, "step": 15360 }, { "epoch": 0.709436053761951, "eval_cos_loss": 0.2342552522614122, "eval_loss": 0.7931138123279293, "eval_mse_loss": 0.7931138123279293, "flow/cos_sim": 0.7657447608366404, "flow/improvement_ratio": 0.4886273239573387, "flow/mag_ratio_mean": 0.7547847685748583, "flow/mag_ratio_std": 0.1757596176985192, "step": 15360 }, { "epoch": 0.709436053761951, "eval_cos_loss": 0.2342552522614122, "eval_loss": 0.7931138123279293, "eval_mse_loss": 0.7931138123279293, "eval_runtime": 37.5138, "eval_samples_per_second": 746.206, "eval_steps_per_second": 11.676, "flow/cos_sim": 0.7657447608366404, "flow/improvement_ratio": 0.4886273239573387, "flow/mag_ratio_mean": 0.7547847685748583, "flow/mag_ratio_std": 0.1757596176985192, "step": 15360 }, { "epoch": 0.7212599879913169, "grad_norm": 0.16880613565444946, "learning_rate": 0.00018832980002442701, "loss": 0.8349105715751648, "step": 15616 }, { "epoch": 0.7330839222206826, "grad_norm": 0.21710531413555145, "learning_rate": 0.00017368400546514002, "loss": 0.8341897130012512, "step": 15872 }, { "epoch": 0.7449078564500485, "grad_norm": 0.4028049111366272, "learning_rate": 0.00015951048788154866, "loss": 0.8300965428352356, "step": 16128 }, { "epoch": 0.7567317906794143, "grad_norm": 0.4703959822654724, "learning_rate": 0.0001458297605985633, "loss": 0.830625057220459, "step": 16384 }, { "epoch": 0.7567317906794143, "eval_cos_loss": 0.23394639184485833, "eval_loss": 0.7917164079674847, "eval_mse_loss": 0.7917164079674847, "flow/cos_sim": 0.7660536425843086, "flow/improvement_ratio": 0.4823139966486796, "flow/mag_ratio_mean": 0.7634640123746167, "flow/mag_ratio_std": 0.17673859605778297, "step": 16384 }, { "epoch": 0.7567317906794143, "eval_cos_loss": 0.23394639184485833, "eval_loss": 0.7917164079674847, "eval_mse_loss": 0.7917164079674847, "eval_runtime": 37.6008, "eval_samples_per_second": 744.478, "eval_steps_per_second": 11.649, "flow/cos_sim": 0.7660536425843086, "flow/improvement_ratio": 0.4823139966486796, "flow/mag_ratio_mean": 0.7634640123746167, "flow/mag_ratio_std": 0.17673859605778297, "step": 16384 }, { "epoch": 0.7685557249087802, "grad_norm": 0.2865261733531952, "learning_rate": 0.00013266162372593872, "loss": 0.8289151787757874, "step": 16640 }, { "epoch": 0.780379659138146, "grad_norm": 0.32574427127838135, "learning_rate": 0.00012002513550158511, "loss": 0.8299338817596436, "step": 16896 }, { "epoch": 0.7922035933675119, "grad_norm": 0.19136400520801544, "learning_rate": 0.00010793858470858986, "loss": 0.831389307975769, "step": 17152 }, { "epoch": 0.8040275275968778, "grad_norm": 0.4910804331302643, "learning_rate": 9.641946420587128e-05, "loss": 0.8310226798057556, "step": 17408 }, { "epoch": 0.8040275275968778, "eval_cos_loss": 0.23344004797200635, "eval_loss": 0.7905207032754541, "eval_mse_loss": 0.7905207032754541, "flow/cos_sim": 0.7665599775096598, "flow/improvement_ratio": 0.48091149343747525, "flow/mag_ratio_mean": 0.7604631242686755, "flow/mag_ratio_std": 0.1777392937440306, "step": 17408 }, { "epoch": 0.8040275275968778, "eval_cos_loss": 0.23344004797200635, "eval_loss": 0.7905207032754541, "eval_mse_loss": 0.7905207032754541, "eval_runtime": 37.5059, "eval_samples_per_second": 746.363, "eval_steps_per_second": 11.678, "flow/cos_sim": 0.7665599775096598, "flow/improvement_ratio": 0.48091149343747525, "flow/mag_ratio_mean": 0.7604631242686755, "flow/mag_ratio_std": 0.1777392937440306, "step": 17408 }, { "epoch": 0.8158514618262436, "grad_norm": 0.41079503297805786, "learning_rate": 8.548444561077174e-05, "loss": 0.831270694732666, "step": 17664 }, { "epoch": 0.8276753960556095, "grad_norm": 0.20108859241008759, "learning_rate": 7.514935517023558e-05, "loss": 0.8281430006027222, "step": 17920 }, { "epoch": 0.8394993302849753, "grad_norm": 0.22395165264606476, "learning_rate": 6.542915085548828e-05, "loss": 0.8286972045898438, "step": 18176 }, { "epoch": 0.8513232645143411, "grad_norm": 0.12410369515419006, "learning_rate": 5.6337900713373745e-05, "loss": 0.828626275062561, "step": 18432 }, { "epoch": 0.8513232645143411, "eval_cos_loss": 0.23364517943227672, "eval_loss": 0.7902474757198874, "eval_mse_loss": 0.7902474757198874, "flow/cos_sim": 0.7663548270316973, "flow/improvement_ratio": 0.4790440064655047, "flow/mag_ratio_mean": 0.7584420379982691, "flow/mag_ratio_std": 0.17626052373620474, "step": 18432 }, { "epoch": 0.8513232645143411, "eval_cos_loss": 0.23364517943227672, "eval_loss": 0.7902474757198874, "eval_mse_loss": 0.7902474757198874, "eval_runtime": 37.4969, "eval_samples_per_second": 746.541, "eval_steps_per_second": 11.681, "flow/cos_sim": 0.7663548270316973, "flow/improvement_ratio": 0.4790440064655047, "flow/mag_ratio_mean": 0.7584420379982691, "flow/mag_ratio_std": 0.17626052373620474, "step": 18432 }, { "epoch": 0.8631471987437069, "grad_norm": 0.15060371160507202, "learning_rate": 4.78887625056757e-05, "loss": 0.8286436796188354, "step": 18688 }, { "epoch": 0.8749711329730728, "grad_norm": 0.1537846028804779, "learning_rate": 4.012312516433581e-05, "loss": 0.8254930377006531, "step": 18944 }, { "epoch": 0.8867950672024387, "grad_norm": 0.4253118336200714, "learning_rate": 3.2991328308497545e-05, "loss": 0.8294442296028137, "step": 19200 }, { "epoch": 0.8986190014318045, "grad_norm": 0.17603912949562073, "learning_rate": 2.653543286964183e-05, "loss": 0.8307968378067017, "step": 19456 }, { "epoch": 0.8986190014318045, "eval_cos_loss": 0.2343874951679957, "eval_loss": 0.7920145923144197, "eval_mse_loss": 0.7920145923144197, "flow/cos_sim": 0.7656125166372622, "flow/improvement_ratio": 0.4853253698648383, "flow/mag_ratio_mean": 0.7600582418920787, "flow/mag_ratio_std": 0.17680819529920952, "step": 19456 }, { "epoch": 0.8986190014318045, "eval_cos_loss": 0.2343874951679957, "eval_loss": 0.7920145923144197, "eval_mse_loss": 0.7920145923144197, "eval_runtime": 37.6789, "eval_samples_per_second": 742.935, "eval_steps_per_second": 11.625, "flow/cos_sim": 0.7656125166372622, "flow/improvement_ratio": 0.4853253698648383, "flow/mag_ratio_mean": 0.7600582418920787, "flow/mag_ratio_std": 0.17680819529920952, "step": 19456 }, { "epoch": 0.9104429356611704, "grad_norm": 0.13290032744407654, "learning_rate": 2.076478246200819e-05, "loss": 0.8278121948242188, "step": 19712 }, { "epoch": 0.9222668698905362, "grad_norm": 0.20168907940387726, "learning_rate": 1.5687728945045944e-05, "loss": 0.8273869156837463, "step": 19968 }, { "epoch": 0.9340908041199021, "grad_norm": 0.20389237999916077, "learning_rate": 1.1311620335770879e-05, "loss": 0.831144392490387, "step": 20224 }, { "epoch": 0.945914738349268, "grad_norm": 0.26355621218681335, "learning_rate": 7.642790173984836e-06, "loss": 0.8322795629501343, "step": 20480 }, { "epoch": 0.945914738349268, "eval_cos_loss": 0.23226006676072944, "eval_loss": 0.7871269678960652, "eval_mse_loss": 0.7871269678960652, "flow/cos_sim": 0.7677399510662305, "flow/improvement_ratio": 0.4782098607249456, "flow/mag_ratio_mean": 0.76152302032192, "flow/mag_ratio_std": 0.1765111445632155, "step": 20480 }, { "epoch": 0.945914738349268, "eval_cos_loss": 0.23226006676072944, "eval_loss": 0.7871269678960652, "eval_mse_loss": 0.7871269678960652, "eval_runtime": 37.4829, "eval_samples_per_second": 746.822, "eval_steps_per_second": 11.685, "flow/cos_sim": 0.7677399510662305, "flow/improvement_ratio": 0.4782098607249456, "flow/mag_ratio_mean": 0.76152302032192, "flow/mag_ratio_std": 0.1765111445632155, "step": 20480 }, { "epoch": 0.9577386725786338, "grad_norm": 0.35688987374305725, "learning_rate": 4.686548355746001e-06, "loss": 0.8279744386672974, "step": 20736 }, { "epoch": 0.9695626068079997, "grad_norm": 0.15006007254123688, "learning_rate": 2.447173448359541e-06, "loss": 0.8288099765777588, "step": 20992 }, { "epoch": 0.9813865410373654, "grad_norm": 0.13515928387641907, "learning_rate": 9.324369820445933e-07, "loss": 0.8292235136032104, "step": 21248 }, { "epoch": 0.9932104752667313, "grad_norm": 0.19579511880874634, "learning_rate": 1.3265238678672464e-07, "loss": 0.8291770219802856, "step": 21504 }, { "epoch": 0.9932104752667313, "eval_cos_loss": 0.23236965187336212, "eval_loss": 0.7870777566411179, "eval_mse_loss": 0.7870777566411179, "flow/cos_sim": 0.7676303687433129, "flow/improvement_ratio": 0.48667068133071134, "flow/mag_ratio_mean": 0.7602007927959913, "flow/mag_ratio_std": 0.17566703702216824, "step": 21504 }, { "epoch": 0.9932104752667313, "eval_cos_loss": 0.23236965187336212, "eval_loss": 0.7870777566411179, "eval_mse_loss": 0.7870777566411179, "eval_runtime": 37.3306, "eval_samples_per_second": 749.867, "eval_steps_per_second": 11.733, "flow/cos_sim": 0.7676303687433129, "flow/improvement_ratio": 0.48667068133071134, "flow/mag_ratio_mean": 0.7602007927959913, "flow/mag_ratio_std": 0.17566703702216824, "step": 21504 } ], "logging_steps": 256, "max_steps": 21651, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1024, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }