| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 1024, |
| "global_step": 21651, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.011823934229365849, |
| "grad_norm": 0.2415127456188202, |
| "learning_rate": 0.000498046875, |
| "loss": 1.734155297279358, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.023647868458731697, |
| "grad_norm": 0.200724259018898, |
| "learning_rate": 0.000998046875, |
| "loss": 1.1768943071365356, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.03547180268809755, |
| "grad_norm": 0.10385265946388245, |
| "learning_rate": 0.000999640996023194, |
| "loss": 1.0633362531661987, |
| "step": 768 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "grad_norm": 0.24572038650512695, |
| "learning_rate": 0.0009985588674043958, |
| "loss": 1.0212352275848389, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "eval_cos_loss": 0.2939360714666375, |
| "eval_loss": 0.9792764313416938, |
| "eval_mse_loss": 0.9792764313416938, |
| "flow/cos_sim": 0.706063949762414, |
| "flow/improvement_ratio": 0.47937897269584273, |
| "flow/mag_ratio_mean": 0.7043995116943638, |
| "flow/mag_ratio_std": 0.14764773857579927, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "eval_cos_loss": 0.2939360714666375, |
| "eval_loss": 0.9792764313416938, |
| "eval_mse_loss": 0.9792764313416938, |
| "eval_runtime": 37.309, |
| "eval_samples_per_second": 750.302, |
| "eval_steps_per_second": 11.74, |
| "flow/cos_sim": 0.706063949762414, |
| "flow/improvement_ratio": 0.47937897269584273, |
| "flow/mag_ratio_mean": 0.7043995116943638, |
| "flow/mag_ratio_std": 0.14764773857579927, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.05911967114682925, |
| "grad_norm": 0.14034895598888397, |
| "learning_rate": 0.0009967551747861387, |
| "loss": 0.9987254738807678, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.0709436053761951, |
| "grad_norm": 0.32785671949386597, |
| "learning_rate": 0.000994232528651847, |
| "loss": 0.9810371398925781, |
| "step": 1536 |
| }, |
| { |
| "epoch": 0.08276753960556095, |
| "grad_norm": 0.4202195107936859, |
| "learning_rate": 0.0009909945800260092, |
| "loss": 0.9669110774993896, |
| "step": 1792 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "grad_norm": 0.11297852545976639, |
| "learning_rate": 0.0009870460151900522, |
| "loss": 0.9570462107658386, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "eval_cos_loss": 0.27135700569169163, |
| "eval_loss": 0.9140212976769225, |
| "eval_mse_loss": 0.9140212976769225, |
| "flow/cos_sim": 0.7286430077465702, |
| "flow/improvement_ratio": 0.4797877063081689, |
| "flow/mag_ratio_mean": 0.7240678813631676, |
| "flow/mag_ratio_std": 0.15831342356526143, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "eval_cos_loss": 0.27135700569169163, |
| "eval_loss": 0.9140212976769225, |
| "eval_mse_loss": 0.9140212976769225, |
| "eval_runtime": 37.3614, |
| "eval_samples_per_second": 749.249, |
| "eval_steps_per_second": 11.723, |
| "flow/cos_sim": 0.7286430077465702, |
| "flow/improvement_ratio": 0.4797877063081689, |
| "flow/mag_ratio_mean": 0.7240678813631676, |
| "flow/mag_ratio_std": 0.15831342356526143, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.10641540806429264, |
| "grad_norm": 0.10684677958488464, |
| "learning_rate": 0.0009823925488998885, |
| "loss": 0.9458644986152649, |
| "step": 2304 |
| }, |
| { |
| "epoch": 0.1182393422936585, |
| "grad_norm": 0.35598301887512207, |
| "learning_rate": 0.0009770409161149525, |
| "loss": 0.9369097948074341, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.13006327652302435, |
| "grad_norm": 0.24151749908924103, |
| "learning_rate": 0.0009709988622506973, |
| "loss": 0.925268292427063, |
| "step": 2816 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "grad_norm": 0.17448143661022186, |
| "learning_rate": 0.000964275131968659, |
| "loss": 0.9168843030929565, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "eval_cos_loss": 0.258329911795381, |
| "eval_loss": 0.872469331303688, |
| "eval_mse_loss": 0.872469331303688, |
| "flow/cos_sim": 0.7416701187554016, |
| "flow/improvement_ratio": 0.4866235294978913, |
| "flow/mag_ratio_mean": 0.7309322730046973, |
| "flow/mag_ratio_std": 0.16244563712104815, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "eval_cos_loss": 0.258329911795381, |
| "eval_loss": 0.872469331303688, |
| "eval_mse_loss": 0.872469331303688, |
| "eval_runtime": 37.367, |
| "eval_samples_per_second": 749.136, |
| "eval_steps_per_second": 11.722, |
| "flow/cos_sim": 0.7416701187554016, |
| "flow/improvement_ratio": 0.4866235294978913, |
| "flow/mag_ratio_mean": 0.7309322730046973, |
| "flow/mag_ratio_std": 0.16244563712104815, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.15371114498175603, |
| "grad_norm": 0.16108064353466034, |
| "learning_rate": 0.0009568794565203123, |
| "loss": 0.91062992811203, |
| "step": 3328 |
| }, |
| { |
| "epoch": 0.1655350792111219, |
| "grad_norm": 0.21124346554279327, |
| "learning_rate": 0.0009488225396630347, |
| "loss": 0.9045400023460388, |
| "step": 3584 |
| }, |
| { |
| "epoch": 0.17735901344048774, |
| "grad_norm": 0.49332571029663086, |
| "learning_rate": 0.0009401160421685646, |
| "loss": 0.8949952721595764, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "grad_norm": 0.25588458776474, |
| "learning_rate": 0.0009307725649463714, |
| "loss": 0.8952076435089111, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "eval_cos_loss": 0.24964532671181577, |
| "eval_loss": 0.8440415833366516, |
| "eval_mse_loss": 0.8440415833366516, |
| "flow/cos_sim": 0.7503547062884727, |
| "flow/improvement_ratio": 0.4770904905311593, |
| "flow/mag_ratio_mean": 0.7401841025646418, |
| "flow/mag_ratio_std": 0.1675400482533185, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "eval_cos_loss": 0.24964532671181577, |
| "eval_loss": 0.8440415833366516, |
| "eval_mse_loss": 0.8440415833366516, |
| "eval_runtime": 37.4493, |
| "eval_samples_per_second": 747.491, |
| "eval_steps_per_second": 11.696, |
| "flow/cos_sim": 0.7503547062884727, |
| "flow/improvement_ratio": 0.4770904905311593, |
| "flow/mag_ratio_mean": 0.7401841025646418, |
| "flow/mag_ratio_std": 0.1675400482533185, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.20100688189921945, |
| "grad_norm": 0.37869253754615784, |
| "learning_rate": 0.0009208056308063659, |
| "loss": 0.8890768885612488, |
| "step": 4352 |
| }, |
| { |
| "epoch": 0.2128308161285853, |
| "grad_norm": 0.9834415316581726, |
| "learning_rate": 0.0009102296648873445, |
| "loss": 0.8840116262435913, |
| "step": 4608 |
| }, |
| { |
| "epoch": 0.22465475035795113, |
| "grad_norm": 1.289456844329834, |
| "learning_rate": 0.0008990599737794927, |
| "loss": 0.8811625838279724, |
| "step": 4864 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "grad_norm": 0.24145644903182983, |
| "learning_rate": 0.0008873127233711644, |
| "loss": 0.8737959861755371, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "eval_cos_loss": 0.24506433716375534, |
| "eval_loss": 0.8307346953376787, |
| "eval_mse_loss": 0.8307346953376787, |
| "flow/cos_sim": 0.7549356581413582, |
| "flow/improvement_ratio": 0.47620510081994477, |
| "flow/mag_ratio_mean": 0.7473646888177689, |
| "flow/mag_ratio_std": 0.16763514821251776, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "eval_cos_loss": 0.24506433716375534, |
| "eval_loss": 0.8307346953376787, |
| "eval_mse_loss": 0.8307346953376787, |
| "eval_runtime": 37.5063, |
| "eval_samples_per_second": 746.354, |
| "eval_steps_per_second": 11.678, |
| "flow/cos_sim": 0.7549356581413582, |
| "flow/improvement_ratio": 0.47620510081994477, |
| "flow/mag_ratio_mean": 0.7473646888177689, |
| "flow/mag_ratio_std": 0.16763514821251776, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.24830261881668284, |
| "grad_norm": 1.039290428161621, |
| "learning_rate": 0.0008750049154520011, |
| "loss": 0.8717202544212341, |
| "step": 5376 |
| }, |
| { |
| "epoch": 0.2601265530460487, |
| "grad_norm": 0.22408978641033173, |
| "learning_rate": 0.0008621543631062487, |
| "loss": 0.8717328906059265, |
| "step": 5632 |
| }, |
| { |
| "epoch": 0.27195048727541454, |
| "grad_norm": 0.7838807106018066, |
| "learning_rate": 0.0008487796649318904, |
| "loss": 0.8674213886260986, |
| "step": 5888 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "grad_norm": 0.15397988259792328, |
| "learning_rate": 0.0008349553511611836, |
| "loss": 0.8678247332572937, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "eval_cos_loss": 0.2434517246879399, |
| "eval_loss": 0.8235682821981439, |
| "eval_mse_loss": 0.8235682821981439, |
| "flow/cos_sim": 0.7565482973235927, |
| "flow/improvement_ratio": 0.48134120863322255, |
| "flow/mag_ratio_mean": 0.752200963970733, |
| "flow/mag_ratio_std": 0.17564187855481012, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "eval_cos_loss": 0.2434517246879399, |
| "eval_loss": 0.8235682821981439, |
| "eval_mse_loss": 0.8235682821981439, |
| "eval_runtime": 37.4608, |
| "eval_samples_per_second": 747.262, |
| "eval_steps_per_second": 11.692, |
| "flow/cos_sim": 0.7565482973235927, |
| "flow/improvement_ratio": 0.48134120863322255, |
| "flow/mag_ratio_mean": 0.752200963970733, |
| "flow/mag_ratio_std": 0.17564187855481012, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2955983557341462, |
| "grad_norm": 0.15729770064353943, |
| "learning_rate": 0.0008205930168562264, |
| "loss": 0.8611059188842773, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.30742228996351206, |
| "grad_norm": 0.5960604548454285, |
| "learning_rate": 0.0008057666884383055, |
| "loss": 0.8611810803413391, |
| "step": 6656 |
| }, |
| { |
| "epoch": 0.3192462241928779, |
| "grad_norm": 0.12568458914756775, |
| "learning_rate": 0.0007905583005945037, |
| "loss": 0.8600746393203735, |
| "step": 6912 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "grad_norm": 0.15424804389476776, |
| "learning_rate": 0.000774870597388272, |
| "loss": 0.8601675629615784, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "eval_cos_loss": 0.24115517810327278, |
| "eval_loss": 0.8168311545021458, |
| "eval_mse_loss": 0.8168311545021458, |
| "flow/cos_sim": 0.758844825230777, |
| "flow/improvement_ratio": 0.48181676517610683, |
| "flow/mag_ratio_mean": 0.7439628689800768, |
| "flow/mag_ratio_std": 0.17355786634770703, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "eval_cos_loss": 0.24115517810327278, |
| "eval_loss": 0.8168311545021458, |
| "eval_mse_loss": 0.8168311545021458, |
| "eval_runtime": 37.4871, |
| "eval_samples_per_second": 746.736, |
| "eval_steps_per_second": 11.684, |
| "flow/cos_sim": 0.758844825230777, |
| "flow/improvement_ratio": 0.48181676517610683, |
| "flow/mag_ratio_mean": 0.7439628689800768, |
| "flow/mag_ratio_std": 0.17355786634770703, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.34289409265160964, |
| "grad_norm": 0.1787535548210144, |
| "learning_rate": 0.0007587850741039065, |
| "loss": 0.8584595322608948, |
| "step": 7424 |
| }, |
| { |
| "epoch": 0.3547180268809755, |
| "grad_norm": 0.8946473002433777, |
| "learning_rate": 0.000742325011311212, |
| "loss": 0.8562237620353699, |
| "step": 7680 |
| }, |
| { |
| "epoch": 0.3665419611103413, |
| "grad_norm": 0.1438024491071701, |
| "learning_rate": 0.0007255142316508366, |
| "loss": 0.8549248576164246, |
| "step": 7936 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "grad_norm": 0.15269657969474792, |
| "learning_rate": 0.0007083770653557752, |
| "loss": 0.8540560603141785, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "eval_cos_loss": 0.238492783584279, |
| "eval_loss": 0.8075671801556191, |
| "eval_mse_loss": 0.8075671801556191, |
| "flow/cos_sim": 0.7615072517634527, |
| "flow/improvement_ratio": 0.48001350197073533, |
| "flow/mag_ratio_mean": 0.7569256454842276, |
| "flow/mag_ratio_std": 0.17385539324044091, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "eval_cos_loss": 0.238492783584279, |
| "eval_loss": 0.8075671801556191, |
| "eval_mse_loss": 0.8075671801556191, |
| "eval_runtime": 37.4059, |
| "eval_samples_per_second": 748.358, |
| "eval_steps_per_second": 11.709, |
| "flow/cos_sim": 0.7615072517634527, |
| "flow/improvement_ratio": 0.48001350197073533, |
| "flow/mag_ratio_mean": 0.7569256454842276, |
| "flow/mag_ratio_std": 0.17385539324044091, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.390189829569073, |
| "grad_norm": 0.2120441496372223, |
| "learning_rate": 0.0006909383150382365, |
| "loss": 0.850504994392395, |
| "step": 8448 |
| }, |
| { |
| "epoch": 0.4020137637984389, |
| "grad_norm": 1.0651849508285522, |
| "learning_rate": 0.0006732232197928334, |
| "loss": 0.8530774116516113, |
| "step": 8704 |
| }, |
| { |
| "epoch": 0.41383769802780473, |
| "grad_norm": 0.18748371303081512, |
| "learning_rate": 0.0006552574186680553, |
| "loss": 0.849456250667572, |
| "step": 8960 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "grad_norm": 1.3815633058547974, |
| "learning_rate": 0.0006370669135588852, |
| "loss": 0.8472809791564941, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "eval_cos_loss": 0.23755816625405665, |
| "eval_loss": 0.8048887692357851, |
| "eval_mse_loss": 0.8048887692357851, |
| "flow/cos_sim": 0.7624418421150887, |
| "flow/improvement_ratio": 0.47895704168979436, |
| "flow/mag_ratio_mean": 0.7494950834746774, |
| "flow/mag_ratio_std": 0.1740222738621986, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "eval_cos_loss": 0.23755816625405665, |
| "eval_loss": 0.8048887692357851, |
| "eval_mse_loss": 0.8048887692357851, |
| "eval_runtime": 37.4388, |
| "eval_samples_per_second": 747.7, |
| "eval_steps_per_second": 11.699, |
| "flow/cos_sim": 0.7624418421150887, |
| "flow/improvement_ratio": 0.47895704168979436, |
| "flow/mag_ratio_mean": 0.7494950834746774, |
| "flow/mag_ratio_std": 0.1740222738621986, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4374855664865364, |
| "grad_norm": 0.30135899782180786, |
| "learning_rate": 0.0006187502147205189, |
| "loss": 0.8494647741317749, |
| "step": 9472 |
| }, |
| { |
| "epoch": 0.44930950071590225, |
| "grad_norm": 0.1467859447002411, |
| "learning_rate": 0.000600190188915804, |
| "loss": 0.8477605581283569, |
| "step": 9728 |
| }, |
| { |
| "epoch": 0.4611334349452681, |
| "grad_norm": 0.5628754496574402, |
| "learning_rate": 0.0005814851579007778, |
| "loss": 0.845370352268219, |
| "step": 9984 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "grad_norm": 0.8975974321365356, |
| "learning_rate": 0.0005626621934574736, |
| "loss": 0.8435695171356201, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "eval_cos_loss": 0.23847832721254053, |
| "eval_loss": 0.8075589748550224, |
| "eval_mse_loss": 0.8075589748550224, |
| "flow/cos_sim": 0.76152167540707, |
| "flow/improvement_ratio": 0.48205344698744823, |
| "flow/mag_ratio_mean": 0.7498847978300156, |
| "flow/mag_ratio_std": 0.17484679011857673, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "eval_cos_loss": 0.23847832721254053, |
| "eval_loss": 0.8075589748550224, |
| "eval_mse_loss": 0.8075589748550224, |
| "eval_runtime": 37.5927, |
| "eval_samples_per_second": 744.639, |
| "eval_steps_per_second": 11.651, |
| "flow/cos_sim": 0.76152167540707, |
| "flow/improvement_ratio": 0.48205344698744823, |
| "flow/mag_ratio_mean": 0.7498847978300156, |
| "flow/mag_ratio_std": 0.17484679011857673, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.48478130340399983, |
| "grad_norm": 0.7524752020835876, |
| "learning_rate": 0.0005437485380529165, |
| "loss": 0.8425331115722656, |
| "step": 10496 |
| }, |
| { |
| "epoch": 0.49660523763336567, |
| "grad_norm": 0.6698228120803833, |
| "learning_rate": 0.0005247715654111146, |
| "loss": 0.8414271473884583, |
| "step": 10752 |
| }, |
| { |
| "epoch": 0.5084291718627315, |
| "grad_norm": 0.5097092390060425, |
| "learning_rate": 0.0005057587408950816, |
| "loss": 0.8411047458648682, |
| "step": 11008 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "grad_norm": 0.7546167373657227, |
| "learning_rate": 0.00048673758175623157, |
| "loss": 0.841041088104248, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "eval_cos_loss": 0.23565144779067063, |
| "eval_loss": 0.7988868664660954, |
| "eval_mse_loss": 0.7988868664660954, |
| "flow/cos_sim": 0.7643485723837325, |
| "flow/improvement_ratio": 0.48455546578588005, |
| "flow/mag_ratio_mean": 0.7568667526930979, |
| "flow/mag_ratio_std": 0.17350949476298677, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "eval_cos_loss": 0.23565144779067063, |
| "eval_loss": 0.7988868664660954, |
| "eval_mse_loss": 0.7988868664660954, |
| "eval_runtime": 37.5643, |
| "eval_samples_per_second": 745.203, |
| "eval_steps_per_second": 11.66, |
| "flow/cos_sim": 0.7643485723837325, |
| "flow/improvement_ratio": 0.48455546578588005, |
| "flow/mag_ratio_mean": 0.7568667526930979, |
| "flow/mag_ratio_std": 0.17350949476298677, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5320770403214632, |
| "grad_norm": 0.24848276376724243, |
| "learning_rate": 0.00046773561730867265, |
| "loss": 0.8385207056999207, |
| "step": 11520 |
| }, |
| { |
| "epoch": 0.5439009745508291, |
| "grad_norm": 0.2328663319349289, |
| "learning_rate": 0.0004487803490860472, |
| "loss": 0.8425102829933167, |
| "step": 11776 |
| }, |
| { |
| "epoch": 0.5557249087801949, |
| "grad_norm": 1.160973072052002, |
| "learning_rate": 0.00042989921103857385, |
| "loss": 0.8413041830062866, |
| "step": 12032 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "grad_norm": 0.48494067788124084, |
| "learning_rate": 0.00041119265533512737, |
| "loss": 0.8376575708389282, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "eval_cos_loss": 0.23514270255146505, |
| "eval_loss": 0.7956640910884561, |
| "eval_mse_loss": 0.7956640910884561, |
| "flow/cos_sim": 0.7648573198002767, |
| "flow/improvement_ratio": 0.4841282178550006, |
| "flow/mag_ratio_mean": 0.754011933798115, |
| "flow/mag_ratio_std": 0.17356456295676428, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "eval_cos_loss": 0.23514270255146505, |
| "eval_loss": 0.7956640910884561, |
| "eval_mse_loss": 0.7956640910884561, |
| "eval_runtime": 37.4222, |
| "eval_samples_per_second": 748.032, |
| "eval_steps_per_second": 11.704, |
| "flow/cos_sim": 0.7648573198002767, |
| "flow/improvement_ratio": 0.4841282178550006, |
| "flow/mag_ratio_mean": 0.754011933798115, |
| "flow/mag_ratio_std": 0.17356456295676428, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5793727772389267, |
| "grad_norm": 0.2547326982021332, |
| "learning_rate": 0.0003925410556475763, |
| "loss": 0.8384107947349548, |
| "step": 12544 |
| }, |
| { |
| "epoch": 0.5911967114682924, |
| "grad_norm": 0.2702837586402893, |
| "learning_rate": 0.00037404498123644504, |
| "loss": 0.8394683003425598, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.6030206456976583, |
| "grad_norm": 0.4496554732322693, |
| "learning_rate": 0.0003557312014609784, |
| "loss": 0.8402306437492371, |
| "step": 13056 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "grad_norm": 0.40902939438819885, |
| "learning_rate": 0.00033762622184548686, |
| "loss": 0.8355939388275146, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "eval_cos_loss": 0.23389767066120556, |
| "eval_loss": 0.7927584718895829, |
| "eval_mse_loss": 0.7927584718895829, |
| "flow/cos_sim": 0.7661023556369625, |
| "flow/improvement_ratio": 0.4783722004389654, |
| "flow/mag_ratio_mean": 0.7541745113183375, |
| "flow/mag_ratio_std": 0.17828954082645782, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "eval_cos_loss": 0.23389767066120556, |
| "eval_loss": 0.7927584718895829, |
| "eval_mse_loss": 0.7927584718895829, |
| "eval_runtime": 37.4329, |
| "eval_samples_per_second": 747.818, |
| "eval_steps_per_second": 11.701, |
| "flow/cos_sim": 0.7661023556369625, |
| "flow/improvement_ratio": 0.4783722004389654, |
| "flow/mag_ratio_mean": 0.7541745113183375, |
| "flow/mag_ratio_std": 0.17828954082645782, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.62666851415639, |
| "grad_norm": 1.1801968812942505, |
| "learning_rate": 0.00031975624571791317, |
| "loss": 0.8333742022514343, |
| "step": 13568 |
| }, |
| { |
| "epoch": 0.6384924483857558, |
| "grad_norm": 0.1962728500366211, |
| "learning_rate": 0.00030214713628576984, |
| "loss": 0.8351457715034485, |
| "step": 13824 |
| }, |
| { |
| "epoch": 0.6503163826151217, |
| "grad_norm": 0.5504610538482666, |
| "learning_rate": 0.00028482437920433144, |
| "loss": 0.8345349431037903, |
| "step": 14080 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "grad_norm": 0.2537405788898468, |
| "learning_rate": 0.00026781304569125866, |
| "loss": 0.8331661820411682, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "eval_cos_loss": 0.23358930164276193, |
| "eval_loss": 0.790659844331001, |
| "eval_mse_loss": 0.790659844331001, |
| "flow/cos_sim": 0.7664107191780386, |
| "flow/improvement_ratio": 0.48042010873147883, |
| "flow/mag_ratio_mean": 0.7608423984214051, |
| "flow/mag_ratio_std": 0.17466495716816757, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "eval_cos_loss": 0.23358930164276193, |
| "eval_loss": 0.790659844331001, |
| "eval_mse_loss": 0.790659844331001, |
| "eval_runtime": 37.4882, |
| "eval_samples_per_second": 746.715, |
| "eval_steps_per_second": 11.684, |
| "flow/cos_sim": 0.7664107191780386, |
| "flow/improvement_ratio": 0.48042010873147883, |
| "flow/mag_ratio_mean": 0.7608423984214051, |
| "flow/mag_ratio_std": 0.17466495716816757, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6739642510738534, |
| "grad_norm": 0.44085371494293213, |
| "learning_rate": 0.0002512022089125798, |
| "loss": 0.8341683149337769, |
| "step": 14592 |
| }, |
| { |
| "epoch": 0.6857881853032193, |
| "grad_norm": 1.0609331130981445, |
| "learning_rate": 0.00023488564442791606, |
| "loss": 0.835027813911438, |
| "step": 14848 |
| }, |
| { |
| "epoch": 0.6976121195325851, |
| "grad_norm": 0.22936664521694183, |
| "learning_rate": 0.0002189527798174848, |
| "loss": 0.8334974050521851, |
| "step": 15104 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "grad_norm": 0.28989291191101074, |
| "learning_rate": 0.00020342667470826854, |
| "loss": 0.8318911790847778, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "eval_cos_loss": 0.2342552522614122, |
| "eval_loss": 0.7931138123279293, |
| "eval_mse_loss": 0.7931138123279293, |
| "flow/cos_sim": 0.7657447608366404, |
| "flow/improvement_ratio": 0.4886273239573387, |
| "flow/mag_ratio_mean": 0.7547847685748583, |
| "flow/mag_ratio_std": 0.1757596176985192, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "eval_cos_loss": 0.2342552522614122, |
| "eval_loss": 0.7931138123279293, |
| "eval_mse_loss": 0.7931138123279293, |
| "eval_runtime": 37.5138, |
| "eval_samples_per_second": 746.206, |
| "eval_steps_per_second": 11.676, |
| "flow/cos_sim": 0.7657447608366404, |
| "flow/improvement_ratio": 0.4886273239573387, |
| "flow/mag_ratio_mean": 0.7547847685748583, |
| "flow/mag_ratio_std": 0.1757596176985192, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.7212599879913169, |
| "grad_norm": 0.16880613565444946, |
| "learning_rate": 0.00018832980002442701, |
| "loss": 0.8349105715751648, |
| "step": 15616 |
| }, |
| { |
| "epoch": 0.7330839222206826, |
| "grad_norm": 0.21710531413555145, |
| "learning_rate": 0.00017368400546514002, |
| "loss": 0.8341897130012512, |
| "step": 15872 |
| }, |
| { |
| "epoch": 0.7449078564500485, |
| "grad_norm": 0.4028049111366272, |
| "learning_rate": 0.00015951048788154866, |
| "loss": 0.8300965428352356, |
| "step": 16128 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "grad_norm": 0.4703959822654724, |
| "learning_rate": 0.0001458297605985633, |
| "loss": 0.830625057220459, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "eval_cos_loss": 0.23394639184485833, |
| "eval_loss": 0.7917164079674847, |
| "eval_mse_loss": 0.7917164079674847, |
| "flow/cos_sim": 0.7660536425843086, |
| "flow/improvement_ratio": 0.4823139966486796, |
| "flow/mag_ratio_mean": 0.7634640123746167, |
| "flow/mag_ratio_std": 0.17673859605778297, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "eval_cos_loss": 0.23394639184485833, |
| "eval_loss": 0.7917164079674847, |
| "eval_mse_loss": 0.7917164079674847, |
| "eval_runtime": 37.6008, |
| "eval_samples_per_second": 744.478, |
| "eval_steps_per_second": 11.649, |
| "flow/cos_sim": 0.7660536425843086, |
| "flow/improvement_ratio": 0.4823139966486796, |
| "flow/mag_ratio_mean": 0.7634640123746167, |
| "flow/mag_ratio_std": 0.17673859605778297, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7685557249087802, |
| "grad_norm": 0.2865261733531952, |
| "learning_rate": 0.00013266162372593872, |
| "loss": 0.8289151787757874, |
| "step": 16640 |
| }, |
| { |
| "epoch": 0.780379659138146, |
| "grad_norm": 0.32574427127838135, |
| "learning_rate": 0.00012002513550158511, |
| "loss": 0.8299338817596436, |
| "step": 16896 |
| }, |
| { |
| "epoch": 0.7922035933675119, |
| "grad_norm": 0.19136400520801544, |
| "learning_rate": 0.00010793858470858986, |
| "loss": 0.831389307975769, |
| "step": 17152 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "grad_norm": 0.4910804331302643, |
| "learning_rate": 9.641946420587128e-05, |
| "loss": 0.8310226798057556, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "eval_cos_loss": 0.23344004797200635, |
| "eval_loss": 0.7905207032754541, |
| "eval_mse_loss": 0.7905207032754541, |
| "flow/cos_sim": 0.7665599775096598, |
| "flow/improvement_ratio": 0.48091149343747525, |
| "flow/mag_ratio_mean": 0.7604631242686755, |
| "flow/mag_ratio_std": 0.1777392937440306, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "eval_cos_loss": 0.23344004797200635, |
| "eval_loss": 0.7905207032754541, |
| "eval_mse_loss": 0.7905207032754541, |
| "eval_runtime": 37.5059, |
| "eval_samples_per_second": 746.363, |
| "eval_steps_per_second": 11.678, |
| "flow/cos_sim": 0.7665599775096598, |
| "flow/improvement_ratio": 0.48091149343747525, |
| "flow/mag_ratio_mean": 0.7604631242686755, |
| "flow/mag_ratio_std": 0.1777392937440306, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8158514618262436, |
| "grad_norm": 0.41079503297805786, |
| "learning_rate": 8.548444561077174e-05, |
| "loss": 0.831270694732666, |
| "step": 17664 |
| }, |
| { |
| "epoch": 0.8276753960556095, |
| "grad_norm": 0.20108859241008759, |
| "learning_rate": 7.514935517023558e-05, |
| "loss": 0.8281430006027222, |
| "step": 17920 |
| }, |
| { |
| "epoch": 0.8394993302849753, |
| "grad_norm": 0.22395165264606476, |
| "learning_rate": 6.542915085548828e-05, |
| "loss": 0.8286972045898438, |
| "step": 18176 |
| }, |
| { |
| "epoch": 0.8513232645143411, |
| "grad_norm": 0.12410369515419006, |
| "learning_rate": 5.6337900713373745e-05, |
| "loss": 0.828626275062561, |
| "step": 18432 |
| }, |
| { |
| "epoch": 0.8513232645143411, |
| "eval_cos_loss": 0.23364517943227672, |
| "eval_loss": 0.7902474757198874, |
| "eval_mse_loss": 0.7902474757198874, |
| "flow/cos_sim": 0.7663548270316973, |
| "flow/improvement_ratio": 0.4790440064655047, |
| "flow/mag_ratio_mean": 0.7584420379982691, |
| "flow/mag_ratio_std": 0.17626052373620474, |
| "step": 18432 |
| }, |
| { |
| "epoch": 0.8513232645143411, |
| "eval_cos_loss": 0.23364517943227672, |
| "eval_loss": 0.7902474757198874, |
| "eval_mse_loss": 0.7902474757198874, |
| "eval_runtime": 37.4969, |
| "eval_samples_per_second": 746.541, |
| "eval_steps_per_second": 11.681, |
| "flow/cos_sim": 0.7663548270316973, |
| "flow/improvement_ratio": 0.4790440064655047, |
| "flow/mag_ratio_mean": 0.7584420379982691, |
| "flow/mag_ratio_std": 0.17626052373620474, |
| "step": 18432 |
| }, |
| { |
| "epoch": 0.8631471987437069, |
| "grad_norm": 0.15060371160507202, |
| "learning_rate": 4.78887625056757e-05, |
| "loss": 0.8286436796188354, |
| "step": 18688 |
| }, |
| { |
| "epoch": 0.8749711329730728, |
| "grad_norm": 0.1537846028804779, |
| "learning_rate": 4.012312516433581e-05, |
| "loss": 0.8254930377006531, |
| "step": 18944 |
| }, |
| { |
| "epoch": 0.8867950672024387, |
| "grad_norm": 0.4253118336200714, |
| "learning_rate": 3.2991328308497545e-05, |
| "loss": 0.8294442296028137, |
| "step": 19200 |
| }, |
| { |
| "epoch": 0.8986190014318045, |
| "grad_norm": 0.17603912949562073, |
| "learning_rate": 2.653543286964183e-05, |
| "loss": 0.8307968378067017, |
| "step": 19456 |
| }, |
| { |
| "epoch": 0.8986190014318045, |
| "eval_cos_loss": 0.2343874951679957, |
| "eval_loss": 0.7920145923144197, |
| "eval_mse_loss": 0.7920145923144197, |
| "flow/cos_sim": 0.7656125166372622, |
| "flow/improvement_ratio": 0.4853253698648383, |
| "flow/mag_ratio_mean": 0.7600582418920787, |
| "flow/mag_ratio_std": 0.17680819529920952, |
| "step": 19456 |
| }, |
| { |
| "epoch": 0.8986190014318045, |
| "eval_cos_loss": 0.2343874951679957, |
| "eval_loss": 0.7920145923144197, |
| "eval_mse_loss": 0.7920145923144197, |
| "eval_runtime": 37.6789, |
| "eval_samples_per_second": 742.935, |
| "eval_steps_per_second": 11.625, |
| "flow/cos_sim": 0.7656125166372622, |
| "flow/improvement_ratio": 0.4853253698648383, |
| "flow/mag_ratio_mean": 0.7600582418920787, |
| "flow/mag_ratio_std": 0.17680819529920952, |
| "step": 19456 |
| }, |
| { |
| "epoch": 0.9104429356611704, |
| "grad_norm": 0.13290032744407654, |
| "learning_rate": 2.076478246200819e-05, |
| "loss": 0.8278121948242188, |
| "step": 19712 |
| }, |
| { |
| "epoch": 0.9222668698905362, |
| "grad_norm": 0.20168907940387726, |
| "learning_rate": 1.5687728945045944e-05, |
| "loss": 0.8273869156837463, |
| "step": 19968 |
| }, |
| { |
| "epoch": 0.9340908041199021, |
| "grad_norm": 0.20389237999916077, |
| "learning_rate": 1.1311620335770879e-05, |
| "loss": 0.831144392490387, |
| "step": 20224 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "grad_norm": 0.26355621218681335, |
| "learning_rate": 7.642790173984836e-06, |
| "loss": 0.8322795629501343, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "eval_cos_loss": 0.23226006676072944, |
| "eval_loss": 0.7871269678960652, |
| "eval_mse_loss": 0.7871269678960652, |
| "flow/cos_sim": 0.7677399510662305, |
| "flow/improvement_ratio": 0.4782098607249456, |
| "flow/mag_ratio_mean": 0.76152302032192, |
| "flow/mag_ratio_std": 0.1765111445632155, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "eval_cos_loss": 0.23226006676072944, |
| "eval_loss": 0.7871269678960652, |
| "eval_mse_loss": 0.7871269678960652, |
| "eval_runtime": 37.4829, |
| "eval_samples_per_second": 746.822, |
| "eval_steps_per_second": 11.685, |
| "flow/cos_sim": 0.7677399510662305, |
| "flow/improvement_ratio": 0.4782098607249456, |
| "flow/mag_ratio_mean": 0.76152302032192, |
| "flow/mag_ratio_std": 0.1765111445632155, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.9577386725786338, |
| "grad_norm": 0.35688987374305725, |
| "learning_rate": 4.686548355746001e-06, |
| "loss": 0.8279744386672974, |
| "step": 20736 |
| }, |
| { |
| "epoch": 0.9695626068079997, |
| "grad_norm": 0.15006007254123688, |
| "learning_rate": 2.447173448359541e-06, |
| "loss": 0.8288099765777588, |
| "step": 20992 |
| }, |
| { |
| "epoch": 0.9813865410373654, |
| "grad_norm": 0.13515928387641907, |
| "learning_rate": 9.324369820445933e-07, |
| "loss": 0.8292235136032104, |
| "step": 21248 |
| }, |
| { |
| "epoch": 0.9932104752667313, |
| "grad_norm": 0.19579511880874634, |
| "learning_rate": 1.3265238678672464e-07, |
| "loss": 0.8291770219802856, |
| "step": 21504 |
| }, |
| { |
| "epoch": 0.9932104752667313, |
| "eval_cos_loss": 0.23236965187336212, |
| "eval_loss": 0.7870777566411179, |
| "eval_mse_loss": 0.7870777566411179, |
| "flow/cos_sim": 0.7676303687433129, |
| "flow/improvement_ratio": 0.48667068133071134, |
| "flow/mag_ratio_mean": 0.7602007927959913, |
| "flow/mag_ratio_std": 0.17566703702216824, |
| "step": 21504 |
| }, |
| { |
| "epoch": 0.9932104752667313, |
| "eval_cos_loss": 0.23236965187336212, |
| "eval_loss": 0.7870777566411179, |
| "eval_mse_loss": 0.7870777566411179, |
| "eval_runtime": 37.3306, |
| "eval_samples_per_second": 749.867, |
| "eval_steps_per_second": 11.733, |
| "flow/cos_sim": 0.7676303687433129, |
| "flow/improvement_ratio": 0.48667068133071134, |
| "flow/mag_ratio_mean": 0.7602007927959913, |
| "flow/mag_ratio_std": 0.17566703702216824, |
| "step": 21504 |
| } |
| ], |
| "logging_steps": 256, |
| "max_steps": 21651, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1024, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|