Attila1011's picture
Upload folder using huggingface_hub
a341351 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 1024,
"global_step": 21651,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011823934229365849,
"grad_norm": 0.2415127456188202,
"learning_rate": 0.000498046875,
"loss": 1.734155297279358,
"step": 256
},
{
"epoch": 0.023647868458731697,
"grad_norm": 0.200724259018898,
"learning_rate": 0.000998046875,
"loss": 1.1768943071365356,
"step": 512
},
{
"epoch": 0.03547180268809755,
"grad_norm": 0.10385265946388245,
"learning_rate": 0.000999640996023194,
"loss": 1.0633362531661987,
"step": 768
},
{
"epoch": 0.047295736917463395,
"grad_norm": 0.24572038650512695,
"learning_rate": 0.0009985588674043958,
"loss": 1.0212352275848389,
"step": 1024
},
{
"epoch": 0.047295736917463395,
"eval_cos_loss": 0.2939360714666375,
"eval_loss": 0.9792764313416938,
"eval_mse_loss": 0.9792764313416938,
"flow/cos_sim": 0.706063949762414,
"flow/improvement_ratio": 0.47937897269584273,
"flow/mag_ratio_mean": 0.7043995116943638,
"flow/mag_ratio_std": 0.14764773857579927,
"step": 1024
},
{
"epoch": 0.047295736917463395,
"eval_cos_loss": 0.2939360714666375,
"eval_loss": 0.9792764313416938,
"eval_mse_loss": 0.9792764313416938,
"eval_runtime": 37.309,
"eval_samples_per_second": 750.302,
"eval_steps_per_second": 11.74,
"flow/cos_sim": 0.706063949762414,
"flow/improvement_ratio": 0.47937897269584273,
"flow/mag_ratio_mean": 0.7043995116943638,
"flow/mag_ratio_std": 0.14764773857579927,
"step": 1024
},
{
"epoch": 0.05911967114682925,
"grad_norm": 0.14034895598888397,
"learning_rate": 0.0009967551747861387,
"loss": 0.9987254738807678,
"step": 1280
},
{
"epoch": 0.0709436053761951,
"grad_norm": 0.32785671949386597,
"learning_rate": 0.000994232528651847,
"loss": 0.9810371398925781,
"step": 1536
},
{
"epoch": 0.08276753960556095,
"grad_norm": 0.4202195107936859,
"learning_rate": 0.0009909945800260092,
"loss": 0.9669110774993896,
"step": 1792
},
{
"epoch": 0.09459147383492679,
"grad_norm": 0.11297852545976639,
"learning_rate": 0.0009870460151900522,
"loss": 0.9570462107658386,
"step": 2048
},
{
"epoch": 0.09459147383492679,
"eval_cos_loss": 0.27135700569169163,
"eval_loss": 0.9140212976769225,
"eval_mse_loss": 0.9140212976769225,
"flow/cos_sim": 0.7286430077465702,
"flow/improvement_ratio": 0.4797877063081689,
"flow/mag_ratio_mean": 0.7240678813631676,
"flow/mag_ratio_std": 0.15831342356526143,
"step": 2048
},
{
"epoch": 0.09459147383492679,
"eval_cos_loss": 0.27135700569169163,
"eval_loss": 0.9140212976769225,
"eval_mse_loss": 0.9140212976769225,
"eval_runtime": 37.3614,
"eval_samples_per_second": 749.249,
"eval_steps_per_second": 11.723,
"flow/cos_sim": 0.7286430077465702,
"flow/improvement_ratio": 0.4797877063081689,
"flow/mag_ratio_mean": 0.7240678813631676,
"flow/mag_ratio_std": 0.15831342356526143,
"step": 2048
},
{
"epoch": 0.10641540806429264,
"grad_norm": 0.10684677958488464,
"learning_rate": 0.0009823925488998885,
"loss": 0.9458644986152649,
"step": 2304
},
{
"epoch": 0.1182393422936585,
"grad_norm": 0.35598301887512207,
"learning_rate": 0.0009770409161149525,
"loss": 0.9369097948074341,
"step": 2560
},
{
"epoch": 0.13006327652302435,
"grad_norm": 0.24151749908924103,
"learning_rate": 0.0009709988622506973,
"loss": 0.925268292427063,
"step": 2816
},
{
"epoch": 0.1418872107523902,
"grad_norm": 0.17448143661022186,
"learning_rate": 0.000964275131968659,
"loss": 0.9168843030929565,
"step": 3072
},
{
"epoch": 0.1418872107523902,
"eval_cos_loss": 0.258329911795381,
"eval_loss": 0.872469331303688,
"eval_mse_loss": 0.872469331303688,
"flow/cos_sim": 0.7416701187554016,
"flow/improvement_ratio": 0.4866235294978913,
"flow/mag_ratio_mean": 0.7309322730046973,
"flow/mag_ratio_std": 0.16244563712104815,
"step": 3072
},
{
"epoch": 0.1418872107523902,
"eval_cos_loss": 0.258329911795381,
"eval_loss": 0.872469331303688,
"eval_mse_loss": 0.872469331303688,
"eval_runtime": 37.367,
"eval_samples_per_second": 749.136,
"eval_steps_per_second": 11.722,
"flow/cos_sim": 0.7416701187554016,
"flow/improvement_ratio": 0.4866235294978913,
"flow/mag_ratio_mean": 0.7309322730046973,
"flow/mag_ratio_std": 0.16244563712104815,
"step": 3072
},
{
"epoch": 0.15371114498175603,
"grad_norm": 0.16108064353466034,
"learning_rate": 0.0009568794565203123,
"loss": 0.91062992811203,
"step": 3328
},
{
"epoch": 0.1655350792111219,
"grad_norm": 0.21124346554279327,
"learning_rate": 0.0009488225396630347,
"loss": 0.9045400023460388,
"step": 3584
},
{
"epoch": 0.17735901344048774,
"grad_norm": 0.49332571029663086,
"learning_rate": 0.0009401160421685646,
"loss": 0.8949952721595764,
"step": 3840
},
{
"epoch": 0.18918294766985358,
"grad_norm": 0.25588458776474,
"learning_rate": 0.0009307725649463714,
"loss": 0.8952076435089111,
"step": 4096
},
{
"epoch": 0.18918294766985358,
"eval_cos_loss": 0.24964532671181577,
"eval_loss": 0.8440415833366516,
"eval_mse_loss": 0.8440415833366516,
"flow/cos_sim": 0.7503547062884727,
"flow/improvement_ratio": 0.4770904905311593,
"flow/mag_ratio_mean": 0.7401841025646418,
"flow/mag_ratio_std": 0.1675400482533185,
"step": 4096
},
{
"epoch": 0.18918294766985358,
"eval_cos_loss": 0.24964532671181577,
"eval_loss": 0.8440415833366516,
"eval_mse_loss": 0.8440415833366516,
"eval_runtime": 37.4493,
"eval_samples_per_second": 747.491,
"eval_steps_per_second": 11.696,
"flow/cos_sim": 0.7503547062884727,
"flow/improvement_ratio": 0.4770904905311593,
"flow/mag_ratio_mean": 0.7401841025646418,
"flow/mag_ratio_std": 0.1675400482533185,
"step": 4096
},
{
"epoch": 0.20100688189921945,
"grad_norm": 0.37869253754615784,
"learning_rate": 0.0009208056308063659,
"loss": 0.8890768885612488,
"step": 4352
},
{
"epoch": 0.2128308161285853,
"grad_norm": 0.9834415316581726,
"learning_rate": 0.0009102296648873445,
"loss": 0.8840116262435913,
"step": 4608
},
{
"epoch": 0.22465475035795113,
"grad_norm": 1.289456844329834,
"learning_rate": 0.0008990599737794927,
"loss": 0.8811625838279724,
"step": 4864
},
{
"epoch": 0.236478684587317,
"grad_norm": 0.24145644903182983,
"learning_rate": 0.0008873127233711644,
"loss": 0.8737959861755371,
"step": 5120
},
{
"epoch": 0.236478684587317,
"eval_cos_loss": 0.24506433716375534,
"eval_loss": 0.8307346953376787,
"eval_mse_loss": 0.8307346953376787,
"flow/cos_sim": 0.7549356581413582,
"flow/improvement_ratio": 0.47620510081994477,
"flow/mag_ratio_mean": 0.7473646888177689,
"flow/mag_ratio_std": 0.16763514821251776,
"step": 5120
},
{
"epoch": 0.236478684587317,
"eval_cos_loss": 0.24506433716375534,
"eval_loss": 0.8307346953376787,
"eval_mse_loss": 0.8307346953376787,
"eval_runtime": 37.5063,
"eval_samples_per_second": 746.354,
"eval_steps_per_second": 11.678,
"flow/cos_sim": 0.7549356581413582,
"flow/improvement_ratio": 0.47620510081994477,
"flow/mag_ratio_mean": 0.7473646888177689,
"flow/mag_ratio_std": 0.16763514821251776,
"step": 5120
},
{
"epoch": 0.24830261881668284,
"grad_norm": 1.039290428161621,
"learning_rate": 0.0008750049154520011,
"loss": 0.8717202544212341,
"step": 5376
},
{
"epoch": 0.2601265530460487,
"grad_norm": 0.22408978641033173,
"learning_rate": 0.0008621543631062487,
"loss": 0.8717328906059265,
"step": 5632
},
{
"epoch": 0.27195048727541454,
"grad_norm": 0.7838807106018066,
"learning_rate": 0.0008487796649318904,
"loss": 0.8674213886260986,
"step": 5888
},
{
"epoch": 0.2837744215047804,
"grad_norm": 0.15397988259792328,
"learning_rate": 0.0008349553511611836,
"loss": 0.8678247332572937,
"step": 6144
},
{
"epoch": 0.2837744215047804,
"eval_cos_loss": 0.2434517246879399,
"eval_loss": 0.8235682821981439,
"eval_mse_loss": 0.8235682821981439,
"flow/cos_sim": 0.7565482973235927,
"flow/improvement_ratio": 0.48134120863322255,
"flow/mag_ratio_mean": 0.752200963970733,
"flow/mag_ratio_std": 0.17564187855481012,
"step": 6144
},
{
"epoch": 0.2837744215047804,
"eval_cos_loss": 0.2434517246879399,
"eval_loss": 0.8235682821981439,
"eval_mse_loss": 0.8235682821981439,
"eval_runtime": 37.4608,
"eval_samples_per_second": 747.262,
"eval_steps_per_second": 11.692,
"flow/cos_sim": 0.7565482973235927,
"flow/improvement_ratio": 0.48134120863322255,
"flow/mag_ratio_mean": 0.752200963970733,
"flow/mag_ratio_std": 0.17564187855481012,
"step": 6144
},
{
"epoch": 0.2955983557341462,
"grad_norm": 0.15729770064353943,
"learning_rate": 0.0008205930168562264,
"loss": 0.8611059188842773,
"step": 6400
},
{
"epoch": 0.30742228996351206,
"grad_norm": 0.5960604548454285,
"learning_rate": 0.0008057666884383055,
"loss": 0.8611810803413391,
"step": 6656
},
{
"epoch": 0.3192462241928779,
"grad_norm": 0.12568458914756775,
"learning_rate": 0.0007905583005945037,
"loss": 0.8600746393203735,
"step": 6912
},
{
"epoch": 0.3310701584222438,
"grad_norm": 0.15424804389476776,
"learning_rate": 0.000774870597388272,
"loss": 0.8601675629615784,
"step": 7168
},
{
"epoch": 0.3310701584222438,
"eval_cos_loss": 0.24115517810327278,
"eval_loss": 0.8168311545021458,
"eval_mse_loss": 0.8168311545021458,
"flow/cos_sim": 0.758844825230777,
"flow/improvement_ratio": 0.48181676517610683,
"flow/mag_ratio_mean": 0.7439628689800768,
"flow/mag_ratio_std": 0.17355786634770703,
"step": 7168
},
{
"epoch": 0.3310701584222438,
"eval_cos_loss": 0.24115517810327278,
"eval_loss": 0.8168311545021458,
"eval_mse_loss": 0.8168311545021458,
"eval_runtime": 37.4871,
"eval_samples_per_second": 746.736,
"eval_steps_per_second": 11.684,
"flow/cos_sim": 0.758844825230777,
"flow/improvement_ratio": 0.48181676517610683,
"flow/mag_ratio_mean": 0.7439628689800768,
"flow/mag_ratio_std": 0.17355786634770703,
"step": 7168
},
{
"epoch": 0.34289409265160964,
"grad_norm": 0.1787535548210144,
"learning_rate": 0.0007587850741039065,
"loss": 0.8584595322608948,
"step": 7424
},
{
"epoch": 0.3547180268809755,
"grad_norm": 0.8946473002433777,
"learning_rate": 0.000742325011311212,
"loss": 0.8562237620353699,
"step": 7680
},
{
"epoch": 0.3665419611103413,
"grad_norm": 0.1438024491071701,
"learning_rate": 0.0007255142316508366,
"loss": 0.8549248576164246,
"step": 7936
},
{
"epoch": 0.37836589533970716,
"grad_norm": 0.15269657969474792,
"learning_rate": 0.0007083770653557752,
"loss": 0.8540560603141785,
"step": 8192
},
{
"epoch": 0.37836589533970716,
"eval_cos_loss": 0.238492783584279,
"eval_loss": 0.8075671801556191,
"eval_mse_loss": 0.8075671801556191,
"flow/cos_sim": 0.7615072517634527,
"flow/improvement_ratio": 0.48001350197073533,
"flow/mag_ratio_mean": 0.7569256454842276,
"flow/mag_ratio_std": 0.17385539324044091,
"step": 8192
},
{
"epoch": 0.37836589533970716,
"eval_cos_loss": 0.238492783584279,
"eval_loss": 0.8075671801556191,
"eval_mse_loss": 0.8075671801556191,
"eval_runtime": 37.4059,
"eval_samples_per_second": 748.358,
"eval_steps_per_second": 11.709,
"flow/cos_sim": 0.7615072517634527,
"flow/improvement_ratio": 0.48001350197073533,
"flow/mag_ratio_mean": 0.7569256454842276,
"flow/mag_ratio_std": 0.17385539324044091,
"step": 8192
},
{
"epoch": 0.390189829569073,
"grad_norm": 0.2120441496372223,
"learning_rate": 0.0006909383150382365,
"loss": 0.850504994392395,
"step": 8448
},
{
"epoch": 0.4020137637984389,
"grad_norm": 1.0651849508285522,
"learning_rate": 0.0006732232197928334,
"loss": 0.8530774116516113,
"step": 8704
},
{
"epoch": 0.41383769802780473,
"grad_norm": 0.18748371303081512,
"learning_rate": 0.0006552574186680553,
"loss": 0.849456250667572,
"step": 8960
},
{
"epoch": 0.4256616322571706,
"grad_norm": 1.3815633058547974,
"learning_rate": 0.0006370669135588852,
"loss": 0.8472809791564941,
"step": 9216
},
{
"epoch": 0.4256616322571706,
"eval_cos_loss": 0.23755816625405665,
"eval_loss": 0.8048887692357851,
"eval_mse_loss": 0.8048887692357851,
"flow/cos_sim": 0.7624418421150887,
"flow/improvement_ratio": 0.47895704168979436,
"flow/mag_ratio_mean": 0.7494950834746774,
"flow/mag_ratio_std": 0.1740222738621986,
"step": 9216
},
{
"epoch": 0.4256616322571706,
"eval_cos_loss": 0.23755816625405665,
"eval_loss": 0.8048887692357851,
"eval_mse_loss": 0.8048887692357851,
"eval_runtime": 37.4388,
"eval_samples_per_second": 747.7,
"eval_steps_per_second": 11.699,
"flow/cos_sim": 0.7624418421150887,
"flow/improvement_ratio": 0.47895704168979436,
"flow/mag_ratio_mean": 0.7494950834746774,
"flow/mag_ratio_std": 0.1740222738621986,
"step": 9216
},
{
"epoch": 0.4374855664865364,
"grad_norm": 0.30135899782180786,
"learning_rate": 0.0006187502147205189,
"loss": 0.8494647741317749,
"step": 9472
},
{
"epoch": 0.44930950071590225,
"grad_norm": 0.1467859447002411,
"learning_rate": 0.000600190188915804,
"loss": 0.8477605581283569,
"step": 9728
},
{
"epoch": 0.4611334349452681,
"grad_norm": 0.5628754496574402,
"learning_rate": 0.0005814851579007778,
"loss": 0.845370352268219,
"step": 9984
},
{
"epoch": 0.472957369174634,
"grad_norm": 0.8975974321365356,
"learning_rate": 0.0005626621934574736,
"loss": 0.8435695171356201,
"step": 10240
},
{
"epoch": 0.472957369174634,
"eval_cos_loss": 0.23847832721254053,
"eval_loss": 0.8075589748550224,
"eval_mse_loss": 0.8075589748550224,
"flow/cos_sim": 0.76152167540707,
"flow/improvement_ratio": 0.48205344698744823,
"flow/mag_ratio_mean": 0.7498847978300156,
"flow/mag_ratio_std": 0.17484679011857673,
"step": 10240
},
{
"epoch": 0.472957369174634,
"eval_cos_loss": 0.23847832721254053,
"eval_loss": 0.8075589748550224,
"eval_mse_loss": 0.8075589748550224,
"eval_runtime": 37.5927,
"eval_samples_per_second": 744.639,
"eval_steps_per_second": 11.651,
"flow/cos_sim": 0.76152167540707,
"flow/improvement_ratio": 0.48205344698744823,
"flow/mag_ratio_mean": 0.7498847978300156,
"flow/mag_ratio_std": 0.17484679011857673,
"step": 10240
},
{
"epoch": 0.48478130340399983,
"grad_norm": 0.7524752020835876,
"learning_rate": 0.0005437485380529165,
"loss": 0.8425331115722656,
"step": 10496
},
{
"epoch": 0.49660523763336567,
"grad_norm": 0.6698228120803833,
"learning_rate": 0.0005247715654111146,
"loss": 0.8414271473884583,
"step": 10752
},
{
"epoch": 0.5084291718627315,
"grad_norm": 0.5097092390060425,
"learning_rate": 0.0005057587408950816,
"loss": 0.8411047458648682,
"step": 11008
},
{
"epoch": 0.5202531060920974,
"grad_norm": 0.7546167373657227,
"learning_rate": 0.00048673758175623157,
"loss": 0.841041088104248,
"step": 11264
},
{
"epoch": 0.5202531060920974,
"eval_cos_loss": 0.23565144779067063,
"eval_loss": 0.7988868664660954,
"eval_mse_loss": 0.7988868664660954,
"flow/cos_sim": 0.7643485723837325,
"flow/improvement_ratio": 0.48455546578588005,
"flow/mag_ratio_mean": 0.7568667526930979,
"flow/mag_ratio_std": 0.17350949476298677,
"step": 11264
},
{
"epoch": 0.5202531060920974,
"eval_cos_loss": 0.23565144779067063,
"eval_loss": 0.7988868664660954,
"eval_mse_loss": 0.7988868664660954,
"eval_runtime": 37.5643,
"eval_samples_per_second": 745.203,
"eval_steps_per_second": 11.66,
"flow/cos_sim": 0.7643485723837325,
"flow/improvement_ratio": 0.48455546578588005,
"flow/mag_ratio_mean": 0.7568667526930979,
"flow/mag_ratio_std": 0.17350949476298677,
"step": 11264
},
{
"epoch": 0.5320770403214632,
"grad_norm": 0.24848276376724243,
"learning_rate": 0.00046773561730867265,
"loss": 0.8385207056999207,
"step": 11520
},
{
"epoch": 0.5439009745508291,
"grad_norm": 0.2328663319349289,
"learning_rate": 0.0004487803490860472,
"loss": 0.8425102829933167,
"step": 11776
},
{
"epoch": 0.5557249087801949,
"grad_norm": 1.160973072052002,
"learning_rate": 0.00042989921103857385,
"loss": 0.8413041830062866,
"step": 12032
},
{
"epoch": 0.5675488430095608,
"grad_norm": 0.48494067788124084,
"learning_rate": 0.00041119265533512737,
"loss": 0.8376575708389282,
"step": 12288
},
{
"epoch": 0.5675488430095608,
"eval_cos_loss": 0.23514270255146505,
"eval_loss": 0.7956640910884561,
"eval_mse_loss": 0.7956640910884561,
"flow/cos_sim": 0.7648573198002767,
"flow/improvement_ratio": 0.4841282178550006,
"flow/mag_ratio_mean": 0.754011933798115,
"flow/mag_ratio_std": 0.17356456295676428,
"step": 12288
},
{
"epoch": 0.5675488430095608,
"eval_cos_loss": 0.23514270255146505,
"eval_loss": 0.7956640910884561,
"eval_mse_loss": 0.7956640910884561,
"eval_runtime": 37.4222,
"eval_samples_per_second": 748.032,
"eval_steps_per_second": 11.704,
"flow/cos_sim": 0.7648573198002767,
"flow/improvement_ratio": 0.4841282178550006,
"flow/mag_ratio_mean": 0.754011933798115,
"flow/mag_ratio_std": 0.17356456295676428,
"step": 12288
},
{
"epoch": 0.5793727772389267,
"grad_norm": 0.2547326982021332,
"learning_rate": 0.0003925410556475763,
"loss": 0.8384107947349548,
"step": 12544
},
{
"epoch": 0.5911967114682924,
"grad_norm": 0.2702837586402893,
"learning_rate": 0.00037404498123644504,
"loss": 0.8394683003425598,
"step": 12800
},
{
"epoch": 0.6030206456976583,
"grad_norm": 0.4496554732322693,
"learning_rate": 0.0003557312014609784,
"loss": 0.8402306437492371,
"step": 13056
},
{
"epoch": 0.6148445799270241,
"grad_norm": 0.40902939438819885,
"learning_rate": 0.00033762622184548686,
"loss": 0.8355939388275146,
"step": 13312
},
{
"epoch": 0.6148445799270241,
"eval_cos_loss": 0.23389767066120556,
"eval_loss": 0.7927584718895829,
"eval_mse_loss": 0.7927584718895829,
"flow/cos_sim": 0.7661023556369625,
"flow/improvement_ratio": 0.4783722004389654,
"flow/mag_ratio_mean": 0.7541745113183375,
"flow/mag_ratio_std": 0.17828954082645782,
"step": 13312
},
{
"epoch": 0.6148445799270241,
"eval_cos_loss": 0.23389767066120556,
"eval_loss": 0.7927584718895829,
"eval_mse_loss": 0.7927584718895829,
"eval_runtime": 37.4329,
"eval_samples_per_second": 747.818,
"eval_steps_per_second": 11.701,
"flow/cos_sim": 0.7661023556369625,
"flow/improvement_ratio": 0.4783722004389654,
"flow/mag_ratio_mean": 0.7541745113183375,
"flow/mag_ratio_std": 0.17828954082645782,
"step": 13312
},
{
"epoch": 0.62666851415639,
"grad_norm": 1.1801968812942505,
"learning_rate": 0.00031975624571791317,
"loss": 0.8333742022514343,
"step": 13568
},
{
"epoch": 0.6384924483857558,
"grad_norm": 0.1962728500366211,
"learning_rate": 0.00030214713628576984,
"loss": 0.8351457715034485,
"step": 13824
},
{
"epoch": 0.6503163826151217,
"grad_norm": 0.5504610538482666,
"learning_rate": 0.00028482437920433144,
"loss": 0.8345349431037903,
"step": 14080
},
{
"epoch": 0.6621403168444876,
"grad_norm": 0.2537405788898468,
"learning_rate": 0.00026781304569125866,
"loss": 0.8331661820411682,
"step": 14336
},
{
"epoch": 0.6621403168444876,
"eval_cos_loss": 0.23358930164276193,
"eval_loss": 0.790659844331001,
"eval_mse_loss": 0.790659844331001,
"flow/cos_sim": 0.7664107191780386,
"flow/improvement_ratio": 0.48042010873147883,
"flow/mag_ratio_mean": 0.7608423984214051,
"flow/mag_ratio_std": 0.17466495716816757,
"step": 14336
},
{
"epoch": 0.6621403168444876,
"eval_cos_loss": 0.23358930164276193,
"eval_loss": 0.790659844331001,
"eval_mse_loss": 0.790659844331001,
"eval_runtime": 37.4882,
"eval_samples_per_second": 746.715,
"eval_steps_per_second": 11.684,
"flow/cos_sim": 0.7664107191780386,
"flow/improvement_ratio": 0.48042010873147883,
"flow/mag_ratio_mean": 0.7608423984214051,
"flow/mag_ratio_std": 0.17466495716816757,
"step": 14336
},
{
"epoch": 0.6739642510738534,
"grad_norm": 0.44085371494293213,
"learning_rate": 0.0002512022089125798,
"loss": 0.8341683149337769,
"step": 14592
},
{
"epoch": 0.6857881853032193,
"grad_norm": 1.0609331130981445,
"learning_rate": 0.00023488564442791606,
"loss": 0.835027813911438,
"step": 14848
},
{
"epoch": 0.6976121195325851,
"grad_norm": 0.22936664521694183,
"learning_rate": 0.0002189527798174848,
"loss": 0.8334974050521851,
"step": 15104
},
{
"epoch": 0.709436053761951,
"grad_norm": 0.28989291191101074,
"learning_rate": 0.00020342667470826854,
"loss": 0.8318911790847778,
"step": 15360
},
{
"epoch": 0.709436053761951,
"eval_cos_loss": 0.2342552522614122,
"eval_loss": 0.7931138123279293,
"eval_mse_loss": 0.7931138123279293,
"flow/cos_sim": 0.7657447608366404,
"flow/improvement_ratio": 0.4886273239573387,
"flow/mag_ratio_mean": 0.7547847685748583,
"flow/mag_ratio_std": 0.1757596176985192,
"step": 15360
},
{
"epoch": 0.709436053761951,
"eval_cos_loss": 0.2342552522614122,
"eval_loss": 0.7931138123279293,
"eval_mse_loss": 0.7931138123279293,
"eval_runtime": 37.5138,
"eval_samples_per_second": 746.206,
"eval_steps_per_second": 11.676,
"flow/cos_sim": 0.7657447608366404,
"flow/improvement_ratio": 0.4886273239573387,
"flow/mag_ratio_mean": 0.7547847685748583,
"flow/mag_ratio_std": 0.1757596176985192,
"step": 15360
},
{
"epoch": 0.7212599879913169,
"grad_norm": 0.16880613565444946,
"learning_rate": 0.00018832980002442701,
"loss": 0.8349105715751648,
"step": 15616
},
{
"epoch": 0.7330839222206826,
"grad_norm": 0.21710531413555145,
"learning_rate": 0.00017368400546514002,
"loss": 0.8341897130012512,
"step": 15872
},
{
"epoch": 0.7449078564500485,
"grad_norm": 0.4028049111366272,
"learning_rate": 0.00015951048788154866,
"loss": 0.8300965428352356,
"step": 16128
},
{
"epoch": 0.7567317906794143,
"grad_norm": 0.4703959822654724,
"learning_rate": 0.0001458297605985633,
"loss": 0.830625057220459,
"step": 16384
},
{
"epoch": 0.7567317906794143,
"eval_cos_loss": 0.23394639184485833,
"eval_loss": 0.7917164079674847,
"eval_mse_loss": 0.7917164079674847,
"flow/cos_sim": 0.7660536425843086,
"flow/improvement_ratio": 0.4823139966486796,
"flow/mag_ratio_mean": 0.7634640123746167,
"flow/mag_ratio_std": 0.17673859605778297,
"step": 16384
},
{
"epoch": 0.7567317906794143,
"eval_cos_loss": 0.23394639184485833,
"eval_loss": 0.7917164079674847,
"eval_mse_loss": 0.7917164079674847,
"eval_runtime": 37.6008,
"eval_samples_per_second": 744.478,
"eval_steps_per_second": 11.649,
"flow/cos_sim": 0.7660536425843086,
"flow/improvement_ratio": 0.4823139966486796,
"flow/mag_ratio_mean": 0.7634640123746167,
"flow/mag_ratio_std": 0.17673859605778297,
"step": 16384
},
{
"epoch": 0.7685557249087802,
"grad_norm": 0.2865261733531952,
"learning_rate": 0.00013266162372593872,
"loss": 0.8289151787757874,
"step": 16640
},
{
"epoch": 0.780379659138146,
"grad_norm": 0.32574427127838135,
"learning_rate": 0.00012002513550158511,
"loss": 0.8299338817596436,
"step": 16896
},
{
"epoch": 0.7922035933675119,
"grad_norm": 0.19136400520801544,
"learning_rate": 0.00010793858470858986,
"loss": 0.831389307975769,
"step": 17152
},
{
"epoch": 0.8040275275968778,
"grad_norm": 0.4910804331302643,
"learning_rate": 9.641946420587128e-05,
"loss": 0.8310226798057556,
"step": 17408
},
{
"epoch": 0.8040275275968778,
"eval_cos_loss": 0.23344004797200635,
"eval_loss": 0.7905207032754541,
"eval_mse_loss": 0.7905207032754541,
"flow/cos_sim": 0.7665599775096598,
"flow/improvement_ratio": 0.48091149343747525,
"flow/mag_ratio_mean": 0.7604631242686755,
"flow/mag_ratio_std": 0.1777392937440306,
"step": 17408
},
{
"epoch": 0.8040275275968778,
"eval_cos_loss": 0.23344004797200635,
"eval_loss": 0.7905207032754541,
"eval_mse_loss": 0.7905207032754541,
"eval_runtime": 37.5059,
"eval_samples_per_second": 746.363,
"eval_steps_per_second": 11.678,
"flow/cos_sim": 0.7665599775096598,
"flow/improvement_ratio": 0.48091149343747525,
"flow/mag_ratio_mean": 0.7604631242686755,
"flow/mag_ratio_std": 0.1777392937440306,
"step": 17408
},
{
"epoch": 0.8158514618262436,
"grad_norm": 0.41079503297805786,
"learning_rate": 8.548444561077174e-05,
"loss": 0.831270694732666,
"step": 17664
},
{
"epoch": 0.8276753960556095,
"grad_norm": 0.20108859241008759,
"learning_rate": 7.514935517023558e-05,
"loss": 0.8281430006027222,
"step": 17920
},
{
"epoch": 0.8394993302849753,
"grad_norm": 0.22395165264606476,
"learning_rate": 6.542915085548828e-05,
"loss": 0.8286972045898438,
"step": 18176
},
{
"epoch": 0.8513232645143411,
"grad_norm": 0.12410369515419006,
"learning_rate": 5.6337900713373745e-05,
"loss": 0.828626275062561,
"step": 18432
},
{
"epoch": 0.8513232645143411,
"eval_cos_loss": 0.23364517943227672,
"eval_loss": 0.7902474757198874,
"eval_mse_loss": 0.7902474757198874,
"flow/cos_sim": 0.7663548270316973,
"flow/improvement_ratio": 0.4790440064655047,
"flow/mag_ratio_mean": 0.7584420379982691,
"flow/mag_ratio_std": 0.17626052373620474,
"step": 18432
},
{
"epoch": 0.8513232645143411,
"eval_cos_loss": 0.23364517943227672,
"eval_loss": 0.7902474757198874,
"eval_mse_loss": 0.7902474757198874,
"eval_runtime": 37.4969,
"eval_samples_per_second": 746.541,
"eval_steps_per_second": 11.681,
"flow/cos_sim": 0.7663548270316973,
"flow/improvement_ratio": 0.4790440064655047,
"flow/mag_ratio_mean": 0.7584420379982691,
"flow/mag_ratio_std": 0.17626052373620474,
"step": 18432
},
{
"epoch": 0.8631471987437069,
"grad_norm": 0.15060371160507202,
"learning_rate": 4.78887625056757e-05,
"loss": 0.8286436796188354,
"step": 18688
},
{
"epoch": 0.8749711329730728,
"grad_norm": 0.1537846028804779,
"learning_rate": 4.012312516433581e-05,
"loss": 0.8254930377006531,
"step": 18944
},
{
"epoch": 0.8867950672024387,
"grad_norm": 0.4253118336200714,
"learning_rate": 3.2991328308497545e-05,
"loss": 0.8294442296028137,
"step": 19200
},
{
"epoch": 0.8986190014318045,
"grad_norm": 0.17603912949562073,
"learning_rate": 2.653543286964183e-05,
"loss": 0.8307968378067017,
"step": 19456
},
{
"epoch": 0.8986190014318045,
"eval_cos_loss": 0.2343874951679957,
"eval_loss": 0.7920145923144197,
"eval_mse_loss": 0.7920145923144197,
"flow/cos_sim": 0.7656125166372622,
"flow/improvement_ratio": 0.4853253698648383,
"flow/mag_ratio_mean": 0.7600582418920787,
"flow/mag_ratio_std": 0.17680819529920952,
"step": 19456
},
{
"epoch": 0.8986190014318045,
"eval_cos_loss": 0.2343874951679957,
"eval_loss": 0.7920145923144197,
"eval_mse_loss": 0.7920145923144197,
"eval_runtime": 37.6789,
"eval_samples_per_second": 742.935,
"eval_steps_per_second": 11.625,
"flow/cos_sim": 0.7656125166372622,
"flow/improvement_ratio": 0.4853253698648383,
"flow/mag_ratio_mean": 0.7600582418920787,
"flow/mag_ratio_std": 0.17680819529920952,
"step": 19456
},
{
"epoch": 0.9104429356611704,
"grad_norm": 0.13290032744407654,
"learning_rate": 2.076478246200819e-05,
"loss": 0.8278121948242188,
"step": 19712
},
{
"epoch": 0.9222668698905362,
"grad_norm": 0.20168907940387726,
"learning_rate": 1.5687728945045944e-05,
"loss": 0.8273869156837463,
"step": 19968
},
{
"epoch": 0.9340908041199021,
"grad_norm": 0.20389237999916077,
"learning_rate": 1.1311620335770879e-05,
"loss": 0.831144392490387,
"step": 20224
},
{
"epoch": 0.945914738349268,
"grad_norm": 0.26355621218681335,
"learning_rate": 7.642790173984836e-06,
"loss": 0.8322795629501343,
"step": 20480
},
{
"epoch": 0.945914738349268,
"eval_cos_loss": 0.23226006676072944,
"eval_loss": 0.7871269678960652,
"eval_mse_loss": 0.7871269678960652,
"flow/cos_sim": 0.7677399510662305,
"flow/improvement_ratio": 0.4782098607249456,
"flow/mag_ratio_mean": 0.76152302032192,
"flow/mag_ratio_std": 0.1765111445632155,
"step": 20480
},
{
"epoch": 0.945914738349268,
"eval_cos_loss": 0.23226006676072944,
"eval_loss": 0.7871269678960652,
"eval_mse_loss": 0.7871269678960652,
"eval_runtime": 37.4829,
"eval_samples_per_second": 746.822,
"eval_steps_per_second": 11.685,
"flow/cos_sim": 0.7677399510662305,
"flow/improvement_ratio": 0.4782098607249456,
"flow/mag_ratio_mean": 0.76152302032192,
"flow/mag_ratio_std": 0.1765111445632155,
"step": 20480
},
{
"epoch": 0.9577386725786338,
"grad_norm": 0.35688987374305725,
"learning_rate": 4.686548355746001e-06,
"loss": 0.8279744386672974,
"step": 20736
},
{
"epoch": 0.9695626068079997,
"grad_norm": 0.15006007254123688,
"learning_rate": 2.447173448359541e-06,
"loss": 0.8288099765777588,
"step": 20992
},
{
"epoch": 0.9813865410373654,
"grad_norm": 0.13515928387641907,
"learning_rate": 9.324369820445933e-07,
"loss": 0.8292235136032104,
"step": 21248
},
{
"epoch": 0.9932104752667313,
"grad_norm": 0.19579511880874634,
"learning_rate": 1.3265238678672464e-07,
"loss": 0.8291770219802856,
"step": 21504
},
{
"epoch": 0.9932104752667313,
"eval_cos_loss": 0.23236965187336212,
"eval_loss": 0.7870777566411179,
"eval_mse_loss": 0.7870777566411179,
"flow/cos_sim": 0.7676303687433129,
"flow/improvement_ratio": 0.48667068133071134,
"flow/mag_ratio_mean": 0.7602007927959913,
"flow/mag_ratio_std": 0.17566703702216824,
"step": 21504
},
{
"epoch": 0.9932104752667313,
"eval_cos_loss": 0.23236965187336212,
"eval_loss": 0.7870777566411179,
"eval_mse_loss": 0.7870777566411179,
"eval_runtime": 37.3306,
"eval_samples_per_second": 749.867,
"eval_steps_per_second": 11.733,
"flow/cos_sim": 0.7676303687433129,
"flow/improvement_ratio": 0.48667068133071134,
"flow/mag_ratio_mean": 0.7602007927959913,
"flow/mag_ratio_std": 0.17566703702216824,
"step": 21504
}
],
"logging_steps": 256,
"max_steps": 21651,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1024,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}