{ "best_global_step": 59997, "best_metric": 0.8794049695261634, "best_model_checkpoint": "./doc_type_v1_primary_model_multilingual-e5-small/checkpoint-59997", "epoch": 3.0, "eval_steps": 500, "global_step": 59997, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025001250062503127, "grad_norm": 6.811492443084717, "learning_rate": 4.958414587396037e-05, "loss": 1.2379, "step": 500 }, { "epoch": 0.050002500125006254, "grad_norm": 27.201915740966797, "learning_rate": 4.916745837291865e-05, "loss": 0.8651, "step": 1000 }, { "epoch": 0.07500375018750938, "grad_norm": 38.990150451660156, "learning_rate": 4.875077087187693e-05, "loss": 0.7379, "step": 1500 }, { "epoch": 0.10000500025001251, "grad_norm": 7.001402378082275, "learning_rate": 4.833408337083521e-05, "loss": 0.7292, "step": 2000 }, { "epoch": 0.12500625031251564, "grad_norm": 13.325078010559082, "learning_rate": 4.791739586979349e-05, "loss": 0.696, "step": 2500 }, { "epoch": 0.15000750037501875, "grad_norm": 8.727783203125, "learning_rate": 4.7500708368751775e-05, "loss": 0.711, "step": 3000 }, { "epoch": 0.17500875043752187, "grad_norm": 16.32622528076172, "learning_rate": 4.708402086771005e-05, "loss": 0.6598, "step": 3500 }, { "epoch": 0.20001000050002501, "grad_norm": 7.856052875518799, "learning_rate": 4.6667333366668335e-05, "loss": 0.6057, "step": 4000 }, { "epoch": 0.22501125056252813, "grad_norm": 7.5791335105896, "learning_rate": 4.625064586562662e-05, "loss": 0.585, "step": 4500 }, { "epoch": 0.2500125006250313, "grad_norm": 10.478697776794434, "learning_rate": 4.58339583645849e-05, "loss": 0.5894, "step": 5000 }, { "epoch": 0.27501375068753436, "grad_norm": 12.097478866577148, "learning_rate": 4.541727086354318e-05, "loss": 0.5759, "step": 5500 }, { "epoch": 0.3000150007500375, "grad_norm": 9.861705780029297, "learning_rate": 4.5000583362501456e-05, "loss": 0.5605, "step": 6000 }, { "epoch": 0.32501625081254065, "grad_norm": 23.986101150512695, "learning_rate": 4.458389586145974e-05, "loss": 0.5548, "step": 6500 }, { "epoch": 0.35001750087504374, "grad_norm": 28.77222442626953, "learning_rate": 4.416720836041802e-05, "loss": 0.5508, "step": 7000 }, { "epoch": 0.3750187509375469, "grad_norm": 5.619863510131836, "learning_rate": 4.3750520859376306e-05, "loss": 0.5182, "step": 7500 }, { "epoch": 0.40002000100005003, "grad_norm": 5.042713642120361, "learning_rate": 4.333383335833458e-05, "loss": 0.5597, "step": 8000 }, { "epoch": 0.4250212510625531, "grad_norm": 8.755766868591309, "learning_rate": 4.291714585729287e-05, "loss": 0.5342, "step": 8500 }, { "epoch": 0.45002250112505626, "grad_norm": 9.902745246887207, "learning_rate": 4.250045835625115e-05, "loss": 0.5154, "step": 9000 }, { "epoch": 0.47502375118755935, "grad_norm": 18.608970642089844, "learning_rate": 4.2083770855209434e-05, "loss": 0.5101, "step": 9500 }, { "epoch": 0.5000250012500626, "grad_norm": 7.529025077819824, "learning_rate": 4.166708335416771e-05, "loss": 0.5153, "step": 10000 }, { "epoch": 0.5250262513125656, "grad_norm": 4.368379592895508, "learning_rate": 4.125039585312599e-05, "loss": 0.4962, "step": 10500 }, { "epoch": 0.5500275013750687, "grad_norm": 5.460580348968506, "learning_rate": 4.083370835208427e-05, "loss": 0.5055, "step": 11000 }, { "epoch": 0.5750287514375719, "grad_norm": 17.047475814819336, "learning_rate": 4.0417020851042555e-05, "loss": 0.5289, "step": 11500 }, { "epoch": 0.600030001500075, "grad_norm": 8.671713829040527, "learning_rate": 4.000033335000083e-05, "loss": 0.5024, "step": 12000 }, { "epoch": 0.6250312515625781, "grad_norm": 13.457786560058594, "learning_rate": 3.9583645848959115e-05, "loss": 0.481, "step": 12500 }, { "epoch": 0.6500325016250813, "grad_norm": 1.2953449487686157, "learning_rate": 3.91669583479174e-05, "loss": 0.4843, "step": 13000 }, { "epoch": 0.6750337516875844, "grad_norm": 0.7997756004333496, "learning_rate": 3.875027084687568e-05, "loss": 0.4519, "step": 13500 }, { "epoch": 0.7000350017500875, "grad_norm": 5.482394218444824, "learning_rate": 3.833358334583396e-05, "loss": 0.4829, "step": 14000 }, { "epoch": 0.7250362518125907, "grad_norm": 13.774917602539062, "learning_rate": 3.791689584479224e-05, "loss": 0.4746, "step": 14500 }, { "epoch": 0.7500375018750938, "grad_norm": 7.448230743408203, "learning_rate": 3.750020834375052e-05, "loss": 0.5123, "step": 15000 }, { "epoch": 0.7750387519375969, "grad_norm": 11.042137145996094, "learning_rate": 3.70835208427088e-05, "loss": 0.5058, "step": 15500 }, { "epoch": 0.8000400020001001, "grad_norm": 9.733834266662598, "learning_rate": 3.666683334166709e-05, "loss": 0.453, "step": 16000 }, { "epoch": 0.8250412520626031, "grad_norm": 9.845958709716797, "learning_rate": 3.6250145840625363e-05, "loss": 0.4604, "step": 16500 }, { "epoch": 0.8500425021251062, "grad_norm": 24.350200653076172, "learning_rate": 3.583345833958365e-05, "loss": 0.4689, "step": 17000 }, { "epoch": 0.8750437521876093, "grad_norm": 25.33015251159668, "learning_rate": 3.541677083854193e-05, "loss": 0.4689, "step": 17500 }, { "epoch": 0.9000450022501125, "grad_norm": 3.9758975505828857, "learning_rate": 3.5000083337500214e-05, "loss": 0.4704, "step": 18000 }, { "epoch": 0.9250462523126156, "grad_norm": 5.771573066711426, "learning_rate": 3.458339583645849e-05, "loss": 0.4367, "step": 18500 }, { "epoch": 0.9500475023751187, "grad_norm": 5.9610819816589355, "learning_rate": 3.4166708335416775e-05, "loss": 0.451, "step": 19000 }, { "epoch": 0.9750487524376219, "grad_norm": 13.63049030303955, "learning_rate": 3.375002083437505e-05, "loss": 0.4538, "step": 19500 }, { "epoch": 1.0, "eval_f1": 0.8655922269409259, "eval_loss": 0.4387092590332031, "eval_runtime": 10.5773, "eval_samples_per_second": 1891.79, "eval_steps_per_second": 236.545, "step": 19999 }, { "epoch": 1.000050002500125, "grad_norm": 5.536550045013428, "learning_rate": 3.3333333333333335e-05, "loss": 0.4367, "step": 20000 }, { "epoch": 1.025051252562628, "grad_norm": 16.51177406311035, "learning_rate": 3.291664583229161e-05, "loss": 0.3614, "step": 20500 }, { "epoch": 1.0500525026251313, "grad_norm": 20.4051570892334, "learning_rate": 3.2499958331249895e-05, "loss": 0.3757, "step": 21000 }, { "epoch": 1.0750537526876345, "grad_norm": 4.4692912101745605, "learning_rate": 3.208327083020818e-05, "loss": 0.3197, "step": 21500 }, { "epoch": 1.1000550027501375, "grad_norm": 9.222668647766113, "learning_rate": 3.166658332916646e-05, "loss": 0.3649, "step": 22000 }, { "epoch": 1.1250562528126407, "grad_norm": 20.703449249267578, "learning_rate": 3.124989582812474e-05, "loss": 0.3736, "step": 22500 }, { "epoch": 1.1500575028751436, "grad_norm": 10.590250015258789, "learning_rate": 3.083320832708302e-05, "loss": 0.3325, "step": 23000 }, { "epoch": 1.1750587529376468, "grad_norm": 39.34541320800781, "learning_rate": 3.0416520826041306e-05, "loss": 0.3472, "step": 23500 }, { "epoch": 1.20006000300015, "grad_norm": 9.01701545715332, "learning_rate": 2.9999833324999583e-05, "loss": 0.3513, "step": 24000 }, { "epoch": 1.2250612530626532, "grad_norm": 20.631200790405273, "learning_rate": 2.9583145823957863e-05, "loss": 0.3699, "step": 24500 }, { "epoch": 1.2500625031251562, "grad_norm": 0.09197826683521271, "learning_rate": 2.9166458322916147e-05, "loss": 0.3847, "step": 25000 }, { "epoch": 1.2750637531876594, "grad_norm": 5.867861747741699, "learning_rate": 2.8749770821874427e-05, "loss": 0.3252, "step": 25500 }, { "epoch": 1.3000650032501624, "grad_norm": 21.79376220703125, "learning_rate": 2.833308332083271e-05, "loss": 0.3573, "step": 26000 }, { "epoch": 1.3250662533126656, "grad_norm": 10.486989974975586, "learning_rate": 2.791639581979099e-05, "loss": 0.3704, "step": 26500 }, { "epoch": 1.3500675033751688, "grad_norm": 10.07336139678955, "learning_rate": 2.7499708318749275e-05, "loss": 0.3269, "step": 27000 }, { "epoch": 1.375068753437672, "grad_norm": 0.7917349338531494, "learning_rate": 2.7083020817707555e-05, "loss": 0.3637, "step": 27500 }, { "epoch": 1.400070003500175, "grad_norm": 4.454699993133545, "learning_rate": 2.666633331666584e-05, "loss": 0.3503, "step": 28000 }, { "epoch": 1.4250712535626782, "grad_norm": 4.157910346984863, "learning_rate": 2.6249645815624112e-05, "loss": 0.3503, "step": 28500 }, { "epoch": 1.4500725036251811, "grad_norm": 0.06325356662273407, "learning_rate": 2.5832958314582395e-05, "loss": 0.3246, "step": 29000 }, { "epoch": 1.4750737536876843, "grad_norm": 19.81574058532715, "learning_rate": 2.5416270813540676e-05, "loss": 0.3507, "step": 29500 }, { "epoch": 1.5000750037501875, "grad_norm": 0.12496486306190491, "learning_rate": 2.499958331249896e-05, "loss": 0.3274, "step": 30000 }, { "epoch": 1.5250762538126907, "grad_norm": 14.442770004272461, "learning_rate": 2.458289581145724e-05, "loss": 0.3926, "step": 30500 }, { "epoch": 1.5500775038751937, "grad_norm": 30.787338256835938, "learning_rate": 2.4166208310415523e-05, "loss": 0.3445, "step": 31000 }, { "epoch": 1.575078753937697, "grad_norm": 16.105411529541016, "learning_rate": 2.3749520809373803e-05, "loss": 0.3397, "step": 31500 }, { "epoch": 1.6000800040002, "grad_norm": 0.16195891797542572, "learning_rate": 2.3332833308332083e-05, "loss": 0.3337, "step": 32000 }, { "epoch": 1.625081254062703, "grad_norm": 5.337989807128906, "learning_rate": 2.2916145807290363e-05, "loss": 0.3398, "step": 32500 }, { "epoch": 1.6500825041252063, "grad_norm": 12.693595886230469, "learning_rate": 2.2499458306248647e-05, "loss": 0.3457, "step": 33000 }, { "epoch": 1.6750837541877095, "grad_norm": 0.6089347004890442, "learning_rate": 2.2082770805206927e-05, "loss": 0.3252, "step": 33500 }, { "epoch": 1.7000850042502125, "grad_norm": 21.343097686767578, "learning_rate": 2.166608330416521e-05, "loss": 0.3691, "step": 34000 }, { "epoch": 1.7250862543127157, "grad_norm": 7.946640968322754, "learning_rate": 2.124939580312349e-05, "loss": 0.3334, "step": 34500 }, { "epoch": 1.7500875043752186, "grad_norm": 7.454244613647461, "learning_rate": 2.083270830208177e-05, "loss": 0.3363, "step": 35000 }, { "epoch": 1.7750887544377218, "grad_norm": 0.1911257654428482, "learning_rate": 2.0416020801040055e-05, "loss": 0.3454, "step": 35500 }, { "epoch": 1.800090004500225, "grad_norm": 13.01964282989502, "learning_rate": 1.9999333299998335e-05, "loss": 0.3189, "step": 36000 }, { "epoch": 1.8250912545627282, "grad_norm": 0.19861529767513275, "learning_rate": 1.9582645798956615e-05, "loss": 0.3422, "step": 36500 }, { "epoch": 1.8500925046252312, "grad_norm": 19.57059097290039, "learning_rate": 1.9165958297914895e-05, "loss": 0.3355, "step": 37000 }, { "epoch": 1.8750937546877344, "grad_norm": 4.671505451202393, "learning_rate": 1.874927079687318e-05, "loss": 0.3195, "step": 37500 }, { "epoch": 1.9000950047502374, "grad_norm": 13.636198997497559, "learning_rate": 1.833258329583146e-05, "loss": 0.2937, "step": 38000 }, { "epoch": 1.9250962548127406, "grad_norm": 3.3196206092834473, "learning_rate": 1.7915895794789743e-05, "loss": 0.3382, "step": 38500 }, { "epoch": 1.9500975048752438, "grad_norm": 10.638530731201172, "learning_rate": 1.749920829374802e-05, "loss": 0.3509, "step": 39000 }, { "epoch": 1.975098754937747, "grad_norm": 25.718746185302734, "learning_rate": 1.7082520792706303e-05, "loss": 0.3244, "step": 39500 }, { "epoch": 2.0, "eval_f1": 0.8739136212588781, "eval_loss": 0.5150496959686279, "eval_runtime": 10.5221, "eval_samples_per_second": 1901.71, "eval_steps_per_second": 237.785, "step": 39998 }, { "epoch": 2.00010000500025, "grad_norm": 5.354879856109619, "learning_rate": 1.6665833291664583e-05, "loss": 0.3325, "step": 40000 }, { "epoch": 2.025101255062753, "grad_norm": 26.597732543945312, "learning_rate": 1.6249145790622867e-05, "loss": 0.2202, "step": 40500 }, { "epoch": 2.050102505125256, "grad_norm": 19.418195724487305, "learning_rate": 1.5832458289581144e-05, "loss": 0.2126, "step": 41000 }, { "epoch": 2.0751037551877594, "grad_norm": 30.896820068359375, "learning_rate": 1.5415770788539427e-05, "loss": 0.1978, "step": 41500 }, { "epoch": 2.1001050052502626, "grad_norm": 33.62570571899414, "learning_rate": 1.4999083287497709e-05, "loss": 0.2235, "step": 42000 }, { "epoch": 2.1251062553127658, "grad_norm": 34.06229782104492, "learning_rate": 1.4582395786455991e-05, "loss": 0.2285, "step": 42500 }, { "epoch": 2.150107505375269, "grad_norm": 7.419727325439453, "learning_rate": 1.4165708285414273e-05, "loss": 0.2114, "step": 43000 }, { "epoch": 2.1751087554377717, "grad_norm": 0.09190714359283447, "learning_rate": 1.3749020784372551e-05, "loss": 0.2401, "step": 43500 }, { "epoch": 2.200110005500275, "grad_norm": 3.367650032043457, "learning_rate": 1.3332333283330833e-05, "loss": 0.2316, "step": 44000 }, { "epoch": 2.225111255562778, "grad_norm": 1.1675509214401245, "learning_rate": 1.2915645782289115e-05, "loss": 0.2356, "step": 44500 }, { "epoch": 2.2501125056252813, "grad_norm": 9.24765682220459, "learning_rate": 1.2498958281247395e-05, "loss": 0.2265, "step": 45000 }, { "epoch": 2.2751137556877845, "grad_norm": 49.01567459106445, "learning_rate": 1.2082270780205677e-05, "loss": 0.2156, "step": 45500 }, { "epoch": 2.3001150057502873, "grad_norm": 1.4991744756698608, "learning_rate": 1.1665583279163959e-05, "loss": 0.1985, "step": 46000 }, { "epoch": 2.3251162558127905, "grad_norm": 23.972734451293945, "learning_rate": 1.124889577812224e-05, "loss": 0.2341, "step": 46500 }, { "epoch": 2.3501175058752937, "grad_norm": 0.17075876891613007, "learning_rate": 1.0832208277080521e-05, "loss": 0.2253, "step": 47000 }, { "epoch": 2.375118755937797, "grad_norm": 31.144304275512695, "learning_rate": 1.0415520776038801e-05, "loss": 0.2155, "step": 47500 }, { "epoch": 2.4001200060003, "grad_norm": 0.0451333224773407, "learning_rate": 9.998833274997083e-06, "loss": 0.1964, "step": 48000 }, { "epoch": 2.4251212560628033, "grad_norm": 0.07999496906995773, "learning_rate": 9.582145773955365e-06, "loss": 0.2406, "step": 48500 }, { "epoch": 2.4501225061253065, "grad_norm": 0.9701845049858093, "learning_rate": 9.165458272913647e-06, "loss": 0.2345, "step": 49000 }, { "epoch": 2.475123756187809, "grad_norm": 1.0465730428695679, "learning_rate": 8.748770771871927e-06, "loss": 0.2179, "step": 49500 }, { "epoch": 2.5001250062503124, "grad_norm": 0.01202826015651226, "learning_rate": 8.332083270830209e-06, "loss": 0.2076, "step": 50000 }, { "epoch": 2.5251262563128156, "grad_norm": 0.02468780241906643, "learning_rate": 7.915395769788491e-06, "loss": 0.2387, "step": 50500 }, { "epoch": 2.550127506375319, "grad_norm": 2.958998918533325, "learning_rate": 7.498708268746771e-06, "loss": 0.2114, "step": 51000 }, { "epoch": 2.575128756437822, "grad_norm": 0.1065281331539154, "learning_rate": 7.082020767705053e-06, "loss": 0.1916, "step": 51500 }, { "epoch": 2.6001300065003248, "grad_norm": 34.561431884765625, "learning_rate": 6.665333266663333e-06, "loss": 0.2074, "step": 52000 }, { "epoch": 2.625131256562828, "grad_norm": 0.007779615931212902, "learning_rate": 6.248645765621615e-06, "loss": 0.2133, "step": 52500 }, { "epoch": 2.650132506625331, "grad_norm": 0.13916213810443878, "learning_rate": 5.831958264579896e-06, "loss": 0.2301, "step": 53000 }, { "epoch": 2.6751337566878344, "grad_norm": 42.24327087402344, "learning_rate": 5.415270763538177e-06, "loss": 0.2216, "step": 53500 }, { "epoch": 2.7001350067503376, "grad_norm": 7.163196563720703, "learning_rate": 4.998583262496458e-06, "loss": 0.2313, "step": 54000 }, { "epoch": 2.7251362568128408, "grad_norm": 0.8102510571479797, "learning_rate": 4.581895761454739e-06, "loss": 0.1916, "step": 54500 }, { "epoch": 2.750137506875344, "grad_norm": 7.8397417068481445, "learning_rate": 4.16520826041302e-06, "loss": 0.2055, "step": 55000 }, { "epoch": 2.7751387569378467, "grad_norm": 1.0101325511932373, "learning_rate": 3.7485207593713018e-06, "loss": 0.2059, "step": 55500 }, { "epoch": 2.80014000700035, "grad_norm": 0.05691508203744888, "learning_rate": 3.3318332583295837e-06, "loss": 0.2021, "step": 56000 }, { "epoch": 2.825141257062853, "grad_norm": 0.08343327045440674, "learning_rate": 2.9151457572878643e-06, "loss": 0.2075, "step": 56500 }, { "epoch": 2.8501425071253563, "grad_norm": 0.06794146448373795, "learning_rate": 2.4984582562461457e-06, "loss": 0.1644, "step": 57000 }, { "epoch": 2.8751437571878595, "grad_norm": 0.023177076131105423, "learning_rate": 2.081770755204427e-06, "loss": 0.2023, "step": 57500 }, { "epoch": 2.9001450072503623, "grad_norm": 0.15367339551448822, "learning_rate": 1.6650832541627082e-06, "loss": 0.2175, "step": 58000 }, { "epoch": 2.9251462573128655, "grad_norm": 0.029408954083919525, "learning_rate": 1.2483957531209895e-06, "loss": 0.2073, "step": 58500 }, { "epoch": 2.9501475073753687, "grad_norm": 2.278526782989502, "learning_rate": 8.317082520792706e-07, "loss": 0.2154, "step": 59000 }, { "epoch": 2.975148757437872, "grad_norm": 15.807899475097656, "learning_rate": 4.150207510375519e-07, "loss": 0.2132, "step": 59500 }, { "epoch": 3.0, "eval_f1": 0.8794049695261634, "eval_loss": 0.6096033453941345, "eval_runtime": 10.6241, "eval_samples_per_second": 1883.457, "eval_steps_per_second": 235.503, "step": 59997 }, { "epoch": 3.0, "step": 59997, "total_flos": 7906263495201792.0, "train_loss": 0.3725671201518653, "train_runtime": 1624.1756, "train_samples_per_second": 295.512, "train_steps_per_second": 36.94 } ], "logging_steps": 500, "max_steps": 59997, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7906263495201792.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }