{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 100, "global_step": 5048, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07923930269413629, "grad_norm": 0.36957958340644836, "learning_rate": 9.801980198019804e-06, "loss": 1.0261, "step": 100 }, { "epoch": 0.07923930269413629, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9996037483215332, "eval_loss": 0.009768067859113216, "eval_runtime": 34.4825, "eval_samples_per_second": 146.364, "eval_steps_per_second": 4.582, "step": 100 }, { "epoch": 0.15847860538827258, "grad_norm": 0.15150195360183716, "learning_rate": 1.9702970297029703e-05, "loss": 0.0086, "step": 200 }, { "epoch": 0.15847860538827258, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.999405562877655, "eval_loss": 0.006445930805057287, "eval_runtime": 33.42, "eval_samples_per_second": 151.017, "eval_steps_per_second": 4.728, "step": 200 }, { "epoch": 0.23771790808240886, "grad_norm": 0.24285651743412018, "learning_rate": 2.9603960396039603e-05, "loss": 0.0043, "step": 300 }, { "epoch": 0.23771790808240886, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9996037483215332, "eval_loss": 0.00557651836425066, "eval_runtime": 32.8039, "eval_samples_per_second": 153.854, "eval_steps_per_second": 4.816, "step": 300 }, { "epoch": 0.31695721077654515, "grad_norm": 0.061441030353307724, "learning_rate": 3.950495049504951e-05, "loss": 0.0051, "step": 400 }, { "epoch": 0.31695721077654515, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.999405562877655, "eval_loss": 0.005679984577000141, "eval_runtime": 32.9416, "eval_samples_per_second": 153.21, "eval_steps_per_second": 4.796, "step": 400 }, { "epoch": 0.39619651347068147, "grad_norm": 0.09473396837711334, "learning_rate": 4.9405940594059405e-05, "loss": 0.0068, "step": 500 }, { "epoch": 0.39619651347068147, "eval_Embedding_Dataset_Dev_cosine_accuracy": 1.0, "eval_loss": 0.008005364798009396, "eval_runtime": 32.9704, "eval_samples_per_second": 153.077, "eval_steps_per_second": 4.792, "step": 500 }, { "epoch": 0.4754358161648177, "grad_norm": 2.07448148727417, "learning_rate": 4.8965441338322695e-05, "loss": 0.0078, "step": 600 }, { "epoch": 0.4754358161648177, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9998018741607666, "eval_loss": 0.007311029359698296, "eval_runtime": 33.4995, "eval_samples_per_second": 150.659, "eval_steps_per_second": 4.716, "step": 600 }, { "epoch": 0.554675118858954, "grad_norm": 0.44440600275993347, "learning_rate": 4.7864847017389395e-05, "loss": 0.0066, "step": 700 }, { "epoch": 0.554675118858954, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.999405562877655, "eval_loss": 0.00900042150169611, "eval_runtime": 33.8286, "eval_samples_per_second": 149.193, "eval_steps_per_second": 4.671, "step": 700 }, { "epoch": 0.6339144215530903, "grad_norm": 0.06913918256759644, "learning_rate": 4.676425269645609e-05, "loss": 0.0081, "step": 800 }, { "epoch": 0.6339144215530903, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9996037483215332, "eval_loss": 0.0071762725710868835, "eval_runtime": 33.3998, "eval_samples_per_second": 151.109, "eval_steps_per_second": 4.731, "step": 800 }, { "epoch": 0.7131537242472267, "grad_norm": 0.7136378884315491, "learning_rate": 4.566365837552278e-05, "loss": 0.0096, "step": 900 }, { "epoch": 0.7131537242472267, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.999405562877655, "eval_loss": 0.008276536129415035, "eval_runtime": 33.755, "eval_samples_per_second": 149.519, "eval_steps_per_second": 4.681, "step": 900 }, { "epoch": 0.7923930269413629, "grad_norm": 0.5450271964073181, "learning_rate": 4.456306405458948e-05, "loss": 0.0088, "step": 1000 }, { "epoch": 0.7923930269413629, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.999405562877655, "eval_loss": 0.006255414802581072, "eval_runtime": 33.084, "eval_samples_per_second": 152.551, "eval_steps_per_second": 4.776, "step": 1000 }, { "epoch": 0.8716323296354992, "grad_norm": 0.026704631745815277, "learning_rate": 4.346246973365617e-05, "loss": 0.0087, "step": 1100 }, { "epoch": 0.8716323296354992, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.999405562877655, "eval_loss": 0.006418165750801563, "eval_runtime": 32.6589, "eval_samples_per_second": 154.537, "eval_steps_per_second": 4.838, "step": 1100 }, { "epoch": 0.9508716323296355, "grad_norm": 2.8012053966522217, "learning_rate": 4.236187541272287e-05, "loss": 0.0096, "step": 1200 }, { "epoch": 0.9508716323296355, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9996037483215332, "eval_loss": 0.006508174352347851, "eval_runtime": 33.0205, "eval_samples_per_second": 152.844, "eval_steps_per_second": 4.785, "step": 1200 }, { "epoch": 1.0301109350237718, "grad_norm": 0.059912703931331635, "learning_rate": 4.1261281091789564e-05, "loss": 0.0085, "step": 1300 }, { "epoch": 1.0301109350237718, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9998018741607666, "eval_loss": 0.005652877036482096, "eval_runtime": 32.9331, "eval_samples_per_second": 153.25, "eval_steps_per_second": 4.798, "step": 1300 }, { "epoch": 1.109350237717908, "grad_norm": 0.039631109684705734, "learning_rate": 4.0160686770856264e-05, "loss": 0.0049, "step": 1400 }, { "epoch": 1.109350237717908, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9998018741607666, "eval_loss": 0.004980658181011677, "eval_runtime": 34.2128, "eval_samples_per_second": 147.518, "eval_steps_per_second": 4.618, "step": 1400 }, { "epoch": 1.1885895404120443, "grad_norm": 0.02145099826157093, "learning_rate": 3.9060092449922957e-05, "loss": 0.0048, "step": 1500 }, { "epoch": 1.1885895404120443, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9996037483215332, "eval_loss": 0.005376764573156834, "eval_runtime": 32.9215, "eval_samples_per_second": 153.304, "eval_steps_per_second": 4.799, "step": 1500 }, { "epoch": 1.2678288431061806, "grad_norm": 1.3420361280441284, "learning_rate": 3.7959498128989656e-05, "loss": 0.0032, "step": 1600 }, { "epoch": 1.2678288431061806, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9998018741607666, "eval_loss": 0.005436555948108435, "eval_runtime": 66.7901, "eval_samples_per_second": 75.565, "eval_steps_per_second": 2.366, "step": 1600 }, { "epoch": 1.3470681458003169, "grad_norm": 0.040875934064388275, "learning_rate": 3.685890380805635e-05, "loss": 0.0017, "step": 1700 }, { "epoch": 1.3470681458003169, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.999405562877655, "eval_loss": 0.006017347332090139, "eval_runtime": 67.8654, "eval_samples_per_second": 74.368, "eval_steps_per_second": 2.328, "step": 1700 }, { "epoch": 1.4263074484944531, "grad_norm": 0.030839553102850914, "learning_rate": 3.575830948712305e-05, "loss": 0.0032, "step": 1800 }, { "epoch": 1.4263074484944531, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.999405562877655, "eval_loss": 0.0059111895971000195, "eval_runtime": 67.895, "eval_samples_per_second": 74.335, "eval_steps_per_second": 2.327, "step": 1800 }, { "epoch": 1.5055467511885894, "grad_norm": 0.06008416414260864, "learning_rate": 3.465771516618974e-05, "loss": 0.0072, "step": 1900 }, { "epoch": 1.5055467511885894, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9996037483215332, "eval_loss": 0.006080140359699726, "eval_runtime": 67.931, "eval_samples_per_second": 74.296, "eval_steps_per_second": 2.326, "step": 1900 }, { "epoch": 1.5847860538827259, "grad_norm": 0.02880307100713253, "learning_rate": 3.355712084525644e-05, "loss": 0.0077, "step": 2000 }, { "epoch": 1.5847860538827259, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.999405562877655, "eval_loss": 0.007432046812027693, "eval_runtime": 67.8369, "eval_samples_per_second": 74.399, "eval_steps_per_second": 2.329, "step": 2000 }, { "epoch": 1.6640253565768621, "grad_norm": 0.1070467010140419, "learning_rate": 3.245652652432314e-05, "loss": 0.0068, "step": 2100 }, { "epoch": 1.6640253565768621, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9952446818351746, "eval_loss": 0.08792955428361893, "eval_runtime": 67.8539, "eval_samples_per_second": 74.38, "eval_steps_per_second": 2.329, "step": 2100 }, { "epoch": 1.7432646592709984, "grad_norm": 0.05642708018422127, "learning_rate": 3.135593220338983e-05, "loss": 0.0056, "step": 2200 }, { "epoch": 1.7432646592709984, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9996037483215332, "eval_loss": 0.006125820800662041, "eval_runtime": 67.8381, "eval_samples_per_second": 74.398, "eval_steps_per_second": 2.329, "step": 2200 }, { "epoch": 1.8225039619651349, "grad_norm": 0.04333237186074257, "learning_rate": 3.0255337882456532e-05, "loss": 0.0087, "step": 2300 }, { "epoch": 1.8225039619651349, "eval_Embedding_Dataset_Dev_cosine_accuracy": 1.0, "eval_loss": 0.005210440140217543, "eval_runtime": 68.0258, "eval_samples_per_second": 74.192, "eval_steps_per_second": 2.323, "step": 2300 }, { "epoch": 1.9017432646592711, "grad_norm": 0.10529103130102158, "learning_rate": 2.9154743561523225e-05, "loss": 0.0112, "step": 2400 }, { "epoch": 1.9017432646592711, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9998018741607666, "eval_loss": 0.004964636173099279, "eval_runtime": 67.9671, "eval_samples_per_second": 74.257, "eval_steps_per_second": 2.325, "step": 2400 }, { "epoch": 1.9809825673534074, "grad_norm": 0.030080392956733704, "learning_rate": 2.805414924058992e-05, "loss": 0.0036, "step": 2500 }, { "epoch": 1.9809825673534074, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.999405562877655, "eval_loss": 0.003934075124561787, "eval_runtime": 67.9768, "eval_samples_per_second": 74.246, "eval_steps_per_second": 2.324, "step": 2500 }, { "epoch": 2.0602218700475436, "grad_norm": 0.05913154035806656, "learning_rate": 2.6953554919656613e-05, "loss": 0.0047, "step": 2600 }, { "epoch": 2.0602218700475436, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.999405562877655, "eval_loss": 0.004718616604804993, "eval_runtime": 68.1157, "eval_samples_per_second": 74.095, "eval_steps_per_second": 2.32, "step": 2600 }, { "epoch": 2.13946117274168, "grad_norm": 0.02771185152232647, "learning_rate": 2.5852960598723313e-05, "loss": 0.0054, "step": 2700 }, { "epoch": 2.13946117274168, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9998018741607666, "eval_loss": 0.007168024778366089, "eval_runtime": 67.8508, "eval_samples_per_second": 74.384, "eval_steps_per_second": 2.329, "step": 2700 }, { "epoch": 2.218700475435816, "grad_norm": 0.023437298834323883, "learning_rate": 2.4752366277790006e-05, "loss": 0.0052, "step": 2800 }, { "epoch": 2.218700475435816, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9998018741607666, "eval_loss": 0.004668584559112787, "eval_runtime": 67.8782, "eval_samples_per_second": 74.354, "eval_steps_per_second": 2.328, "step": 2800 }, { "epoch": 2.2979397781299524, "grad_norm": 0.11916761100292206, "learning_rate": 2.36517719568567e-05, "loss": 0.0044, "step": 2900 }, { "epoch": 2.2979397781299524, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9996037483215332, "eval_loss": 0.0058599598705768585, "eval_runtime": 67.9158, "eval_samples_per_second": 74.313, "eval_steps_per_second": 2.326, "step": 2900 }, { "epoch": 2.3771790808240887, "grad_norm": 0.10791371762752533, "learning_rate": 2.2551177635923398e-05, "loss": 0.0051, "step": 3000 }, { "epoch": 2.3771790808240887, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9996037483215332, "eval_loss": 0.00463093351572752, "eval_runtime": 68.0279, "eval_samples_per_second": 74.19, "eval_steps_per_second": 2.323, "step": 3000 }, { "epoch": 2.456418383518225, "grad_norm": 6.349626064300537, "learning_rate": 2.1450583314990097e-05, "loss": 0.0068, "step": 3100 }, { "epoch": 2.456418383518225, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9996037483215332, "eval_loss": 0.008199479430913925, "eval_runtime": 68.0218, "eval_samples_per_second": 74.197, "eval_steps_per_second": 2.323, "step": 3100 }, { "epoch": 2.535657686212361, "grad_norm": 0.07518544048070908, "learning_rate": 2.0349988994056793e-05, "loss": 0.0051, "step": 3200 }, { "epoch": 2.535657686212361, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9996037483215332, "eval_loss": 0.004618997685611248, "eval_runtime": 67.9844, "eval_samples_per_second": 74.238, "eval_steps_per_second": 2.324, "step": 3200 }, { "epoch": 2.6148969889064975, "grad_norm": 0.6720037460327148, "learning_rate": 1.924939467312349e-05, "loss": 0.0025, "step": 3300 }, { "epoch": 2.6148969889064975, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9998018741607666, "eval_loss": 0.005006096325814724, "eval_runtime": 67.9382, "eval_samples_per_second": 74.288, "eval_steps_per_second": 2.326, "step": 3300 }, { "epoch": 2.6941362916006337, "grad_norm": 0.0643150731921196, "learning_rate": 1.8148800352190185e-05, "loss": 0.004, "step": 3400 }, { "epoch": 2.6941362916006337, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9992074370384216, "eval_loss": 0.005211703013628721, "eval_runtime": 67.9947, "eval_samples_per_second": 74.226, "eval_steps_per_second": 2.324, "step": 3400 }, { "epoch": 2.7733755942947704, "grad_norm": 0.07725568860769272, "learning_rate": 1.704820603125688e-05, "loss": 0.0019, "step": 3500 }, { "epoch": 2.7733755942947704, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9996037483215332, "eval_loss": 0.004846959374845028, "eval_runtime": 68.0179, "eval_samples_per_second": 74.201, "eval_steps_per_second": 2.323, "step": 3500 }, { "epoch": 2.8526148969889062, "grad_norm": 0.024744508787989616, "learning_rate": 1.5947611710323578e-05, "loss": 0.0039, "step": 3600 }, { "epoch": 2.8526148969889062, "eval_Embedding_Dataset_Dev_cosine_accuracy": 1.0, "eval_loss": 0.0042335595935583115, "eval_runtime": 68.2641, "eval_samples_per_second": 73.933, "eval_steps_per_second": 2.315, "step": 3600 }, { "epoch": 2.931854199683043, "grad_norm": 0.015782877802848816, "learning_rate": 1.4847017389390272e-05, "loss": 0.0045, "step": 3700 }, { "epoch": 2.931854199683043, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9998018741607666, "eval_loss": 0.004929765127599239, "eval_runtime": 67.901, "eval_samples_per_second": 74.329, "eval_steps_per_second": 2.327, "step": 3700 }, { "epoch": 3.011093502377179, "grad_norm": 2.817779779434204, "learning_rate": 1.3746423068456968e-05, "loss": 0.002, "step": 3800 }, { "epoch": 3.011093502377179, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9996037483215332, "eval_loss": 0.004618987441062927, "eval_runtime": 67.8421, "eval_samples_per_second": 74.393, "eval_steps_per_second": 2.329, "step": 3800 }, { "epoch": 3.0903328050713155, "grad_norm": 0.011266672052443027, "learning_rate": 1.2645828747523664e-05, "loss": 0.0028, "step": 3900 }, { "epoch": 3.0903328050713155, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9996037483215332, "eval_loss": 0.004977984819561243, "eval_runtime": 67.9701, "eval_samples_per_second": 74.253, "eval_steps_per_second": 2.325, "step": 3900 }, { "epoch": 3.1695721077654517, "grad_norm": 0.030996697023510933, "learning_rate": 1.154523442659036e-05, "loss": 0.0033, "step": 4000 }, { "epoch": 3.1695721077654517, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9992074370384216, "eval_loss": 0.004901626612991095, "eval_runtime": 68.0995, "eval_samples_per_second": 74.112, "eval_steps_per_second": 2.32, "step": 4000 }, { "epoch": 3.248811410459588, "grad_norm": 0.011963835917413235, "learning_rate": 1.0444640105657054e-05, "loss": 0.0052, "step": 4100 }, { "epoch": 3.248811410459588, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9996037483215332, "eval_loss": 0.0047536250203847885, "eval_runtime": 67.9001, "eval_samples_per_second": 74.33, "eval_steps_per_second": 2.327, "step": 4100 }, { "epoch": 3.3280507131537242, "grad_norm": 0.00867912545800209, "learning_rate": 9.34404578472375e-06, "loss": 0.0026, "step": 4200 }, { "epoch": 3.3280507131537242, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.999405562877655, "eval_loss": 0.004853234626352787, "eval_runtime": 67.9652, "eval_samples_per_second": 74.259, "eval_steps_per_second": 2.325, "step": 4200 }, { "epoch": 3.4072900158478605, "grad_norm": 0.012174161151051521, "learning_rate": 8.243451463790447e-06, "loss": 0.0043, "step": 4300 }, { "epoch": 3.4072900158478605, "eval_Embedding_Dataset_Dev_cosine_accuracy": 1.0, "eval_loss": 0.004362211097031832, "eval_runtime": 68.0327, "eval_samples_per_second": 74.185, "eval_steps_per_second": 2.322, "step": 4300 }, { "epoch": 3.4865293185419968, "grad_norm": 1.0564968585968018, "learning_rate": 7.142857142857143e-06, "loss": 0.0038, "step": 4400 }, { "epoch": 3.4865293185419968, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9998018741607666, "eval_loss": 0.004115838557481766, "eval_runtime": 67.8268, "eval_samples_per_second": 74.41, "eval_steps_per_second": 2.329, "step": 4400 }, { "epoch": 3.565768621236133, "grad_norm": 0.018350793048739433, "learning_rate": 6.04226282192384e-06, "loss": 0.003, "step": 4500 }, { "epoch": 3.565768621236133, "eval_Embedding_Dataset_Dev_cosine_accuracy": 1.0, "eval_loss": 0.0043433657847344875, "eval_runtime": 68.0255, "eval_samples_per_second": 74.193, "eval_steps_per_second": 2.323, "step": 4500 }, { "epoch": 3.6450079239302693, "grad_norm": 0.0198595579713583, "learning_rate": 4.941668500990535e-06, "loss": 0.003, "step": 4600 }, { "epoch": 3.6450079239302693, "eval_Embedding_Dataset_Dev_cosine_accuracy": 1.0, "eval_loss": 0.004538228269666433, "eval_runtime": 67.8043, "eval_samples_per_second": 74.435, "eval_steps_per_second": 2.33, "step": 4600 }, { "epoch": 3.7242472266244055, "grad_norm": 0.014352944679558277, "learning_rate": 3.841074180057231e-06, "loss": 0.003, "step": 4700 }, { "epoch": 3.7242472266244055, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9998018741607666, "eval_loss": 0.004474899731576443, "eval_runtime": 67.8863, "eval_samples_per_second": 74.345, "eval_steps_per_second": 2.327, "step": 4700 }, { "epoch": 3.8034865293185423, "grad_norm": 0.05333567038178444, "learning_rate": 2.740479859123927e-06, "loss": 0.0009, "step": 4800 }, { "epoch": 3.8034865293185423, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9998018741607666, "eval_loss": 0.0041495212353765965, "eval_runtime": 68.1146, "eval_samples_per_second": 74.096, "eval_steps_per_second": 2.32, "step": 4800 }, { "epoch": 3.882725832012678, "grad_norm": 0.017361685633659363, "learning_rate": 1.639885538190623e-06, "loss": 0.0048, "step": 4900 }, { "epoch": 3.882725832012678, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9998018741607666, "eval_loss": 0.004217915236949921, "eval_runtime": 67.5299, "eval_samples_per_second": 74.737, "eval_steps_per_second": 2.34, "step": 4900 }, { "epoch": 3.9619651347068148, "grad_norm": 0.021525979042053223, "learning_rate": 5.392912172573191e-07, "loss": 0.0035, "step": 5000 }, { "epoch": 3.9619651347068148, "eval_Embedding_Dataset_Dev_cosine_accuracy": 0.9996037483215332, "eval_loss": 0.004212545696645975, "eval_runtime": 67.678, "eval_samples_per_second": 74.574, "eval_steps_per_second": 2.335, "step": 5000 } ], "logging_steps": 100, "max_steps": 5048, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }