| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.012, |
| "eval_steps": 500, |
| "global_step": 150, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 8e-05, |
| "grad_norm": 0.18695563077926636, |
| "learning_rate": 4e-05, |
| "loss": 0.626, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.00016, |
| "grad_norm": 0.16322645545005798, |
| "learning_rate": 8e-05, |
| "loss": 0.6748, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.00024, |
| "grad_norm": 0.20551565289497375, |
| "learning_rate": 0.00012, |
| "loss": 0.8631, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.00032, |
| "grad_norm": 0.19189168512821198, |
| "learning_rate": 0.00016, |
| "loss": 0.7236, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.0004, |
| "grad_norm": 0.17240828275680542, |
| "learning_rate": 0.0002, |
| "loss": 0.6456, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.00048, |
| "grad_norm": 0.1921045035123825, |
| "learning_rate": 0.00016, |
| "loss": 0.7794, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.00056, |
| "grad_norm": 0.17362892627716064, |
| "learning_rate": 0.00012, |
| "loss": 0.5114, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.00064, |
| "grad_norm": 0.22235442698001862, |
| "learning_rate": 8e-05, |
| "loss": 0.8547, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.00072, |
| "grad_norm": 0.17304766178131104, |
| "learning_rate": 4e-05, |
| "loss": 0.7826, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.0008, |
| "grad_norm": 0.20972397923469543, |
| "learning_rate": 0.0, |
| "loss": 0.6725, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.00088, |
| "grad_norm": 0.18230140209197998, |
| "learning_rate": 0.0, |
| "loss": 0.8175, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.00096, |
| "grad_norm": 0.2014843076467514, |
| "learning_rate": 0.00019988795518207283, |
| "loss": 1.0219, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.00104, |
| "grad_norm": 0.24639324843883514, |
| "learning_rate": 0.0001998719487795118, |
| "loss": 0.8582, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.00112, |
| "grad_norm": 0.1707516610622406, |
| "learning_rate": 0.0001998559423769508, |
| "loss": 0.5871, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.0012, |
| "grad_norm": 0.17754444479942322, |
| "learning_rate": 0.00019983993597438976, |
| "loss": 0.8023, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.00128, |
| "grad_norm": 0.2088427096605301, |
| "learning_rate": 0.00019982392957182873, |
| "loss": 0.5597, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.00136, |
| "grad_norm": 0.2147207260131836, |
| "learning_rate": 0.0001998079231692677, |
| "loss": 0.9279, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.00144, |
| "grad_norm": 0.2416459172964096, |
| "learning_rate": 0.0001997919167667067, |
| "loss": 0.7524, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.00152, |
| "grad_norm": 0.18660244345664978, |
| "learning_rate": 0.00019977591036414566, |
| "loss": 0.723, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.0016, |
| "grad_norm": 0.24767373502254486, |
| "learning_rate": 0.00019975990396158463, |
| "loss": 0.8528, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.00168, |
| "grad_norm": 0.2119741290807724, |
| "learning_rate": 0.00019974389755902363, |
| "loss": 0.8555, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.00176, |
| "grad_norm": 0.19749240577220917, |
| "learning_rate": 0.0001997278911564626, |
| "loss": 0.8141, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.00184, |
| "grad_norm": 0.15635022521018982, |
| "learning_rate": 0.00019971188475390156, |
| "loss": 1.0032, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.00192, |
| "grad_norm": 0.28329262137413025, |
| "learning_rate": 0.00019969587835134053, |
| "loss": 0.8104, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.002, |
| "grad_norm": 0.2267996221780777, |
| "learning_rate": 0.00019967987194877953, |
| "loss": 0.5932, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.00208, |
| "grad_norm": 0.2392159253358841, |
| "learning_rate": 0.0001996638655462185, |
| "loss": 0.7813, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.00216, |
| "grad_norm": 0.29656457901000977, |
| "learning_rate": 0.00019964785914365746, |
| "loss": 0.7647, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.00224, |
| "grad_norm": 0.276050329208374, |
| "learning_rate": 0.00019963185274109646, |
| "loss": 0.7368, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.00232, |
| "grad_norm": 0.26816362142562866, |
| "learning_rate": 0.00019961584633853543, |
| "loss": 1.0461, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.0024, |
| "grad_norm": 0.2567765414714813, |
| "learning_rate": 0.0001995998399359744, |
| "loss": 1.0064, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.00248, |
| "grad_norm": 0.28481513261795044, |
| "learning_rate": 0.00019958383353341336, |
| "loss": 0.6283, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.00256, |
| "grad_norm": 0.19182950258255005, |
| "learning_rate": 0.00019956782713085236, |
| "loss": 0.5184, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.00264, |
| "grad_norm": 0.2858627438545227, |
| "learning_rate": 0.00019955182072829133, |
| "loss": 0.5853, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.00272, |
| "grad_norm": 0.23260071873664856, |
| "learning_rate": 0.0001995358143257303, |
| "loss": 0.5486, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.0028, |
| "grad_norm": 0.2574014365673065, |
| "learning_rate": 0.00019951980792316926, |
| "loss": 0.7127, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.00288, |
| "grad_norm": 0.27332785725593567, |
| "learning_rate": 0.00019950380152060826, |
| "loss": 0.9821, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.00296, |
| "grad_norm": 0.2918913960456848, |
| "learning_rate": 0.00019948779511804723, |
| "loss": 0.853, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.00304, |
| "grad_norm": 0.22690187394618988, |
| "learning_rate": 0.0001994717887154862, |
| "loss": 0.6959, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.00312, |
| "grad_norm": 0.24837082624435425, |
| "learning_rate": 0.00019945578231292518, |
| "loss": 0.7622, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.0032, |
| "grad_norm": 0.24773573875427246, |
| "learning_rate": 0.00019943977591036416, |
| "loss": 0.9853, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.00328, |
| "grad_norm": 0.2665715515613556, |
| "learning_rate": 0.00019942376950780313, |
| "loss": 0.7365, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.00336, |
| "grad_norm": 0.2815437912940979, |
| "learning_rate": 0.0001994077631052421, |
| "loss": 0.9859, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.00344, |
| "grad_norm": 0.23276300728321075, |
| "learning_rate": 0.00019939175670268108, |
| "loss": 0.7499, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.00352, |
| "grad_norm": 0.2659528851509094, |
| "learning_rate": 0.00019937575030012006, |
| "loss": 0.7896, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.0036, |
| "grad_norm": 0.2777968943119049, |
| "learning_rate": 0.00019935974389755903, |
| "loss": 0.7405, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.00368, |
| "grad_norm": 0.2703694999217987, |
| "learning_rate": 0.000199343737494998, |
| "loss": 0.8549, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.00376, |
| "grad_norm": 0.2913441061973572, |
| "learning_rate": 0.00019932773109243698, |
| "loss": 0.7648, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.00384, |
| "grad_norm": 0.21126149594783783, |
| "learning_rate": 0.00019931172468987596, |
| "loss": 0.6442, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.00392, |
| "grad_norm": 0.35344573855400085, |
| "learning_rate": 0.00019929571828731493, |
| "loss": 1.0157, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.004, |
| "grad_norm": 0.211960569024086, |
| "learning_rate": 0.0001992797118847539, |
| "loss": 1.0145, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.00408, |
| "grad_norm": 0.17948386073112488, |
| "learning_rate": 0.00019926370548219288, |
| "loss": 0.4476, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.00416, |
| "grad_norm": 0.18907713890075684, |
| "learning_rate": 0.00019924769907963185, |
| "loss": 0.8988, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.00424, |
| "grad_norm": 0.9255684614181519, |
| "learning_rate": 0.00019923169267707086, |
| "loss": 1.3091, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.00432, |
| "grad_norm": 0.23128096759319305, |
| "learning_rate": 0.0001992156862745098, |
| "loss": 0.6738, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.0044, |
| "grad_norm": 0.32358431816101074, |
| "learning_rate": 0.00019919967987194878, |
| "loss": 1.0512, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.00448, |
| "grad_norm": 0.21004758775234222, |
| "learning_rate": 0.00019918367346938775, |
| "loss": 1.1109, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.00456, |
| "grad_norm": 0.17308218777179718, |
| "learning_rate": 0.00019916766706682676, |
| "loss": 0.7186, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.00464, |
| "grad_norm": 0.1969563513994217, |
| "learning_rate": 0.0001991516606642657, |
| "loss": 0.8231, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.00472, |
| "grad_norm": 0.20930427312850952, |
| "learning_rate": 0.00019913565426170468, |
| "loss": 0.7483, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.0048, |
| "grad_norm": 0.2239973247051239, |
| "learning_rate": 0.00019911964785914368, |
| "loss": 0.9065, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.00488, |
| "grad_norm": 0.21532970666885376, |
| "learning_rate": 0.00019910364145658266, |
| "loss": 0.7133, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.00496, |
| "grad_norm": 0.22679661214351654, |
| "learning_rate": 0.0001990876350540216, |
| "loss": 0.8632, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.00504, |
| "grad_norm": 0.18961389362812042, |
| "learning_rate": 0.00019907162865146058, |
| "loss": 0.7713, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.00512, |
| "grad_norm": 0.3985270857810974, |
| "learning_rate": 0.00019905562224889958, |
| "loss": 1.1621, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.0052, |
| "grad_norm": 0.1857418268918991, |
| "learning_rate": 0.00019903961584633856, |
| "loss": 0.7665, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.00528, |
| "grad_norm": 0.21082746982574463, |
| "learning_rate": 0.0001990236094437775, |
| "loss": 0.8936, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.00536, |
| "grad_norm": 0.2598806619644165, |
| "learning_rate": 0.0001990076030412165, |
| "loss": 0.8367, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.00544, |
| "grad_norm": 0.21064138412475586, |
| "learning_rate": 0.00019899159663865548, |
| "loss": 0.7481, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.00552, |
| "grad_norm": 0.17963984608650208, |
| "learning_rate": 0.00019897559023609445, |
| "loss": 0.857, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.0056, |
| "grad_norm": 0.2018403857946396, |
| "learning_rate": 0.0001989595838335334, |
| "loss": 0.6117, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.00568, |
| "grad_norm": 0.2090141773223877, |
| "learning_rate": 0.0001989435774309724, |
| "loss": 0.4831, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.00576, |
| "grad_norm": 0.19442321360111237, |
| "learning_rate": 0.00019892757102841138, |
| "loss": 0.6887, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.00584, |
| "grad_norm": 0.20884303748607635, |
| "learning_rate": 0.00019891156462585035, |
| "loss": 0.8775, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.00592, |
| "grad_norm": 0.23718436062335968, |
| "learning_rate": 0.00019889555822328933, |
| "loss": 0.9292, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.006, |
| "grad_norm": 0.2717212438583374, |
| "learning_rate": 0.0001988795518207283, |
| "loss": 0.66, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.00608, |
| "grad_norm": 0.2522720396518707, |
| "learning_rate": 0.00019886354541816728, |
| "loss": 0.7293, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.00616, |
| "grad_norm": 0.22638511657714844, |
| "learning_rate": 0.00019884753901560625, |
| "loss": 0.9605, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.00624, |
| "grad_norm": 0.25353768467903137, |
| "learning_rate": 0.00019883153261304523, |
| "loss": 0.7305, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.00632, |
| "grad_norm": 0.26388710737228394, |
| "learning_rate": 0.0001988155262104842, |
| "loss": 1.0473, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.0064, |
| "grad_norm": 0.22967277467250824, |
| "learning_rate": 0.00019879951980792318, |
| "loss": 0.8024, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.00648, |
| "grad_norm": 0.21001595258712769, |
| "learning_rate": 0.00019878351340536215, |
| "loss": 0.6549, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.00656, |
| "grad_norm": 0.22154393792152405, |
| "learning_rate": 0.00019876750700280113, |
| "loss": 0.8322, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.00664, |
| "grad_norm": 0.2273344248533249, |
| "learning_rate": 0.0001987515006002401, |
| "loss": 0.8533, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.00672, |
| "grad_norm": 0.2042098492383957, |
| "learning_rate": 0.00019873549419767908, |
| "loss": 1.0104, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.0068, |
| "grad_norm": 0.18789270520210266, |
| "learning_rate": 0.00019871948779511805, |
| "loss": 0.8703, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.00688, |
| "grad_norm": 0.16704747080802917, |
| "learning_rate": 0.00019870348139255703, |
| "loss": 0.6079, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.00696, |
| "grad_norm": 0.20875659584999084, |
| "learning_rate": 0.000198687474989996, |
| "loss": 0.8806, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.00704, |
| "grad_norm": 0.17773783206939697, |
| "learning_rate": 0.000198671468587435, |
| "loss": 0.6195, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.00712, |
| "grad_norm": 0.20498760044574738, |
| "learning_rate": 0.00019865546218487395, |
| "loss": 0.8146, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.0072, |
| "grad_norm": 0.1688094437122345, |
| "learning_rate": 0.00019863945578231293, |
| "loss": 1.1415, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.00728, |
| "grad_norm": 0.22424210608005524, |
| "learning_rate": 0.0001986234493797519, |
| "loss": 0.8, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.00736, |
| "grad_norm": 0.21771728992462158, |
| "learning_rate": 0.0001986074429771909, |
| "loss": 0.5614, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.00744, |
| "grad_norm": 0.2241130769252777, |
| "learning_rate": 0.00019859143657462985, |
| "loss": 0.8084, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.00752, |
| "grad_norm": 0.1654769629240036, |
| "learning_rate": 0.00019857543017206883, |
| "loss": 0.687, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.0076, |
| "grad_norm": 0.16390787065029144, |
| "learning_rate": 0.0001985594237695078, |
| "loss": 0.5289, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.00768, |
| "grad_norm": 0.259437620639801, |
| "learning_rate": 0.0001985434173669468, |
| "loss": 0.5644, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.00776, |
| "grad_norm": 0.20152436196804047, |
| "learning_rate": 0.00019852741096438575, |
| "loss": 0.6532, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.00784, |
| "grad_norm": 0.22755707800388336, |
| "learning_rate": 0.00019851140456182473, |
| "loss": 0.7435, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.00792, |
| "grad_norm": 0.21967531740665436, |
| "learning_rate": 0.00019849539815926373, |
| "loss": 0.7607, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 0.18391412496566772, |
| "learning_rate": 0.0001984793917567027, |
| "loss": 0.7239, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.00808, |
| "grad_norm": 0.2660037577152252, |
| "learning_rate": 0.00019846338535414165, |
| "loss": 0.7299, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.00816, |
| "grad_norm": 0.1816340982913971, |
| "learning_rate": 0.00019844737895158062, |
| "loss": 0.7276, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.00824, |
| "grad_norm": 0.21206796169281006, |
| "learning_rate": 0.00019843137254901963, |
| "loss": 0.7689, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.00832, |
| "grad_norm": 0.18705548346042633, |
| "learning_rate": 0.0001984153661464586, |
| "loss": 0.7199, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.0084, |
| "grad_norm": 0.2467879205942154, |
| "learning_rate": 0.00019839935974389755, |
| "loss": 1.0206, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.00848, |
| "grad_norm": 0.2145715057849884, |
| "learning_rate": 0.00019838335334133655, |
| "loss": 0.8011, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.00856, |
| "grad_norm": 0.23377610743045807, |
| "learning_rate": 0.00019836734693877553, |
| "loss": 0.9455, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.00864, |
| "grad_norm": 0.13857395946979523, |
| "learning_rate": 0.0001983513405362145, |
| "loss": 0.603, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.00872, |
| "grad_norm": 0.21066828072071075, |
| "learning_rate": 0.00019833533413365345, |
| "loss": 0.7147, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.0088, |
| "grad_norm": 0.22423389554023743, |
| "learning_rate": 0.00019831932773109245, |
| "loss": 0.7619, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.00888, |
| "grad_norm": 0.20110934972763062, |
| "learning_rate": 0.00019830332132853143, |
| "loss": 0.6215, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.00896, |
| "grad_norm": 0.22843226790428162, |
| "learning_rate": 0.0001982873149259704, |
| "loss": 0.833, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.00904, |
| "grad_norm": 0.171301007270813, |
| "learning_rate": 0.00019827130852340938, |
| "loss": 0.9602, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.00912, |
| "grad_norm": 0.21754777431488037, |
| "learning_rate": 0.00019825530212084835, |
| "loss": 0.8931, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.0092, |
| "grad_norm": 0.16314199566841125, |
| "learning_rate": 0.00019823929571828732, |
| "loss": 0.6414, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.00928, |
| "grad_norm": 0.17339545488357544, |
| "learning_rate": 0.0001982232893157263, |
| "loss": 0.8579, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.00936, |
| "grad_norm": 0.2185641974210739, |
| "learning_rate": 0.00019820728291316527, |
| "loss": 0.5762, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.00944, |
| "grad_norm": 0.23066163063049316, |
| "learning_rate": 0.00019819127651060425, |
| "loss": 0.7929, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.00952, |
| "grad_norm": 0.16946138441562653, |
| "learning_rate": 0.00019817527010804322, |
| "loss": 0.6734, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.0096, |
| "grad_norm": 0.16290231049060822, |
| "learning_rate": 0.0001981592637054822, |
| "loss": 0.4331, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.00968, |
| "grad_norm": 0.14785629510879517, |
| "learning_rate": 0.00019814325730292117, |
| "loss": 0.5846, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.00976, |
| "grad_norm": 0.15986767411231995, |
| "learning_rate": 0.00019812725090036015, |
| "loss": 0.7937, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.00984, |
| "grad_norm": 0.22597737610340118, |
| "learning_rate": 0.00019811124449779912, |
| "loss": 0.645, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.00992, |
| "grad_norm": 0.16873855888843536, |
| "learning_rate": 0.0001980952380952381, |
| "loss": 0.783, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 0.24884037673473358, |
| "learning_rate": 0.00019807923169267707, |
| "loss": 0.806, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.01008, |
| "grad_norm": 0.1921387016773224, |
| "learning_rate": 0.00019806322529011605, |
| "loss": 0.7133, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.01016, |
| "grad_norm": 0.1714552938938141, |
| "learning_rate": 0.00019804721888755505, |
| "loss": 0.8823, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.01024, |
| "grad_norm": 0.17558862268924713, |
| "learning_rate": 0.000198031212484994, |
| "loss": 0.5438, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.01032, |
| "grad_norm": 0.20176133513450623, |
| "learning_rate": 0.00019801520608243297, |
| "loss": 0.6564, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.0104, |
| "grad_norm": 0.16648930311203003, |
| "learning_rate": 0.00019799919967987195, |
| "loss": 0.8251, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.01048, |
| "grad_norm": 0.15654760599136353, |
| "learning_rate": 0.00019798319327731095, |
| "loss": 0.7868, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.01056, |
| "grad_norm": 0.1604606807231903, |
| "learning_rate": 0.0001979671868747499, |
| "loss": 0.9408, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.01064, |
| "grad_norm": 0.1831110417842865, |
| "learning_rate": 0.00019795118047218887, |
| "loss": 0.6789, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.01072, |
| "grad_norm": 0.19563564658164978, |
| "learning_rate": 0.00019793517406962787, |
| "loss": 0.6197, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.0108, |
| "grad_norm": 0.15682204067707062, |
| "learning_rate": 0.00019791916766706685, |
| "loss": 0.3705, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.01088, |
| "grad_norm": 0.20387424528598785, |
| "learning_rate": 0.0001979031612645058, |
| "loss": 0.6203, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.01096, |
| "grad_norm": 0.18805289268493652, |
| "learning_rate": 0.00019788715486194477, |
| "loss": 0.5732, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.01104, |
| "grad_norm": 0.190113365650177, |
| "learning_rate": 0.00019787114845938377, |
| "loss": 0.4919, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.01112, |
| "grad_norm": 0.22532878816127777, |
| "learning_rate": 0.00019785514205682275, |
| "loss": 0.4651, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.0112, |
| "grad_norm": 0.23364323377609253, |
| "learning_rate": 0.0001978391356542617, |
| "loss": 0.9228, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.01128, |
| "grad_norm": 0.18550938367843628, |
| "learning_rate": 0.0001978231292517007, |
| "loss": 0.7556, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.01136, |
| "grad_norm": 0.21325847506523132, |
| "learning_rate": 0.00019780712284913967, |
| "loss": 0.6726, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.01144, |
| "grad_norm": 0.21966691315174103, |
| "learning_rate": 0.00019779111644657865, |
| "loss": 0.7203, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.01152, |
| "grad_norm": 0.17671513557434082, |
| "learning_rate": 0.0001977751100440176, |
| "loss": 0.7607, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.0116, |
| "grad_norm": 0.21979670226573944, |
| "learning_rate": 0.0001977591036414566, |
| "loss": 0.738, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.01168, |
| "grad_norm": 0.24346943199634552, |
| "learning_rate": 0.00019774309723889557, |
| "loss": 0.9575, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.01176, |
| "grad_norm": 0.17305152118206024, |
| "learning_rate": 0.00019772709083633455, |
| "loss": 0.7207, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.01184, |
| "grad_norm": 0.17260083556175232, |
| "learning_rate": 0.0001977110844337735, |
| "loss": 0.7765, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.01192, |
| "grad_norm": 0.20756393671035767, |
| "learning_rate": 0.0001976950780312125, |
| "loss": 0.7864, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 0.20012526214122772, |
| "learning_rate": 0.00019767907162865147, |
| "loss": 0.6435, |
| "step": 150 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 12500, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 10, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.5678754959040512e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|