| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9976019184652278, | |
| "eval_steps": 500, | |
| "global_step": 312, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0031974420463629096, | |
| "grad_norm": 0.3446813225746155, | |
| "learning_rate": 1.875e-05, | |
| "loss": 2.181, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.006394884092725819, | |
| "grad_norm": 0.30540090799331665, | |
| "learning_rate": 3.75e-05, | |
| "loss": 1.9989, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.009592326139088728, | |
| "grad_norm": 0.33418309688568115, | |
| "learning_rate": 5.625e-05, | |
| "loss": 2.2171, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.012789768185451638, | |
| "grad_norm": 0.3458963632583618, | |
| "learning_rate": 7.5e-05, | |
| "loss": 2.1132, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.01598721023181455, | |
| "grad_norm": 0.3518303334712982, | |
| "learning_rate": 9.374999999999999e-05, | |
| "loss": 1.9712, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.019184652278177457, | |
| "grad_norm": 0.3810424506664276, | |
| "learning_rate": 0.0001125, | |
| "loss": 1.7775, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.02238209432454037, | |
| "grad_norm": 0.4941970109939575, | |
| "learning_rate": 0.00013125, | |
| "loss": 1.9399, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.025579536370903277, | |
| "grad_norm": 0.3915010094642639, | |
| "learning_rate": 0.00015, | |
| "loss": 1.8796, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.02877697841726619, | |
| "grad_norm": 0.3387204706668854, | |
| "learning_rate": 0.00016874999999999998, | |
| "loss": 1.8703, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.0319744204636291, | |
| "grad_norm": 0.3416554927825928, | |
| "learning_rate": 0.00018749999999999998, | |
| "loss": 1.8606, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.035171862509992005, | |
| "grad_norm": 0.36551880836486816, | |
| "learning_rate": 0.00020624999999999997, | |
| "loss": 1.5603, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.03836930455635491, | |
| "grad_norm": 0.3075932264328003, | |
| "learning_rate": 0.000225, | |
| "loss": 1.5475, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.04156674660271783, | |
| "grad_norm": 0.28144699335098267, | |
| "learning_rate": 0.00024375, | |
| "loss": 1.6917, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.04476418864908074, | |
| "grad_norm": 0.27931058406829834, | |
| "learning_rate": 0.0002625, | |
| "loss": 1.4981, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.047961630695443645, | |
| "grad_norm": 0.24638418853282928, | |
| "learning_rate": 0.00028125, | |
| "loss": 1.599, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.051159072741806554, | |
| "grad_norm": 0.49918419122695923, | |
| "learning_rate": 0.0003, | |
| "loss": 1.5411, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.05435651478816946, | |
| "grad_norm": 0.227300763130188, | |
| "learning_rate": 0.00029999155161863667, | |
| "loss": 1.4908, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.05755395683453238, | |
| "grad_norm": 0.24631308019161224, | |
| "learning_rate": 0.0002999662074262154, | |
| "loss": 1.5127, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.060751398880895285, | |
| "grad_norm": 0.20278117060661316, | |
| "learning_rate": 0.00029992397027763483, | |
| "loss": 1.5784, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.0639488409272582, | |
| "grad_norm": 0.20311613380908966, | |
| "learning_rate": 0.00029986484493070223, | |
| "loss": 1.5577, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0671462829736211, | |
| "grad_norm": 0.22366106510162354, | |
| "learning_rate": 0.00029978883804559716, | |
| "loss": 1.6616, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.07034372501998401, | |
| "grad_norm": 0.22588945925235748, | |
| "learning_rate": 0.00029969595818412183, | |
| "loss": 1.7524, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.07354116706634692, | |
| "grad_norm": 0.20929686725139618, | |
| "learning_rate": 0.000299586215808736, | |
| "loss": 1.5186, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.07673860911270983, | |
| "grad_norm": 0.2444813847541809, | |
| "learning_rate": 0.00029945962328137895, | |
| "loss": 1.5135, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.07993605115907274, | |
| "grad_norm": 0.21571452915668488, | |
| "learning_rate": 0.00029931619486207655, | |
| "loss": 1.4799, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.08313349320543566, | |
| "grad_norm": 0.2103520780801773, | |
| "learning_rate": 0.00029915594670733536, | |
| "loss": 1.6818, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.08633093525179857, | |
| "grad_norm": 0.24929186701774597, | |
| "learning_rate": 0.00029897889686832227, | |
| "loss": 1.4392, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.08952837729816147, | |
| "grad_norm": 0.24320849776268005, | |
| "learning_rate": 0.0002987850652888315, | |
| "loss": 1.5211, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.09272581934452438, | |
| "grad_norm": 0.23468714952468872, | |
| "learning_rate": 0.0002985744738030378, | |
| "loss": 1.5468, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.09592326139088729, | |
| "grad_norm": 0.2079857587814331, | |
| "learning_rate": 0.0002983471461330368, | |
| "loss": 1.5166, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0991207034372502, | |
| "grad_norm": 0.21627485752105713, | |
| "learning_rate": 0.0002981031078861733, | |
| "loss": 1.5507, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.10231814548361311, | |
| "grad_norm": 0.23451927304267883, | |
| "learning_rate": 0.00029784238655215626, | |
| "loss": 1.508, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.10551558752997602, | |
| "grad_norm": 0.2220710664987564, | |
| "learning_rate": 0.0002975650114999625, | |
| "loss": 1.5164, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.10871302957633892, | |
| "grad_norm": 0.22487205266952515, | |
| "learning_rate": 0.00029727101397452834, | |
| "loss": 1.4938, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.11191047162270183, | |
| "grad_norm": 0.2187684029340744, | |
| "learning_rate": 0.00029696042709322995, | |
| "loss": 1.3007, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.11510791366906475, | |
| "grad_norm": 0.2184438705444336, | |
| "learning_rate": 0.00029663328584215293, | |
| "loss": 1.5204, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.11830535571542766, | |
| "grad_norm": 0.21176907420158386, | |
| "learning_rate": 0.00029628962707215124, | |
| "loss": 1.5017, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.12150279776179057, | |
| "grad_norm": 0.2055819034576416, | |
| "learning_rate": 0.00029592948949469614, | |
| "loss": 1.2755, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.12470023980815348, | |
| "grad_norm": 0.220439150929451, | |
| "learning_rate": 0.00029555291367751573, | |
| "loss": 1.5057, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.1278976818545164, | |
| "grad_norm": 0.2324652075767517, | |
| "learning_rate": 0.00029515994204002484, | |
| "loss": 1.5839, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1310951239008793, | |
| "grad_norm": 0.19285540282726288, | |
| "learning_rate": 0.0002947506188485468, | |
| "loss": 1.4434, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.1342925659472422, | |
| "grad_norm": 0.26798316836357117, | |
| "learning_rate": 0.00029432499021132737, | |
| "loss": 1.6137, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.1374900079936051, | |
| "grad_norm": 0.18407316505908966, | |
| "learning_rate": 0.0002938831040733405, | |
| "loss": 1.3876, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.14068745003996802, | |
| "grad_norm": 0.2084178477525711, | |
| "learning_rate": 0.0002934250102108876, | |
| "loss": 1.5409, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.14388489208633093, | |
| "grad_norm": 0.20955117046833038, | |
| "learning_rate": 0.00029295076022599077, | |
| "loss": 1.4635, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.14708233413269384, | |
| "grad_norm": 0.2144644558429718, | |
| "learning_rate": 0.00029246040754057976, | |
| "loss": 1.4585, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.15027977617905675, | |
| "grad_norm": 0.20352163910865784, | |
| "learning_rate": 0.0002919540073904744, | |
| "loss": 1.5338, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.15347721822541965, | |
| "grad_norm": 0.18800681829452515, | |
| "learning_rate": 0.0002914316168191626, | |
| "loss": 1.5031, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.15667466027178256, | |
| "grad_norm": 0.19407911598682404, | |
| "learning_rate": 0.00029089329467137456, | |
| "loss": 1.4457, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.15987210231814547, | |
| "grad_norm": 0.19459669291973114, | |
| "learning_rate": 0.0002903391015864543, | |
| "loss": 1.3383, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1630695443645084, | |
| "grad_norm": 0.22761711478233337, | |
| "learning_rate": 0.0002897690999915289, | |
| "loss": 1.5057, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.16626698641087131, | |
| "grad_norm": 0.22577515244483948, | |
| "learning_rate": 0.0002891833540944764, | |
| "loss": 1.3057, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.16946442845723422, | |
| "grad_norm": 0.2257939875125885, | |
| "learning_rate": 0.000288581929876693, | |
| "loss": 1.4777, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.17266187050359713, | |
| "grad_norm": 0.20815476775169373, | |
| "learning_rate": 0.0002879648950856608, | |
| "loss": 1.4252, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.17585931254996004, | |
| "grad_norm": 0.20832973718643188, | |
| "learning_rate": 0.0002873323192273162, | |
| "loss": 1.5008, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.17905675459632295, | |
| "grad_norm": 0.2152206003665924, | |
| "learning_rate": 0.00028668427355822034, | |
| "loss": 1.6078, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.18225419664268586, | |
| "grad_norm": 0.18941529095172882, | |
| "learning_rate": 0.0002860208310775327, | |
| "loss": 1.4449, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.18545163868904876, | |
| "grad_norm": 0.23700568079948425, | |
| "learning_rate": 0.00028534206651878777, | |
| "loss": 1.5582, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.18864908073541167, | |
| "grad_norm": 0.2555181384086609, | |
| "learning_rate": 0.0002846480563414768, | |
| "loss": 1.5682, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.19184652278177458, | |
| "grad_norm": 0.18711774051189423, | |
| "learning_rate": 0.0002839388787224353, | |
| "loss": 1.5051, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1950439648281375, | |
| "grad_norm": 0.2022084891796112, | |
| "learning_rate": 0.00028321461354703604, | |
| "loss": 1.4694, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.1982414068745004, | |
| "grad_norm": 0.1778743863105774, | |
| "learning_rate": 0.0002824753424001914, | |
| "loss": 1.3847, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.2014388489208633, | |
| "grad_norm": 0.1981406807899475, | |
| "learning_rate": 0.0002817211485571623, | |
| "loss": 1.3561, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.20463629096722621, | |
| "grad_norm": 0.19981688261032104, | |
| "learning_rate": 0.0002809521169741782, | |
| "loss": 1.4506, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.20783373301358912, | |
| "grad_norm": 0.20264656841754913, | |
| "learning_rate": 0.0002801683342788671, | |
| "loss": 1.5316, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.21103117505995203, | |
| "grad_norm": 0.18628135323524475, | |
| "learning_rate": 0.000279369888760497, | |
| "loss": 1.4879, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.21422861710631494, | |
| "grad_norm": 0.2130441665649414, | |
| "learning_rate": 0.00027855687036003134, | |
| "loss": 1.6192, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.21742605915267785, | |
| "grad_norm": 0.19949516654014587, | |
| "learning_rate": 0.00027772937065999667, | |
| "loss": 1.4773, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.22062350119904076, | |
| "grad_norm": 0.20962868630886078, | |
| "learning_rate": 0.0002768874828741669, | |
| "loss": 1.4617, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.22382094324540366, | |
| "grad_norm": 0.21659812331199646, | |
| "learning_rate": 0.00027603130183706314, | |
| "loss": 1.5065, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2270183852917666, | |
| "grad_norm": 0.19917699694633484, | |
| "learning_rate": 0.00027516092399327094, | |
| "loss": 1.6265, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.2302158273381295, | |
| "grad_norm": 0.20580779016017914, | |
| "learning_rate": 0.0002742764473865763, | |
| "loss": 1.4508, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.23341326938449242, | |
| "grad_norm": 0.20578929781913757, | |
| "learning_rate": 0.0002733779716489217, | |
| "loss": 1.5362, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.23661071143085532, | |
| "grad_norm": 0.21730633080005646, | |
| "learning_rate": 0.0002724655979891828, | |
| "loss": 1.4373, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.23980815347721823, | |
| "grad_norm": 0.21635404229164124, | |
| "learning_rate": 0.000271539429181768, | |
| "loss": 1.3639, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.24300559552358114, | |
| "grad_norm": 0.24112968146800995, | |
| "learning_rate": 0.0002705995695550411, | |
| "loss": 1.5238, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.24620303756994405, | |
| "grad_norm": 0.20409514009952545, | |
| "learning_rate": 0.00026964612497956946, | |
| "loss": 1.4533, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.24940047961630696, | |
| "grad_norm": 0.21514864265918732, | |
| "learning_rate": 0.0002686792028561983, | |
| "loss": 1.4657, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.25259792166266987, | |
| "grad_norm": 0.20796911418437958, | |
| "learning_rate": 0.00026769891210395207, | |
| "loss": 1.4834, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.2557953637090328, | |
| "grad_norm": 0.20425471663475037, | |
| "learning_rate": 0.00026670536314776593, | |
| "loss": 1.4799, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2589928057553957, | |
| "grad_norm": 0.1899542212486267, | |
| "learning_rate": 0.0002656986679060462, | |
| "loss": 1.4862, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.2621902478017586, | |
| "grad_norm": 0.20222659409046173, | |
| "learning_rate": 0.00026467893977806387, | |
| "loss": 1.4788, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.2653876898481215, | |
| "grad_norm": 0.1941121220588684, | |
| "learning_rate": 0.0002636462936311804, | |
| "loss": 1.4913, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.2685851318944844, | |
| "grad_norm": 0.21576811373233795, | |
| "learning_rate": 0.0002626008457879086, | |
| "loss": 1.5327, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.2717825739408473, | |
| "grad_norm": 0.1937507688999176, | |
| "learning_rate": 0.00026154271401280957, | |
| "loss": 1.4609, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2749800159872102, | |
| "grad_norm": 0.18996623158454895, | |
| "learning_rate": 0.0002604720174992268, | |
| "loss": 1.4023, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.27817745803357313, | |
| "grad_norm": 0.20716165006160736, | |
| "learning_rate": 0.00025938887685585994, | |
| "loss": 1.5351, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.28137490007993604, | |
| "grad_norm": 0.20239269733428955, | |
| "learning_rate": 0.0002582934140931786, | |
| "loss": 1.4851, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.28457234212629895, | |
| "grad_norm": 0.20915232598781586, | |
| "learning_rate": 0.0002571857526096788, | |
| "loss": 1.3798, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.28776978417266186, | |
| "grad_norm": 0.20972570776939392, | |
| "learning_rate": 0.00025606601717798207, | |
| "loss": 1.4097, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.29096722621902477, | |
| "grad_norm": 0.20584455132484436, | |
| "learning_rate": 0.0002549343339307813, | |
| "loss": 1.5279, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.2941646682653877, | |
| "grad_norm": 0.1897670328617096, | |
| "learning_rate": 0.00025379083034663194, | |
| "loss": 1.603, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.2973621103117506, | |
| "grad_norm": 0.19150228798389435, | |
| "learning_rate": 0.000252635635235592, | |
| "loss": 1.3939, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.3005595523581135, | |
| "grad_norm": 0.1970176249742508, | |
| "learning_rate": 0.00025146887872471303, | |
| "loss": 1.468, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.3037569944044764, | |
| "grad_norm": 0.19097474217414856, | |
| "learning_rate": 0.000250290692243381, | |
| "loss": 1.4303, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.3069544364508393, | |
| "grad_norm": 0.21538837254047394, | |
| "learning_rate": 0.00024910120850851216, | |
| "loss": 1.5775, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.3101518784972022, | |
| "grad_norm": 0.1855296939611435, | |
| "learning_rate": 0.0002479005615096028, | |
| "loss": 1.413, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.3133493205435651, | |
| "grad_norm": 0.23258726298809052, | |
| "learning_rate": 0.00024668888649363583, | |
| "loss": 1.5517, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.31654676258992803, | |
| "grad_norm": 0.19402435421943665, | |
| "learning_rate": 0.0002454663199498463, | |
| "loss": 1.3835, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.31974420463629094, | |
| "grad_norm": 0.1976032257080078, | |
| "learning_rate": 0.00024423299959434636, | |
| "loss": 1.4637, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3229416466826539, | |
| "grad_norm": 0.19951173663139343, | |
| "learning_rate": 0.0002429890643546119, | |
| "loss": 1.3731, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.3261390887290168, | |
| "grad_norm": 0.20681437849998474, | |
| "learning_rate": 0.0002417346543538337, | |
| "loss": 1.4865, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.3293365307753797, | |
| "grad_norm": 0.36958593130111694, | |
| "learning_rate": 0.00024046991089513267, | |
| "loss": 1.4612, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.33253397282174263, | |
| "grad_norm": 0.20621562004089355, | |
| "learning_rate": 0.00023919497644564298, | |
| "loss": 1.357, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.33573141486810554, | |
| "grad_norm": 0.18956023454666138, | |
| "learning_rate": 0.00023790999462046394, | |
| "loss": 1.6554, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.33892885691446845, | |
| "grad_norm": 0.2084682583808899, | |
| "learning_rate": 0.0002366151101664822, | |
| "loss": 1.4853, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.34212629896083135, | |
| "grad_norm": 0.17509467899799347, | |
| "learning_rate": 0.00023531046894606703, | |
| "loss": 1.4028, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.34532374100719426, | |
| "grad_norm": 0.19247236847877502, | |
| "learning_rate": 0.00023399621792063928, | |
| "loss": 1.4353, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.34852118305355717, | |
| "grad_norm": 0.19204045832157135, | |
| "learning_rate": 0.00023267250513411733, | |
| "loss": 1.3393, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.3517186250999201, | |
| "grad_norm": 0.20329782366752625, | |
| "learning_rate": 0.00023133947969624028, | |
| "loss": 1.6107, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.354916067146283, | |
| "grad_norm": 0.2169138640165329, | |
| "learning_rate": 0.00022999729176577163, | |
| "loss": 1.4617, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.3581135091926459, | |
| "grad_norm": 0.22543761134147644, | |
| "learning_rate": 0.00022864609253358474, | |
| "loss": 1.4731, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.3613109512390088, | |
| "grad_norm": 0.19519487023353577, | |
| "learning_rate": 0.00022728603420563175, | |
| "loss": 1.597, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.3645083932853717, | |
| "grad_norm": 0.20843897759914398, | |
| "learning_rate": 0.00022591726998579843, | |
| "loss": 1.4963, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.3677058353317346, | |
| "grad_norm": 0.2149285078048706, | |
| "learning_rate": 0.00022453995405864638, | |
| "loss": 1.5095, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.37090327737809753, | |
| "grad_norm": 0.19521689414978027, | |
| "learning_rate": 0.00022315424157204518, | |
| "loss": 1.5709, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.37410071942446044, | |
| "grad_norm": 0.19614940881729126, | |
| "learning_rate": 0.00022176028861969535, | |
| "loss": 1.4573, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.37729816147082335, | |
| "grad_norm": 0.1948356330394745, | |
| "learning_rate": 0.00022035825222354552, | |
| "loss": 1.309, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.38049560351718625, | |
| "grad_norm": 0.20020437240600586, | |
| "learning_rate": 0.00021894829031610452, | |
| "loss": 1.5289, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.38369304556354916, | |
| "grad_norm": 0.20084881782531738, | |
| "learning_rate": 0.00021753056172265096, | |
| "loss": 1.5456, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.38689048760991207, | |
| "grad_norm": 0.17715269327163696, | |
| "learning_rate": 0.00021610522614334265, | |
| "loss": 1.4322, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.390087929656275, | |
| "grad_norm": 0.2064034342765808, | |
| "learning_rate": 0.00021467244413522673, | |
| "loss": 1.5772, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.3932853717026379, | |
| "grad_norm": 0.19036740064620972, | |
| "learning_rate": 0.00021323237709415413, | |
| "loss": 1.5086, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.3964828137490008, | |
| "grad_norm": 0.19214606285095215, | |
| "learning_rate": 0.0002117851872365989, | |
| "loss": 1.5296, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.3996802557953637, | |
| "grad_norm": 0.20223727822303772, | |
| "learning_rate": 0.00021033103758138529, | |
| "loss": 1.5354, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.4028776978417266, | |
| "grad_norm": 0.18433460593223572, | |
| "learning_rate": 0.00020887009193132456, | |
| "loss": 1.532, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.4060751398880895, | |
| "grad_norm": 0.18365609645843506, | |
| "learning_rate": 0.00020740251485476345, | |
| "loss": 1.3326, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.40927258193445243, | |
| "grad_norm": 0.19547204673290253, | |
| "learning_rate": 0.0002059284716670463, | |
| "loss": 1.4566, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.41247002398081534, | |
| "grad_norm": 0.2268918752670288, | |
| "learning_rate": 0.00020444812841189294, | |
| "loss": 1.6165, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.41566746602717825, | |
| "grad_norm": 0.21848422288894653, | |
| "learning_rate": 0.0002029616518426951, | |
| "loss": 1.6039, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.41886490807354115, | |
| "grad_norm": 0.19918426871299744, | |
| "learning_rate": 0.00020146920940373195, | |
| "loss": 1.4602, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.42206235011990406, | |
| "grad_norm": 0.18590374290943146, | |
| "learning_rate": 0.00019997096921130862, | |
| "loss": 1.2925, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.42525979216626697, | |
| "grad_norm": 0.19987183809280396, | |
| "learning_rate": 0.00019846710003481875, | |
| "loss": 1.4157, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.4284572342126299, | |
| "grad_norm": 0.20987945795059204, | |
| "learning_rate": 0.00019695777127773332, | |
| "loss": 1.4424, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.4316546762589928, | |
| "grad_norm": 0.21076463162899017, | |
| "learning_rate": 0.00019544315295851825, | |
| "loss": 1.4946, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.4348521183053557, | |
| "grad_norm": 0.20848603546619415, | |
| "learning_rate": 0.00019392341569148252, | |
| "loss": 1.4393, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.4380495603517186, | |
| "grad_norm": 0.21943925321102142, | |
| "learning_rate": 0.00019239873066755964, | |
| "loss": 1.6161, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.4412470023980815, | |
| "grad_norm": 0.23087991774082184, | |
| "learning_rate": 0.0001908692696350234, | |
| "loss": 1.3502, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 0.20302651822566986, | |
| "learning_rate": 0.00018933520488014166, | |
| "loss": 1.3896, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.44764188649080733, | |
| "grad_norm": 0.19597011804580688, | |
| "learning_rate": 0.00018779670920776877, | |
| "loss": 1.4437, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.45083932853717024, | |
| "grad_norm": 0.21784569323062897, | |
| "learning_rate": 0.00018625395592188036, | |
| "loss": 1.5956, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.4540367705835332, | |
| "grad_norm": 0.20360009372234344, | |
| "learning_rate": 0.00018470711880605122, | |
| "loss": 1.2507, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.4572342126298961, | |
| "grad_norm": 0.1850934773683548, | |
| "learning_rate": 0.00018315637210387947, | |
| "loss": 1.477, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.460431654676259, | |
| "grad_norm": 0.22538472712039948, | |
| "learning_rate": 0.00018160189049935892, | |
| "loss": 1.3688, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.4636290967226219, | |
| "grad_norm": 0.2093997299671173, | |
| "learning_rate": 0.00018004384909720188, | |
| "loss": 1.3953, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.46682653876898483, | |
| "grad_norm": 0.19743283092975616, | |
| "learning_rate": 0.00017848242340311424, | |
| "loss": 1.5111, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.47002398081534774, | |
| "grad_norm": 0.23592239618301392, | |
| "learning_rate": 0.0001769177893040258, | |
| "loss": 1.4628, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.47322142286171065, | |
| "grad_norm": 0.2107086479663849, | |
| "learning_rate": 0.00017535012304827736, | |
| "loss": 1.345, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.47641886490807356, | |
| "grad_norm": 0.212343230843544, | |
| "learning_rate": 0.00017377960122576732, | |
| "loss": 1.4294, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.47961630695443647, | |
| "grad_norm": 0.280923455953598, | |
| "learning_rate": 0.0001722064007480597, | |
| "loss": 1.6237, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4828137490007994, | |
| "grad_norm": 0.19629351794719696, | |
| "learning_rate": 0.00017063069882845575, | |
| "loss": 1.439, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.4860111910471623, | |
| "grad_norm": 0.2047591209411621, | |
| "learning_rate": 0.0001690526729620318, | |
| "loss": 1.3626, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.4892086330935252, | |
| "grad_norm": 0.18259218335151672, | |
| "learning_rate": 0.00016747250090564557, | |
| "loss": 1.3234, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.4924060751398881, | |
| "grad_norm": 0.20569853484630585, | |
| "learning_rate": 0.00016589036065791242, | |
| "loss": 1.4376, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.495603517186251, | |
| "grad_norm": 0.18437625467777252, | |
| "learning_rate": 0.0001643064304391547, | |
| "loss": 1.4705, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.4988009592326139, | |
| "grad_norm": 0.22610221803188324, | |
| "learning_rate": 0.00016272088867132637, | |
| "loss": 1.3045, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.5019984012789768, | |
| "grad_norm": 0.197098046541214, | |
| "learning_rate": 0.00016113391395791436, | |
| "loss": 1.531, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.5051958433253397, | |
| "grad_norm": 0.2230396866798401, | |
| "learning_rate": 0.00015954568506381994, | |
| "loss": 1.5164, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.5083932853717026, | |
| "grad_norm": 0.19642704725265503, | |
| "learning_rate": 0.0001579563808952216, | |
| "loss": 1.4442, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.5115907274180655, | |
| "grad_norm": 0.21066069602966309, | |
| "learning_rate": 0.00015636618047942222, | |
| "loss": 1.4251, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5147881694644284, | |
| "grad_norm": 0.18799303472042084, | |
| "learning_rate": 0.0001547752629446827, | |
| "loss": 1.3866, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.5179856115107914, | |
| "grad_norm": 0.20167718827724457, | |
| "learning_rate": 0.00015318380750004352, | |
| "loss": 1.471, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.5211830535571543, | |
| "grad_norm": 0.20787064731121063, | |
| "learning_rate": 0.00015159199341513845, | |
| "loss": 1.5312, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.5243804956035172, | |
| "grad_norm": 0.19502943754196167, | |
| "learning_rate": 0.00015, | |
| "loss": 1.5153, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.5275779376498801, | |
| "grad_norm": 0.18463830649852753, | |
| "learning_rate": 0.00014840800658486158, | |
| "loss": 1.62, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.530775379696243, | |
| "grad_norm": 0.20096978545188904, | |
| "learning_rate": 0.00014681619249995646, | |
| "loss": 1.3816, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.533972821742606, | |
| "grad_norm": 0.20995350182056427, | |
| "learning_rate": 0.00014522473705531736, | |
| "loss": 1.4321, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.5371702637889688, | |
| "grad_norm": 0.1865735948085785, | |
| "learning_rate": 0.00014363381952057778, | |
| "loss": 1.4262, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.5403677058353318, | |
| "grad_norm": 0.1792657971382141, | |
| "learning_rate": 0.00014204361910477844, | |
| "loss": 1.5558, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.5435651478816946, | |
| "grad_norm": 0.2027653157711029, | |
| "learning_rate": 0.00014045431493618003, | |
| "loss": 1.3377, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5467625899280576, | |
| "grad_norm": 0.19514119625091553, | |
| "learning_rate": 0.0001388660860420856, | |
| "loss": 1.3874, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.5499600319744204, | |
| "grad_norm": 0.17817656695842743, | |
| "learning_rate": 0.00013727911132867365, | |
| "loss": 1.3716, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.5531574740207834, | |
| "grad_norm": 0.23043349385261536, | |
| "learning_rate": 0.00013569356956084528, | |
| "loss": 1.464, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.5563549160671463, | |
| "grad_norm": 0.19135528802871704, | |
| "learning_rate": 0.00013410963934208759, | |
| "loss": 1.3154, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.5595523581135092, | |
| "grad_norm": 0.20745159685611725, | |
| "learning_rate": 0.0001325274990943544, | |
| "loss": 1.4785, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.5627498001598721, | |
| "grad_norm": 0.20532263815402985, | |
| "learning_rate": 0.00013094732703796818, | |
| "loss": 1.5137, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.565947242206235, | |
| "grad_norm": 0.21446797251701355, | |
| "learning_rate": 0.00012936930117154425, | |
| "loss": 1.3701, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.5691446842525979, | |
| "grad_norm": 0.19260822236537933, | |
| "learning_rate": 0.0001277935992519403, | |
| "loss": 1.4443, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.5723421262989609, | |
| "grad_norm": 0.19996041059494019, | |
| "learning_rate": 0.00012622039877423265, | |
| "loss": 1.371, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.5755395683453237, | |
| "grad_norm": 0.19244007766246796, | |
| "learning_rate": 0.00012464987695172264, | |
| "loss": 1.3142, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5787370103916867, | |
| "grad_norm": 0.19164302945137024, | |
| "learning_rate": 0.00012308221069597418, | |
| "loss": 1.4773, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.5819344524380495, | |
| "grad_norm": 0.20002460479736328, | |
| "learning_rate": 0.00012151757659688571, | |
| "loss": 1.4264, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.5851318944844125, | |
| "grad_norm": 0.21552026271820068, | |
| "learning_rate": 0.00011995615090279813, | |
| "loss": 1.4049, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.5883293365307753, | |
| "grad_norm": 0.19300565123558044, | |
| "learning_rate": 0.00011839810950064109, | |
| "loss": 1.3554, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.5915267785771383, | |
| "grad_norm": 0.19941386580467224, | |
| "learning_rate": 0.00011684362789612053, | |
| "loss": 1.5601, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.5947242206235012, | |
| "grad_norm": 0.18221646547317505, | |
| "learning_rate": 0.00011529288119394878, | |
| "loss": 1.4828, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.5979216626698641, | |
| "grad_norm": 0.1901618093252182, | |
| "learning_rate": 0.00011374604407811962, | |
| "loss": 1.5442, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.601119104716227, | |
| "grad_norm": 0.17420263588428497, | |
| "learning_rate": 0.00011220329079223123, | |
| "loss": 1.285, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.60431654676259, | |
| "grad_norm": 0.23658356070518494, | |
| "learning_rate": 0.00011066479511985838, | |
| "loss": 1.2485, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.6075139888089528, | |
| "grad_norm": 0.20968788862228394, | |
| "learning_rate": 0.00010913073036497658, | |
| "loss": 1.3972, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6107114308553158, | |
| "grad_norm": 0.2030273675918579, | |
| "learning_rate": 0.00010760126933244036, | |
| "loss": 1.6353, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.6139088729016786, | |
| "grad_norm": 0.1902075558900833, | |
| "learning_rate": 0.00010607658430851744, | |
| "loss": 1.2809, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.6171063149480416, | |
| "grad_norm": 0.20934785902500153, | |
| "learning_rate": 0.00010455684704148173, | |
| "loss": 1.3585, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.6203037569944044, | |
| "grad_norm": 0.2173265963792801, | |
| "learning_rate": 0.00010304222872226668, | |
| "loss": 1.2973, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.6235011990407674, | |
| "grad_norm": 0.19533811509609222, | |
| "learning_rate": 0.00010153289996518125, | |
| "loss": 1.4299, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.6266986410871302, | |
| "grad_norm": 0.2015613615512848, | |
| "learning_rate": 0.00010002903078869135, | |
| "loss": 1.4279, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.6298960831334932, | |
| "grad_norm": 0.20218639075756073, | |
| "learning_rate": 9.853079059626805e-05, | |
| "loss": 1.3212, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.6330935251798561, | |
| "grad_norm": 0.1902882307767868, | |
| "learning_rate": 9.703834815730487e-05, | |
| "loss": 1.3939, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.636290967226219, | |
| "grad_norm": 0.18366214632987976, | |
| "learning_rate": 9.555187158810702e-05, | |
| "loss": 1.4403, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.6394884092725819, | |
| "grad_norm": 0.1821315586566925, | |
| "learning_rate": 9.407152833295372e-05, | |
| "loss": 1.372, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6426858513189448, | |
| "grad_norm": 0.20973654091358185, | |
| "learning_rate": 9.259748514523653e-05, | |
| "loss": 1.4149, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.6458832933653078, | |
| "grad_norm": 0.18254290521144867, | |
| "learning_rate": 9.112990806867543e-05, | |
| "loss": 1.3052, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.6490807354116707, | |
| "grad_norm": 0.18717211484909058, | |
| "learning_rate": 8.966896241861473e-05, | |
| "loss": 1.4061, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.6522781774580336, | |
| "grad_norm": 0.17621521651744843, | |
| "learning_rate": 8.821481276340112e-05, | |
| "loss": 1.6093, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.6554756195043965, | |
| "grad_norm": 0.1912049949169159, | |
| "learning_rate": 8.676762290584585e-05, | |
| "loss": 1.353, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.6586730615507594, | |
| "grad_norm": 0.2157009094953537, | |
| "learning_rate": 8.532755586477324e-05, | |
| "loss": 1.4063, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.6618705035971223, | |
| "grad_norm": 0.18072722852230072, | |
| "learning_rate": 8.389477385665732e-05, | |
| "loss": 1.5591, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.6650679456434853, | |
| "grad_norm": 0.22034448385238647, | |
| "learning_rate": 8.246943827734897e-05, | |
| "loss": 1.4766, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.6682653876898481, | |
| "grad_norm": 0.21938645839691162, | |
| "learning_rate": 8.105170968389552e-05, | |
| "loss": 1.3791, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.6714628297362111, | |
| "grad_norm": 0.19702577590942383, | |
| "learning_rate": 7.964174777645448e-05, | |
| "loss": 1.5582, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6746602717825739, | |
| "grad_norm": 0.20586428046226501, | |
| "learning_rate": 7.823971138030466e-05, | |
| "loss": 1.4005, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.6778577138289369, | |
| "grad_norm": 0.1924622356891632, | |
| "learning_rate": 7.684575842795485e-05, | |
| "loss": 1.4078, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.6810551558752997, | |
| "grad_norm": 0.1937723606824875, | |
| "learning_rate": 7.546004594135356e-05, | |
| "loss": 1.2821, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.6842525979216627, | |
| "grad_norm": 0.22969581186771393, | |
| "learning_rate": 7.408273001420153e-05, | |
| "loss": 1.2398, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.6874500399680256, | |
| "grad_norm": 0.19231727719306946, | |
| "learning_rate": 7.271396579436825e-05, | |
| "loss": 1.3752, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.6906474820143885, | |
| "grad_norm": 0.20469219982624054, | |
| "learning_rate": 7.135390746641526e-05, | |
| "loss": 1.352, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.6938449240607514, | |
| "grad_norm": 0.19728676974773407, | |
| "learning_rate": 7.000270823422837e-05, | |
| "loss": 1.5623, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.6970423661071143, | |
| "grad_norm": 0.22052626311779022, | |
| "learning_rate": 6.866052030375974e-05, | |
| "loss": 1.4183, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.7002398081534772, | |
| "grad_norm": 0.19779476523399353, | |
| "learning_rate": 6.732749486588266e-05, | |
| "loss": 1.4014, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.7034372501998402, | |
| "grad_norm": 0.1978594809770584, | |
| "learning_rate": 6.600378207936069e-05, | |
| "loss": 1.4317, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.706634692246203, | |
| "grad_norm": 0.2020850032567978, | |
| "learning_rate": 6.468953105393297e-05, | |
| "loss": 1.4208, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.709832134292566, | |
| "grad_norm": 0.18292494118213654, | |
| "learning_rate": 6.338488983351777e-05, | |
| "loss": 1.3283, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.7130295763389288, | |
| "grad_norm": 0.2223280966281891, | |
| "learning_rate": 6.209000537953605e-05, | |
| "loss": 1.4245, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.7162270183852918, | |
| "grad_norm": 0.22692078351974487, | |
| "learning_rate": 6.080502355435701e-05, | |
| "loss": 1.5982, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.7194244604316546, | |
| "grad_norm": 0.19702717661857605, | |
| "learning_rate": 5.9530089104867386e-05, | |
| "loss": 1.3909, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.7226219024780176, | |
| "grad_norm": 0.22220925986766815, | |
| "learning_rate": 5.826534564616633e-05, | |
| "loss": 1.4322, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.7258193445243805, | |
| "grad_norm": 0.20837551355361938, | |
| "learning_rate": 5.701093564538806e-05, | |
| "loss": 1.3919, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.7290167865707434, | |
| "grad_norm": 0.1905641108751297, | |
| "learning_rate": 5.5767000405653636e-05, | |
| "loss": 1.446, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.7322142286171063, | |
| "grad_norm": 0.20399922132492065, | |
| "learning_rate": 5.453368005015363e-05, | |
| "loss": 1.3922, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.7354116706634692, | |
| "grad_norm": 0.19176483154296875, | |
| "learning_rate": 5.3311113506364116e-05, | |
| "loss": 1.3255, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7386091127098321, | |
| "grad_norm": 0.21297192573547363, | |
| "learning_rate": 5.209943849039722e-05, | |
| "loss": 1.3992, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.7418065547561951, | |
| "grad_norm": 0.20219087600708008, | |
| "learning_rate": 5.089879149148781e-05, | |
| "loss": 1.5462, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.7450039968025579, | |
| "grad_norm": 0.1977456510066986, | |
| "learning_rate": 4.9709307756618985e-05, | |
| "loss": 1.4046, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.7482014388489209, | |
| "grad_norm": 0.22329548001289368, | |
| "learning_rate": 4.853112127528698e-05, | |
| "loss": 1.5767, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.7513988808952837, | |
| "grad_norm": 0.20563232898712158, | |
| "learning_rate": 4.736436476440791e-05, | |
| "loss": 1.6348, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.7545963229416467, | |
| "grad_norm": 0.19388997554779053, | |
| "learning_rate": 4.6209169653368086e-05, | |
| "loss": 1.364, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.7577937649880095, | |
| "grad_norm": 0.2103840559720993, | |
| "learning_rate": 4.506566606921864e-05, | |
| "loss": 1.4538, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.7609912070343725, | |
| "grad_norm": 0.17306749522686005, | |
| "learning_rate": 4.3933982822017876e-05, | |
| "loss": 1.4435, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.7641886490807354, | |
| "grad_norm": 0.20918579399585724, | |
| "learning_rate": 4.2814247390321215e-05, | |
| "loss": 1.2357, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.7673860911270983, | |
| "grad_norm": 0.21173876523971558, | |
| "learning_rate": 4.1706585906821334e-05, | |
| "loss": 1.2602, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7705835331734612, | |
| "grad_norm": 0.19886651635169983, | |
| "learning_rate": 4.0611123144140075e-05, | |
| "loss": 1.4166, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.7737809752198241, | |
| "grad_norm": 0.19375504553318024, | |
| "learning_rate": 3.952798250077317e-05, | |
| "loss": 1.3777, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.7769784172661871, | |
| "grad_norm": 0.20145930349826813, | |
| "learning_rate": 3.84572859871904e-05, | |
| "loss": 1.3258, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.78017585931255, | |
| "grad_norm": 0.2076532244682312, | |
| "learning_rate": 3.739915421209133e-05, | |
| "loss": 1.3921, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.7833733013589129, | |
| "grad_norm": 0.19265635311603546, | |
| "learning_rate": 3.635370636881958e-05, | |
| "loss": 1.4043, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.7865707434052758, | |
| "grad_norm": 0.19883492588996887, | |
| "learning_rate": 3.532106022193615e-05, | |
| "loss": 1.346, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.7897681854516387, | |
| "grad_norm": 0.18948738276958466, | |
| "learning_rate": 3.4301332093953807e-05, | |
| "loss": 1.4363, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.7929656274980016, | |
| "grad_norm": 0.18976429104804993, | |
| "learning_rate": 3.3294636852234105e-05, | |
| "loss": 1.4316, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.7961630695443646, | |
| "grad_norm": 0.202013298869133, | |
| "learning_rate": 3.230108789604792e-05, | |
| "loss": 1.4532, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.7993605115907274, | |
| "grad_norm": 0.2116522341966629, | |
| "learning_rate": 3.132079714380171e-05, | |
| "loss": 1.5129, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8025579536370904, | |
| "grad_norm": 0.19418169558048248, | |
| "learning_rate": 3.035387502043052e-05, | |
| "loss": 1.3265, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.8057553956834532, | |
| "grad_norm": 0.21084119379520416, | |
| "learning_rate": 2.9400430444958932e-05, | |
| "loss": 1.3929, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.8089528377298162, | |
| "grad_norm": 0.23588140308856964, | |
| "learning_rate": 2.846057081823201e-05, | |
| "loss": 1.2077, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.812150279776179, | |
| "grad_norm": 0.21185244619846344, | |
| "learning_rate": 2.7534402010817157e-05, | |
| "loss": 1.2874, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.815347721822542, | |
| "grad_norm": 0.184846431016922, | |
| "learning_rate": 2.6622028351078277e-05, | |
| "loss": 1.4785, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.8185451638689049, | |
| "grad_norm": 0.1995445042848587, | |
| "learning_rate": 2.5723552613423687e-05, | |
| "loss": 1.4153, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.8217426059152678, | |
| "grad_norm": 0.20493745803833008, | |
| "learning_rate": 2.4839076006729082e-05, | |
| "loss": 1.448, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.8249400479616307, | |
| "grad_norm": 0.1989341676235199, | |
| "learning_rate": 2.3968698162936854e-05, | |
| "loss": 1.4733, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.8281374900079936, | |
| "grad_norm": 0.20579148828983307, | |
| "learning_rate": 2.311251712583307e-05, | |
| "loss": 1.4746, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.8313349320543565, | |
| "grad_norm": 0.2025279700756073, | |
| "learning_rate": 2.2270629340003303e-05, | |
| "loss": 1.6248, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8345323741007195, | |
| "grad_norm": 0.17980627715587616, | |
| "learning_rate": 2.1443129639968615e-05, | |
| "loss": 1.3753, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.8377298161470823, | |
| "grad_norm": 0.21116185188293457, | |
| "learning_rate": 2.063011123950295e-05, | |
| "loss": 1.2975, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.8409272581934453, | |
| "grad_norm": 0.20071591436862946, | |
| "learning_rate": 1.9831665721132954e-05, | |
| "loss": 1.444, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.8441247002398081, | |
| "grad_norm": 0.19569140672683716, | |
| "learning_rate": 1.9047883025821774e-05, | |
| "loss": 1.5126, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.8473221422861711, | |
| "grad_norm": 0.19419822096824646, | |
| "learning_rate": 1.827885144283769e-05, | |
| "loss": 1.3867, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.8505195843325339, | |
| "grad_norm": 0.19556277990341187, | |
| "learning_rate": 1.75246575998086e-05, | |
| "loss": 1.3758, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.8537170263788969, | |
| "grad_norm": 0.20848549902439117, | |
| "learning_rate": 1.678538645296391e-05, | |
| "loss": 1.4835, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.8569144684252598, | |
| "grad_norm": 0.19634144008159637, | |
| "learning_rate": 1.6061121277564743e-05, | |
| "loss": 1.4624, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.8601119104716227, | |
| "grad_norm": 0.19600766897201538, | |
| "learning_rate": 1.535194365852315e-05, | |
| "loss": 1.2323, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.8633093525179856, | |
| "grad_norm": 0.21323877573013306, | |
| "learning_rate": 1.4657933481212242e-05, | |
| "loss": 1.5224, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8665067945643485, | |
| "grad_norm": 0.18555647134780884, | |
| "learning_rate": 1.3979168922467298e-05, | |
| "loss": 1.3663, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.8697042366107114, | |
| "grad_norm": 0.19477520883083344, | |
| "learning_rate": 1.3315726441779629e-05, | |
| "loss": 1.4892, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.8729016786570744, | |
| "grad_norm": 0.19639001786708832, | |
| "learning_rate": 1.2667680772683825e-05, | |
| "loss": 1.2377, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.8760991207034372, | |
| "grad_norm": 0.21710480749607086, | |
| "learning_rate": 1.2035104914339188e-05, | |
| "loss": 1.3991, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.8792965627498002, | |
| "grad_norm": 0.21137666702270508, | |
| "learning_rate": 1.1418070123306989e-05, | |
| "loss": 1.5307, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.882494004796163, | |
| "grad_norm": 0.19870568811893463, | |
| "learning_rate": 1.0816645905523597e-05, | |
| "loss": 1.341, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.885691446842526, | |
| "grad_norm": 0.2340983897447586, | |
| "learning_rate": 1.0230900008471072e-05, | |
| "loss": 1.3578, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 0.18889744579792023, | |
| "learning_rate": 9.660898413545692e-06, | |
| "loss": 1.4085, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.8920863309352518, | |
| "grad_norm": 0.213284432888031, | |
| "learning_rate": 9.106705328625408e-06, | |
| "loss": 1.3843, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.8952837729816147, | |
| "grad_norm": 0.2060411274433136, | |
| "learning_rate": 8.568383180837368e-06, | |
| "loss": 1.473, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8984812150279776, | |
| "grad_norm": 0.18018406629562378, | |
| "learning_rate": 8.04599260952557e-06, | |
| "loss": 1.3782, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.9016786570743405, | |
| "grad_norm": 0.18678754568099976, | |
| "learning_rate": 7.539592459420219e-06, | |
| "loss": 1.4252, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.9048760991207034, | |
| "grad_norm": 0.2027515172958374, | |
| "learning_rate": 7.049239774009213e-06, | |
| "loss": 1.3717, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.9080735411670664, | |
| "grad_norm": 0.20960167050361633, | |
| "learning_rate": 6.574989789112372e-06, | |
| "loss": 1.2815, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.9112709832134293, | |
| "grad_norm": 0.19627049565315247, | |
| "learning_rate": 6.11689592665951e-06, | |
| "loss": 1.4348, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.9144684252597922, | |
| "grad_norm": 0.20119017362594604, | |
| "learning_rate": 5.675009788672596e-06, | |
| "loss": 1.3343, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.9176658673061551, | |
| "grad_norm": 0.18706481158733368, | |
| "learning_rate": 5.2493811514531635e-06, | |
| "loss": 1.3721, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.920863309352518, | |
| "grad_norm": 0.19794286787509918, | |
| "learning_rate": 4.840057959975169e-06, | |
| "loss": 1.3626, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.9240607513988809, | |
| "grad_norm": 0.1808895319700241, | |
| "learning_rate": 4.44708632248425e-06, | |
| "loss": 1.5342, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.9272581934452439, | |
| "grad_norm": 0.1820111721754074, | |
| "learning_rate": 4.070510505303814e-06, | |
| "loss": 1.4357, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9304556354916067, | |
| "grad_norm": 0.1756613701581955, | |
| "learning_rate": 3.710372927848776e-06, | |
| "loss": 1.328, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.9336530775379697, | |
| "grad_norm": 0.19259536266326904, | |
| "learning_rate": 3.366714157847078e-06, | |
| "loss": 1.2882, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.9368505195843325, | |
| "grad_norm": 0.20220039784908295, | |
| "learning_rate": 3.0395729067700324e-06, | |
| "loss": 1.3903, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.9400479616306955, | |
| "grad_norm": 0.1991778463125229, | |
| "learning_rate": 2.728986025471641e-06, | |
| "loss": 1.3649, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.9432454036770583, | |
| "grad_norm": 0.20098921656608582, | |
| "learning_rate": 2.4349885000374657e-06, | |
| "loss": 1.4128, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.9464428457234213, | |
| "grad_norm": 0.18276216089725494, | |
| "learning_rate": 2.1576134478437313e-06, | |
| "loss": 1.3548, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.9496402877697842, | |
| "grad_norm": 0.21758389472961426, | |
| "learning_rate": 1.8968921138267091e-06, | |
| "loss": 1.4765, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.9528377298161471, | |
| "grad_norm": 0.18690507113933563, | |
| "learning_rate": 1.6528538669631997e-06, | |
| "loss": 1.5375, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.95603517186251, | |
| "grad_norm": 0.1706872582435608, | |
| "learning_rate": 1.4255261969622456e-06, | |
| "loss": 1.2775, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.9592326139088729, | |
| "grad_norm": 0.2452152669429779, | |
| "learning_rate": 1.2149347111684749e-06, | |
| "loss": 1.2828, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9624300559552358, | |
| "grad_norm": 0.17894317209720612, | |
| "learning_rate": 1.0211031316776919e-06, | |
| "loss": 1.4131, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.9656274980015987, | |
| "grad_norm": 0.21982480585575104, | |
| "learning_rate": 8.440532926646315e-07, | |
| "loss": 1.3501, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.9688249400479616, | |
| "grad_norm": 0.19508808851242065, | |
| "learning_rate": 6.838051379234099e-07, | |
| "loss": 1.3474, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.9720223820943246, | |
| "grad_norm": 0.1852046549320221, | |
| "learning_rate": 5.403767186210218e-07, | |
| "loss": 1.3791, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.9752198241406874, | |
| "grad_norm": 0.18738119304180145, | |
| "learning_rate": 4.137841912639328e-07, | |
| "loss": 1.4893, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.9784172661870504, | |
| "grad_norm": 0.20034608244895935, | |
| "learning_rate": 3.0404181587811994e-07, | |
| "loss": 1.4388, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.9816147082334132, | |
| "grad_norm": 0.20295751094818115, | |
| "learning_rate": 2.1116195440278872e-07, | |
| "loss": 1.4804, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.9848121502797762, | |
| "grad_norm": 0.207365021109581, | |
| "learning_rate": 1.3515506929778762e-07, | |
| "loss": 1.4719, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.988009592326139, | |
| "grad_norm": 0.2223723828792572, | |
| "learning_rate": 7.602972236513405e-08, | |
| "loss": 1.3123, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.991207034372502, | |
| "grad_norm": 0.2046136111021042, | |
| "learning_rate": 3.3792573784585665e-08, | |
| "loss": 1.4272, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9944044764188649, | |
| "grad_norm": 0.21449051797389984, | |
| "learning_rate": 8.448381363307388e-09, | |
| "loss": 1.3367, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.9976019184652278, | |
| "grad_norm": 0.21067871153354645, | |
| "learning_rate": 0.0, | |
| "loss": 1.4037, | |
| "step": 312 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 312, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8.138882997433958e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |